DrakopoulosAj
5 years ago
3 changed files with 126 additions and 0 deletions
@ -0,0 +1,20 @@ |
|||
DOCKER_NETWORK = docker-hadoop_default |
|||
ENV_FILE = hadoop.env |
|||
current_branch := $(shell git rev-parse --abbrev-ref HEAD) |
|||
build: |
|||
docker build -t bde2020/hadoop-base:$(current_branch) ./base |
|||
docker build -t bde2020/hadoop-namenode:$(current_branch) ./namenode |
|||
docker build -t bde2020/hadoop-datanode:$(current_branch) ./datanode |
|||
docker build -t bde2020/hadoop-resourcemanager:$(current_branch) ./resourcemanager |
|||
docker build -t bde2020/hadoop-nodemanager:$(current_branch) ./nodemanager |
|||
docker build -t bde2020/hadoop-historyserver:$(current_branch) ./historyserver |
|||
docker build -t bde2020/hadoop-submit:$(current_branch) ./submit |
|||
|
|||
wordcount: |
|||
docker build -t hadoop-wordcount ./submit |
|||
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -mkdir -p /input/ |
|||
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -copyFromLocal -f /opt/hadoop-3.2.1/README.txt /input/ |
|||
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} hadoop-wordcount |
|||
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -cat /output/* |
|||
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /output |
|||
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /input |
@ -0,0 +1,63 @@ |
|||
[![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/big-data-europe/Lobby) |
|||
|
|||
# Changes |
|||
|
|||
Version 2.0.0 introduces uses wait_for_it script for the cluster startup |
|||
|
|||
# Hadoop Docker |
|||
|
|||
## Supported Hadoop Versions |
|||
See repository branches for supported hadoop versions |
|||
|
|||
## Quick Start |
|||
|
|||
To deploy an example HDFS cluster, run: |
|||
``` |
|||
docker-compose up |
|||
``` |
|||
|
|||
Run example wordcount job: |
|||
``` |
|||
make wordcount |
|||
``` |
|||
|
|||
Or deploy in swarm: |
|||
``` |
|||
docker stack deploy -c docker-compose-v3.yml hadoop |
|||
``` |
|||
|
|||
`docker-compose` creates a docker network that can be found by running `docker network list`, e.g. `dockerhadoop_default`. |
|||
|
|||
Run `docker network inspect` on the network (e.g. `dockerhadoop_default`) to find the IP the hadoop interfaces are published on. Access these interfaces with the following URLs: |
|||
|
|||
* Namenode: http://<dockerhadoop_IP_address>:9870/dfshealth.html#tab-overview |
|||
* History server: http://<dockerhadoop_IP_address>:8188/applicationhistory |
|||
* Datanode: http://<dockerhadoop_IP_address>:9864/ |
|||
* Nodemanager: http://<dockerhadoop_IP_address>:8042/node |
|||
* Resource manager: http://<dockerhadoop_IP_address>:8088/ |
|||
|
|||
## Configure Environment Variables |
|||
|
|||
The configuration parameters can be specified in the hadoop.env file or as environmental variables for specific services (e.g. namenode, datanode etc.): |
|||
``` |
|||
CORE_CONF_fs_defaultFS=hdfs://namenode:8020 |
|||
``` |
|||
|
|||
CORE_CONF corresponds to core-site.xml. fs_defaultFS=hdfs://namenode:8020 will be transformed into: |
|||
``` |
|||
<property><name>fs.defaultFS</name><value>hdfs://namenode:8020</value></property> |
|||
``` |
|||
To define dash inside a configuration parameter, use triple underscore, such as YARN_CONF_yarn_log___aggregation___enable=true (yarn-site.xml): |
|||
``` |
|||
<property><name>yarn.log-aggregation-enable</name><value>true</value></property> |
|||
``` |
|||
|
|||
The available configurations are: |
|||
* /etc/hadoop/core-site.xml CORE_CONF |
|||
* /etc/hadoop/hdfs-site.xml HDFS_CONF |
|||
* /etc/hadoop/yarn-site.xml YARN_CONF |
|||
* /etc/hadoop/httpfs-site.xml HTTPFS_CONF |
|||
* /etc/hadoop/kms-site.xml KMS_CONF |
|||
* /etc/hadoop/mapred-site.xml MAPRED_CONF |
|||
|
|||
If you need to extend some other configuration file, refer to base/entrypoint.sh bash script. |
@ -0,0 +1,43 @@ |
|||
CORE_CONF_fs_defaultFS=hdfs://namenode:9000 |
|||
CORE_CONF_hadoop_http_staticuser_user=root |
|||
CORE_CONF_hadoop_proxyuser_hue_hosts=* |
|||
CORE_CONF_hadoop_proxyuser_hue_groups=* |
|||
CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec |
|||
|
|||
HDFS_CONF_dfs_webhdfs_enabled=true |
|||
HDFS_CONF_dfs_permissions_enabled=false |
|||
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false |
|||
|
|||
YARN_CONF_yarn_log___aggregation___enable=true |
|||
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ |
|||
YARN_CONF_yarn_resourcemanager_recovery_enabled=true |
|||
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore |
|||
YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler |
|||
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 |
|||
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 |
|||
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate |
|||
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true |
|||
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager |
|||
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 |
|||
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 |
|||
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 |
|||
YARN_CONF_yarn_timeline___service_enabled=true |
|||
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true |
|||
YARN_CONF_yarn_timeline___service_hostname=historyserver |
|||
YARN_CONF_mapreduce_map_output_compress=true |
|||
YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec |
|||
YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 |
|||
YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 |
|||
YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 |
|||
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs |
|||
YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle |
|||
|
|||
MAPRED_CONF_mapreduce_framework_name=yarn |
|||
MAPRED_CONF_mapred_child_java_opts=-Xmx4096m |
|||
MAPRED_CONF_mapreduce_map_memory_mb=4096 |
|||
MAPRED_CONF_mapreduce_reduce_memory_mb=8192 |
|||
MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m |
|||
MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m |
|||
MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ |
|||
MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ |
|||
MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ |
Loading…
Reference in new issue