diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..da7cb7d --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +DOCKER_NETWORK = docker-hadoop_default +ENV_FILE = hadoop.env +current_branch := $(shell git rev-parse --abbrev-ref HEAD) +build: + docker build -t bde2020/hadoop-base:$(current_branch) ./base + docker build -t bde2020/hadoop-namenode:$(current_branch) ./namenode + docker build -t bde2020/hadoop-datanode:$(current_branch) ./datanode + docker build -t bde2020/hadoop-resourcemanager:$(current_branch) ./resourcemanager + docker build -t bde2020/hadoop-nodemanager:$(current_branch) ./nodemanager + docker build -t bde2020/hadoop-historyserver:$(current_branch) ./historyserver + docker build -t bde2020/hadoop-submit:$(current_branch) ./submit + +wordcount: + docker build -t hadoop-wordcount ./submit + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -mkdir -p /input/ + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -copyFromLocal -f /opt/hadoop-3.2.1/README.txt /input/ + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} hadoop-wordcount + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -cat /output/* + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /output + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /input diff --git a/README.md b/README.md index d4b7d38..e836e34 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,63 @@ -# DockerHadoop +[![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/big-data-europe/Lobby) -Cloud project \ No newline at end of file +# Changes + +Version 2.0.0 introduces uses wait_for_it script for the cluster startup + +# Hadoop Docker + +## Supported Hadoop Versions +See repository branches for supported hadoop versions + +## Quick Start + +To deploy an example HDFS cluster, run: +``` + docker-compose up +``` + +Run example wordcount job: +``` + make wordcount +``` + +Or deploy in swarm: +``` +docker stack deploy -c docker-compose-v3.yml hadoop +``` + +`docker-compose` creates a docker network that can be found by running `docker network list`, e.g. `dockerhadoop_default`. + +Run `docker network inspect` on the network (e.g. `dockerhadoop_default`) to find the IP the hadoop interfaces are published on. Access these interfaces with the following URLs: + +* Namenode: http://:9870/dfshealth.html#tab-overview +* History server: http://:8188/applicationhistory +* Datanode: http://:9864/ +* Nodemanager: http://:8042/node +* Resource manager: http://:8088/ + +## Configure Environment Variables + +The configuration parameters can be specified in the hadoop.env file or as environmental variables for specific services (e.g. namenode, datanode etc.): +``` + CORE_CONF_fs_defaultFS=hdfs://namenode:8020 +``` + +CORE_CONF corresponds to core-site.xml. fs_defaultFS=hdfs://namenode:8020 will be transformed into: +``` + fs.defaultFShdfs://namenode:8020 +``` +To define dash inside a configuration parameter, use triple underscore, such as YARN_CONF_yarn_log___aggregation___enable=true (yarn-site.xml): +``` + yarn.log-aggregation-enabletrue +``` + +The available configurations are: +* /etc/hadoop/core-site.xml CORE_CONF +* /etc/hadoop/hdfs-site.xml HDFS_CONF +* /etc/hadoop/yarn-site.xml YARN_CONF +* /etc/hadoop/httpfs-site.xml HTTPFS_CONF +* /etc/hadoop/kms-site.xml KMS_CONF +* /etc/hadoop/mapred-site.xml MAPRED_CONF + +If you need to extend some other configuration file, refer to base/entrypoint.sh bash script. diff --git a/docker-compose-v3.yml b/docker-compose-v3.yml new file mode 100644 index 0000000..f538ce0 --- /dev/null +++ b/docker-compose-v3.yml @@ -0,0 +1,111 @@ +version: '3' + +services: + namenode: + image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + networks: + - hadoop-net + volumes: + - namenode:/hadoop/dfs/name + environment: + - CLUSTER_NAME=test + env_file: + - ./hadoop.env + deploy: + mode: replicated + replicas: 1 + restart_policy: + condition: on-failure + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + labels: + traefik.docker.network: hbase + traefik.port: 50070 + + datanode: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + networks: + - hadoop-net + volumes: + - datanode:/hadoop/dfs/data + env_file: + - ./hadoop.env + environment: + SERVICE_PRECONDITION: "namenode:50070" + deploy: + mode: global + restart_policy: + condition: on-failure + labels: + traefik.docker.network: hbase + traefik.port: 50075 + + resourcemanager: + image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 + networks: + - hadoop-net + environment: + SERVICE_PRECONDITION: "namenode:50070 datanode:50075" + env_file: + - ./hadoop.env + deploy: + mode: replicated + replicas: 1 + restart_policy: + condition: on-failure + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + labels: + traefik.docker.network: hbase + traefik.port: 8088 + healthcheck: + disable: true + + nodemanager: + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + networks: + - hadoop-net + environment: + SERVICE_PRECONDITION: "namenode:50070 datanode:50075 resourcemanager:8088" + env_file: + - ./hadoop.env + deploy: + mode: global + restart_policy: + condition: on-failure + labels: + traefik.docker.network: hbase + traefik.port: 8042 + + historyserver: + image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 + networks: + - hadoop-net + volumes: + - hadoop_historyserver:/hadoop/yarn/timeline + environment: + SERVICE_PRECONDITION: "namenode:50070 datanode:50075 resourcemanager:8088" + env_file: + - ./hadoop.env + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname == akswnc4.aksw.uni-leipzig.de + labels: + traefik.docker.network: hbase + traefik.port: 8188 + +volumes: + datanode: + namenode: + hadoop_historyserver: + +networks: + hadoop-net: + #default: + # external: + # name: hadoop-net diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..ed40dc6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,61 @@ +version: "3" + +services: + namenode: + image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 + container_name: namenode + restart: always + ports: + - 9870:9870 + - 9000:9000 + volumes: + - hadoop_namenode:/hadoop/dfs/name + environment: + - CLUSTER_NAME=test + env_file: + - ./hadoop.env + + datanode: + image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 + container_name: datanode + restart: always + volumes: + - hadoop_datanode:/hadoop/dfs/data + environment: + SERVICE_PRECONDITION: "namenode:9870" + env_file: + - ./hadoop.env + + resourcemanager: + image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 + container_name: resourcemanager + restart: always + environment: + SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864" + env_file: + - ./hadoop.env + + nodemanager1: + image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 + container_name: nodemanager + restart: always + environment: + SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" + env_file: + - ./hadoop.env + + historyserver: + image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 + container_name: historyserver + restart: always + environment: + SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" + volumes: + - hadoop_historyserver:/hadoop/yarn/timeline + env_file: + - ./hadoop.env + +volumes: + hadoop_namenode: + hadoop_datanode: + hadoop_historyserver: diff --git a/hadoop.env b/hadoop.env new file mode 100644 index 0000000..95b3d10 --- /dev/null +++ b/hadoop.env @@ -0,0 +1,43 @@ +CORE_CONF_fs_defaultFS=hdfs://namenode:9000 +CORE_CONF_hadoop_http_staticuser_user=root +CORE_CONF_hadoop_proxyuser_hue_hosts=* +CORE_CONF_hadoop_proxyuser_hue_groups=* +CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec + +HDFS_CONF_dfs_webhdfs_enabled=true +HDFS_CONF_dfs_permissions_enabled=false +HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false + +YARN_CONF_yarn_log___aggregation___enable=true +YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ +YARN_CONF_yarn_resourcemanager_recovery_enabled=true +YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore +YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 +YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate +YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true +YARN_CONF_yarn_resourcemanager_hostname=resourcemanager +YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 +YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 +YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 +YARN_CONF_yarn_timeline___service_enabled=true +YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true +YARN_CONF_yarn_timeline___service_hostname=historyserver +YARN_CONF_mapreduce_map_output_compress=true +YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec +YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 +YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 +YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 +YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs +YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle + +MAPRED_CONF_mapreduce_framework_name=yarn +MAPRED_CONF_mapred_child_java_opts=-Xmx4096m +MAPRED_CONF_mapreduce_map_memory_mb=4096 +MAPRED_CONF_mapreduce_reduce_memory_mb=8192 +MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m +MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m +MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/