From 5321c186e439b1b398ea5a8565b2e2d4903c4057 Mon Sep 17 00:00:00 2001 From: DrakopoulosAj Date: Fri, 12 Jun 2020 18:48:01 +0000 Subject: [PATCH] Upload files to '' --- Makefile | 20 +++++++++++++++++ README.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ hadoop.env | 43 +++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+) create mode 100644 Makefile create mode 100644 README.md create mode 100644 hadoop.env diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..da7cb7d --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +DOCKER_NETWORK = docker-hadoop_default +ENV_FILE = hadoop.env +current_branch := $(shell git rev-parse --abbrev-ref HEAD) +build: + docker build -t bde2020/hadoop-base:$(current_branch) ./base + docker build -t bde2020/hadoop-namenode:$(current_branch) ./namenode + docker build -t bde2020/hadoop-datanode:$(current_branch) ./datanode + docker build -t bde2020/hadoop-resourcemanager:$(current_branch) ./resourcemanager + docker build -t bde2020/hadoop-nodemanager:$(current_branch) ./nodemanager + docker build -t bde2020/hadoop-historyserver:$(current_branch) ./historyserver + docker build -t bde2020/hadoop-submit:$(current_branch) ./submit + +wordcount: + docker build -t hadoop-wordcount ./submit + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -mkdir -p /input/ + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -copyFromLocal -f /opt/hadoop-3.2.1/README.txt /input/ + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} hadoop-wordcount + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -cat /output/* + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /output + docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /input diff --git a/README.md b/README.md new file mode 100644 index 0000000..e836e34 --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +[![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/big-data-europe/Lobby) + +# Changes + +Version 2.0.0 introduces uses wait_for_it script for the cluster startup + +# Hadoop Docker + +## Supported Hadoop Versions +See repository branches for supported hadoop versions + +## Quick Start + +To deploy an example HDFS cluster, run: +``` + docker-compose up +``` + +Run example wordcount job: +``` + make wordcount +``` + +Or deploy in swarm: +``` +docker stack deploy -c docker-compose-v3.yml hadoop +``` + +`docker-compose` creates a docker network that can be found by running `docker network list`, e.g. `dockerhadoop_default`. + +Run `docker network inspect` on the network (e.g. `dockerhadoop_default`) to find the IP the hadoop interfaces are published on. Access these interfaces with the following URLs: + +* Namenode: http://:9870/dfshealth.html#tab-overview +* History server: http://:8188/applicationhistory +* Datanode: http://:9864/ +* Nodemanager: http://:8042/node +* Resource manager: http://:8088/ + +## Configure Environment Variables + +The configuration parameters can be specified in the hadoop.env file or as environmental variables for specific services (e.g. namenode, datanode etc.): +``` + CORE_CONF_fs_defaultFS=hdfs://namenode:8020 +``` + +CORE_CONF corresponds to core-site.xml. fs_defaultFS=hdfs://namenode:8020 will be transformed into: +``` + fs.defaultFShdfs://namenode:8020 +``` +To define dash inside a configuration parameter, use triple underscore, such as YARN_CONF_yarn_log___aggregation___enable=true (yarn-site.xml): +``` + yarn.log-aggregation-enabletrue +``` + +The available configurations are: +* /etc/hadoop/core-site.xml CORE_CONF +* /etc/hadoop/hdfs-site.xml HDFS_CONF +* /etc/hadoop/yarn-site.xml YARN_CONF +* /etc/hadoop/httpfs-site.xml HTTPFS_CONF +* /etc/hadoop/kms-site.xml KMS_CONF +* /etc/hadoop/mapred-site.xml MAPRED_CONF + +If you need to extend some other configuration file, refer to base/entrypoint.sh bash script. diff --git a/hadoop.env b/hadoop.env new file mode 100644 index 0000000..95b3d10 --- /dev/null +++ b/hadoop.env @@ -0,0 +1,43 @@ +CORE_CONF_fs_defaultFS=hdfs://namenode:9000 +CORE_CONF_hadoop_http_staticuser_user=root +CORE_CONF_hadoop_proxyuser_hue_hosts=* +CORE_CONF_hadoop_proxyuser_hue_groups=* +CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec + +HDFS_CONF_dfs_webhdfs_enabled=true +HDFS_CONF_dfs_permissions_enabled=false +HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false + +YARN_CONF_yarn_log___aggregation___enable=true +YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ +YARN_CONF_yarn_resourcemanager_recovery_enabled=true +YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore +YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 +YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 +YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate +YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true +YARN_CONF_yarn_resourcemanager_hostname=resourcemanager +YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 +YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 +YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 +YARN_CONF_yarn_timeline___service_enabled=true +YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true +YARN_CONF_yarn_timeline___service_hostname=historyserver +YARN_CONF_mapreduce_map_output_compress=true +YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec +YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 +YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 +YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 +YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs +YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle + +MAPRED_CONF_mapreduce_framework_name=yarn +MAPRED_CONF_mapred_child_java_opts=-Xmx4096m +MAPRED_CONF_mapreduce_map_memory_mb=4096 +MAPRED_CONF_mapreduce_reduce_memory_mb=8192 +MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m +MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m +MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ +MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/