DrakopoulosAj
5 years ago
5 changed files with 297 additions and 2 deletions
@ -0,0 +1,20 @@ |
|||||
|
DOCKER_NETWORK = docker-hadoop_default |
||||
|
ENV_FILE = hadoop.env |
||||
|
current_branch := $(shell git rev-parse --abbrev-ref HEAD) |
||||
|
build: |
||||
|
docker build -t bde2020/hadoop-base:$(current_branch) ./base |
||||
|
docker build -t bde2020/hadoop-namenode:$(current_branch) ./namenode |
||||
|
docker build -t bde2020/hadoop-datanode:$(current_branch) ./datanode |
||||
|
docker build -t bde2020/hadoop-resourcemanager:$(current_branch) ./resourcemanager |
||||
|
docker build -t bde2020/hadoop-nodemanager:$(current_branch) ./nodemanager |
||||
|
docker build -t bde2020/hadoop-historyserver:$(current_branch) ./historyserver |
||||
|
docker build -t bde2020/hadoop-submit:$(current_branch) ./submit |
||||
|
|
||||
|
wordcount: |
||||
|
docker build -t hadoop-wordcount ./submit |
||||
|
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -mkdir -p /input/ |
||||
|
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -copyFromLocal -f /opt/hadoop-3.2.1/README.txt /input/ |
||||
|
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} hadoop-wordcount |
||||
|
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -cat /output/* |
||||
|
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /output |
||||
|
docker run --network ${DOCKER_NETWORK} --env-file ${ENV_FILE} bde2020/hadoop-base:$(current_branch) hdfs dfs -rm -r /input |
@ -1,3 +1,63 @@ |
|||||
# DockerHadoop |
[![Gitter chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://gitter.im/big-data-europe/Lobby) |
||||
|
|
||||
Cloud project |
# Changes |
||||
|
|
||||
|
Version 2.0.0 introduces uses wait_for_it script for the cluster startup |
||||
|
|
||||
|
# Hadoop Docker |
||||
|
|
||||
|
## Supported Hadoop Versions |
||||
|
See repository branches for supported hadoop versions |
||||
|
|
||||
|
## Quick Start |
||||
|
|
||||
|
To deploy an example HDFS cluster, run: |
||||
|
``` |
||||
|
docker-compose up |
||||
|
``` |
||||
|
|
||||
|
Run example wordcount job: |
||||
|
``` |
||||
|
make wordcount |
||||
|
``` |
||||
|
|
||||
|
Or deploy in swarm: |
||||
|
``` |
||||
|
docker stack deploy -c docker-compose-v3.yml hadoop |
||||
|
``` |
||||
|
|
||||
|
`docker-compose` creates a docker network that can be found by running `docker network list`, e.g. `dockerhadoop_default`. |
||||
|
|
||||
|
Run `docker network inspect` on the network (e.g. `dockerhadoop_default`) to find the IP the hadoop interfaces are published on. Access these interfaces with the following URLs: |
||||
|
|
||||
|
* Namenode: http://<dockerhadoop_IP_address>:9870/dfshealth.html#tab-overview |
||||
|
* History server: http://<dockerhadoop_IP_address>:8188/applicationhistory |
||||
|
* Datanode: http://<dockerhadoop_IP_address>:9864/ |
||||
|
* Nodemanager: http://<dockerhadoop_IP_address>:8042/node |
||||
|
* Resource manager: http://<dockerhadoop_IP_address>:8088/ |
||||
|
|
||||
|
## Configure Environment Variables |
||||
|
|
||||
|
The configuration parameters can be specified in the hadoop.env file or as environmental variables for specific services (e.g. namenode, datanode etc.): |
||||
|
``` |
||||
|
CORE_CONF_fs_defaultFS=hdfs://namenode:8020 |
||||
|
``` |
||||
|
|
||||
|
CORE_CONF corresponds to core-site.xml. fs_defaultFS=hdfs://namenode:8020 will be transformed into: |
||||
|
``` |
||||
|
<property><name>fs.defaultFS</name><value>hdfs://namenode:8020</value></property> |
||||
|
``` |
||||
|
To define dash inside a configuration parameter, use triple underscore, such as YARN_CONF_yarn_log___aggregation___enable=true (yarn-site.xml): |
||||
|
``` |
||||
|
<property><name>yarn.log-aggregation-enable</name><value>true</value></property> |
||||
|
``` |
||||
|
|
||||
|
The available configurations are: |
||||
|
* /etc/hadoop/core-site.xml CORE_CONF |
||||
|
* /etc/hadoop/hdfs-site.xml HDFS_CONF |
||||
|
* /etc/hadoop/yarn-site.xml YARN_CONF |
||||
|
* /etc/hadoop/httpfs-site.xml HTTPFS_CONF |
||||
|
* /etc/hadoop/kms-site.xml KMS_CONF |
||||
|
* /etc/hadoop/mapred-site.xml MAPRED_CONF |
||||
|
|
||||
|
If you need to extend some other configuration file, refer to base/entrypoint.sh bash script. |
||||
|
@ -0,0 +1,111 @@ |
|||||
|
version: '3' |
||||
|
|
||||
|
services: |
||||
|
namenode: |
||||
|
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 |
||||
|
networks: |
||||
|
- hadoop-net |
||||
|
volumes: |
||||
|
- namenode:/hadoop/dfs/name |
||||
|
environment: |
||||
|
- CLUSTER_NAME=test |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
deploy: |
||||
|
mode: replicated |
||||
|
replicas: 1 |
||||
|
restart_policy: |
||||
|
condition: on-failure |
||||
|
placement: |
||||
|
constraints: |
||||
|
- node.hostname == akswnc4.aksw.uni-leipzig.de |
||||
|
labels: |
||||
|
traefik.docker.network: hbase |
||||
|
traefik.port: 50070 |
||||
|
|
||||
|
datanode: |
||||
|
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 |
||||
|
networks: |
||||
|
- hadoop-net |
||||
|
volumes: |
||||
|
- datanode:/hadoop/dfs/data |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
environment: |
||||
|
SERVICE_PRECONDITION: "namenode:50070" |
||||
|
deploy: |
||||
|
mode: global |
||||
|
restart_policy: |
||||
|
condition: on-failure |
||||
|
labels: |
||||
|
traefik.docker.network: hbase |
||||
|
traefik.port: 50075 |
||||
|
|
||||
|
resourcemanager: |
||||
|
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 |
||||
|
networks: |
||||
|
- hadoop-net |
||||
|
environment: |
||||
|
SERVICE_PRECONDITION: "namenode:50070 datanode:50075" |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
deploy: |
||||
|
mode: replicated |
||||
|
replicas: 1 |
||||
|
restart_policy: |
||||
|
condition: on-failure |
||||
|
placement: |
||||
|
constraints: |
||||
|
- node.hostname == akswnc4.aksw.uni-leipzig.de |
||||
|
labels: |
||||
|
traefik.docker.network: hbase |
||||
|
traefik.port: 8088 |
||||
|
healthcheck: |
||||
|
disable: true |
||||
|
|
||||
|
nodemanager: |
||||
|
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 |
||||
|
networks: |
||||
|
- hadoop-net |
||||
|
environment: |
||||
|
SERVICE_PRECONDITION: "namenode:50070 datanode:50075 resourcemanager:8088" |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
deploy: |
||||
|
mode: global |
||||
|
restart_policy: |
||||
|
condition: on-failure |
||||
|
labels: |
||||
|
traefik.docker.network: hbase |
||||
|
traefik.port: 8042 |
||||
|
|
||||
|
historyserver: |
||||
|
image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 |
||||
|
networks: |
||||
|
- hadoop-net |
||||
|
volumes: |
||||
|
- hadoop_historyserver:/hadoop/yarn/timeline |
||||
|
environment: |
||||
|
SERVICE_PRECONDITION: "namenode:50070 datanode:50075 resourcemanager:8088" |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
deploy: |
||||
|
mode: replicated |
||||
|
replicas: 1 |
||||
|
placement: |
||||
|
constraints: |
||||
|
- node.hostname == akswnc4.aksw.uni-leipzig.de |
||||
|
labels: |
||||
|
traefik.docker.network: hbase |
||||
|
traefik.port: 8188 |
||||
|
|
||||
|
volumes: |
||||
|
datanode: |
||||
|
namenode: |
||||
|
hadoop_historyserver: |
||||
|
|
||||
|
networks: |
||||
|
hadoop-net: |
||||
|
#default: |
||||
|
# external: |
||||
|
# name: hadoop-net |
@ -0,0 +1,61 @@ |
|||||
|
version: "3" |
||||
|
|
||||
|
services: |
||||
|
namenode: |
||||
|
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8 |
||||
|
container_name: namenode |
||||
|
restart: always |
||||
|
ports: |
||||
|
- 9870:9870 |
||||
|
- 9000:9000 |
||||
|
volumes: |
||||
|
- hadoop_namenode:/hadoop/dfs/name |
||||
|
environment: |
||||
|
- CLUSTER_NAME=test |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
|
||||
|
datanode: |
||||
|
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8 |
||||
|
container_name: datanode |
||||
|
restart: always |
||||
|
volumes: |
||||
|
- hadoop_datanode:/hadoop/dfs/data |
||||
|
environment: |
||||
|
SERVICE_PRECONDITION: "namenode:9870" |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
|
||||
|
resourcemanager: |
||||
|
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8 |
||||
|
container_name: resourcemanager |
||||
|
restart: always |
||||
|
environment: |
||||
|
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864" |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
|
||||
|
nodemanager1: |
||||
|
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8 |
||||
|
container_name: nodemanager |
||||
|
restart: always |
||||
|
environment: |
||||
|
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
|
||||
|
historyserver: |
||||
|
image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8 |
||||
|
container_name: historyserver |
||||
|
restart: always |
||||
|
environment: |
||||
|
SERVICE_PRECONDITION: "namenode:9000 namenode:9870 datanode:9864 resourcemanager:8088" |
||||
|
volumes: |
||||
|
- hadoop_historyserver:/hadoop/yarn/timeline |
||||
|
env_file: |
||||
|
- ./hadoop.env |
||||
|
|
||||
|
volumes: |
||||
|
hadoop_namenode: |
||||
|
hadoop_datanode: |
||||
|
hadoop_historyserver: |
@ -0,0 +1,43 @@ |
|||||
|
CORE_CONF_fs_defaultFS=hdfs://namenode:9000 |
||||
|
CORE_CONF_hadoop_http_staticuser_user=root |
||||
|
CORE_CONF_hadoop_proxyuser_hue_hosts=* |
||||
|
CORE_CONF_hadoop_proxyuser_hue_groups=* |
||||
|
CORE_CONF_io_compression_codecs=org.apache.hadoop.io.compress.SnappyCodec |
||||
|
|
||||
|
HDFS_CONF_dfs_webhdfs_enabled=true |
||||
|
HDFS_CONF_dfs_permissions_enabled=false |
||||
|
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false |
||||
|
|
||||
|
YARN_CONF_yarn_log___aggregation___enable=true |
||||
|
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/ |
||||
|
YARN_CONF_yarn_resourcemanager_recovery_enabled=true |
||||
|
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore |
||||
|
YARN_CONF_yarn_resourcemanager_scheduler_class=org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler |
||||
|
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___mb=8192 |
||||
|
YARN_CONF_yarn_scheduler_capacity_root_default_maximum___allocation___vcores=4 |
||||
|
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate |
||||
|
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true |
||||
|
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager |
||||
|
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032 |
||||
|
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030 |
||||
|
YARN_CONF_yarn_resourcemanager_resource__tracker_address=resourcemanager:8031 |
||||
|
YARN_CONF_yarn_timeline___service_enabled=true |
||||
|
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true |
||||
|
YARN_CONF_yarn_timeline___service_hostname=historyserver |
||||
|
YARN_CONF_mapreduce_map_output_compress=true |
||||
|
YARN_CONF_mapred_map_output_compress_codec=org.apache.hadoop.io.compress.SnappyCodec |
||||
|
YARN_CONF_yarn_nodemanager_resource_memory___mb=16384 |
||||
|
YARN_CONF_yarn_nodemanager_resource_cpu___vcores=8 |
||||
|
YARN_CONF_yarn_nodemanager_disk___health___checker_max___disk___utilization___per___disk___percentage=98.5 |
||||
|
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs |
||||
|
YARN_CONF_yarn_nodemanager_aux___services=mapreduce_shuffle |
||||
|
|
||||
|
MAPRED_CONF_mapreduce_framework_name=yarn |
||||
|
MAPRED_CONF_mapred_child_java_opts=-Xmx4096m |
||||
|
MAPRED_CONF_mapreduce_map_memory_mb=4096 |
||||
|
MAPRED_CONF_mapreduce_reduce_memory_mb=8192 |
||||
|
MAPRED_CONF_mapreduce_map_java_opts=-Xmx3072m |
||||
|
MAPRED_CONF_mapreduce_reduce_java_opts=-Xmx6144m |
||||
|
MAPRED_CONF_yarn_app_mapreduce_am_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ |
||||
|
MAPRED_CONF_mapreduce_map_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ |
||||
|
MAPRED_CONF_mapreduce_reduce_env=HADOOP_MAPRED_HOME=/opt/hadoop-3.2.1/ |
Loading…
Reference in new issue