Browse Source

HADOOP-18682. Move hadoop docker scripts under the main source code (#6483). Contributed by Christos Bisias.

Christos Bisias 5 months ago
parent
commit
66baf1eb51

+ 68 - 0
hadoop-common-project/hadoop-common/src/site/markdown/HadoopDocker.md

@@ -0,0 +1,68 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+## Hadoop Docker
+
+### Running from existing setups
+
+There are special branches for running hadoop in docker.
+
+The `docker-hadoop-runner*` branches contain scripts that set up base images that can be used for running any Hadoop version.
+
+* [docker-hadoop-runner-latest](https://github.com/apache/hadoop/tree/docker-hadoop-runner-latest)
+* [docker-hadoop-runner-jdk11](https://github.com/apache/hadoop/tree/docker-hadoop-runner-jdk11)
+* [docker-hadoop-runner-jdk8](https://github.com/apache/hadoop/tree/docker-hadoop-runner-jdk8)
+* [docker-hadoop-runner](https://github.com/apache/hadoop/tree/docker-hadoop-runner)
+
+The `docker-hadoop*` branches can be used for running a specific version.
+
+* [docker-hadoop-3](https://github.com/apache/hadoop/tree/docker-hadoop-3)
+  * `hadoop-3.3.6`
+* [docker-hadoop-2](https://github.com/apache/hadoop/tree/docker-hadoop-2)
+  * `hadoop-2.10.2`
+
+### Running from the source code
+
+There is a setup under `hadoop-dist` that contains Docker Compose definitions
+for running the current version of Hadoop in a multi-node docker environment.
+
+This is meant for testing code changes locally and debugging.
+
+The base image used by the Docker setup is built as part of the maven lifecycle.
+The distribution files generated while building the project with the `-Pdist` profile enabled,
+will be used for running hadoop inside the containers.
+
+In order to start the docker environment you need to do the following
+* Build the project, using the `-Pdist` profile
+  ```shell
+  > mvn clean install -Dmaven.javadoc.skip=true -DskipTests -DskipShade -Pdist,src
+  ```
+* From the project root, navigate under the docker-compose dir under the generated dist directory
+  ```shell
+  > cd hadoop-dist/target/hadoop-<current-version>/compose/hadoop
+  ```
+* Start the docker environment
+  ```shell
+  > docker-compose up -d --scale datanode=3
+  ```
+* Connect to a container to execute commands
+  ```shell
+  > docker exec -it hadoop_datanode_1 bash
+  bash-4.2$ hdfs dfs -mkdir /test
+  ```
+
+### Config files
+
+To add or remove properties from the `core-site.xml`, `hdfs-site.xml`, etc. files used in the docker environment,
+simply edit the `config` file before starting the containers. The changes will be persisted in the docker environment.

+ 6 - 0
hadoop-common-project/hadoop-common/src/site/markdown/SingleCluster.md.vm

@@ -236,3 +236,9 @@ Fully-Distributed Operation
 ---------------------------
 
 For information on setting up fully-distributed, non-trivial clusters see [Cluster Setup](./ClusterSetup.html).
+
+Hadoop in Docker containers
+---------------------------
+
+For information on setting up hadoop in docker, using either official releases or the main source code,
+check [Hadoop Docker](./HadoopDocker.html).

+ 94 - 0
hadoop-dist/pom.xml

@@ -29,6 +29,13 @@
   <name>Apache Hadoop Distribution</name>
   <packaging>jar</packaging>
 
+  <properties>
+    <file.encoding>UTF-8</file.encoding>
+    <downloadSources>true</downloadSources>
+    <docker.hadoop-runner.version>docker-hadoop-runner</docker.hadoop-runner.version>
+    <maven.test.skip>true</maven.test.skip>
+  </properties>
+
   <!-- Using dependencies to ensure this module is the last one -->
   <dependencies>
     <dependency>
@@ -151,6 +158,43 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <artifactId>maven-resources-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>copy-compose-files</id>
+            <phase>package</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/hadoop-${project.version}/compose</outputDirectory>
+              <resources>
+                <resource>
+                  <directory>src/main/compose</directory>
+                  <filtering>true</filtering>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+          <execution>
+            <id>copy-and-filter-dockerfile</id>
+            <phase>package</phase>
+            <goals>
+              <goal>copy-resources</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>${project.build.directory}/hadoop-${project.version}</outputDirectory>
+              <resources>
+                <resource>
+                  <directory>src/main/docker</directory>
+                  <filtering>true</filtering>
+                </resource>
+              </resources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
   </build>
 
@@ -230,6 +274,56 @@
         </plugins>
       </build>
     </profile>
+    <profile>
+      <id>docker-build</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>io.fabric8</groupId>
+            <artifactId>docker-maven-plugin</artifactId>
+            <executions>
+              <execution>
+                <goals>
+                  <goal>build</goal>
+                </goals>
+                <phase>package</phase>
+              </execution>
+            </executions>
+            <configuration>
+              <images>
+                <image>
+                  <name>${docker.image}</name>
+                  <build>
+                    <dockerFileDir>
+                      ${project.build.directory}/hadoop-${project.version}
+                    </dockerFileDir>
+                  </build>
+                </image>
+              </images>
+            </configuration>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
+    <profile>
+      <id>docker-push</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>io.fabric8</groupId>
+            <artifactId>docker-maven-plugin</artifactId>
+            <executions>
+              <execution>
+                <goals>
+                  <goal>push</goal>
+                </goals>
+                <phase>package</phase>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
   </profiles>
 
 </project>

+ 18 - 0
hadoop-dist/src/main/compose/hadoop/.env

@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+HADOOP_IMAGE=apache/hadoop
+HADOOP_RUNNER_VERSION=${docker.hadoop-runner.version}
+HADOOP_RUNNER_IMAGE=apache/hadoop-runner

+ 50 - 0
hadoop-dist/src/main/compose/hadoop/config

@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CORE-SITE.XML_fs.default.name=hdfs://namenode
+CORE-SITE.XML_fs.defaultFS=hdfs://namenode
+
+HDFS-SITE.XML_dfs.namenode.rpc-address=namenode:8020
+HDFS-SITE.XML_dfs.replication=1
+
+MAPRED-SITE.XML_mapreduce.framework.name=yarn
+MAPRED-SITE.XML_yarn.app.mapreduce.am.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
+MAPRED-SITE.XML_mapreduce.map.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
+MAPRED-SITE.XML_mapreduce.reduce.env=HADOOP_MAPRED_HOME=$HADOOP_HOME
+
+YARN-SITE.XML_yarn.resourcemanager.hostname=resourcemanager
+YARN-SITE.XML_yarn.nodemanager.pmem-check-enabled=false
+YARN-SITE.XML_yarn.nodemanager.delete.debug-delay-sec=600
+YARN-SITE.XML_yarn.nodemanager.vmem-check-enabled=false
+YARN-SITE.XML_yarn.nodemanager.aux-services=mapreduce_shuffle
+
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-applications=10000
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.maximum-am-resource-percent=0.1
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.resource-calculator=org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.queues=default
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.capacity=100
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.user-limit-factor=1
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.maximum-capacity=100
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.state=RUNNING
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_submit_applications=*
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.root.default.acl_administer_queue=*
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.node-locality-delay=40
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings=
+CAPACITY-SCHEDULER.XML_yarn.scheduler.capacity.queue-mappings-override.enable=false
+
+LOG4J.PROPERTIES_log4j.rootLogger=INFO, stdout
+LOG4J.PROPERTIES_log4j.appender.stdout=org.apache.log4j.ConsoleAppender
+LOG4J.PROPERTIES_log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
+LOG4J.PROPERTIES_log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n

+ 46 - 0
hadoop-dist/src/main/compose/hadoop/docker-compose.yaml

@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: "3.8"
+
+x-common-config:
+  &common-config
+  image: ${HADOOP_RUNNER_IMAGE}:${HADOOP_RUNNER_VERSION}
+  volumes:
+    - ../..:/opt/hadoop
+  env_file:
+    - ./config
+
+services:
+  namenode:
+    <<: *common-config
+    hostname: namenode
+    command: ["hdfs", "namenode"]
+    ports:
+      - 9870:9870
+    environment:
+      ENSURE_NAMENODE_DIR: "/tmp/hadoop-root/dfs/name"
+  datanode:
+    <<: *common-config
+    command: ["hdfs", "datanode"]
+  resourcemanager:
+    <<: *common-config
+    hostname: resourcemanager
+    command: ["yarn", "resourcemanager"]
+    ports:
+      - 8088:8088
+  nodemanager:
+    <<: *common-config
+    command: ["yarn", "nodemanager"]

+ 26 - 0
hadoop-dist/src/main/docker/Dockerfile

@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM apache/hadoop-runner:@docker.hadoop-runner.version@
+
+COPY . /opt/hadoop
+
+WORKDIR /opt/hadoop
+
+USER root
+
+RUN chown -R hadoop:users /opt/hadoop
+
+USER hadoop

+ 14 - 0
pom.xml

@@ -82,6 +82,8 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/x
     <!-- required as child projects with different version can't use ${project.version} -->
     <hadoop.version>3.5.0-SNAPSHOT</hadoop.version>
 
+    <docker.image>apache/hadoop:${project.version}</docker.image>
+
     <distMgmtSnapshotsId>apache.snapshots.https</distMgmtSnapshotsId>
     <distMgmtSnapshotsName>Apache Development Snapshot Repository</distMgmtSnapshotsName>
     <distMgmtSnapshotsUrl>https://repository.apache.org/content/repositories/snapshots</distMgmtSnapshotsUrl>
@@ -119,6 +121,7 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/x
     <jsonschema2pojo-maven-plugin.version>1.1.1</jsonschema2pojo-maven-plugin.version>
     <maven-compiler-plugin.version>3.10.1</maven-compiler-plugin.version>
     <cyclonedx.version>2.7.10</cyclonedx.version>
+    <docker-maven-plugin.version>0.29.0</docker-maven-plugin.version>
 
     <shell-executable>bash</shell-executable>
 
@@ -150,6 +153,11 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/x
           <artifactId>maven-dependency-plugin</artifactId>
           <version>${maven-dependency-plugin.version}</version>
         </plugin>
+        <plugin>
+          <groupId>io.fabric8</groupId>
+          <artifactId>docker-maven-plugin</artifactId>
+          <version>${docker-maven-plugin.version}</version>
+        </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-enforcer-plugin</artifactId>
@@ -892,5 +900,11 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/x
       </activation>
     </profile>
 
+    <profile>
+      <id>docker-build</id>
+      <properties>
+        <docker.image>${user.name}/hadoop:${project.version}</docker.image>
+      </properties>
+    </profile>
   </profiles>
 </project>