瀏覽代碼

HADOOP-13145 In DistCp, prevent unnecessary getFileStatus call when not preserving metadata. Contributed by Chris Nauroth.

Steve Loughran 9 年之前
父節點
當前提交
c918286b17

+ 6 - 0
hadoop-project/pom.xml

@@ -353,6 +353,12 @@
         <artifactId>hadoop-distcp</artifactId>
         <version>${project.version}</version>
       </dependency>
+      <dependency>
+        <groupId>org.apache.hadoop</groupId>
+        <artifactId>hadoop-distcp</artifactId>
+        <version>${project.version}</version>
+        <type>test-jar</type>
+      </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-datajoin</artifactId>

+ 11 - 0
hadoop-tools/hadoop-aws/pom.xml

@@ -252,5 +252,16 @@
       <scope>test</scope>
       <type>jar</type>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
   </dependencies>
 </project>

+ 9 - 0
hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md

@@ -731,6 +731,15 @@ or in batch runs.
 Smaller values should result in faster test runs, especially when the object
 store is a long way away.
 
+DistCp tests targeting S3A support a configurable file size.  The default is
+10 MB, but the configuration value is expressed in KB so that it can be tuned
+smaller to achieve faster test runs.
+
+      <property>
+        <name>scale.test.distcp.file.size.kb</name>
+        <value>10240</value>
+      </property>
+
 ### Running the Tests
 
 After completing the configuration, execute the test run through Maven.

+ 46 - 0
hadoop-tools/hadoop-aws/src/test/java/org/apache/hadoop/fs/contract/s3a/TestS3AContractDistCp.java

@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.contract.s3a;
+
+import static org.apache.hadoop.fs.s3a.Constants.MIN_MULTIPART_THRESHOLD;
+import static org.apache.hadoop.fs.s3a.Constants.MULTIPART_SIZE;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.tools.contract.AbstractContractDistCpTest;
+
+/**
+ * Contract test suite covering S3A integration with DistCp.
+ */
+public class TestS3AContractDistCp extends AbstractContractDistCpTest {
+
+  private static final long MULTIPART_SETTING = 8 * 1024 * 1024; // 8 MB
+
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration newConf = super.createConfiguration();
+    newConf.setLong(MIN_MULTIPART_THRESHOLD, MULTIPART_SETTING);
+    newConf.setLong(MULTIPART_SIZE, MULTIPART_SETTING);
+    return newConf;
+  }
+
+  @Override
+  protected S3AContract createContract(Configuration conf) {
+    return new S3AContract(conf);
+  }
+}

+ 19 - 0
hadoop-tools/hadoop-azure/pom.xml

@@ -193,6 +193,25 @@
       <type>test-jar</type>
     </dependency>
     
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-distcp</artifactId>
+      <scope>test</scope>
+      <type>test-jar</type>
+    </dependency>
+
     <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-all</artifactId>

+ 33 - 0
hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azure/contract/TestAzureNativeContractDistCp.java

@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azure.contract;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.tools.contract.AbstractContractDistCpTest;
+
+/**
+ * Contract test suite covering WASB integration with DistCp.
+ */
+public class TestAzureNativeContractDistCp extends AbstractContractDistCpTest {
+
+  @Override
+  protected NativeAzureFileSystemContract createContract(Configuration conf) {
+    return new NativeAzureFileSystemContract(conf);
+  }
+}

+ 16 - 0
hadoop-tools/hadoop-distcp/pom.xml

@@ -186,6 +186,22 @@
             </manifest>
           </archive>
         </configuration>
+        <executions>
+          <execution>
+            <id>prepare-jar</id>
+            <phase>prepare-package</phase>
+            <goals>
+              <goal>jar</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>prepare-test-jar</id>
+            <phase>prepare-package</phase>
+            <goals>
+              <goal>test-jar</goal>
+            </goals>
+          </execution>
+        </executions>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>

+ 7 - 3
hadoop-tools/hadoop-distcp/src/main/java/org/apache/hadoop/tools/util/DistCpUtils.java

@@ -195,9 +195,13 @@ public class DistCpUtils {
                               EnumSet<FileAttribute> attributes,
                               boolean preserveRawXattrs) throws IOException {
 
-    FileStatus targetFileStatus = targetFS.getFileStatus(path);
-    String group = targetFileStatus.getGroup();
-    String user = targetFileStatus.getOwner();
+    // If not preserving anything from FileStatus, don't bother fetching it.
+    FileStatus targetFileStatus = attributes.isEmpty() ? null :
+        targetFS.getFileStatus(path);
+    String group = targetFileStatus == null ? null :
+        targetFileStatus.getGroup();
+    String user = targetFileStatus == null ? null :
+        targetFileStatus.getOwner();
     boolean chown = false;
 
     if (attributes.contains(FileAttribute.ACL)) {

+ 204 - 0
hadoop-tools/hadoop-distcp/src/test/java/org/apache/hadoop/tools/contract/AbstractContractDistCpTest.java

@@ -0,0 +1,204 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.tools.contract;
+
+import static org.apache.hadoop.fs.contract.ContractTestUtils.*;
+import static org.junit.Assert.*;
+
+import java.util.Arrays;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.contract.AbstractFSContractTestBase;
+import org.apache.hadoop.fs.contract.ContractTestUtils;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.hadoop.tools.DistCp;
+import org.apache.hadoop.tools.DistCpOptions;
+
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TestName;
+
+/**
+ * Contract test suite covering a file system's integration with DistCp.  The
+ * tests coordinate two file system instances: one "local", which is the local
+ * file system, and the other "remote", which is the file system implementation
+ * under test.  The tests in the suite cover both copying from local to remote
+ * (e.g. a backup use case) and copying from remote to local (e.g. a restore use
+ * case).
+ */
+public abstract class AbstractContractDistCpTest
+    extends AbstractFSContractTestBase {
+
+  @Rule
+  public TestName testName = new TestName();
+
+  private Configuration conf;
+  private FileSystem localFS, remoteFS;
+  private Path localDir, remoteDir;
+
+  @Override
+  protected Configuration createConfiguration() {
+    Configuration newConf = new Configuration();
+    newConf.set("mapred.job.tracker", "local");
+    return newConf;
+  }
+
+  @Before
+  @Override
+  public void setup() throws Exception {
+    super.setup();
+    conf = getContract().getConf();
+    localFS = FileSystem.getLocal(conf);
+    remoteFS = getFileSystem();
+    // Test paths are isolated by concrete subclass name and test method name.
+    // All paths are fully qualified including scheme (not taking advantage of
+    // default file system), so if something fails, the messages will make it
+    // clear which paths are local and which paths are remote.
+    Path testSubDir = new Path(getClass().getSimpleName(),
+        testName.getMethodName());
+    localDir = localFS.makeQualified(new Path(new Path(
+        GenericTestUtils.getTestDir().toURI()), testSubDir));
+    mkdirs(localFS, localDir);
+    remoteDir = remoteFS.makeQualified(
+        new Path(getContract().getTestPath(), testSubDir));
+    mkdirs(remoteFS, remoteDir);
+  }
+
+  @Test
+  public void deepDirectoryStructureToRemote() throws Exception {
+    describe("copy a deep directory structure from local to remote");
+    deepDirectoryStructure(localFS, localDir, remoteFS, remoteDir);
+  }
+
+  @Test
+  public void largeFilesToRemote() throws Exception {
+    describe("copy multiple large files from local to remote");
+    largeFiles(localFS, localDir, remoteFS, remoteDir);
+  }
+
+  @Test
+  public void deepDirectoryStructureFromRemote() throws Exception {
+    describe("copy a deep directory structure from remote to local");
+    deepDirectoryStructure(remoteFS, remoteDir, localFS, localDir);
+  }
+
+  @Test
+  public void largeFilesFromRemote() throws Exception {
+    describe("copy multiple large files from remote to local");
+    largeFiles(remoteFS, remoteDir, localFS, localDir);
+  }
+
+  /**
+   * Executes a test using a file system sub-tree with multiple nesting levels.
+   *
+   * @param srcFS source FileSystem
+   * @param srcDir source directory
+   * @param dstFS destination FileSystem
+   * @param dstDir destination directory
+   * @throws Exception if there is a failure
+   */
+  private void deepDirectoryStructure(FileSystem srcFS, Path srcDir,
+      FileSystem dstFS, Path dstDir) throws Exception {
+    Path inputDir = new Path(srcDir, "inputDir");
+    Path inputSubDir1 = new Path(inputDir, "subDir1");
+    Path inputSubDir2 = new Path(inputDir, "subDir2/subDir3");
+    Path inputFile1 = new Path(inputDir, "file1");
+    Path inputFile2 = new Path(inputSubDir1, "file2");
+    Path inputFile3 = new Path(inputSubDir2, "file3");
+    mkdirs(srcFS, inputSubDir1);
+    mkdirs(srcFS, inputSubDir2);
+    byte[] data1 = dataset(100, 33, 43);
+    createFile(srcFS, inputFile1, true, data1);
+    byte[] data2 = dataset(200, 43, 53);
+    createFile(srcFS, inputFile2, true, data2);
+    byte[] data3 = dataset(300, 53, 63);
+    createFile(srcFS, inputFile3, true, data3);
+    Path target = new Path(dstDir, "outputDir");
+    runDistCp(inputDir, target);
+    ContractTestUtils.assertIsDirectory(dstFS, target);
+    verifyFileContents(dstFS, new Path(target, "inputDir/file1"), data1);
+    verifyFileContents(dstFS,
+        new Path(target, "inputDir/subDir1/file2"), data2);
+    verifyFileContents(dstFS,
+        new Path(target, "inputDir/subDir2/subDir3/file3"), data3);
+  }
+
+  /**
+   * Executes a test using multiple large files.
+   *
+   * @param srcFS source FileSystem
+   * @param srcDir source directory
+   * @param dstFS destination FileSystem
+   * @param dstDir destination directory
+   * @throws Exception if there is a failure
+   */
+  private void largeFiles(FileSystem srcFS, Path srcDir, FileSystem dstFS,
+      Path dstDir) throws Exception {
+    Path inputDir = new Path(srcDir, "inputDir");
+    Path inputFile1 = new Path(inputDir, "file1");
+    Path inputFile2 = new Path(inputDir, "file2");
+    Path inputFile3 = new Path(inputDir, "file3");
+    mkdirs(srcFS, inputDir);
+    int fileSizeKb = conf.getInt("scale.test.distcp.file.size.kb", 10 * 1024);
+    int fileSizeMb = fileSizeKb * 1024;
+    getLog().info("{} with file size {}", testName.getMethodName(), fileSizeMb);
+    byte[] data1 = dataset((fileSizeMb + 1) * 1024 * 1024, 33, 43);
+    createFile(srcFS, inputFile1, true, data1);
+    byte[] data2 = dataset((fileSizeMb + 2) * 1024 * 1024, 43, 53);
+    createFile(srcFS, inputFile2, true, data2);
+    byte[] data3 = dataset((fileSizeMb + 3) * 1024 * 1024, 53, 63);
+    createFile(srcFS, inputFile3, true, data3);
+    Path target = new Path(dstDir, "outputDir");
+    runDistCp(inputDir, target);
+    ContractTestUtils.assertIsDirectory(dstFS, target);
+    verifyFileContents(dstFS, new Path(target, "inputDir/file1"), data1);
+    verifyFileContents(dstFS, new Path(target, "inputDir/file2"), data2);
+    verifyFileContents(dstFS, new Path(target, "inputDir/file3"), data3);
+  }
+
+  /**
+   * Executes DistCp and asserts that the job finished successfully.
+   *
+   * @param src source path
+   * @param dst destination path
+   * @throws Exception if there is a failure
+   */
+  private void runDistCp(Path src, Path dst) throws Exception {
+    DistCpOptions options = new DistCpOptions(Arrays.asList(src), dst);
+    Job job = new DistCp(conf, options).execute();
+    assertNotNull("Unexpected null job returned from DistCp execution.", job);
+    assertTrue("DistCp job did not complete.", job.isComplete());
+    assertTrue("DistCp job did not complete successfully.", job.isSuccessful());
+  }
+
+  /**
+   * Creates a directory and any ancestor directories required.
+   *
+   * @param fs FileSystem in which to create directories
+   * @param dir path of directory to create
+   * @throws Exception if there is a failure
+   */
+  private static void mkdirs(FileSystem fs, Path dir) throws Exception {
+    assertTrue("Failed to mkdir " + dir, fs.mkdirs(dir));
+  }
+}