|
@@ -19,10 +19,17 @@ package org.apache.hadoop.mapreduce.lib.input;
|
|
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
import java.io.IOException;
|
|
|
+import java.util.Arrays;
|
|
|
+import java.util.Collection;
|
|
|
import java.util.List;
|
|
|
+import java.util.Set;
|
|
|
+
|
|
|
+import javax.annotation.Nullable;
|
|
|
|
|
|
import junit.framework.Assert;
|
|
|
|
|
|
+import org.apache.commons.logging.Log;
|
|
|
+import org.apache.commons.logging.LogFactory;
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
import org.apache.hadoop.fs.BlockLocation;
|
|
|
import org.apache.hadoop.fs.FileStatus;
|
|
@@ -34,55 +41,90 @@ import org.apache.hadoop.fs.RawLocalFileSystem;
|
|
|
import org.apache.hadoop.fs.RemoteIterator;
|
|
|
import org.apache.hadoop.mapreduce.InputSplit;
|
|
|
import org.apache.hadoop.mapreduce.Job;
|
|
|
+import org.junit.After;
|
|
|
+import org.junit.Before;
|
|
|
import org.junit.Test;
|
|
|
+import org.junit.runner.RunWith;
|
|
|
+import org.junit.runners.Parameterized;
|
|
|
+import org.junit.runners.Parameterized.Parameters;
|
|
|
+
|
|
|
+import com.google.common.base.Function;
|
|
|
+import com.google.common.collect.Iterables;
|
|
|
+import com.google.common.collect.Lists;
|
|
|
+import com.google.common.collect.Sets;
|
|
|
|
|
|
+@RunWith(value = Parameterized.class)
|
|
|
public class TestFileInputFormat {
|
|
|
+
|
|
|
+ private static final Log LOG = LogFactory.getLog(TestFileInputFormat.class);
|
|
|
+
|
|
|
+ private static String testTmpDir = System.getProperty("test.build.data", "/tmp");
|
|
|
+ private static final Path TEST_ROOT_DIR = new Path(testTmpDir, "TestFIF");
|
|
|
+
|
|
|
+ private static FileSystem localFs;
|
|
|
+
|
|
|
+ private int numThreads;
|
|
|
+
|
|
|
+ public TestFileInputFormat(int numThreads) {
|
|
|
+ this.numThreads = numThreads;
|
|
|
+ LOG.info("Running with numThreads: " + numThreads);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Parameters
|
|
|
+ public static Collection<Object[]> data() {
|
|
|
+ Object[][] data = new Object[][] { { 1 }, { 5 }};
|
|
|
+ return Arrays.asList(data);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Before
|
|
|
+ public void setup() throws IOException {
|
|
|
+ LOG.info("Using Test Dir: " + TEST_ROOT_DIR);
|
|
|
+ localFs = FileSystem.getLocal(new Configuration());
|
|
|
+ localFs.delete(TEST_ROOT_DIR, true);
|
|
|
+ localFs.mkdirs(TEST_ROOT_DIR);
|
|
|
+ }
|
|
|
+
|
|
|
+ @After
|
|
|
+ public void cleanup() throws IOException {
|
|
|
+ localFs.delete(TEST_ROOT_DIR, true);
|
|
|
+ }
|
|
|
|
|
|
@Test
|
|
|
public void testNumInputFilesRecursively() throws Exception {
|
|
|
Configuration conf = getConfiguration();
|
|
|
conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
|
|
|
+ conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
|
|
|
Job job = Job.getInstance(conf);
|
|
|
FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
|
|
|
List<InputSplit> splits = fileInputFormat.getSplits(job);
|
|
|
Assert.assertEquals("Input splits are not correct", 3, splits.size());
|
|
|
- Assert.assertEquals("test:/a1/a2/file2", ((FileSplit) splits.get(0))
|
|
|
- .getPath().toString());
|
|
|
- Assert.assertEquals("test:/a1/a2/file3", ((FileSplit) splits.get(1))
|
|
|
- .getPath().toString());
|
|
|
- Assert.assertEquals("test:/a1/file1", ((FileSplit) splits.get(2)).getPath()
|
|
|
- .toString());
|
|
|
-
|
|
|
+ verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3",
|
|
|
+ "test:/a1/file1"), splits);
|
|
|
+
|
|
|
// Using the deprecated configuration
|
|
|
conf = getConfiguration();
|
|
|
conf.set("mapred.input.dir.recursive", "true");
|
|
|
job = Job.getInstance(conf);
|
|
|
splits = fileInputFormat.getSplits(job);
|
|
|
- Assert.assertEquals("Input splits are not correct", 3, splits.size());
|
|
|
- Assert.assertEquals("test:/a1/a2/file2", ((FileSplit) splits.get(0))
|
|
|
- .getPath().toString());
|
|
|
- Assert.assertEquals("test:/a1/a2/file3", ((FileSplit) splits.get(1))
|
|
|
- .getPath().toString());
|
|
|
- Assert.assertEquals("test:/a1/file1", ((FileSplit) splits.get(2)).getPath()
|
|
|
- .toString());
|
|
|
+ verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3",
|
|
|
+ "test:/a1/file1"), splits);
|
|
|
}
|
|
|
|
|
|
@Test
|
|
|
public void testNumInputFilesWithoutRecursively() throws Exception {
|
|
|
Configuration conf = getConfiguration();
|
|
|
+ conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
|
|
|
Job job = Job.getInstance(conf);
|
|
|
FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
|
|
|
List<InputSplit> splits = fileInputFormat.getSplits(job);
|
|
|
Assert.assertEquals("Input splits are not correct", 2, splits.size());
|
|
|
- Assert.assertEquals("test:/a1/a2", ((FileSplit) splits.get(0)).getPath()
|
|
|
- .toString());
|
|
|
- Assert.assertEquals("test:/a1/file1", ((FileSplit) splits.get(1)).getPath()
|
|
|
- .toString());
|
|
|
+ verifySplits(Lists.newArrayList("test:/a1/a2", "test:/a1/file1"), splits);
|
|
|
}
|
|
|
|
|
|
@Test
|
|
|
public void testListLocatedStatus() throws Exception {
|
|
|
Configuration conf = getConfiguration();
|
|
|
+ conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
|
|
|
conf.setBoolean("fs.test.impl.disable.cache", false);
|
|
|
conf.set(FileInputFormat.INPUT_DIR, "test:///a1/a2");
|
|
|
MockFileSystem mockFs =
|
|
@@ -95,8 +137,226 @@ public class TestFileInputFormat {
|
|
|
Assert.assertEquals("Input splits are not correct", 2, splits.size());
|
|
|
Assert.assertEquals("listLocatedStatuss calls",
|
|
|
1, mockFs.numListLocatedStatusCalls);
|
|
|
+ FileSystem.closeAll();
|
|
|
+ }
|
|
|
+
|
|
|
+ @Test
|
|
|
+ public void testListStatusSimple() throws IOException {
|
|
|
+ Configuration conf = new Configuration();
|
|
|
+ conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
|
|
|
+
|
|
|
+ List<Path> expectedPaths = configureTestSimple(conf, localFs);
|
|
|
+
|
|
|
+ Job job = Job.getInstance(conf);
|
|
|
+ FileInputFormat<?, ?> fif = new TextInputFormat();
|
|
|
+ List<FileStatus> statuses = fif.listStatus(job);
|
|
|
+
|
|
|
+ verifyFileStatuses(expectedPaths, statuses, localFs);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Test
|
|
|
+ public void testListStatusNestedRecursive() throws IOException {
|
|
|
+ Configuration conf = new Configuration();
|
|
|
+ conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
|
|
|
+
|
|
|
+ List<Path> expectedPaths = configureTestNestedRecursive(conf, localFs);
|
|
|
+ Job job = Job.getInstance(conf);
|
|
|
+ FileInputFormat<?, ?> fif = new TextInputFormat();
|
|
|
+ List<FileStatus> statuses = fif.listStatus(job);
|
|
|
+
|
|
|
+ verifyFileStatuses(expectedPaths, statuses, localFs);
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ @Test
|
|
|
+ public void testListStatusNestedNonRecursive() throws IOException {
|
|
|
+ Configuration conf = new Configuration();
|
|
|
+ conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
|
|
|
+
|
|
|
+ List<Path> expectedPaths = configureTestNestedNonRecursive(conf, localFs);
|
|
|
+ Job job = Job.getInstance(conf);
|
|
|
+ FileInputFormat<?, ?> fif = new TextInputFormat();
|
|
|
+ List<FileStatus> statuses = fif.listStatus(job);
|
|
|
+
|
|
|
+ verifyFileStatuses(expectedPaths, statuses, localFs);
|
|
|
}
|
|
|
|
|
|
+ @Test
|
|
|
+ public void testListStatusErrorOnNonExistantDir() throws IOException {
|
|
|
+ Configuration conf = new Configuration();
|
|
|
+ conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
|
|
|
+
|
|
|
+ configureTestErrorOnNonExistantDir(conf, localFs);
|
|
|
+ Job job = Job.getInstance(conf);
|
|
|
+ FileInputFormat<?, ?> fif = new TextInputFormat();
|
|
|
+ try {
|
|
|
+ fif.listStatus(job);
|
|
|
+ Assert.fail("Expecting an IOException for a missing Input path");
|
|
|
+ } catch (IOException e) {
|
|
|
+ Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2");
|
|
|
+ expectedExceptionPath = localFs.makeQualified(expectedExceptionPath);
|
|
|
+ Assert.assertTrue(e instanceof InvalidInputException);
|
|
|
+ Assert.assertEquals(
|
|
|
+ "Input path does not exist: " + expectedExceptionPath.toString(),
|
|
|
+ e.getMessage());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public static List<Path> configureTestSimple(Configuration conf, FileSystem localFs)
|
|
|
+ throws IOException {
|
|
|
+ Path base1 = new Path(TEST_ROOT_DIR, "input1");
|
|
|
+ Path base2 = new Path(TEST_ROOT_DIR, "input2");
|
|
|
+ conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
|
|
|
+ localFs.makeQualified(base1) + "," + localFs.makeQualified(base2));
|
|
|
+ localFs.mkdirs(base1);
|
|
|
+ localFs.mkdirs(base2);
|
|
|
+
|
|
|
+ Path in1File1 = new Path(base1, "file1");
|
|
|
+ Path in1File2 = new Path(base1, "file2");
|
|
|
+ localFs.createNewFile(in1File1);
|
|
|
+ localFs.createNewFile(in1File2);
|
|
|
+
|
|
|
+ Path in2File1 = new Path(base2, "file1");
|
|
|
+ Path in2File2 = new Path(base2, "file2");
|
|
|
+ localFs.createNewFile(in2File1);
|
|
|
+ localFs.createNewFile(in2File2);
|
|
|
+ List<Path> expectedPaths = Lists.newArrayList(in1File1, in1File2, in2File1,
|
|
|
+ in2File2);
|
|
|
+ return expectedPaths;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static List<Path> configureTestNestedRecursive(Configuration conf,
|
|
|
+ FileSystem localFs) throws IOException {
|
|
|
+ Path base1 = new Path(TEST_ROOT_DIR, "input1");
|
|
|
+ conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
|
|
|
+ localFs.makeQualified(base1).toString());
|
|
|
+ conf.setBoolean(
|
|
|
+ org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_RECURSIVE,
|
|
|
+ true);
|
|
|
+ localFs.mkdirs(base1);
|
|
|
+
|
|
|
+ Path inDir1 = new Path(base1, "dir1");
|
|
|
+ Path inDir2 = new Path(base1, "dir2");
|
|
|
+ Path inFile1 = new Path(base1, "file1");
|
|
|
+
|
|
|
+ Path dir1File1 = new Path(inDir1, "file1");
|
|
|
+ Path dir1File2 = new Path(inDir1, "file2");
|
|
|
+
|
|
|
+ Path dir2File1 = new Path(inDir2, "file1");
|
|
|
+ Path dir2File2 = new Path(inDir2, "file2");
|
|
|
+
|
|
|
+ localFs.mkdirs(inDir1);
|
|
|
+ localFs.mkdirs(inDir2);
|
|
|
+
|
|
|
+ localFs.createNewFile(inFile1);
|
|
|
+ localFs.createNewFile(dir1File1);
|
|
|
+ localFs.createNewFile(dir1File2);
|
|
|
+ localFs.createNewFile(dir2File1);
|
|
|
+ localFs.createNewFile(dir2File2);
|
|
|
+
|
|
|
+ List<Path> expectedPaths = Lists.newArrayList(inFile1, dir1File1,
|
|
|
+ dir1File2, dir2File1, dir2File2);
|
|
|
+ return expectedPaths;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static List<Path> configureTestNestedNonRecursive(Configuration conf,
|
|
|
+ FileSystem localFs) throws IOException {
|
|
|
+ Path base1 = new Path(TEST_ROOT_DIR, "input1");
|
|
|
+ conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
|
|
|
+ localFs.makeQualified(base1).toString());
|
|
|
+ conf.setBoolean(
|
|
|
+ org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_RECURSIVE,
|
|
|
+ false);
|
|
|
+ localFs.mkdirs(base1);
|
|
|
+
|
|
|
+ Path inDir1 = new Path(base1, "dir1");
|
|
|
+ Path inDir2 = new Path(base1, "dir2");
|
|
|
+ Path inFile1 = new Path(base1, "file1");
|
|
|
+
|
|
|
+ Path dir1File1 = new Path(inDir1, "file1");
|
|
|
+ Path dir1File2 = new Path(inDir1, "file2");
|
|
|
+
|
|
|
+ Path dir2File1 = new Path(inDir2, "file1");
|
|
|
+ Path dir2File2 = new Path(inDir2, "file2");
|
|
|
+
|
|
|
+ localFs.mkdirs(inDir1);
|
|
|
+ localFs.mkdirs(inDir2);
|
|
|
+
|
|
|
+ localFs.createNewFile(inFile1);
|
|
|
+ localFs.createNewFile(dir1File1);
|
|
|
+ localFs.createNewFile(dir1File2);
|
|
|
+ localFs.createNewFile(dir2File1);
|
|
|
+ localFs.createNewFile(dir2File2);
|
|
|
+
|
|
|
+ List<Path> expectedPaths = Lists.newArrayList(inFile1, inDir1, inDir2);
|
|
|
+ return expectedPaths;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static List<Path> configureTestErrorOnNonExistantDir(Configuration conf,
|
|
|
+ FileSystem localFs) throws IOException {
|
|
|
+ Path base1 = new Path(TEST_ROOT_DIR, "input1");
|
|
|
+ Path base2 = new Path(TEST_ROOT_DIR, "input2");
|
|
|
+ conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
|
|
|
+ localFs.makeQualified(base1) + "," + localFs.makeQualified(base2));
|
|
|
+ conf.setBoolean(
|
|
|
+ org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR_RECURSIVE,
|
|
|
+ true);
|
|
|
+ localFs.mkdirs(base1);
|
|
|
+
|
|
|
+ Path inFile1 = new Path(base1, "file1");
|
|
|
+ Path inFile2 = new Path(base1, "file2");
|
|
|
+
|
|
|
+ localFs.createNewFile(inFile1);
|
|
|
+ localFs.createNewFile(inFile2);
|
|
|
+
|
|
|
+ List<Path> expectedPaths = Lists.newArrayList();
|
|
|
+ return expectedPaths;
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void verifyFileStatuses(List<Path> expectedPaths,
|
|
|
+ List<FileStatus> fetchedStatuses, final FileSystem localFs) {
|
|
|
+ Assert.assertEquals(expectedPaths.size(), fetchedStatuses.size());
|
|
|
+
|
|
|
+ Iterable<Path> fqExpectedPaths = Iterables.transform(expectedPaths,
|
|
|
+ new Function<Path, Path>() {
|
|
|
+ @Override
|
|
|
+ public Path apply(Path input) {
|
|
|
+ return localFs.makeQualified(input);
|
|
|
+ }
|
|
|
+ });
|
|
|
+
|
|
|
+ Set<Path> expectedPathSet = Sets.newHashSet(fqExpectedPaths);
|
|
|
+ for (FileStatus fileStatus : fetchedStatuses) {
|
|
|
+ if (!expectedPathSet.remove(localFs.makeQualified(fileStatus.getPath()))) {
|
|
|
+ Assert.fail("Found extra fetched status: " + fileStatus.getPath());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Assert.assertEquals(
|
|
|
+ "Not all expectedPaths matched: " + expectedPathSet.toString(), 0,
|
|
|
+ expectedPathSet.size());
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ private void verifySplits(List<String> expected, List<InputSplit> splits) {
|
|
|
+ Iterable<String> pathsFromSplits = Iterables.transform(splits,
|
|
|
+ new Function<InputSplit, String>() {
|
|
|
+ @Override
|
|
|
+ public String apply(@Nullable InputSplit input) {
|
|
|
+ return ((FileSplit) input).getPath().toString();
|
|
|
+ }
|
|
|
+ });
|
|
|
+
|
|
|
+ Set<String> expectedSet = Sets.newHashSet(expected);
|
|
|
+ for (String splitPathString : pathsFromSplits) {
|
|
|
+ if (!expectedSet.remove(splitPathString)) {
|
|
|
+ Assert.fail("Found extra split: " + splitPathString);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ Assert.assertEquals(
|
|
|
+ "Not all expectedPaths matched: " + expectedSet.toString(), 0,
|
|
|
+ expectedSet.size());
|
|
|
+ }
|
|
|
+
|
|
|
private Configuration getConfiguration() {
|
|
|
Configuration conf = new Configuration();
|
|
|
conf.set("fs.test.impl.disable.cache", "true");
|