|
@@ -0,0 +1,154 @@
|
|
|
+/**
|
|
|
+ * Licensed to the Apache Software Foundation (ASF) under one
|
|
|
+ * or more contributor license agreements. See the NOTICE file
|
|
|
+ * distributed with this work for additional information
|
|
|
+ * regarding copyright ownership. The ASF licenses this file
|
|
|
+ * to you under the Apache License, Version 2.0 (the
|
|
|
+ * "License"); you may not use this file except in compliance
|
|
|
+ * with the License. You may obtain a copy of the License at
|
|
|
+ *
|
|
|
+ * http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+ *
|
|
|
+ * Unless required by applicable law or agreed to in writing, software
|
|
|
+ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+ * See the License for the specific language governing permissions and
|
|
|
+ * limitations under the License.
|
|
|
+ */
|
|
|
+
|
|
|
+package org.apache.hadoop.contrib.utils.join;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+
|
|
|
+import junit.framework.Test;
|
|
|
+import junit.framework.TestCase;
|
|
|
+import junit.framework.TestSuite;
|
|
|
+import junit.extensions.TestSetup;
|
|
|
+
|
|
|
+import org.apache.hadoop.conf.Configuration;
|
|
|
+import org.apache.hadoop.dfs.MiniDFSCluster;
|
|
|
+import org.apache.hadoop.fs.FSDataInputStream;
|
|
|
+import org.apache.hadoop.fs.FileStatus;
|
|
|
+import org.apache.hadoop.fs.FileSystem;
|
|
|
+import org.apache.hadoop.fs.Path;
|
|
|
+import org.apache.hadoop.io.LongWritable;
|
|
|
+import org.apache.hadoop.io.SequenceFile;
|
|
|
+import org.apache.hadoop.io.Text;
|
|
|
+import org.apache.hadoop.mapred.*;
|
|
|
+
|
|
|
+public class TestDataJoin extends TestCase {
|
|
|
+
|
|
|
+ private static MiniDFSCluster cluster = null;
|
|
|
+ public static Test suite() {
|
|
|
+ TestSetup setup = new TestSetup(new TestSuite(TestDataJoin.class)) {
|
|
|
+ protected void setUp() throws Exception {
|
|
|
+ Configuration conf = new Configuration();
|
|
|
+ cluster = new MiniDFSCluster(conf, 2, true, null);
|
|
|
+ }
|
|
|
+ protected void tearDown() throws Exception {
|
|
|
+ if (cluster != null) {
|
|
|
+ cluster.shutdown();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
+ return setup;
|
|
|
+ }
|
|
|
+
|
|
|
+ public void testDataJoin() throws Exception {
|
|
|
+ final int srcs = 4;
|
|
|
+ JobConf job = new JobConf();
|
|
|
+ Path base = cluster.getFileSystem().makeQualified(new Path("/inner"));
|
|
|
+ Path[] src = writeSimpleSrc(base, job, srcs);
|
|
|
+ job.setInputFormat(SequenceFileInputFormat.class);
|
|
|
+ Path outdir = new Path(base, "out");
|
|
|
+ FileOutputFormat.setOutputPath(job, outdir);
|
|
|
+
|
|
|
+ job.setMapperClass(SampleDataJoinMapper.class);
|
|
|
+ job.setReducerClass(SampleDataJoinReducer.class);
|
|
|
+ job.setMapOutputKeyClass(Text.class);
|
|
|
+ job.setMapOutputValueClass(SampleTaggedMapOutput.class);
|
|
|
+ job.setOutputKeyClass(Text.class);
|
|
|
+ job.setOutputValueClass(Text.class);
|
|
|
+ job.setOutputFormat(TextOutputFormat.class);
|
|
|
+ job.setNumMapTasks(1);
|
|
|
+ job.setNumReduceTasks(1);
|
|
|
+ FileInputFormat.setInputPaths(job, src);
|
|
|
+ try {
|
|
|
+ JobClient.runJob(job);
|
|
|
+ confirmOutput(outdir, job, srcs);
|
|
|
+ } finally {
|
|
|
+ base.getFileSystem(job).delete(base, true);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void confirmOutput(Path out, JobConf job, int srcs)
|
|
|
+ throws IOException {
|
|
|
+ FileSystem fs = out.getFileSystem(job);
|
|
|
+ FileStatus[] outlist = fs.listStatus(out);
|
|
|
+ assertEquals(1, outlist.length);
|
|
|
+ assertTrue(0 < outlist[0].getLen());
|
|
|
+ FSDataInputStream in = fs.open(outlist[0].getPath());
|
|
|
+ LineRecordReader rr = new LineRecordReader(in, 0, Integer.MAX_VALUE, job);
|
|
|
+ LongWritable k = new LongWritable();
|
|
|
+ Text v = new Text();
|
|
|
+ int count = 0;
|
|
|
+ while (rr.next(k, v)) {
|
|
|
+ String[] vals = v.toString().split("\t");
|
|
|
+ assertEquals(srcs + 1, vals.length);
|
|
|
+ int[] ivals = new int[vals.length];
|
|
|
+ for (int i = 0; i < vals.length; ++i)
|
|
|
+ ivals[i] = Integer.parseInt(vals[i]);
|
|
|
+ assertEquals(0, ivals[0] % (srcs * srcs));
|
|
|
+ for (int i = 1; i < vals.length; ++i) {
|
|
|
+ assertEquals((ivals[i] - (i - 1)) * srcs, 10 * ivals[0]);
|
|
|
+ }
|
|
|
+ ++count;
|
|
|
+ }
|
|
|
+ assertEquals(4, count);
|
|
|
+ }
|
|
|
+
|
|
|
+ private static SequenceFile.Writer[] createWriters(Path testdir,
|
|
|
+ JobConf conf, int srcs, Path[] src) throws IOException {
|
|
|
+ for (int i = 0; i < srcs; ++i) {
|
|
|
+ src[i] = new Path(testdir, Integer.toString(i + 10, 36));
|
|
|
+ }
|
|
|
+ SequenceFile.Writer out[] = new SequenceFile.Writer[srcs];
|
|
|
+ for (int i = 0; i < srcs; ++i) {
|
|
|
+ out[i] = new SequenceFile.Writer(testdir.getFileSystem(conf), conf,
|
|
|
+ src[i], Text.class, Text.class);
|
|
|
+ }
|
|
|
+ return out;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static Path[] writeSimpleSrc(Path testdir, JobConf conf,
|
|
|
+ int srcs) throws IOException {
|
|
|
+ SequenceFile.Writer out[] = null;
|
|
|
+ Path[] src = new Path[srcs];
|
|
|
+ try {
|
|
|
+ out = createWriters(testdir, conf, srcs, src);
|
|
|
+ final int capacity = srcs * 2 + 1;
|
|
|
+ Text key = new Text();
|
|
|
+ key.set("ignored");
|
|
|
+ Text val = new Text();
|
|
|
+ for (int k = 0; k < capacity; ++k) {
|
|
|
+ for (int i = 0; i < srcs; ++i) {
|
|
|
+ val.set(Integer.toString(k % srcs == 0 ? k * srcs : k * srcs + i) +
|
|
|
+ "\t" + Integer.toString(10 * k + i));
|
|
|
+ out[i].append(key, val);
|
|
|
+ if (i == k) {
|
|
|
+ // add duplicate key
|
|
|
+ out[i].append(key, val);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } finally {
|
|
|
+ if (out != null) {
|
|
|
+ for (int i = 0; i < srcs; ++i) {
|
|
|
+ if (out[i] != null)
|
|
|
+ out[i].close();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return src;
|
|
|
+ }
|
|
|
+}
|