14 vuotta sitten · e755f16fe4
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -4,6 +4,8 @@ Release 0.20.206.0 - unreleased
 
				 
			
 
				   NEW FEATURES
			
 
				 
			
 
				+    MAPREDUCE-3118. Backport Gridmix and Rumen features to 
			
 
				+                    branch-0.20-security (Ravi Gummadi via amarrk)
			
 
				   BUG FIXES
			
 
				 
			
 
				     HDFS-2305. Running multiple 2NNs can result in corrupt file system. (atm)
			
--- a/build.xml
+++ b/build.xml
@@ -1291,6 +1291,7 @@
 
				         <packageset dir="${mapred.src.dir}"/>
			
 
				         <packageset dir="${hdfs.src.dir}"/>        	
			
 
				     	<packageset dir="${examples.dir}"/>
			
 
				+    	<packageset dir="${tools.src}"/>
			
 
				 
			
 
				     	<packageset dir="src/contrib/streaming/src/java"/>
			
 
				     	<packageset dir="src/contrib/data_join/src/java"/>
			
@@ -1371,6 +1372,8 @@
 
				        <packageset dir="src/core"/>
			
 
				        <packageset dir="src/mapred"/>
			
 
				        <packageset dir="src/tools"/>
			
 
				+       <packageset dir="${tools.src}"/>
			
 
				+       <packageset dir="${tools.src}"/>
			
 
				        <classpath >
			
 
				          <path refid="classpath" />
			
 
				          <path refid="jdiff-classpath" />
			
--- a/src/contrib/build-contrib.xml
+++ b/src/contrib/build-contrib.xml
@@ -33,6 +33,7 @@
 
				 
			
 
				   <property name="src.dir"  location="${root}/src/java"/>
			
 
				   <property name="src.test" location="${root}/src/test"/>
			
 
				+  <property name="src.test.data" location="${root}/src/test/data"/>
			
 
				   <!-- Property added for contrib system tests -->
			
 
				   <property name="src.test.system" location="${root}/src/test/system"/>
			
 
				 
			
@@ -289,6 +290,7 @@
 
				       
			
 
				       <sysproperty key="test.build.data" value="${build.test}/data"/>
			
 
				       <sysproperty key="build.test" value="${build.test}"/>
			
 
				+      <sysproperty key="src.test.data" value="${src.test.data}"/>
			
 
				       <sysproperty key="contrib.name" value="${name}"/>
			
 
				       
			
 
				       <!-- requires fork=yes for: 
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/AvgRecordFactory.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/AvgRecordFactory.java
@@ -40,6 +40,8 @@ class AvgRecordFactory extends RecordFactory {
 
				   private final int keyLen;
			
 
				   private long accBytes = 0L;
			
 
				   private long accRecords = 0L;
			
 
				+  private int unspilledBytes = 0;
			
 
				+  private int minSpilledBytes = 0;
			
 
				 
			
 
				   /**
			
 
				    * @param targetBytes Expected byte count.
			
@@ -48,6 +50,14 @@ class AvgRecordFactory extends RecordFactory {
 
				    */
			
 
				   public AvgRecordFactory(long targetBytes, long targetRecords,
			
 
				       Configuration conf) {
			
 
				+    this(targetBytes, targetRecords, conf, 0);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * @param minSpilledBytes Minimum amount of data expected per record
			
 
				+   */
			
 
				+  public AvgRecordFactory(long targetBytes, long targetRecords,
			
 
				+      Configuration conf, int minSpilledBytes) {
			
 
				     this.targetBytes = targetBytes;
			
 
				     this.targetRecords = targetRecords <= 0 && this.targetBytes >= 0
			
 
				       ? Math.max(1,
			
@@ -58,6 +68,7 @@ class AvgRecordFactory extends RecordFactory {
 
				     avgrec = (int) Math.min(Integer.MAX_VALUE, tmp + 1);
			
 
				     keyLen = Math.max(1,
			
 
				         (int)(tmp * Math.min(1.0f, conf.getFloat(GRIDMIX_KEY_FRC, 0.1f))));
			
 
				+    this.minSpilledBytes = minSpilledBytes;
			
 
				   }
			
 
				 
			
 
				   @Override
			
@@ -67,14 +78,33 @@ class AvgRecordFactory extends RecordFactory {
 
				     }
			
 
				     final int reclen = accRecords++ >= step ? avgrec - 1 : avgrec;
			
 
				     final int len = (int) Math.min(targetBytes - accBytes, reclen);
			
 
				+    
			
 
				+    unspilledBytes += len;
			
 
				+    
			
 
				     // len != reclen?
			
 
				     if (key != null) {
			
 
				-      key.setSize(keyLen);
			
 
				-      val.setSize(len - key.getSize());
			
 
				+      if (unspilledBytes < minSpilledBytes && accRecords < targetRecords) {
			
 
				+        key.setSize(1);
			
 
				+        val.setSize(1);
			
 
				+        accBytes += key.getSize() + val.getSize();
			
 
				+        unspilledBytes -= (key.getSize() + val.getSize());
			
 
				+      } else {
			
 
				+        key.setSize(keyLen);
			
 
				+        val.setSize(unspilledBytes - key.getSize());
			
 
				+        accBytes += unspilledBytes;
			
 
				+        unspilledBytes = 0;
			
 
				+      }
			
 
				     } else {
			
 
				-      val.setSize(len);
			
 
				+      if (unspilledBytes < minSpilledBytes && accRecords < targetRecords) {
			
 
				+        val.setSize(1);
			
 
				+        accBytes += val.getSize();
			
 
				+        unspilledBytes -= val.getSize();
			
 
				+      } else {
			
 
				+        val.setSize(unspilledBytes);
			
 
				+        accBytes += unspilledBytes;
			
 
				+        unspilledBytes = 0;
			
 
				+      }
			
 
				     }
			
 
				-    accBytes += len;
			
 
				     return true;
			
 
				   }
			
 
				 
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/ClusterSummarizer.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/ClusterSummarizer.java
@@ -0,0 +1,116 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import org.apache.commons.lang.time.FastDateFormat;
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.CommonConfigurationKeys;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.mapred.JobTracker;
			
 
				+import org.apache.hadoop.mapred.gridmix.Statistics.ClusterStats;
			
 
				+
			
 
				+/**
			
 
				+ * Summarizes the Hadoop cluster used in this {@link Gridmix} run. 
			
 
				+ * Statistics that are reported are
			
 
				+ * <ul>
			
 
				+ *   <li>Total number of active trackers in the cluster</li>
			
 
				+ *   <li>Total number of blacklisted trackers in the cluster</li>
			
 
				+ *   <li>Max map task capacity of the cluster</li>
			
 
				+ *   <li>Max reduce task capacity of the cluster</li>
			
 
				+ * </ul>
			
 
				+ * 
			
 
				+ * Apart from these statistics, {@link JobTracker} and {@link FileSystem} 
			
 
				+ * addresses are also recorded in the summary.
			
 
				+ */
			
 
				+class ClusterSummarizer implements StatListener<ClusterStats> {
			
 
				+  static final Log LOG = LogFactory.getLog(ClusterSummarizer.class);
			
 
				+  
			
 
				+  private int numBlacklistedTrackers;
			
 
				+  private int numActiveTrackers;
			
 
				+  private int maxMapTasks;
			
 
				+  private int maxReduceTasks;
			
 
				+  private String jobTrackerInfo = Summarizer.NA;
			
 
				+  private String namenodeInfo = Summarizer.NA;
			
 
				+  
			
 
				+  @Override
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  public void update(ClusterStats item) {
			
 
				+    try {
			
 
				+      numBlacklistedTrackers = item.getStatus().getBlacklistedTrackers();
			
 
				+      numActiveTrackers = item.getStatus().getTaskTrackers();
			
 
				+      maxMapTasks = item.getStatus().getMaxMapTasks();
			
 
				+      maxReduceTasks = item.getStatus().getMaxReduceTasks();
			
 
				+    } catch (Exception e) {
			
 
				+      long time = System.currentTimeMillis();
			
 
				+      LOG.info("Error in processing cluster status at " 
			
 
				+               + FastDateFormat.getInstance().format(time));
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Summarizes the cluster used for this {@link Gridmix} run.
			
 
				+   */
			
 
				+  @Override
			
 
				+  public String toString() {
			
 
				+    StringBuilder builder = new StringBuilder();
			
 
				+    builder.append("Cluster Summary:-");
			
 
				+    builder.append("\nJobTracker: ").append(getJobTrackerInfo());
			
 
				+    builder.append("\nFileSystem: ").append(getNamenodeInfo());
			
 
				+    builder.append("\nNumber of blacklisted trackers: ")
			
 
				+           .append(getNumBlacklistedTrackers());
			
 
				+    builder.append("\nNumber of active trackers: ")
			
 
				+           .append(getNumActiveTrackers());
			
 
				+    builder.append("\nMax map task capacity: ")
			
 
				+           .append(getMaxMapTasks());
			
 
				+    builder.append("\nMax reduce task capacity: ").append(getMaxReduceTasks());
			
 
				+    builder.append("\n\n");
			
 
				+    return builder.toString();
			
 
				+  }
			
 
				+  
			
 
				+  void start(Configuration conf) {
			
 
				+    jobTrackerInfo = conf.get("mapred.job.tracker");
			
 
				+    namenodeInfo = conf.get(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY);
			
 
				+  }
			
 
				+  
			
 
				+  // Getters
			
 
				+  protected int getNumBlacklistedTrackers() {
			
 
				+    return numBlacklistedTrackers;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getNumActiveTrackers() {
			
 
				+    return numActiveTrackers;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getMaxMapTasks() {
			
 
				+    return maxMapTasks;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getMaxReduceTasks() {
			
 
				+    return maxReduceTasks;
			
 
				+  }
			
 
				+  
			
 
				+  protected String getJobTrackerInfo() {
			
 
				+    return jobTrackerInfo;
			
 
				+  }
			
 
				+  
			
 
				+  protected String getNamenodeInfo() {
			
 
				+    return namenodeInfo;
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java
@@ -0,0 +1,573 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.io.DataOutputStream;
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+import java.io.OutputStream;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FSDataInputStream;
			
 
				+import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.io.LongWritable;
			
 
				+import org.apache.hadoop.io.NullWritable;
			
 
				+import org.apache.hadoop.io.Text;
			
 
				+import org.apache.hadoop.io.compress.CodecPool;
			
 
				+import org.apache.hadoop.io.compress.CompressionCodec;
			
 
				+import org.apache.hadoop.io.compress.CompressionCodecFactory;
			
 
				+import org.apache.hadoop.io.compress.CompressionInputStream;
			
 
				+import org.apache.hadoop.io.compress.Decompressor;
			
 
				+import org.apache.hadoop.io.compress.GzipCodec;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.Utils;
			
 
				+import org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics;
			
 
				+import org.apache.hadoop.mapred.gridmix.GenerateData.GenDataFormat;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.mapreduce.Mapper;
			
 
				+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
			
 
				+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
			
 
				+import org.apache.hadoop.util.ReflectionUtils;
			
 
				+import org.apache.hadoop.util.StringUtils;
			
 
				+
			
 
				+/**
			
 
				+ * This is a utility class for all the compression related modules.
			
 
				+ */
			
 
				+class CompressionEmulationUtil {
			
 
				+  static final Log LOG = LogFactory.getLog(CompressionEmulationUtil.class);
			
 
				+  
			
 
				+  /**
			
 
				+   * Enable compression usage in GridMix runs.
			
 
				+   */
			
 
				+  private static final String COMPRESSION_EMULATION_ENABLE = 
			
 
				+    "gridmix.compression-emulation.enable";
			
 
				+  
			
 
				+  /**
			
 
				+   * Enable input data decompression.
			
 
				+   */
			
 
				+  private static final String INPUT_DECOMPRESSION_EMULATION_ENABLE = 
			
 
				+    "gridmix.compression-emulation.input-decompression.enable";
			
 
				+  
			
 
				+  /**
			
 
				+   * Configuration property for setting the compression ratio for map input 
			
 
				+   * data.
			
 
				+   */
			
 
				+  private static final String GRIDMIX_MAP_INPUT_COMPRESSION_RATIO = 
			
 
				+    "gridmix.compression-emulation.map-input.decompression-ratio";
			
 
				+  
			
 
				+  /**
			
 
				+   * Configuration property for setting the compression ratio of map output.
			
 
				+   */
			
 
				+  private static final String GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO = 
			
 
				+    "gridmix.compression-emulation.map-output.compression-ratio";
			
 
				+  
			
 
				+  /**
			
 
				+   * Configuration property for setting the compression ratio of reduce output.
			
 
				+   */
			
 
				+  private static final String GRIDMIX_REDUCE_OUTPUT_COMPRESSION_RATIO = 
			
 
				+    "gridmix.compression-emulation.reduce-output.compression-ratio";
			
 
				+  
			
 
				+  /**
			
 
				+   * Default compression ratio.
			
 
				+   */
			
 
				+  static final float DEFAULT_COMPRESSION_RATIO = 0.5F;
			
 
				+  
			
 
				+  private static final CompressionRatioLookupTable COMPRESSION_LOOKUP_TABLE = 
			
 
				+    new CompressionRatioLookupTable();
			
 
				+  
			
 
				+  /**
			
 
				+   * This is a {@link Mapper} implementation for generating random text data.
			
 
				+   * It uses {@link RandomTextDataGenerator} for generating text data and the
			
 
				+   * output files are compressed.
			
 
				+   */
			
 
				+  public static class RandomTextDataMapper
			
 
				+  extends Mapper<NullWritable, LongWritable, Text, Text> {
			
 
				+    private RandomTextDataGenerator rtg;
			
 
				+
			
 
				+    @Override
			
 
				+    protected void setup(Context context)
			
 
				+        throws IOException, InterruptedException {
			
 
				+      Configuration conf = context.getConfiguration();
			
 
				+      int listSize = 
			
 
				+        RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf);
			
 
				+      int wordSize = 
			
 
				+        RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf);
			
 
				+      rtg = new RandomTextDataGenerator(listSize, wordSize);
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * Emits random words sequence of desired size. Note that the desired output
			
 
				+     * size is passed as the value parameter to this map.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public void map(NullWritable key, LongWritable value, Context context)
			
 
				+    throws IOException, InterruptedException {
			
 
				+      //TODO Control the extra data written ..
			
 
				+      //TODO Should the key\tvalue\n be considered for measuring size?
			
 
				+      //     Can counters like BYTES_WRITTEN be used? What will be the value of
			
 
				+      //     such counters in LocalJobRunner?
			
 
				+      for (long bytes = value.get(); bytes > 0;) {
			
 
				+        String randomKey = rtg.getRandomWord();
			
 
				+        String randomValue = rtg.getRandomWord();
			
 
				+        context.write(new Text(randomKey), new Text(randomValue));
			
 
				+        bytes -= (randomValue.getBytes().length + randomKey.getBytes().length);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Configure the {@link Job} for enabling compression emulation.
			
 
				+   */
			
 
				+  static void configure(final Job job) throws IOException, InterruptedException,
			
 
				+                                              ClassNotFoundException {
			
 
				+    // set the random text mapper
			
 
				+    job.setMapperClass(RandomTextDataMapper.class);
			
 
				+    job.setNumReduceTasks(0);
			
 
				+    job.setMapOutputKeyClass(Text.class);
			
 
				+    job.setMapOutputValueClass(Text.class);
			
 
				+    job.setInputFormatClass(GenDataFormat.class);
			
 
				+    job.setJarByClass(GenerateData.class);
			
 
				+
			
 
				+    // set the output compression true
			
 
				+    FileOutputFormat.setCompressOutput(job, true);
			
 
				+    try {
			
 
				+      FileInputFormat.addInputPath(job, new Path("ignored"));
			
 
				+    } catch (IOException e) {
			
 
				+      LOG.error("Error while adding input path ", e);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * This is the lookup table for mapping compression ratio to the size of the 
			
 
				+   * word in the {@link RandomTextDataGenerator}'s dictionary. 
			
 
				+   * 
			
 
				+   * Note that this table is computed (empirically) using a dictionary of 
			
 
				+   * default length i.e {@value RandomTextDataGenerator#DEFAULT_LIST_SIZE}.
			
 
				+   */
			
 
				+  private static class CompressionRatioLookupTable {
			
 
				+    private static Map<Float, Integer> map = new HashMap<Float, Integer>(60);
			
 
				+    private static final float MIN_RATIO = 0.07F;
			
 
				+    private static final float MAX_RATIO = 0.68F;
			
 
				+    
			
 
				+    // add the empirically obtained data points in the lookup table
			
 
				+    CompressionRatioLookupTable() {
			
 
				+      map.put(.07F,30);
			
 
				+      map.put(.08F,25);
			
 
				+      map.put(.09F,60);
			
 
				+      map.put(.10F,20);
			
 
				+      map.put(.11F,70);
			
 
				+      map.put(.12F,15);
			
 
				+      map.put(.13F,80);
			
 
				+      map.put(.14F,85);
			
 
				+      map.put(.15F,90);
			
 
				+      map.put(.16F,95);
			
 
				+      map.put(.17F,100);
			
 
				+      map.put(.18F,105);
			
 
				+      map.put(.19F,110);
			
 
				+      map.put(.20F,115);
			
 
				+      map.put(.21F,120);
			
 
				+      map.put(.22F,125);
			
 
				+      map.put(.23F,130);
			
 
				+      map.put(.24F,140);
			
 
				+      map.put(.25F,145);
			
 
				+      map.put(.26F,150);
			
 
				+      map.put(.27F,155);
			
 
				+      map.put(.28F,160);
			
 
				+      map.put(.29F,170);
			
 
				+      map.put(.30F,175);
			
 
				+      map.put(.31F,180);
			
 
				+      map.put(.32F,190);
			
 
				+      map.put(.33F,195);
			
 
				+      map.put(.34F,205);
			
 
				+      map.put(.35F,215);
			
 
				+      map.put(.36F,225);
			
 
				+      map.put(.37F,230);
			
 
				+      map.put(.38F,240);
			
 
				+      map.put(.39F,250);
			
 
				+      map.put(.40F,260);
			
 
				+      map.put(.41F,270);
			
 
				+      map.put(.42F,280);
			
 
				+      map.put(.43F,295);
			
 
				+      map.put(.44F,310);
			
 
				+      map.put(.45F,325);
			
 
				+      map.put(.46F,335);
			
 
				+      map.put(.47F,355);
			
 
				+      map.put(.48F,375);
			
 
				+      map.put(.49F,395);
			
 
				+      map.put(.50F,420);
			
 
				+      map.put(.51F,440);
			
 
				+      map.put(.52F,465);
			
 
				+      map.put(.53F,500);
			
 
				+      map.put(.54F,525);
			
 
				+      map.put(.55F,550);
			
 
				+      map.put(.56F,600);
			
 
				+      map.put(.57F,640);
			
 
				+      map.put(.58F,680);
			
 
				+      map.put(.59F,734);
			
 
				+      map.put(.60F,813);
			
 
				+      map.put(.61F,905);
			
 
				+      map.put(.62F,1000);
			
 
				+      map.put(.63F,1055);
			
 
				+      map.put(.64F,1160);
			
 
				+      map.put(.65F,1355);
			
 
				+      map.put(.66F,1510);
			
 
				+      map.put(.67F,1805);
			
 
				+      map.put(.68F,2170);
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * Returns the size of the word in {@link RandomTextDataGenerator}'s 
			
 
				+     * dictionary that can generate text with the desired compression ratio.
			
 
				+     * 
			
 
				+     * @throws RuntimeException If ratio is less than {@value #MIN_RATIO} or 
			
 
				+     *                          greater than {@value #MAX_RATIO}.
			
 
				+     */
			
 
				+    int getWordSizeForRatio(float ratio) {
			
 
				+      ratio = standardizeCompressionRatio(ratio);
			
 
				+      if (ratio >= MIN_RATIO && ratio <= MAX_RATIO) {
			
 
				+        return map.get(ratio);
			
 
				+      } else {
			
 
				+        throw new RuntimeException("Compression ratio should be in the range [" 
			
 
				+          + MIN_RATIO + "," + MAX_RATIO + "]. Configured compression ratio is " 
			
 
				+          + ratio + ".");
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Setup the data generator's configuration to generate compressible random 
			
 
				+   * text data with the desired compression ratio.
			
 
				+   * Note that the compression ratio, if configured, will set the 
			
 
				+   * {@link RandomTextDataGenerator}'s list-size and word-size based on 
			
 
				+   * empirical values using the compression ratio set in the configuration. 
			
 
				+   * 
			
 
				+   * Hence to achieve the desired compression ratio, 
			
 
				+   * {@link RandomTextDataGenerator}'s list-size will be set to the default 
			
 
				+   * value i.e {@value RandomTextDataGenerator#DEFAULT_LIST_SIZE}.
			
 
				+   */
			
 
				+  static void setupDataGeneratorConfig(Configuration conf) {
			
 
				+    boolean compress = isCompressionEmulationEnabled(conf);
			
 
				+    if (compress) {
			
 
				+      float ratio = getMapInputCompressionEmulationRatio(conf);
			
 
				+      LOG.info("GridMix is configured to generate compressed input data with "
			
 
				+               + " a compression ratio of " + ratio);
			
 
				+      int wordSize = COMPRESSION_LOOKUP_TABLE.getWordSizeForRatio(ratio);
			
 
				+      RandomTextDataGenerator.setRandomTextDataGeneratorWordSize(conf, 
			
 
				+                                                                 wordSize);
			
 
				+
			
 
				+      // since the compression ratios are computed using the default value of 
			
 
				+      // list size
			
 
				+      RandomTextDataGenerator.setRandomTextDataGeneratorListSize(conf, 
			
 
				+          RandomTextDataGenerator.DEFAULT_LIST_SIZE);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Returns a {@link RandomTextDataGenerator} that generates random 
			
 
				+   * compressible text with the desired compression ratio.
			
 
				+   */
			
 
				+  static RandomTextDataGenerator getRandomTextDataGenerator(float ratio, 
			
 
				+                                                            long seed) {
			
 
				+    int wordSize = COMPRESSION_LOOKUP_TABLE.getWordSizeForRatio(ratio);
			
 
				+    RandomTextDataGenerator rtg = 
			
 
				+      new RandomTextDataGenerator(RandomTextDataGenerator.DEFAULT_LIST_SIZE, 
			
 
				+            seed, wordSize);
			
 
				+    return rtg;
			
 
				+  }
			
 
				+  
			
 
				+  /** Publishes compression related data statistics. Following statistics are
			
 
				+   * published
			
 
				+   * <ul>
			
 
				+   *   <li>Total compressed input data size</li>
			
 
				+   *   <li>Number of compressed input data files</li>
			
 
				+   *   <li>Compression Ratio</li>
			
 
				+   *   <li>Text data dictionary size</li>
			
 
				+   *   <li>Random text word size</li>
			
 
				+   * </ul>
			
 
				+   */
			
 
				+  static DataStatistics publishCompressedDataStatistics(Path inputDir, 
			
 
				+                          Configuration conf, long uncompressedDataSize) 
			
 
				+  throws IOException {
			
 
				+    FileSystem fs = inputDir.getFileSystem(conf);
			
 
				+    CompressionCodecFactory compressionCodecs = 
			
 
				+      new CompressionCodecFactory(conf);
			
 
				+
			
 
				+    // iterate over compressed files and sum up the compressed file sizes
			
 
				+    long compressedDataSize = 0;
			
 
				+    int numCompressedFiles = 0;
			
 
				+    // obtain input data file statuses
			
 
				+    FileStatus[] outFileStatuses = 
			
 
				+      fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
			
 
				+    for (FileStatus status : outFileStatuses) {
			
 
				+      // check if the input file is compressed
			
 
				+      if (compressionCodecs != null) {
			
 
				+        CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
			
 
				+        if (codec != null) {
			
 
				+          ++numCompressedFiles;
			
 
				+          compressedDataSize += status.getLen();
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    LOG.info("Gridmix is configured to use compressed input data.");
			
 
				+    // publish the input data size
			
 
				+    LOG.info("Total size of compressed input data : " 
			
 
				+             + StringUtils.humanReadableInt(compressedDataSize));
			
 
				+    LOG.info("Total number of compressed input data files : " 
			
 
				+             + numCompressedFiles);
			
 
				+
			
 
				+    if (numCompressedFiles == 0) {
			
 
				+      throw new RuntimeException("No compressed file found in the input" 
			
 
				+          + " directory : " + inputDir.toString() + ". To enable compression"
			
 
				+          + " emulation, run Gridmix either with "
			
 
				+          + " an input directory containing compressed input file(s) or" 
			
 
				+          + " use the -generate option to (re)generate it. If compression"
			
 
				+          + " emulation is not desired, disable it by setting '" 
			
 
				+          + COMPRESSION_EMULATION_ENABLE + "' to 'false'.");
			
 
				+    }
			
 
				+    
			
 
				+    // publish compression ratio only if its generated in this gridmix run
			
 
				+    if (uncompressedDataSize > 0) {
			
 
				+      // compute the compression ratio
			
 
				+      double ratio = ((double)compressedDataSize) / uncompressedDataSize;
			
 
				+
			
 
				+      // publish the compression ratio
			
 
				+      LOG.info("Input Data Compression Ratio : " + ratio);
			
 
				+    }
			
 
				+    
			
 
				+    return new DataStatistics(compressedDataSize, numCompressedFiles, true);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Enables/Disables compression emulation.
			
 
				+   * @param conf Target configuration where the parameter 
			
 
				+   * {@value #COMPRESSION_EMULATION_ENABLE} will be set. 
			
 
				+   * @param val The value to be set.
			
 
				+   */
			
 
				+  static void setCompressionEmulationEnabled(Configuration conf, boolean val) {
			
 
				+    conf.setBoolean(COMPRESSION_EMULATION_ENABLE, val);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Checks if compression emulation is enabled or not. Default is {@code true}.
			
 
				+   */
			
 
				+  static boolean isCompressionEmulationEnabled(Configuration conf) {
			
 
				+    return conf.getBoolean(COMPRESSION_EMULATION_ENABLE, true);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Enables/Disables input decompression emulation.
			
 
				+   * @param conf Target configuration where the parameter 
			
 
				+   * {@value #INPUT_DECOMPRESSION_EMULATION_ENABLE} will be set. 
			
 
				+   * @param val The value to be set.
			
 
				+   */
			
 
				+  static void setInputCompressionEmulationEnabled(Configuration conf, 
			
 
				+                                                  boolean val) {
			
 
				+    conf.setBoolean(INPUT_DECOMPRESSION_EMULATION_ENABLE, val);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Check if input decompression emulation is enabled or not. 
			
 
				+   * Default is {@code false}.
			
 
				+   */
			
 
				+  static boolean isInputCompressionEmulationEnabled(Configuration conf) {
			
 
				+    return conf.getBoolean(INPUT_DECOMPRESSION_EMULATION_ENABLE, false);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the map input data compression ratio in the given conf.
			
 
				+   */
			
 
				+  static void setMapInputCompressionEmulationRatio(Configuration conf, 
			
 
				+                                                   float ratio) {
			
 
				+    conf.setFloat(GRIDMIX_MAP_INPUT_COMPRESSION_RATIO, ratio);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the map input data compression ratio using the given configuration.
			
 
				+   * If the compression ratio is not set in the configuration then use the 
			
 
				+   * default value i.e {@value #DEFAULT_COMPRESSION_RATIO}.
			
 
				+   */
			
 
				+  static float getMapInputCompressionEmulationRatio(Configuration conf) {
			
 
				+    return conf.getFloat(GRIDMIX_MAP_INPUT_COMPRESSION_RATIO, 
			
 
				+                         DEFAULT_COMPRESSION_RATIO);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the map output data compression ratio in the given configuration.
			
 
				+   */
			
 
				+  static void setMapOutputCompressionEmulationRatio(Configuration conf, 
			
 
				+                                                    float ratio) {
			
 
				+    conf.setFloat(GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO, ratio);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the map output data compression ratio using the given configuration.
			
 
				+   * If the compression ratio is not set in the configuration then use the 
			
 
				+   * default value i.e {@value #DEFAULT_COMPRESSION_RATIO}.
			
 
				+   */
			
 
				+  static float getMapOutputCompressionEmulationRatio(Configuration conf) {
			
 
				+    return conf.getFloat(GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO, 
			
 
				+                         DEFAULT_COMPRESSION_RATIO);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the reduce output data compression ratio in the given configuration.
			
 
				+   */
			
 
				+  static void setReduceOutputCompressionEmulationRatio(Configuration conf, 
			
 
				+                                                       float ratio) {
			
 
				+    conf.setFloat(GRIDMIX_REDUCE_OUTPUT_COMPRESSION_RATIO, ratio);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the reduce output data compression ratio using the given configuration.
			
 
				+   * If the compression ratio is not set in the configuration then use the 
			
 
				+   * default value i.e {@value #DEFAULT_COMPRESSION_RATIO}.
			
 
				+   */
			
 
				+  static float getReduceOutputCompressionEmulationRatio(Configuration conf) {
			
 
				+    return conf.getFloat(GRIDMIX_REDUCE_OUTPUT_COMPRESSION_RATIO, 
			
 
				+                         DEFAULT_COMPRESSION_RATIO);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Standardize the compression ratio i.e round off the compression ratio to
			
 
				+   * only 2 significant digits.
			
 
				+   */
			
 
				+  static float standardizeCompressionRatio(float ratio) {
			
 
				+    // round off to 2 significant digits
			
 
				+    int significant = (int)Math.round(ratio * 100);
			
 
				+    return ((float)significant)/100;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Returns a {@link InputStream} for a file that might be compressed.
			
 
				+   */
			
 
				+  static InputStream getPossiblyDecompressedInputStream(Path file, 
			
 
				+                                                        Configuration conf,
			
 
				+                                                        long offset)
			
 
				+  throws IOException {
			
 
				+    FileSystem fs = file.getFileSystem(conf);
			
 
				+    if (isCompressionEmulationEnabled(conf)
			
 
				+        && isInputCompressionEmulationEnabled(conf)) {
			
 
				+      CompressionCodecFactory compressionCodecs = 
			
 
				+        new CompressionCodecFactory(conf);
			
 
				+      CompressionCodec codec = compressionCodecs.getCodec(file);
			
 
				+      if (codec != null) {
			
 
				+        Decompressor decompressor = CodecPool.getDecompressor(codec);
			
 
				+        if (decompressor != null) {
			
 
				+          CompressionInputStream in = 
			
 
				+            codec.createInputStream(fs.open(file), decompressor);
			
 
				+          //TODO Seek doesnt work with compressed input stream. 
			
 
				+          //     Use SplittableCompressionCodec?
			
 
				+          return (InputStream)in;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    FSDataInputStream in = fs.open(file);
			
 
				+    in.seek(offset);
			
 
				+    return (InputStream)in;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Returns a {@link OutputStream} for a file that might need 
			
 
				+   * compression.
			
 
				+   */
			
 
				+  static OutputStream getPossiblyCompressedOutputStream(Path file, 
			
 
				+                                                        Configuration conf)
			
 
				+  throws IOException {
			
 
				+    FileSystem fs = file.getFileSystem(conf);
			
 
				+    JobConf jConf = new JobConf(conf);
			
 
				+    if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) {
			
 
				+      // get the codec class
			
 
				+      Class<? extends CompressionCodec> codecClass =
			
 
				+        org.apache.hadoop.mapred.FileOutputFormat
			
 
				+                                .getOutputCompressorClass(jConf, 
			
 
				+                                                          GzipCodec.class);
			
 
				+      // get the codec implementation
			
 
				+      CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
			
 
				+
			
 
				+      // add the appropriate extension
			
 
				+      file = file.suffix(codec.getDefaultExtension());
			
 
				+
			
 
				+      if (isCompressionEmulationEnabled(conf)) {
			
 
				+        FSDataOutputStream fileOut = fs.create(file, false);
			
 
				+        return new DataOutputStream(codec.createOutputStream(fileOut));
			
 
				+      }
			
 
				+    }
			
 
				+    return fs.create(file, false);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Extracts compression/decompression related configuration parameters from 
			
 
				+   * the source configuration to the target configuration.
			
 
				+   */
			
 
				+  static void configureCompressionEmulation(Configuration source, 
			
 
				+                                            Configuration target) {
			
 
				+    // enable output compression
			
 
				+    target.setBoolean("mapred.output.compress",
			
 
				+    		source.getBoolean("mapred.output.compress", false));
			
 
				+
			
 
				+    // set the job output compression codec
			
 
				+    String jobOutputCompressionCodec = 
			
 
				+      source.get("mapred.output.compression.codec");
			
 
				+    if (jobOutputCompressionCodec != null) {
			
 
				+      target.set("mapred.output.compression.codec", jobOutputCompressionCodec);
			
 
				+    }
			
 
				+
			
 
				+    // set the job output compression type
			
 
				+    String jobOutputCompressionType = 
			
 
				+      source.get("mapred.output.compression.type");
			
 
				+    if (jobOutputCompressionType != null) {
			
 
				+      target.set("mapred.output.compression.type", jobOutputCompressionType);
			
 
				+    }
			
 
				+
			
 
				+    // enable map output compression
			
 
				+    target.setBoolean("mapred.compress.map.output",
			
 
				+        source.getBoolean("mapred.compress.map.output", false));
			
 
				+
			
 
				+    // set the map output compression codecs
			
 
				+    String mapOutputCompressionCodec = 
			
 
				+      source.get("mapred.map.output.compression.codec");
			
 
				+    if (mapOutputCompressionCodec != null) {
			
 
				+      target.set("mapred.map.output.compression.codec", 
			
 
				+                 mapOutputCompressionCodec);
			
 
				+    }
			
 
				+
			
 
				+    // enable input decompression
			
 
				+    //TODO replace with mapInputBytes and hdfsBytesRead
			
 
				+    Path[] inputs = 
			
 
				+      org.apache.hadoop.mapred.FileInputFormat
			
 
				+         .getInputPaths(new JobConf(source));
			
 
				+    boolean needsCompressedInput = false;
			
 
				+    CompressionCodecFactory compressionCodecs = 
			
 
				+      new CompressionCodecFactory(source);
			
 
				+    for (Path input : inputs) {
			
 
				+      CompressionCodec codec = compressionCodecs.getCodec(input);
			
 
				+      if (codec != null) {
			
 
				+        needsCompressedInput = true;
			
 
				+      }
			
 
				+    }
			
 
				+    setInputCompressionEmulationEnabled(target, needsCompressedInput);
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/DistributedCacheEmulator.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/DistributedCacheEmulator.java
@@ -0,0 +1,543 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ * <p/>
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ * <p/>
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+import org.apache.hadoop.classification.InterfaceAudience;
			
 
				+import org.apache.hadoop.classification.InterfaceStability;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.filecache.DistributedCache;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.permission.FsAction;
			
 
				+import org.apache.hadoop.fs.permission.FsPermission;
			
 
				+import org.apache.hadoop.io.BytesWritable;
			
 
				+import org.apache.hadoop.io.LongWritable;
			
 
				+import org.apache.hadoop.io.MD5Hash;
			
 
				+import org.apache.hadoop.io.SequenceFile;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapreduce.JobContext;
			
 
				+import org.apache.hadoop.tools.rumen.JobStory;
			
 
				+import org.apache.hadoop.tools.rumen.JobStoryProducer;
			
 
				+import org.apache.hadoop.tools.rumen.Pre21JobHistoryConstants;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.net.URI;
			
 
				+import java.net.URISyntaxException;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.Collections;
			
 
				+import java.util.Comparator;
			
 
				+import java.util.HashMap;
			
 
				+import java.util.Iterator;
			
 
				+import java.util.List;
			
 
				+import java.util.Map;
			
 
				+
			
 
				+/**
			
 
				+ * Emulation of Distributed Cache Usage in gridmix.
			
 
				+ * <br> Emulation of Distributed Cache Load in gridmix will put load on
			
 
				+ * TaskTrackers and affects execution time of tasks because of localization of
			
 
				+ * distributed cache files by TaskTrackers.
			
 
				+ * <br> Gridmix creates distributed cache files for simulated jobs by launching
			
 
				+ * a MapReduce job {@link GenerateDistCacheData} in advance i.e. before
			
 
				+ * launching simulated jobs.
			
 
				+ * <br> The distributed cache file paths used in the original cluster are mapped
			
 
				+ * to unique file names in the simulated cluster.
			
 
				+ * <br> All HDFS-based distributed cache files generated by gridmix are
			
 
				+ * public distributed cache files. But Gridmix makes sure that load incurred due
			
 
				+ * to localization of private distributed cache files on the original cluster
			
 
				+ * is also faithfully simulated. Gridmix emulates the load due to private
			
 
				+ * distributed cache files by mapping private distributed cache files of
			
 
				+ * different users in the original cluster to different public distributed cache
			
 
				+ * files in the simulated cluster.
			
 
				+ *
			
 
				+ * <br> The configuration properties like
			
 
				+ * {@link DistributedCache#CACHE_FILES},
			
 
				+ * {@link JobContext#CACHE_FILE_VISIBILITIES},
			
 
				+ * {@link DistributedCache#CACHE_FILES_SIZES} and
			
 
				+ * {@link DistributedCache#CACHE_FILES_TIMESTAMPS} obtained from trace are used
			
 
				+ * to decide
			
 
				+ * <li> file size of each distributed cache file to be generated
			
 
				+ * <li> whether a distributed cache file is already seen in this trace file
			
 
				+ * <li> whether a distributed cache file was considered public or private.
			
 
				+ * <br>
			
 
				+ * <br> Gridmix configures these generated files as distributed cache files for
			
 
				+ * the simulated jobs.
			
 
				+ */
			
 
				+@InterfaceAudience.Private
			
 
				+@InterfaceStability.Evolving
			
 
				+class DistributedCacheEmulator {
			
 
				+  private static final Log LOG =
			
 
				+      LogFactory.getLog(DistributedCacheEmulator.class);
			
 
				+
			
 
				+  static final long AVG_BYTES_PER_MAP = 128 * 1024 * 1024L;// 128MB
			
 
				+
			
 
				+  // If at least 1 distributed cache file is missing in the expected
			
 
				+  // distributed cache dir, Gridmix cannot proceed with emulation of
			
 
				+  // distributed cache load.
			
 
				+  int MISSING_DIST_CACHE_FILES_ERROR = 1;
			
 
				+
			
 
				+  private Path distCachePath;
			
 
				+
			
 
				+  /**
			
 
				+   * Map between simulated cluster's distributed cache file paths and their
			
 
				+   * file sizes. Unique distributed cache files are entered into this map.
			
 
				+   * 2 distributed cache files are considered same if and only if their
			
 
				+   * file paths, visibilities and timestamps are same.
			
 
				+   */
			
 
				+  private Map<String, Long> distCacheFiles = new HashMap<String, Long>();
			
 
				+
			
 
				+  /**
			
 
				+   * Configuration property for whether gridmix should emulate
			
 
				+   * distributed cache usage or not. Default value is true.
			
 
				+   */
			
 
				+  static final String GRIDMIX_EMULATE_DISTRIBUTEDCACHE =
			
 
				+      "gridmix.distributed-cache-emulation.enable";
			
 
				+
			
 
				+  // Whether to emulate distributed cache usage or not
			
 
				+  boolean emulateDistributedCache = true;
			
 
				+
			
 
				+  // Whether to generate distributed cache data or not
			
 
				+  boolean generateDistCacheData = false;
			
 
				+
			
 
				+  Configuration conf; // gridmix configuration
			
 
				+
			
 
				+  // Pseudo local file system where local FS based distributed cache files are
			
 
				+  // created by gridmix.
			
 
				+  FileSystem pseudoLocalFs = null;
			
 
				+
			
 
				+  /**
			
 
				+   * @param conf gridmix configuration
			
 
				+   * @param ioPath &lt;ioPath&gt;/distributedCache/ is the gridmix Distributed
			
 
				+   *               Cache directory
			
 
				+   */
			
 
				+  public DistributedCacheEmulator(Configuration conf, Path ioPath) {
			
 
				+    this.conf = conf;
			
 
				+    distCachePath = new Path(ioPath, "distributedCache");
			
 
				+    this.conf.setClass("fs.pseudo.impl", PseudoLocalFs.class, FileSystem.class);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * This is to be called before any other method of DistributedCacheEmulator.
			
 
				+   * <br> Checks if emulation of distributed cache load is needed and is feasible.
			
 
				+   *  Sets the flags generateDistCacheData and emulateDistributedCache to the
			
 
				+   *  appropriate values.
			
 
				+   * <br> Gridmix does not emulate distributed cache load if
			
 
				+   * <ol><li> the specific gridmix job type doesn't need emulation of
			
 
				+   * distributed cache load OR
			
 
				+   * <li> the trace is coming from a stream instead of file OR
			
 
				+   * <li> the distributed cache dir where distributed cache data is to be
			
 
				+   * generated by gridmix is on local file system OR
			
 
				+   * <li> execute permission is not there for any of the ascendant directories
			
 
				+   * of &lt;ioPath&gt; till root. This is because for emulation of distributed
			
 
				+   * cache load, distributed cache files created under
			
 
				+   * &lt;ioPath/distributedCache/public/&gt; should be considered by hadoop
			
 
				+   * as public distributed cache files.
			
 
				+   * <li> creation of pseudo local file system fails.</ol>
			
 
				+   * <br> For (2), (3), (4) and (5), generation of distributed cache data
			
 
				+   * is also disabled.
			
 
				+   * 
			
 
				+   * @param traceIn trace file path. If this is '-', then trace comes from the
			
 
				+   *                stream stdin.
			
 
				+   * @param jobCreator job creator of gridmix jobs of a specific type
			
 
				+   * @param generate  true if -generate option was specified
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  void init(String traceIn, JobCreator jobCreator, boolean generate)
			
 
				+      throws IOException {
			
 
				+    emulateDistributedCache = jobCreator.canEmulateDistCacheLoad()
			
 
				+        && conf.getBoolean(GRIDMIX_EMULATE_DISTRIBUTEDCACHE, true);
			
 
				+    generateDistCacheData = generate;
			
 
				+
			
 
				+    if (generateDistCacheData || emulateDistributedCache) {
			
 
				+      if ("-".equals(traceIn)) {// trace is from stdin
			
 
				+        LOG.warn("Gridmix will not emulate Distributed Cache load because "
			
 
				+            + "the input trace source is a stream instead of file.");
			
 
				+        emulateDistributedCache = generateDistCacheData = false;
			
 
				+      } else if (FileSystem.getLocal(conf).getUri().getScheme().equals(
			
 
				+          distCachePath.toUri().getScheme())) {// local FS
			
 
				+        LOG.warn("Gridmix will not emulate Distributed Cache load because "
			
 
				+            + "<iopath> provided is on local file system.");
			
 
				+        emulateDistributedCache = generateDistCacheData = false;
			
 
				+      } else {
			
 
				+        // Check if execute permission is there for all the ascendant
			
 
				+        // directories of distCachePath till root.
			
 
				+        FileSystem fs = FileSystem.get(conf);
			
 
				+        Path cur = distCachePath.getParent();
			
 
				+        while (cur != null) {
			
 
				+          if (cur.toString().length() > 0) {
			
 
				+            FsPermission perm = fs.getFileStatus(cur).getPermission();
			
 
				+            if (!perm.getOtherAction().and(FsAction.EXECUTE).equals(
			
 
				+                FsAction.EXECUTE)) {
			
 
				+              LOG.warn("Gridmix will not emulate Distributed Cache load "
			
 
				+                  + "because the ascendant directory (of distributed cache "
			
 
				+                  + "directory) " + cur + " doesn't have execute permission "
			
 
				+                  + "for others.");
			
 
				+              emulateDistributedCache = generateDistCacheData = false;
			
 
				+              break;
			
 
				+            }
			
 
				+          }
			
 
				+          cur = cur.getParent();
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Check if pseudo local file system can be created
			
 
				+    try {
			
 
				+      pseudoLocalFs = FileSystem.get(new URI("pseudo:///"), conf);
			
 
				+    } catch (URISyntaxException e) {
			
 
				+      LOG.warn("Gridmix will not emulate Distributed Cache load because "
			
 
				+          + "creation of pseudo local file system failed.");
			
 
				+      e.printStackTrace();
			
 
				+      emulateDistributedCache = generateDistCacheData = false;
			
 
				+      return;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @return true if gridmix should emulate distributed cache load
			
 
				+   */
			
 
				+  boolean shouldEmulateDistCacheLoad() {
			
 
				+    return emulateDistributedCache;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @return true if gridmix should generate distributed cache data
			
 
				+   */
			
 
				+  boolean shouldGenerateDistCacheData() {
			
 
				+    return generateDistCacheData;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @return the distributed cache directory path
			
 
				+   */
			
 
				+  Path getDistributedCacheDir() {
			
 
				+    return distCachePath;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Create distributed cache directories.
			
 
				+   * Also create a file that contains the list of distributed cache files
			
 
				+   * that will be used as distributed cache files for all the simulated jobs.
			
 
				+   * @param jsp job story producer for the trace
			
 
				+   * @return exit code
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  int setupGenerateDistCacheData(JobStoryProducer jsp)
			
 
				+      throws IOException {
			
 
				+
			
 
				+    createDistCacheDirectory();
			
 
				+    return buildDistCacheFilesList(jsp);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Create distributed cache directory where distributed cache files will be
			
 
				+   * created by the MapReduce job {@link GenerateDistCacheData#JOB_NAME}.
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void createDistCacheDirectory() throws IOException {
			
 
				+    FileSystem fs = FileSystem.get(conf);
			
 
				+    FileSystem.mkdirs(fs, distCachePath, new FsPermission((short) 0777));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Create the list of unique distributed cache files needed for all the
			
 
				+   * simulated jobs and write the list to a special file.
			
 
				+   * @param jsp job story producer for the trace
			
 
				+   * @return exit code
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private int buildDistCacheFilesList(JobStoryProducer jsp) throws IOException {
			
 
				+    // Read all the jobs from the trace file and build the list of unique
			
 
				+    // distributed cache files.
			
 
				+    JobStory jobStory;
			
 
				+    while ((jobStory = jsp.getNextJob()) != null) {
			
 
				+      if (jobStory.getOutcome() == Pre21JobHistoryConstants.Values.SUCCESS && 
			
 
				+         jobStory.getSubmissionTime() >= 0) {
			
 
				+        updateHDFSDistCacheFilesList(jobStory);
			
 
				+      }
			
 
				+    }
			
 
				+    jsp.close();
			
 
				+
			
 
				+    return writeDistCacheFilesList();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * For the job to be simulated, identify the needed distributed cache files by
			
 
				+   * mapping original cluster's distributed cache file paths to the simulated cluster's
			
 
				+   * paths and add these paths in the map {@code distCacheFiles}.
			
 
				+   *<br>
			
 
				+   * JobStory should contain distributed cache related properties like
			
 
				+   * <li> {@link DistributedCache#CACHE_FILES}
			
 
				+   * <li> {@link JobContext#CACHE_FILE_VISIBILITIES}
			
 
				+   * <li> {@link DistributedCache#CACHE_FILES_SIZES}
			
 
				+   * <li> {@link DistributedCache#CACHE_FILES_TIMESTAMPS}
			
 
				+   * <li> {@link DistributedCache#CLASSPATH_FILES}
			
 
				+   *
			
 
				+   * <li> {@link DistributedCache#CACHE_ARCHIVES}
			
 
				+   * <li> {@link JobContext#CACHE_ARCHIVES_VISIBILITIES}
			
 
				+   * <li> {@link DistributedCache#CACHE_ARCHIVES_SIZES}
			
 
				+   * <li> {@link DistributedCache#CACHE_ARCHIVES_TIMESTAMPS}
			
 
				+   * <li> {@link DistributedCache#CLASSPATH_ARCHIVES}
			
 
				+   *
			
 
				+   * <li> {@link DistributedCache#CACHE_SYMLINK}
			
 
				+   *
			
 
				+   * @param jobdesc JobStory of original job obtained from trace
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  void updateHDFSDistCacheFilesList(JobStory jobdesc) throws IOException {
			
 
				+
			
 
				+    // Map original job's distributed cache file paths to simulated cluster's
			
 
				+    // paths, to be used by this simulated job.
			
 
				+    JobConf jobConf = jobdesc.getJobConf();
			
 
				+
			
 
				+    String[] files = jobConf.getStrings(DistributedCache.CACHE_FILES);
			
 
				+    if (files != null) {
			
 
				+
			
 
				+      String[] fileSizes = jobConf.getStrings(
			
 
				+                               DistributedCache.CACHE_FILES_SIZES);
			
 
				+      String[] visibilities =
			
 
				+        jobConf.getStrings(JobContext.CACHE_FILE_VISIBILITIES);
			
 
				+      String[] timeStamps =
			
 
				+        jobConf.getStrings(DistributedCache.CACHE_FILES_TIMESTAMPS);
			
 
				+
			
 
				+      FileSystem fs = FileSystem.get(conf);
			
 
				+      String user = jobConf.getUser();
			
 
				+      for (int i = 0; i < files.length; i++) {
			
 
				+        // Check if visibilities are available because older hadoop versions
			
 
				+        // didn't have public, private Distributed Caches separately.
			
 
				+        boolean visibility =
			
 
				+            (visibilities == null) ? true : Boolean.valueOf(visibilities[i]);
			
 
				+        if (isLocalDistCacheFile(files[i], user, visibility)) {
			
 
				+          // local FS based distributed cache file.
			
 
				+          // Create this file on the pseudo local FS on the fly (i.e. when the
			
 
				+          // simulated job is submitted).
			
 
				+          continue;
			
 
				+        }
			
 
				+        // distributed cache file on hdfs
			
 
				+        String mappedPath = mapDistCacheFilePath(files[i], timeStamps[i],
			
 
				+                                                 visibility, user);
			
 
				+
			
 
				+        // No need to add a distributed cache file path to the list if
			
 
				+        // (1) the mapped path is already there in the list OR
			
 
				+        // (2) the file with the mapped path already exists.
			
 
				+        // In any of the above 2 cases, file paths, timestamps, file sizes and
			
 
				+        // visibilities match. File sizes should match if file paths and
			
 
				+        // timestamps match because single file path with single timestamp
			
 
				+        // should correspond to a single file size.
			
 
				+        if (distCacheFiles.containsKey(mappedPath) ||
			
 
				+            fs.exists(new Path(mappedPath))) {
			
 
				+          continue;
			
 
				+        }
			
 
				+        distCacheFiles.put(mappedPath, Long.valueOf(fileSizes[i]));
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Check if the file path provided was constructed by MapReduce for a
			
 
				+   * distributed cache file on local file system.
			
 
				+   * @param filePath path of the distributed cache file
			
 
				+   * @param user job submitter of the job for which &lt;filePath&gt; is a
			
 
				+   *             distributed cache file
			
 
				+   * @param visibility <code>true</code> for public distributed cache file
			
 
				+   * @return true if the path provided is of a local file system based
			
 
				+   *              distributed cache file
			
 
				+   */
			
 
				+  private boolean isLocalDistCacheFile(String filePath, String user,
			
 
				+                                       boolean visibility) {
			
 
				+    return (!visibility && filePath.contains(user + "/.staging"));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Map the HDFS based distributed cache file path from original cluster to
			
 
				+   * a unique file name on the simulated cluster.
			
 
				+   * <br> Unique  distributed file names on simulated cluster are generated
			
 
				+   * using original cluster's <li>file path, <li>timestamp and <li> the
			
 
				+   * job-submitter for private distributed cache file.
			
 
				+   * <br> This implies that if on original cluster, a single HDFS file
			
 
				+   * considered as two private distributed cache files for two jobs of
			
 
				+   * different users, then the corresponding simulated jobs will have two
			
 
				+   * different files of the same size in public distributed cache, one for each
			
 
				+   * user. Both these simulated jobs will not share these distributed cache
			
 
				+   * files, thus leading to the same load as seen in the original cluster.
			
 
				+   * @param file distributed cache file path
			
 
				+   * @param timeStamp time stamp of dist cachce file
			
 
				+   * @param isPublic true if this distributed cache file is a public
			
 
				+   *                 distributed cache file
			
 
				+   * @param user job submitter on original cluster
			
 
				+   * @return the mapped path on simulated cluster
			
 
				+   */
			
 
				+  private String mapDistCacheFilePath(String file, String timeStamp,
			
 
				+      boolean isPublic, String user) {
			
 
				+    String id = file + timeStamp;
			
 
				+    if (!isPublic) {
			
 
				+      // consider job-submitter for private distributed cache file
			
 
				+      id = id.concat(user);
			
 
				+    }
			
 
				+    return new Path(distCachePath, MD5Hash.digest(id).toString()).toUri()
			
 
				+               .getPath();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Write the list of distributed cache files in the decreasing order of
			
 
				+   * file sizes into the sequence file. This file will be input to the job
			
 
				+   * {@link GenerateDistCacheData}.
			
 
				+   * Also validates if -generate option is missing and distributed cache files
			
 
				+   * are missing.
			
 
				+   * @return exit code
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private int writeDistCacheFilesList()
			
 
				+      throws IOException {
			
 
				+    // Sort the distributed cache files in the decreasing order of file sizes.
			
 
				+    List dcFiles = new ArrayList(distCacheFiles.entrySet());
			
 
				+    Collections.sort(dcFiles, new Comparator() {
			
 
				+      public int compare(Object dc1, Object dc2) {
			
 
				+        return ((Comparable) ((Map.Entry) (dc2)).getValue())
			
 
				+            .compareTo(((Map.Entry) (dc1)).getValue());
			
 
				+      }
			
 
				+    });
			
 
				+
			
 
				+    // write the sorted distributed cache files to the sequence file
			
 
				+    FileSystem fs = FileSystem.get(conf);
			
 
				+    Path distCacheFilesList = new Path(distCachePath, "_distCacheFiles.txt");
			
 
				+    conf.set(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST,
			
 
				+        distCacheFilesList.toString());
			
 
				+    SequenceFile.Writer src_writer = SequenceFile.createWriter(fs, conf,
			
 
				+        distCacheFilesList, LongWritable.class, BytesWritable.class,
			
 
				+        SequenceFile.CompressionType.NONE);
			
 
				+
			
 
				+    // Total number of unique distributed cache files
			
 
				+    int fileCount = dcFiles.size();
			
 
				+    long byteCount = 0;// Total size of all distributed cache files
			
 
				+    long bytesSync = 0;// Bytes after previous sync;used to add sync marker
			
 
				+
			
 
				+    for (Iterator it = dcFiles.iterator(); it.hasNext();) {
			
 
				+      Map.Entry entry = (Map.Entry)it.next();
			
 
				+      LongWritable fileSize =
			
 
				+          new LongWritable(Long.valueOf(entry.getValue().toString()));
			
 
				+      BytesWritable filePath =
			
 
				+          new BytesWritable(entry.getKey().toString().getBytes());
			
 
				+
			
 
				+      byteCount += fileSize.get();
			
 
				+      bytesSync += fileSize.get();
			
 
				+      if (bytesSync > AVG_BYTES_PER_MAP) {
			
 
				+        src_writer.sync();
			
 
				+        bytesSync = fileSize.get();
			
 
				+      }
			
 
				+      src_writer.append(fileSize, filePath);
			
 
				+    }
			
 
				+    if (src_writer != null) {
			
 
				+      src_writer.close();
			
 
				+    }
			
 
				+    // Set delete on exit for 'dist cache files list' as it is not needed later.
			
 
				+    fs.deleteOnExit(distCacheFilesList);
			
 
				+
			
 
				+    conf.setInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, fileCount);
			
 
				+    conf.setLong(GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, byteCount);
			
 
				+    LOG.info("Number of HDFS based distributed cache files to be generated is "
			
 
				+        + fileCount + ". Total size of HDFS based distributed cache files "
			
 
				+        + "to be generated is " + byteCount);
			
 
				+
			
 
				+    if (!shouldGenerateDistCacheData() && fileCount > 0) {
			
 
				+      LOG.error("Missing " + fileCount + " distributed cache files under the "
			
 
				+          + " directory\n" + distCachePath + "\nthat are needed for gridmix"
			
 
				+          + " to emulate distributed cache load. Either use -generate\noption"
			
 
				+          + " to generate distributed cache data along with input data OR "
			
 
				+          + "disable\ndistributed cache emulation by configuring '"
			
 
				+          + DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE
			
 
				+          + "' to false.");
			
 
				+      return MISSING_DIST_CACHE_FILES_ERROR;
			
 
				+    }
			
 
				+    return 0;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * If gridmix needs to emulate distributed cache load, then configure
			
 
				+   * distributed cache files of a simulated job by mapping the original
			
 
				+   * cluster's distributed cache file paths to the simulated cluster's paths and
			
 
				+   * setting these mapped paths in the job configuration of the simulated job.
			
 
				+   * <br>
			
 
				+   * Configure local FS based distributed cache files through the property
			
 
				+   * "tmpfiles" and hdfs based distributed cache files through the property
			
 
				+   * {@link DistributedCache#CACHE_FILES}.
			
 
				+   * @param conf configuration for the simulated job to be run
			
 
				+   * @param jobConf job configuration of original cluster's job, obtained from
			
 
				+   *                trace
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  void configureDistCacheFiles(Configuration conf, JobConf jobConf)
			
 
				+      throws IOException {
			
 
				+    if (shouldEmulateDistCacheLoad()) {
			
 
				+
			
 
				+      String[] files = jobConf.getStrings(DistributedCache.CACHE_FILES);
			
 
				+      if (files != null) {
			
 
				+        // hdfs based distributed cache files to be configured for simulated job
			
 
				+        List<String> cacheFiles = new ArrayList<String>();
			
 
				+        // local FS based distributed cache files to be configured for
			
 
				+        // simulated job
			
 
				+        List<String> localCacheFiles = new ArrayList<String>();
			
 
				+
			
 
				+        String[] visibilities =
			
 
				+          jobConf.getStrings(JobContext.CACHE_FILE_VISIBILITIES);
			
 
				+        String[] timeStamps =
			
 
				+          jobConf.getStrings(DistributedCache.CACHE_FILES_TIMESTAMPS);
			
 
				+        String[] fileSizes =
			
 
				+          jobConf.getStrings(DistributedCache.CACHE_FILES_SIZES);
			
 
				+
			
 
				+        String user = jobConf.getUser();
			
 
				+        for (int i = 0; i < files.length; i++) {
			
 
				+          // Check if visibilities are available because older hadoop versions
			
 
				+          // didn't have public, private Distributed Caches separately.
			
 
				+          boolean visibility =
			
 
				+            (visibilities == null) ? true : Boolean.valueOf(visibilities[i]);
			
 
				+          if (isLocalDistCacheFile(files[i], user, visibility)) {
			
 
				+            // local FS based distributed cache file.
			
 
				+            // Create this file on the pseudo local FS.
			
 
				+            String fileId = MD5Hash.digest(files[i] + timeStamps[i]).toString();
			
 
				+            long fileSize = Long.valueOf(fileSizes[i]);
			
 
				+            Path mappedLocalFilePath =
			
 
				+                PseudoLocalFs.generateFilePath(fileId, fileSize)
			
 
				+                    .makeQualified(pseudoLocalFs.getUri(),
			
 
				+                                   pseudoLocalFs.getWorkingDirectory());
			
 
				+            pseudoLocalFs.create(mappedLocalFilePath);
			
 
				+            localCacheFiles.add(mappedLocalFilePath.toUri().toString());
			
 
				+          } else {
			
 
				+            // hdfs based distributed cache file.
			
 
				+            // Get the mapped HDFS path on simulated cluster
			
 
				+            String mappedPath = mapDistCacheFilePath(files[i], timeStamps[i],
			
 
				+                                                     visibility, user);
			
 
				+            cacheFiles.add(mappedPath);
			
 
				+          }
			
 
				+        }
			
 
				+        if (cacheFiles.size() > 0) {
			
 
				+          // configure hdfs based distributed cache files for simulated job
			
 
				+          conf.setStrings(DistributedCache.CACHE_FILES,
			
 
				+                          cacheFiles.toArray(new String[cacheFiles.size()]));
			
 
				+        }
			
 
				+        if (localCacheFiles.size() > 0) {
			
 
				+          // configure local FS based distributed cache files for simulated job
			
 
				+          conf.setStrings("tmpfiles", localCacheFiles.toArray(
			
 
				+                                        new String[localCacheFiles.size()]));
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/EchoUserResolver.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/EchoUserResolver.java
@@ -19,15 +19,9 @@ package org.apache.hadoop.mapred.gridmix;
 
				 
			
 
				 import java.io.IOException;
			
 
				 import java.net.URI;
			
 
				-import java.util.Collections;
			
 
				-import java.util.List;
			
 
				-import java.util.ArrayList;
			
 
				 
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				 import org.apache.hadoop.security.UserGroupInformation;
			
 
				-import org.apache.hadoop.security.ShellBasedUnixGroupsMapping;
			
 
				-import org.apache.hadoop.security.Groups;
			
 
				-import org.apache.hadoop.fs.CommonConfigurationKeys;
			
 
				 import org.apache.commons.logging.Log;
			
 
				 import org.apache.commons.logging.LogFactory;
			
 
				 
			
@@ -50,4 +44,14 @@ public class EchoUserResolver implements UserResolver {
 
				       UserGroupInformation ugi) {
			
 
				     return ugi;
			
 
				   }
			
 
				+
			
 
				+  /**
			
 
				+   * {@inheritDoc}
			
 
				+   * <br><br>
			
 
				+   * Since {@link EchoUserResolver} simply returns the user's name passed as
			
 
				+   * the argument, it doesn't need a target list of users.
			
 
				+   */
			
 
				+  public boolean needsTargetUsersList() {
			
 
				+    return false;
			
 
				+  }
			
 
				 }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/ExecutionSummarizer.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/ExecutionSummarizer.java
@@ -0,0 +1,307 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.apache.commons.lang.time.FastDateFormat;
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.io.MD5Hash;
			
 
				+import org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics;
			
 
				+import org.apache.hadoop.mapred.gridmix.Statistics.JobStats;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.util.StringUtils;
			
 
				+
			
 
				+/**
			
 
				+ * Summarizes a {@link Gridmix} run. Statistics that are reported are
			
 
				+ * <ul>
			
 
				+ *   <li>Total number of jobs in the input trace</li>
			
 
				+ *   <li>Trace signature</li>
			
 
				+ *   <li>Total number of jobs processed from the input trace</li>
			
 
				+ *   <li>Total number of jobs submitted</li>
			
 
				+ *   <li>Total number of successful and failed jobs</li>
			
 
				+ *   <li>Total number of map/reduce tasks launched</li>
			
 
				+ *   <li>Gridmix start & end time</li>
			
 
				+ *   <li>Total time for the Gridmix run (data-generation and simulation)</li>
			
 
				+ *   <li>Gridmix Configuration (i.e job-type, submission-type, resolver)</li>
			
 
				+ * </ul>
			
 
				+ */
			
 
				+class ExecutionSummarizer implements StatListener<JobStats> {
			
 
				+  static final Log LOG = LogFactory.getLog(ExecutionSummarizer.class);
			
 
				+  private static final FastDateFormat UTIL = FastDateFormat.getInstance();
			
 
				+  
			
 
				+  private int numJobsInInputTrace;
			
 
				+  private int totalSuccessfulJobs;
			
 
				+  private int totalFailedJobs;
			
 
				+  private int totalMapTasksLaunched;
			
 
				+  private int totalReduceTasksLaunched;
			
 
				+  private long totalSimulationTime;
			
 
				+  private long totalRuntime;
			
 
				+  private final String commandLineArgs;
			
 
				+  private long startTime;
			
 
				+  private long endTime;
			
 
				+  private long simulationStartTime;
			
 
				+  private String inputTraceLocation;
			
 
				+  private String inputTraceSignature;
			
 
				+  private String jobSubmissionPolicy;
			
 
				+  private String resolver;
			
 
				+  private DataStatistics dataStats;
			
 
				+  private String expectedDataSize;
			
 
				+  
			
 
				+  /**
			
 
				+   * Basic constructor initialized with the runtime arguments. 
			
 
				+   */
			
 
				+  ExecutionSummarizer(String[] args) {
			
 
				+    startTime = System.currentTimeMillis();
			
 
				+    // flatten the args string and store it
			
 
				+    commandLineArgs = 
			
 
				+      org.apache.commons.lang.StringUtils.join(args, ' '); 
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Default constructor. 
			
 
				+   */
			
 
				+  ExecutionSummarizer() {
			
 
				+    startTime = System.currentTimeMillis();
			
 
				+    commandLineArgs = Summarizer.NA; 
			
 
				+  }
			
 
				+  
			
 
				+  void start(Configuration conf) {
			
 
				+    simulationStartTime = System.currentTimeMillis();
			
 
				+  }
			
 
				+  
			
 
				+  private void processJobState(JobStats stats) throws Exception {
			
 
				+    Job job = stats.getJob();
			
 
				+    if (job.isSuccessful()) {
			
 
				+      ++totalSuccessfulJobs;
			
 
				+    } else {
			
 
				+      ++totalFailedJobs;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  private void processJobTasks(JobStats stats) throws Exception {
			
 
				+    totalMapTasksLaunched += stats.getNoOfMaps();
			
 
				+    Job job = stats.getJob();
			
 
				+    totalReduceTasksLaunched += job.getNumReduceTasks();
			
 
				+  }
			
 
				+  
			
 
				+  private void process(JobStats stats) {
			
 
				+    try {
			
 
				+      // process the job run state
			
 
				+      processJobState(stats);
			
 
				+      
			
 
				+      // process the tasks information
			
 
				+      processJobTasks(stats);
			
 
				+    } catch (Exception e) {
			
 
				+      LOG.info("Error in processing job " + stats.getJob().getJobID() + ".");
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  @Override
			
 
				+  public void update(JobStats item) {
			
 
				+    // process only if the simulation has started
			
 
				+    if (simulationStartTime > 0) {
			
 
				+      process(item);
			
 
				+      totalSimulationTime = 
			
 
				+        System.currentTimeMillis() - getSimulationStartTime();
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  // Generates a signature for the trace file based on
			
 
				+  //   - filename
			
 
				+  //   - modification time
			
 
				+  //   - file length
			
 
				+  //   - owner
			
 
				+  protected static String getTraceSignature(String input) throws IOException {
			
 
				+    Path inputPath = new Path(input);
			
 
				+    FileSystem fs = inputPath.getFileSystem(new Configuration());
			
 
				+    FileStatus status = fs.getFileStatus(inputPath);
			
 
				+    Path qPath = fs.makeQualified(status.getPath());
			
 
				+    String traceID = status.getModificationTime() + qPath.toString()
			
 
				+                     + status.getOwner() + status.getLen();
			
 
				+    return MD5Hash.digest(traceID).toString();
			
 
				+  }
			
 
				+  
			
 
				+  @SuppressWarnings("unchecked")
			
 
				+  void finalize(JobFactory factory, String inputPath, long dataSize, 
			
 
				+                UserResolver userResolver, DataStatistics stats,
			
 
				+                Configuration conf) 
			
 
				+  throws IOException {
			
 
				+    numJobsInInputTrace = factory.numJobsInTrace;
			
 
				+    endTime = System.currentTimeMillis();
			
 
				+    Path inputTracePath = new Path(inputPath);
			
 
				+    FileSystem fs = inputTracePath.getFileSystem(conf);
			
 
				+    inputTraceLocation = fs.makeQualified(inputTracePath).toString();
			
 
				+    inputTraceSignature = getTraceSignature(inputTraceLocation);
			
 
				+    jobSubmissionPolicy = Gridmix.getJobSubmissionPolicy(conf).name();
			
 
				+    resolver = userResolver.getClass().getName();
			
 
				+    if (dataSize > 0) {
			
 
				+      expectedDataSize = StringUtils.humanReadableInt(dataSize);
			
 
				+    } else {
			
 
				+      expectedDataSize = Summarizer.NA;
			
 
				+    }
			
 
				+    dataStats = stats;
			
 
				+    totalRuntime = System.currentTimeMillis() - getStartTime();
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Summarizes the current {@link Gridmix} run.
			
 
				+   */
			
 
				+  @Override
			
 
				+  public String toString() {
			
 
				+    StringBuilder builder = new StringBuilder();
			
 
				+    builder.append("Execution Summary:-");
			
 
				+    builder.append("\nInput trace: ").append(getInputTraceLocation());
			
 
				+    builder.append("\nInput trace signature: ")
			
 
				+           .append(getInputTraceSignature());
			
 
				+    builder.append("\nTotal number of jobs in trace: ")
			
 
				+           .append(getNumJobsInTrace());
			
 
				+    builder.append("\nExpected input data size: ")
			
 
				+           .append(getExpectedDataSize());
			
 
				+    builder.append("\nInput data statistics: ")
			
 
				+           .append(getInputDataStatistics());
			
 
				+    builder.append("\nTotal number of jobs processed: ")
			
 
				+           .append(getNumSubmittedJobs());
			
 
				+    builder.append("\nTotal number of successful jobs: ")
			
 
				+           .append(getNumSuccessfulJobs());
			
 
				+    builder.append("\nTotal number of failed jobs: ")
			
 
				+           .append(getNumFailedJobs());
			
 
				+    builder.append("\nTotal number of map tasks launched: ")
			
 
				+           .append(getNumMapTasksLaunched());
			
 
				+    builder.append("\nTotal number of reduce task launched: ")
			
 
				+           .append(getNumReduceTasksLaunched());
			
 
				+    builder.append("\nGridmix start time: ")
			
 
				+           .append(UTIL.format(getStartTime()));
			
 
				+    builder.append("\nGridmix end time: ").append(UTIL.format(getEndTime()));
			
 
				+    builder.append("\nGridmix simulation start time: ")
			
 
				+           .append(UTIL.format(getStartTime()));
			
 
				+    builder.append("\nGridmix runtime: ")
			
 
				+           .append(StringUtils.formatTime(getRuntime()));
			
 
				+    builder.append("\nTime spent in initialization (data-gen etc): ")
			
 
				+           .append(StringUtils.formatTime(getInitTime()));
			
 
				+    builder.append("\nTime spent in simulation: ")
			
 
				+           .append(StringUtils.formatTime(getSimulationTime()));
			
 
				+    builder.append("\nGridmix configuration parameters: ")
			
 
				+           .append(getCommandLineArgsString());
			
 
				+    builder.append("\nGridmix job submission policy: ")
			
 
				+           .append(getJobSubmissionPolicy());
			
 
				+    builder.append("\nGridmix resolver: ").append(getUserResolver());
			
 
				+    builder.append("\n\n");
			
 
				+    return builder.toString();
			
 
				+  }
			
 
				+  
			
 
				+  // Gets the stringified version of DataStatistics
			
 
				+  static String stringifyDataStatistics(DataStatistics stats) {
			
 
				+    if (stats != null) {
			
 
				+      StringBuffer buffer = new StringBuffer();
			
 
				+      String compressionStatus = stats.isDataCompressed() 
			
 
				+                                 ? "Compressed" 
			
 
				+                                 : "Uncompressed";
			
 
				+      buffer.append(compressionStatus).append(" input data size: ");
			
 
				+      buffer.append(StringUtils.humanReadableInt(stats.getDataSize()));
			
 
				+      buffer.append(", ");
			
 
				+      buffer.append("Number of files: ").append(stats.getNumFiles());
			
 
				+
			
 
				+      return buffer.toString();
			
 
				+    } else {
			
 
				+      return Summarizer.NA;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  // Getters
			
 
				+  protected String getExpectedDataSize() {
			
 
				+    return expectedDataSize;
			
 
				+  }
			
 
				+  
			
 
				+  protected String getUserResolver() {
			
 
				+    return resolver;
			
 
				+  }
			
 
				+  
			
 
				+  protected String getInputDataStatistics() {
			
 
				+    return stringifyDataStatistics(dataStats);
			
 
				+  }
			
 
				+  
			
 
				+  protected String getInputTraceSignature() {
			
 
				+    return inputTraceSignature;
			
 
				+  }
			
 
				+  
			
 
				+  protected String getInputTraceLocation() {
			
 
				+    return inputTraceLocation;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getNumJobsInTrace() {
			
 
				+    return numJobsInInputTrace;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getNumSuccessfulJobs() {
			
 
				+    return totalSuccessfulJobs;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getNumFailedJobs() {
			
 
				+    return totalFailedJobs;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getNumSubmittedJobs() {
			
 
				+    return totalSuccessfulJobs + totalFailedJobs;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getNumMapTasksLaunched() {
			
 
				+    return totalMapTasksLaunched;
			
 
				+  }
			
 
				+  
			
 
				+  protected int getNumReduceTasksLaunched() {
			
 
				+    return totalReduceTasksLaunched;
			
 
				+  }
			
 
				+  
			
 
				+  protected long getStartTime() {
			
 
				+    return startTime;
			
 
				+  }
			
 
				+  
			
 
				+  protected long getEndTime() {
			
 
				+    return endTime;
			
 
				+  }
			
 
				+  
			
 
				+  protected long getInitTime() {
			
 
				+    return simulationStartTime - startTime;
			
 
				+  }
			
 
				+  
			
 
				+  protected long getSimulationStartTime() {
			
 
				+    return simulationStartTime;
			
 
				+  }
			
 
				+  
			
 
				+  protected long getSimulationTime() {
			
 
				+    return totalSimulationTime;
			
 
				+  }
			
 
				+  
			
 
				+  protected long getRuntime() {
			
 
				+    return totalRuntime;
			
 
				+  }
			
 
				+  
			
 
				+  protected String getCommandLineArgsString() {
			
 
				+    return commandLineArgs;
			
 
				+  }
			
 
				+  
			
 
				+  protected String getJobSubmissionPolicy() {
			
 
				+    return jobSubmissionPolicy;
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/FileQueue.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/FileQueue.java
@@ -21,8 +21,6 @@ import java.io.IOException;
 
				 import java.io.InputStream;
			
 
				 
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				-import org.apache.hadoop.fs.FSDataInputStream;
			
 
				-import org.apache.hadoop.fs.FileSystem;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				 import org.apache.hadoop.io.IOUtils;
			
 
				 
			
@@ -34,7 +32,7 @@ class FileQueue extends InputStream {
 
				 
			
 
				   private int idx = -1;
			
 
				   private long curlen = -1L;
			
 
				-  private FSDataInputStream input;
			
 
				+  private InputStream input;
			
 
				   private final byte[] z = new byte[1];
			
 
				   private final Path[] paths;
			
 
				   private final long[] lengths;
			
@@ -64,9 +62,9 @@ class FileQueue extends InputStream {
 
				     idx = (idx + 1) % paths.length;
			
 
				     curlen = lengths[idx];
			
 
				     final Path file = paths[idx];
			
 
				-    final FileSystem fs = file.getFileSystem(conf);
			
 
				-    input = fs.open(file);
			
 
				-    input.seek(startoffset[idx]);
			
 
				+    input = 
			
 
				+      CompressionEmulationUtil.getPossiblyDecompressedInputStream(file, 
			
 
				+                                 conf, startoffset[idx]);
			
 
				   }
			
 
				 
			
 
				   @Override
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java
@@ -30,8 +30,10 @@ import java.util.regex.Matcher;
 
				 import java.util.regex.Pattern;
			
 
				 
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				 import org.apache.hadoop.fs.FileSystem;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.PathFilter;
			
 
				 import org.apache.hadoop.fs.permission.FsPermission;
			
 
				 import org.apache.hadoop.io.BytesWritable;
			
 
				 import org.apache.hadoop.io.LongWritable;
			
@@ -41,6 +43,7 @@ import org.apache.hadoop.io.Writable;
 
				 import org.apache.hadoop.mapred.ClusterStatus;
			
 
				 import org.apache.hadoop.mapred.JobClient;
			
 
				 import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.Utils;
			
 
				 import org.apache.hadoop.mapreduce.InputFormat;
			
 
				 import org.apache.hadoop.mapreduce.InputSplit;
			
 
				 import org.apache.hadoop.mapreduce.Job;
			
@@ -52,6 +55,7 @@ import org.apache.hadoop.mapreduce.TaskAttemptContext;
 
				 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
			
 
				 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
			
 
				 import org.apache.hadoop.security.UserGroupInformation;
			
 
				+import org.apache.hadoop.util.StringUtils;
			
 
				 
			
 
				 // TODO can replace with form of GridmixJob
			
 
				 class GenerateData extends GridmixJob {
			
@@ -86,14 +90,103 @@ class GenerateData extends GridmixJob {
 
				    * Replication of generated data.
			
 
				    */
			
 
				   public static final String GRIDMIX_GEN_REPLICATION = "gridmix.gen.replicas";
			
 
				+  static final String JOB_NAME = "GRIDMIX_GENERATE_INPUT_DATA";
			
 
				 
			
 
				   public GenerateData(Configuration conf, Path outdir, long genbytes)
			
 
				       throws IOException {
			
 
				-    super(conf, 0L, "GRIDMIX_GENDATA");
			
 
				+    super(conf, 0L, JOB_NAME);
			
 
				     job.getConfiguration().setLong(GRIDMIX_GEN_BYTES, genbytes);
			
 
				     FileOutputFormat.setOutputPath(job, outdir);
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Represents the input data characteristics.
			
 
				+   */
			
 
				+  static class DataStatistics {
			
 
				+    private long dataSize;
			
 
				+    private long numFiles;
			
 
				+    private boolean isDataCompressed;
			
 
				+    
			
 
				+    DataStatistics(long dataSize, long numFiles, boolean isCompressed) {
			
 
				+      this.dataSize = dataSize;
			
 
				+      this.numFiles = numFiles;
			
 
				+      this.isDataCompressed = isCompressed;
			
 
				+    }
			
 
				+    
			
 
				+    long getDataSize() {
			
 
				+      return dataSize;
			
 
				+    }
			
 
				+    
			
 
				+    long getNumFiles() {
			
 
				+      return numFiles;
			
 
				+    }
			
 
				+    
			
 
				+    boolean isDataCompressed() {
			
 
				+      return isDataCompressed;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Publish the data statistics.
			
 
				+   */
			
 
				+  static DataStatistics publishDataStatistics(Path inputDir, long genBytes, 
			
 
				+                                              Configuration conf) 
			
 
				+  throws IOException {
			
 
				+    if (CompressionEmulationUtil.isCompressionEmulationEnabled(conf)) {
			
 
				+      return CompressionEmulationUtil.publishCompressedDataStatistics(inputDir, 
			
 
				+                                        conf, genBytes);
			
 
				+    } else {
			
 
				+      return publishPlainDataStatistics(conf, inputDir);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * List files recursively and get their statuses.
			
 
				+   * @param path The path of the file/dir for which ls is to be done recursively
			
 
				+   * @param fs FileSystem of the path
			
 
				+   * @param filter the user-supplied path filter
			
 
				+   * @return
			
 
				+   */
			
 
				+  private static List<FileStatus> listFiles(Path path, FileSystem fs,
			
 
				+      PathFilter filter) throws IOException {
			
 
				+    List<FileStatus> list = new ArrayList<FileStatus>();
			
 
				+    FileStatus[] statuses = fs.listStatus(path, filter);
			
 
				+    if (statuses != null) {
			
 
				+      for (FileStatus status : statuses) {
			
 
				+        if (status.isDir()) {
			
 
				+          list.addAll(listFiles(status.getPath(), fs, filter));
			
 
				+        } else {
			
 
				+          list.add(status);
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    return list;
			
 
				+  }
			
 
				+
			
 
				+  static DataStatistics publishPlainDataStatistics(Configuration conf, 
			
 
				+                                                   Path inputDir) 
			
 
				+  throws IOException {
			
 
				+    FileSystem fs = inputDir.getFileSystem(conf);
			
 
				+
			
 
				+    // obtain input data file statuses
			
 
				+    long dataSize = 0;
			
 
				+    long fileCount = 0;
			
 
				+    PathFilter filter = new Utils.OutputFileUtils.OutputFilesFilter();
			
 
				+    List<FileStatus> statuses = listFiles(inputDir, fs, filter);
			
 
				+
			
 
				+    for (FileStatus fStat : statuses) {
			
 
				+      dataSize += fStat.getLen();
			
 
				+    }
			
 
				+    fileCount = statuses.size();
			
 
				+
			
 
				+    // publish the plain data statistics
			
 
				+    LOG.info("Total size of input data : " 
			
 
				+             + StringUtils.humanReadableInt(dataSize));
			
 
				+    LOG.info("Total number of input data files : " + fileCount);
			
 
				+    
			
 
				+    return new DataStatistics(dataSize, fileCount, false);
			
 
				+  }
			
 
				+  
			
 
				   @Override
			
 
				   public Job call() throws IOException, InterruptedException,
			
 
				                            ClassNotFoundException {
			
@@ -101,6 +194,18 @@ class GenerateData extends GridmixJob {
 
				     ugi.doAs( new PrivilegedExceptionAction <Job>() {
			
 
				        public Job run() throws IOException, ClassNotFoundException,
			
 
				                                InterruptedException {
			
 
				+         // check if compression emulation is enabled
			
 
				+         if (CompressionEmulationUtil
			
 
				+             .isCompressionEmulationEnabled(job.getConfiguration())) {
			
 
				+           CompressionEmulationUtil.configure(job);
			
 
				+         } else {
			
 
				+           configureRandomBytesDataGenerator();
			
 
				+         }
			
 
				+         job.submit();
			
 
				+         return job;
			
 
				+       }
			
 
				+       
			
 
				+       private void configureRandomBytesDataGenerator() {
			
 
				         job.setMapperClass(GenDataMapper.class);
			
 
				         job.setNumReduceTasks(0);
			
 
				         job.setMapOutputKeyClass(NullWritable.class);
			
@@ -113,12 +218,15 @@ class GenerateData extends GridmixJob {
 
				         } catch (IOException e) {
			
 
				           LOG.error("Error  while adding input path ", e);
			
 
				         }
			
 
				-        job.submit();
			
 
				-        return job;
			
 
				       }
			
 
				     });
			
 
				     return job;
			
 
				   }
			
 
				+  
			
 
				+  @Override
			
 
				+  protected boolean canEmulateCompression() {
			
 
				+    return false;
			
 
				+  }
			
 
				 
			
 
				   public static class GenDataMapper
			
 
				       extends Mapper<NullWritable,LongWritable,NullWritable,BytesWritable> {
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateDistCacheData.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateDistCacheData.java
@@ -0,0 +1,259 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.security.PrivilegedExceptionAction;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+import java.util.Random;
			
 
				+
			
 
				+import org.apache.hadoop.classification.InterfaceAudience;
			
 
				+import org.apache.hadoop.classification.InterfaceStability;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.permission.FsPermission;
			
 
				+import org.apache.hadoop.io.BytesWritable;
			
 
				+import org.apache.hadoop.io.LongWritable;
			
 
				+import org.apache.hadoop.io.NullWritable;
			
 
				+import org.apache.hadoop.io.SequenceFile;
			
 
				+import org.apache.hadoop.mapred.ClusterStatus;
			
 
				+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
			
 
				+import org.apache.hadoop.mapred.JobClient;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapreduce.lib.input.SequenceFileRecordReader;
			
 
				+import org.apache.hadoop.mapreduce.InputFormat;
			
 
				+import org.apache.hadoop.mapreduce.InputSplit;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.mapreduce.JobContext;
			
 
				+import org.apache.hadoop.mapreduce.Mapper;
			
 
				+import org.apache.hadoop.mapreduce.RecordReader;
			
 
				+import org.apache.hadoop.mapreduce.TaskAttemptContext;
			
 
				+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
			
 
				+import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
			
 
				+import org.apache.hadoop.security.UserGroupInformation;
			
 
				+
			
 
				+/**
			
 
				+ * GridmixJob that generates distributed cache files.
			
 
				+ * {@link GenerateDistCacheData} expects a list of distributed cache files to be
			
 
				+ * generated as input. This list is expected to be stored as a sequence file
			
 
				+ * and the filename is expected to be configured using
			
 
				+ * {@code gridmix.distcache.file.list}.
			
 
				+ * This input file contains the list of distributed cache files and their sizes.
			
 
				+ * For each record (i.e. file size and file path) in this input file,
			
 
				+ * a file with the specific file size at the specific path is created.
			
 
				+ */
			
 
				+@InterfaceAudience.Private
			
 
				+@InterfaceStability.Evolving
			
 
				+class GenerateDistCacheData extends GridmixJob {
			
 
				+
			
 
				+  /**
			
 
				+   * Number of distributed cache files to be created by gridmix
			
 
				+   */
			
 
				+  static final String GRIDMIX_DISTCACHE_FILE_COUNT =
			
 
				+      "gridmix.distcache.file.count";
			
 
				+  /**
			
 
				+   * Total number of bytes to be written to the distributed cache files by
			
 
				+   * gridmix. i.e. Sum of sizes of all unique distributed cache files to be
			
 
				+   * created by gridmix.
			
 
				+   */
			
 
				+  static final String GRIDMIX_DISTCACHE_BYTE_COUNT =
			
 
				+      "gridmix.distcache.byte.count";
			
 
				+  /**
			
 
				+   * The special file created(and used) by gridmix, that contains the list of
			
 
				+   * unique distributed cache files that are to be created and their sizes.
			
 
				+   */
			
 
				+  static final String GRIDMIX_DISTCACHE_FILE_LIST =
			
 
				+      "gridmix.distcache.file.list";
			
 
				+  static final String JOB_NAME = "GRIDMIX_GENERATE_DISTCACHE_DATA";
			
 
				+
			
 
				+  public GenerateDistCacheData(Configuration conf) throws IOException {
			
 
				+    super(conf, 0L, JOB_NAME);
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public Job call() throws IOException, InterruptedException,
			
 
				+                           ClassNotFoundException {
			
 
				+    UserGroupInformation ugi = UserGroupInformation.getLoginUser();
			
 
				+    ugi.doAs( new PrivilegedExceptionAction <Job>() {
			
 
				+       public Job run() throws IOException, ClassNotFoundException,
			
 
				+                               InterruptedException {
			
 
				+        job.setMapperClass(GenDCDataMapper.class);
			
 
				+        job.setNumReduceTasks(0);
			
 
				+        job.setMapOutputKeyClass(NullWritable.class);
			
 
				+        job.setMapOutputValueClass(BytesWritable.class);
			
 
				+        job.setInputFormatClass(GenDCDataFormat.class);
			
 
				+        job.setOutputFormatClass(NullOutputFormat.class);
			
 
				+        job.setJarByClass(GenerateDistCacheData.class);
			
 
				+        try {
			
 
				+          FileInputFormat.addInputPath(job, new Path("ignored"));
			
 
				+        } catch (IOException e) {
			
 
				+          LOG.error("Error while adding input path ", e);
			
 
				+        }
			
 
				+        job.submit();
			
 
				+        return job;
			
 
				+      }
			
 
				+    });
			
 
				+    return job;
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  protected boolean canEmulateCompression() {
			
 
				+    return false;
			
 
				+  }
			
 
				+
			
 
				+  public static class GenDCDataMapper
			
 
				+      extends Mapper<LongWritable, BytesWritable, NullWritable, BytesWritable> {
			
 
				+
			
 
				+    private BytesWritable val;
			
 
				+    private final Random r = new Random();
			
 
				+    private FileSystem fs;
			
 
				+
			
 
				+    @Override
			
 
				+    protected void setup(Context context)
			
 
				+        throws IOException, InterruptedException {
			
 
				+      val = new BytesWritable(new byte[context.getConfiguration().getInt(
			
 
				+              GenerateData.GRIDMIX_VAL_BYTES, 1024 * 1024)]);
			
 
				+      fs = FileSystem.get(context.getConfiguration());
			
 
				+    }
			
 
				+
			
 
				+    // Create one distributed cache file with the needed file size.
			
 
				+    // key is distributed cache file size and
			
 
				+    // value is distributed cache file path.
			
 
				+    @Override
			
 
				+    public void map(LongWritable key, BytesWritable value, Context context)
			
 
				+        throws IOException, InterruptedException {
			
 
				+
			
 
				+      String fileName = new String(value.getBytes(), 0, value.getLength());
			
 
				+      Path path = new Path(fileName);
			
 
				+
			
 
				+      /**
			
 
				+       * Create distributed cache file with the permissions 0755.
			
 
				+       * Since the private distributed cache directory doesn't have execute
			
 
				+       * permission for others, it is OK to set read permission for others for
			
 
				+       * the files under that directory and still they will become 'private'
			
 
				+       * distributed cache files on the simulated cluster.
			
 
				+       */
			
 
				+      FSDataOutputStream dos =
			
 
				+          FileSystem.create(fs, path, new FsPermission((short)0755));
			
 
				+
			
 
				+      for (long bytes = key.get(); bytes > 0; bytes -= val.getLength()) {
			
 
				+        r.nextBytes(val.getBytes());
			
 
				+        val.setSize((int)Math.min(val.getLength(), bytes));
			
 
				+        dos.write(val.getBytes(), 0, val.getLength());// Write to distCache file
			
 
				+      }
			
 
				+      dos.close();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * InputFormat for GenerateDistCacheData.
			
 
				+   * Input to GenerateDistCacheData is the special file(in SequenceFile format)
			
 
				+   * that contains the list of distributed cache files to be generated along
			
 
				+   * with their file sizes.
			
 
				+   */
			
 
				+  static class GenDCDataFormat
			
 
				+      extends InputFormat<LongWritable, BytesWritable> {
			
 
				+
			
 
				+    // Split the special file that contains the list of distributed cache file
			
 
				+    // paths and their file sizes such that each split corresponds to
			
 
				+    // approximately same amount of distributed cache data to be generated.
			
 
				+    // Consider numTaskTrackers * numMapSlotsPerTracker as the number of maps
			
 
				+    // for this job, if there is lot of data to be generated.
			
 
				+    @Override
			
 
				+    public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
			
 
				+      final JobConf jobConf = new JobConf(jobCtxt.getConfiguration());
			
 
				+      final JobClient client = new JobClient(jobConf);
			
 
				+      ClusterStatus stat = client.getClusterStatus(true);
			
 
				+      int numTrackers = stat.getTaskTrackers();
			
 
				+      final int fileCount = jobConf.getInt(GRIDMIX_DISTCACHE_FILE_COUNT, -1);
			
 
				+
			
 
				+      // Total size of distributed cache files to be generated
			
 
				+      final long totalSize = jobConf.getLong(GRIDMIX_DISTCACHE_BYTE_COUNT, -1);
			
 
				+      // Get the path of the special file
			
 
				+      String distCacheFileList = jobConf.get(GRIDMIX_DISTCACHE_FILE_LIST);
			
 
				+      if (fileCount < 0 || totalSize < 0 || distCacheFileList == null) {
			
 
				+        throw new RuntimeException("Invalid metadata: #files (" + fileCount
			
 
				+            + "), total_size (" + totalSize + "), filelisturi ("
			
 
				+            + distCacheFileList + ")");
			
 
				+      }
			
 
				+
			
 
				+      Path sequenceFile = new Path(distCacheFileList);
			
 
				+      FileSystem fs = sequenceFile.getFileSystem(jobConf);
			
 
				+      FileStatus srcst = fs.getFileStatus(sequenceFile);
			
 
				+      // Consider the number of TTs * mapSlotsPerTracker as number of mappers.
			
 
				+      int numMapSlotsPerTracker =
			
 
				+          jobConf.getInt("mapred.tasktracker.map.tasks.maximum", 2);
			
 
				+      int numSplits = numTrackers * numMapSlotsPerTracker;
			
 
				+
			
 
				+      List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
			
 
				+      LongWritable key = new LongWritable();
			
 
				+      BytesWritable value = new BytesWritable();
			
 
				+
			
 
				+      // Average size of data to be generated by each map task
			
 
				+      final long targetSize = Math.max(totalSize / numSplits,
			
 
				+                                DistributedCacheEmulator.AVG_BYTES_PER_MAP);
			
 
				+      long splitStartPosition = 0L;
			
 
				+      long splitEndPosition = 0L;
			
 
				+      long acc = 0L;
			
 
				+      long bytesRemaining = srcst.getLen();
			
 
				+      SequenceFile.Reader reader = null;
			
 
				+      try {
			
 
				+        reader = new SequenceFile.Reader(fs, sequenceFile, jobConf);
			
 
				+        while (reader.next(key, value)) {
			
 
				+
			
 
				+          // If adding this file would put this split past the target size,
			
 
				+          // cut the last split and put this file in the next split.
			
 
				+          if (acc + key.get() > targetSize && acc != 0) {
			
 
				+            long splitSize = splitEndPosition - splitStartPosition;
			
 
				+            splits.add(new FileSplit(
			
 
				+                sequenceFile, splitStartPosition, splitSize, (String[])null));
			
 
				+            bytesRemaining -= splitSize;
			
 
				+            splitStartPosition = splitEndPosition;
			
 
				+            acc = 0L;
			
 
				+          }
			
 
				+          acc += key.get();
			
 
				+          splitEndPosition = reader.getPosition();
			
 
				+        }
			
 
				+      } finally {
			
 
				+        if (reader != null) {
			
 
				+          reader.close();
			
 
				+        }
			
 
				+      }
			
 
				+      if (bytesRemaining != 0) {
			
 
				+        splits.add(new FileSplit(
			
 
				+            sequenceFile, splitStartPosition, bytesRemaining, (String[])null));
			
 
				+      }
			
 
				+
			
 
				+      return splits;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Returns a reader for this split of the distributed cache file list.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public RecordReader<LongWritable, BytesWritable> createRecordReader(
			
 
				+        InputSplit split, final TaskAttemptContext taskContext)
			
 
				+        throws IOException, InterruptedException {
			
 
				+      return new SequenceFileRecordReader<LongWritable, BytesWritable>();
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
@@ -33,12 +33,14 @@ import org.apache.hadoop.fs.FsShell;
 
				 import org.apache.hadoop.fs.permission.FsPermission;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				 import org.apache.hadoop.io.IOUtils;
			
 
				+import org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics;
			
 
				 import org.apache.hadoop.mapreduce.Job;
			
 
				 import org.apache.hadoop.security.UserGroupInformation;
			
 
				 import org.apache.hadoop.util.ReflectionUtils;
			
 
				 import org.apache.hadoop.util.StringUtils;
			
 
				 import org.apache.hadoop.util.Tool;
			
 
				 import org.apache.hadoop.util.ToolRunner;
			
 
				+import org.apache.hadoop.tools.rumen.JobStoryProducer;
			
 
				 import org.apache.hadoop.tools.rumen.ZombieJobProducer;
			
 
				 
			
 
				 import org.apache.commons.logging.Log;
			
@@ -92,62 +94,143 @@ public class Gridmix extends Configured implements Tool {
 
				    */
			
 
				   public static final String GRIDMIX_USR_RSV = "gridmix.user.resolve.class";
			
 
				 
			
 
				+  /**
			
 
				+   * Configuration property set in simulated job's configuration whose value is
			
 
				+   * set to the corresponding original job's name. This is not configurable by
			
 
				+   * gridmix user.
			
 
				+   */
			
 
				+  public static final String ORIGINAL_JOB_NAME =
			
 
				+      "gridmix.job.original-job-name";
			
 
				+  /**
			
 
				+   * Configuration property set in simulated job's configuration whose value is
			
 
				+   * set to the corresponding original job's id. This is not configurable by
			
 
				+   * gridmix user.
			
 
				+   */
			
 
				+  public static final String ORIGINAL_JOB_ID = "gridmix.job.original-job-id";
			
 
				+
			
 
				+  private DistributedCacheEmulator distCacheEmulator;
			
 
				+
			
 
				   // Submit data structures
			
 
				   private JobFactory factory;
			
 
				   private JobSubmitter submitter;
			
 
				   private JobMonitor monitor;
			
 
				   private Statistics statistics;
			
 
				+  private Summarizer summarizer;
			
 
				 
			
 
				   // Shutdown hook
			
 
				   private final Shutdown sdh = new Shutdown();
			
 
				 
			
 
				+  Gridmix(String[] args) {
			
 
				+    summarizer = new Summarizer(args);
			
 
				+  }
			
 
				+  
			
 
				+  Gridmix() {
			
 
				+    summarizer = new Summarizer();
			
 
				+  }
			
 
				+  
			
 
				+  // Get the input data directory for Gridmix. Input directory is 
			
 
				+  // <io-path>/input
			
 
				+  static Path getGridmixInputDataPath(Path ioPath) {
			
 
				+    return new Path(ioPath, "input");
			
 
				+  }
			
 
				+  
			
 
				   /**
			
 
				-   * Write random bytes at the path provided.
			
 
				+   * Write random bytes at the path &lt;inputDir&gt;.
			
 
				    * @see org.apache.hadoop.mapred.gridmix.GenerateData
			
 
				    */
			
 
				-  protected void writeInputData(long genbytes, Path ioPath)
			
 
				+  protected void writeInputData(long genbytes, Path inputDir)
			
 
				       throws IOException, InterruptedException {
			
 
				     final Configuration conf = getConf();
			
 
				-    final GridmixJob genData = new GenerateData(conf, ioPath, genbytes);
			
 
				-    submitter.add(genData);
			
 
				+    
			
 
				+    // configure the compression ratio if needed
			
 
				+    CompressionEmulationUtil.setupDataGeneratorConfig(conf);
			
 
				+    
			
 
				+    final GenerateData genData = new GenerateData(conf, inputDir, genbytes);
			
 
				     LOG.info("Generating " + StringUtils.humanReadableInt(genbytes) +
			
 
				         " of test data...");
			
 
				+    launchGridmixJob(genData);
			
 
				+    
			
 
				+    FsShell shell = new FsShell(conf);
			
 
				+    try {
			
 
				+      LOG.info("Changing the permissions for inputPath " + inputDir.toString());
			
 
				+      shell.run(new String[] {"-chmod","-R","777", inputDir.toString()});
			
 
				+    } catch (Exception e) {
			
 
				+      LOG.error("Couldnt change the file permissions " , e);
			
 
				+      throw new IOException(e);
			
 
				+    }
			
 
				+    
			
 
				+    LOG.info("Input data generation successful.");
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Write random bytes in the distributed cache files that will be used by all
			
 
				+   * simulated jobs of current gridmix run, if files are to be generated.
			
 
				+   * Do this as part of the MapReduce job {@link GenerateDistCacheData#JOB_NAME}
			
 
				+   * @see org.apache.hadoop.mapred.gridmix.GenerateDistCacheData
			
 
				+   */
			
 
				+  protected void writeDistCacheData(Configuration conf)
			
 
				+      throws IOException, InterruptedException {
			
 
				+    int fileCount =
			
 
				+        conf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1);
			
 
				+    if (fileCount > 0) {// generate distributed cache files
			
 
				+      final GridmixJob genDistCacheData = new GenerateDistCacheData(conf);
			
 
				+      LOG.info("Generating distributed cache data of size " + conf.getLong(
			
 
				+          GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
			
 
				+      launchGridmixJob(genDistCacheData);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Launch Input/DistCache Data Generation job and wait for completion
			
 
				+  void launchGridmixJob(GridmixJob job)
			
 
				+      throws IOException, InterruptedException {
			
 
				+    submitter.add(job);
			
 
				+
			
 
				     // TODO add listeners, use for job dependencies
			
 
				     TimeUnit.SECONDS.sleep(10);
			
 
				     try {
			
 
				-      genData.getJob().waitForCompletion(false);
			
 
				+      job.getJob().waitForCompletion(false);
			
 
				     } catch (ClassNotFoundException e) {
			
 
				       throw new IOException("Internal error", e);
			
 
				     }
			
 
				-    if (!genData.getJob().isSuccessful()) {
			
 
				-      throw new IOException("Data generation failed!");
			
 
				-    }
			
 
				-
			
 
				-    FsShell shell = new FsShell(conf);
			
 
				-    try {
			
 
				-      LOG.info("Changing the permissions for inputPath " + ioPath.toString());
			
 
				-      shell.run(new String[] {"-chmod","-R","777", ioPath.toString()});
			
 
				-    } catch (Exception e) {
			
 
				-      LOG.error("Couldnt change the file permissions " , e);
			
 
				-      throw new IOException(e);
			
 
				+    if (!job.getJob().isSuccessful()) {
			
 
				+      throw new IOException(job.getJob().getJobName() + " job failed!");
			
 
				     }
			
 
				-    LOG.info("Done.");
			
 
				   }
			
 
				 
			
 
				-  protected InputStream createInputStream(String in) throws IOException {
			
 
				-    if ("-".equals(in)) {
			
 
				-      return System.in;
			
 
				+  /**
			
 
				+   * Create an appropriate {@code JobStoryProducer} object for the
			
 
				+   * given trace.
			
 
				+   * 
			
 
				+   * @param traceIn the path to the trace file. The special path
			
 
				+   * &quot;-&quot; denotes the standard input stream.
			
 
				+   *
			
 
				+   * @param conf the configuration to be used.
			
 
				+   *
			
 
				+   * @throws IOException if there was an error.
			
 
				+   */
			
 
				+  protected JobStoryProducer createJobStoryProducer(String traceIn,
			
 
				+      Configuration conf) throws IOException {
			
 
				+    if ("-".equals(traceIn)) {
			
 
				+      return new ZombieJobProducer(System.in, null);
			
 
				     }
			
 
				-    final Path pin = new Path(in);
			
 
				-    return pin.getFileSystem(getConf()).open(pin);
			
 
				+    return new ZombieJobProducer(new Path(traceIn), null, conf);
			
 
				   }
			
 
				 
			
 
				+  // get the gridmix job submission policy
			
 
				+  protected static GridmixJobSubmissionPolicy getJobSubmissionPolicy(
			
 
				+                                                Configuration conf) {
			
 
				+    return GridmixJobSubmissionPolicy.getPolicy(conf, 
			
 
				+                                        GridmixJobSubmissionPolicy.STRESS);
			
 
				+  }
			
 
				+  
			
 
				   /**
			
 
				    * Create each component in the pipeline and start it.
			
 
				    * @param conf Configuration data, no keys specific to this context
			
 
				    * @param traceIn Either a Path to the trace data or &quot;-&quot; for
			
 
				    *                stdin
			
 
				-   * @param ioPath Path from which input data is read
			
 
				+   * @param ioPath &lt;ioPath&gt;/input/ is the dir from which input data is
			
 
				+   *               read and &lt;ioPath&gt;/distributedCache/ is the gridmix
			
 
				+   *               distributed cache directory.
			
 
				    * @param scratchDir Path into which job output is written
			
 
				    * @param startFlag Semaphore for starting job trace pipeline
			
 
				    */
			
@@ -155,8 +238,8 @@ public class Gridmix extends Configured implements Tool {
 
				       Path scratchDir, CountDownLatch startFlag, UserResolver userResolver)
			
 
				       throws IOException {
			
 
				     try {
			
 
				-      GridmixJobSubmissionPolicy policy = GridmixJobSubmissionPolicy.getPolicy(
			
 
				-        conf, GridmixJobSubmissionPolicy.STRESS);
			
 
				+      Path inputDir = getGridmixInputDataPath(ioPath);
			
 
				+      GridmixJobSubmissionPolicy policy = getJobSubmissionPolicy(conf);
			
 
				       LOG.info(" Submission policy is " + policy.name());
			
 
				       statistics = new Statistics(conf, policy.getPollingInterval(), startFlag);
			
 
				       monitor = createJobMonitor(statistics);
			
@@ -167,16 +250,24 @@ public class Gridmix extends Configured implements Tool {
 
				         monitor, conf.getInt(
			
 
				           GRIDMIX_SUB_THR, noOfSubmitterThreads), conf.getInt(
			
 
				           GRIDMIX_QUE_DEP, 5), new FilePool(
			
 
				-          conf, ioPath), userResolver,statistics);
			
 
				+          conf, inputDir), userResolver,statistics);
			
 
				       
			
 
				-      factory = createJobFactory(
			
 
				-        submitter, traceIn, scratchDir, conf, startFlag, userResolver);
			
 
				+      distCacheEmulator = new DistributedCacheEmulator(conf, ioPath);
			
 
				+
			
 
				+      factory = createJobFactory(submitter, traceIn, scratchDir, conf,
			
 
				+                                 startFlag, userResolver);
			
 
				+      factory.jobCreator.setDistCacheEmulator(distCacheEmulator);
			
 
				+
			
 
				       if (policy==GridmixJobSubmissionPolicy.SERIAL) {
			
 
				         statistics.addJobStatsListeners(factory);
			
 
				       } else {
			
 
				         statistics.addClusterStatsObservers(factory);
			
 
				       }
			
 
				-      
			
 
				+
			
 
				+      // add the gridmix run summarizer to the statistics
			
 
				+      statistics.addJobStatsListeners(summarizer.getExecutionSummarizer());
			
 
				+      statistics.addClusterStatsObservers(summarizer.getClusterSummarizer());
			
 
				+
			
 
				       monitor.start();
			
 
				       submitter.start();
			
 
				     }catch(Exception e) {
			
@@ -201,9 +292,8 @@ public class Gridmix extends Configured implements Tool {
 
				     throws IOException {
			
 
				     return GridmixJobSubmissionPolicy.getPolicy(
			
 
				       conf, GridmixJobSubmissionPolicy.STRESS).createJobFactory(
			
 
				-      submitter, new ZombieJobProducer(
			
 
				-        createInputStream(
			
 
				-          traceIn), null), scratchDir, conf, startFlag, resolver);
			
 
				+      submitter, createJobStoryProducer(traceIn, conf), scratchDir, conf,
			
 
				+      startFlag, resolver);
			
 
				   }
			
 
				 
			
 
				   public int run(final String[] argv) throws IOException, InterruptedException {
			
@@ -217,6 +307,10 @@ public class Gridmix extends Configured implements Tool {
 
				         return runJob(conf,argv);
			
 
				       }
			
 
				     });
			
 
				+    
			
 
				+    // print the run summary
			
 
				+    System.out.print("\n\n");
			
 
				+    System.out.println(summarizer.toString());
			
 
				     return val; 
			
 
				   }
			
 
				 
			
@@ -232,6 +326,9 @@ public class Gridmix extends Configured implements Tool {
 
				       printUsage(System.err);
			
 
				       return 1;
			
 
				     }
			
 
				+    
			
 
				+    // Should gridmix generate distributed cache data ?
			
 
				+    boolean generate = false;
			
 
				     long genbytes = -1L;
			
 
				     String traceIn = null;
			
 
				     Path ioPath = null;
			
@@ -243,6 +340,7 @@ public class Gridmix extends Configured implements Tool {
 
				       for (int i = 0; i < argv.length - 2; ++i) {
			
 
				         if ("-generate".equals(argv[i])) {
			
 
				           genbytes = StringUtils.TraditionalBinaryPrefix.string2long(argv[++i]);
			
 
				+          generate = true;
			
 
				         } else if ("-users".equals(argv[i])) {
			
 
				           userRsrc = new URI(argv[++i]);
			
 
				         } else {
			
@@ -250,9 +348,22 @@ public class Gridmix extends Configured implements Tool {
 
				           return 1;
			
 
				         }
			
 
				       }
			
 
				-      if (!userResolver.setTargetUsers(userRsrc, conf)) {
			
 
				-        LOG.warn("Resource " + userRsrc + " ignored");
			
 
				+
			
 
				+      if (userResolver.needsTargetUsersList()) {
			
 
				+        if (userRsrc != null) {
			
 
				+          if (!userResolver.setTargetUsers(userRsrc, conf)) {
			
 
				+            LOG.warn("Ignoring the user resource '" + userRsrc + "'.");
			
 
				+          }
			
 
				+        } else {
			
 
				+          System.err.println("\n\n" + userResolver.getClass()
			
 
				+              + " needs target user list. Use -users option." + "\n\n");
			
 
				+          printUsage(System.err);
			
 
				+          return 1;
			
 
				+        }
			
 
				+      } else if (userRsrc != null) {
			
 
				+        LOG.warn("Ignoring the user resource '" + userRsrc + "'.");
			
 
				       }
			
 
				+
			
 
				       ioPath = new Path(argv[argv.length - 2]);
			
 
				       traceIn = argv[argv.length - 1];
			
 
				     } catch (Exception e) {
			
@@ -260,17 +371,46 @@ public class Gridmix extends Configured implements Tool {
 
				       printUsage(System.err);
			
 
				       return 1;
			
 
				     }
			
 
				-    return start(conf, traceIn, ioPath, genbytes, userResolver);
			
 
				+    return start(conf, traceIn, ioPath, genbytes, userResolver, generate);
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * 
			
 
				+   * @param conf gridmix configuration
			
 
				+   * @param traceIn trace file path(if it is '-', then trace comes from the
			
 
				+   *                stream stdin)
			
 
				+   * @param ioPath Working directory for gridmix. GenerateData job
			
 
				+   *               will generate data in the directory &lt;ioPath&gt;/input/ and
			
 
				+   *               distributed cache data is generated in the directory
			
 
				+   *               &lt;ioPath&gt;/distributedCache/, if -generate option is
			
 
				+   *               specified.
			
 
				+   * @param genbytes size of input data to be generated under the directory
			
 
				+   *                 &lt;ioPath&gt;/input/
			
 
				+   * @param userResolver gridmix user resolver
			
 
				+   * @param generate true if -generate option was specified
			
 
				+   * @return exit code
			
 
				+   * @throws IOException
			
 
				+   * @throws InterruptedException
			
 
				+   */
			
 
				   int start(Configuration conf, String traceIn, Path ioPath, long genbytes,
			
 
				-      UserResolver userResolver) throws IOException, InterruptedException {
			
 
				+      UserResolver userResolver, boolean generate)
			
 
				+      throws IOException, InterruptedException {
			
 
				+    DataStatistics stats = null;
			
 
				     InputStream trace = null;
			
 
				+    final FileSystem inputFs = ioPath.getFileSystem(conf);
			
 
				+    ioPath = ioPath.makeQualified(inputFs);
			
 
				+
			
 
				     try {
			
 
				+      // Create <ioPath> with 777 permissions
			
 
				+      boolean succeeded = FileSystem.mkdirs(inputFs, ioPath,
			
 
				+                                            new FsPermission((short) 0777));
			
 
				+      if (!succeeded) {
			
 
				+        throw new IOException("Creation of <ioPath> directory "
			
 
				+                              + ioPath.toUri().toString() + " failed.");
			
 
				+      }
			
 
				+
			
 
				       Path scratchDir = new Path(ioPath, conf.get(GRIDMIX_OUT_DIR, "gridmix"));
			
 
				-      final FileSystem scratchFs = scratchDir.getFileSystem(conf);
			
 
				-      scratchFs.mkdirs(scratchDir, new FsPermission((short) 0777));
			
 
				-      scratchFs.setPermission(scratchDir, new FsPermission((short) 0777));
			
 
				+
			
 
				       // add shutdown hook for SIGINT, etc.
			
 
				       Runtime.getRuntime().addShutdownHook(sdh);
			
 
				       CountDownLatch startFlag = new CountDownLatch(1);
			
@@ -278,12 +418,30 @@ public class Gridmix extends Configured implements Tool {
 
				         // Create, start job submission threads
			
 
				         startThreads(conf, traceIn, ioPath, scratchDir, startFlag,
			
 
				             userResolver);
			
 
				+
			
 
				+        Path inputDir = getGridmixInputDataPath(ioPath);
			
 
				+
			
 
				         // Write input data if specified
			
 
				         if (genbytes > 0) {
			
 
				-          writeInputData(genbytes, ioPath);
			
 
				+          writeInputData(genbytes, inputDir);
			
 
				         }
			
 
				+
			
 
				+        // publish the data statistics
			
 
				+        stats = GenerateData.publishDataStatistics(inputDir, genbytes, conf);
			
 
				+
			
 
				         // scan input dir contents
			
 
				         submitter.refreshFilePool();
			
 
				+
			
 
				+        // set up the needed things for emulation of various loads
			
 
				+        int exitCode = setupEmulation(conf, traceIn, scratchDir, ioPath,
			
 
				+                                      generate);
			
 
				+        if (exitCode != 0) {
			
 
				+          return exitCode;
			
 
				+        }
			
 
				+
			
 
				+        // start the summarizer
			
 
				+        summarizer.start(conf);
			
 
				+        
			
 
				         factory.start();
			
 
				         statistics.start();
			
 
				       } catch (Throwable e) {
			
@@ -313,11 +471,73 @@ public class Gridmix extends Configured implements Tool {
 
				 
			
 
				       }
			
 
				     } finally {
			
 
				+      if (factory != null) {
			
 
				+        summarizer.finalize(factory, traceIn, genbytes, userResolver, stats, 
			
 
				+                            conf);
			
 
				+      }
			
 
				       IOUtils.cleanup(LOG, trace);
			
 
				     }
			
 
				     return 0;
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Create gridmix output directory. Setup things for emulation of
			
 
				+   * various loads, if needed.
			
 
				+   * @param conf gridmix configuration
			
 
				+   * @param traceIn trace file path(if it is '-', then trace comes from the
			
 
				+   *                stream stdin)
			
 
				+   * @param scratchDir gridmix output directory
			
 
				+   * @param ioPath Working directory for gridmix.
			
 
				+   * @param generate true if -generate option was specified
			
 
				+   * @return exit code
			
 
				+   * @throws IOException
			
 
				+   * @throws InterruptedException 
			
 
				+   */
			
 
				+  private int setupEmulation(Configuration conf, String traceIn,
			
 
				+      Path scratchDir, Path ioPath, boolean generate)
			
 
				+      throws IOException, InterruptedException {
			
 
				+    // create scratch directory(output directory of gridmix)
			
 
				+    final FileSystem scratchFs = scratchDir.getFileSystem(conf);
			
 
				+    FileSystem.mkdirs(scratchFs, scratchDir, new FsPermission((short) 0777));
			
 
				+
			
 
				+    // Setup things needed for emulation of distributed cache load
			
 
				+    return setupDistCacheEmulation(conf, traceIn, ioPath, generate);
			
 
				+    // Setup emulation of other loads like CPU load, Memory load
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Setup gridmix for emulation of distributed cache load. This includes
			
 
				+   * generation of distributed cache files, if needed.
			
 
				+   * @param conf gridmix configuration
			
 
				+   * @param traceIn trace file path(if it is '-', then trace comes from the
			
 
				+   *                stream stdin)
			
 
				+   * @param ioPath &lt;ioPath&gt;/input/ is the dir where input data (a) exists
			
 
				+   *               or (b) is generated. &lt;ioPath&gt;/distributedCache/ is the
			
 
				+   *               folder where distributed cache data (a) exists or (b) is to be
			
 
				+   *               generated by gridmix.
			
 
				+   * @param generate true if -generate option was specified
			
 
				+   * @return exit code
			
 
				+   * @throws IOException
			
 
				+   * @throws InterruptedException
			
 
				+   */
			
 
				+  private int setupDistCacheEmulation(Configuration conf, String traceIn,
			
 
				+      Path ioPath, boolean generate) throws IOException, InterruptedException {
			
 
				+    distCacheEmulator.init(traceIn, factory.jobCreator, generate);
			
 
				+    int exitCode = 0;
			
 
				+    if (distCacheEmulator.shouldGenerateDistCacheData() ||
			
 
				+        distCacheEmulator.shouldEmulateDistCacheLoad()) {
			
 
				+
			
 
				+      JobStoryProducer jsp = createJobStoryProducer(traceIn, conf);
			
 
				+      exitCode = distCacheEmulator.setupGenerateDistCacheData(jsp);
			
 
				+      if (exitCode == 0) {
			
 
				+        // If there are files to be generated, run a MapReduce job to generate
			
 
				+        // these distributed cache files of all the simulated jobs of this trace.
			
 
				+        writeDistCacheData(conf);
			
 
				+      }
			
 
				+    }
			
 
				+    return exitCode;
			
 
				+  }
			
 
				+
			
 
				   /**
			
 
				    * Handles orderly shutdown by requesting that each component in the
			
 
				    * pipeline abort its progress, waiting for each to exit and killing
			
@@ -387,7 +607,7 @@ public class Gridmix extends Configured implements Tool {
 
				   public static void main(String[] argv) throws Exception {
			
 
				     int res = -1;
			
 
				     try {
			
 
				-      res = ToolRunner.run(new Configuration(), new Gridmix(), argv);
			
 
				+      res = ToolRunner.run(new Configuration(), new Gridmix(argv), argv);
			
 
				     } finally {
			
 
				       System.exit(res);
			
 
				     }
			
@@ -416,6 +636,11 @@ public class Gridmix extends Configured implements Tool {
 
				     ToolRunner.printGenericCommandUsage(out);
			
 
				     out.println("Usage: gridmix [-generate <MiB>] [-users URI] [-Dname=value ...] <iopath> <trace>");
			
 
				     out.println("  e.g. gridmix -generate 100m foo -");
			
 
				+    out.println("Options:");
			
 
				+    out.println("   -generate <MiB> : Generate input data of size MiB under "
			
 
				+        + "<iopath>/input/ and generate\n\t\t     distributed cache data under "
			
 
				+        + "<iopath>/distributedCache/.");
			
 
				+    out.println("   -users <usersResourceURI> : URI that contains the users list.");
			
 
				     out.println("Configuration parameters:");
			
 
				     out.println("   General parameters:");
			
 
				     out.printf("       %-48s : Output directory\n", GRIDMIX_OUT_DIR);
			
@@ -493,3 +718,4 @@ public class Gridmix extends Configured implements Tool {
 
				   }
			
 
				 
			
 
				 }
			
 
				+
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java
@@ -17,24 +17,27 @@
 
				  */
			
 
				 package org.apache.hadoop.mapred.gridmix;
			
 
				 
			
 
				+import java.io.DataOutputStream;
			
 
				 import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				 import java.util.Formatter;
			
 
				 import java.util.List;
			
 
				 import java.util.concurrent.Callable;
			
 
				 import java.util.concurrent.ConcurrentHashMap;
			
 
				 import java.util.concurrent.Delayed;
			
 
				 import java.util.concurrent.TimeUnit;
			
 
				+import java.util.regex.Matcher;
			
 
				+import java.util.regex.Pattern;
			
 
				 import java.security.PrivilegedExceptionAction;
			
 
				 
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				-import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				-import org.apache.hadoop.fs.FileSystem;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				 import org.apache.hadoop.io.DataInputBuffer;
			
 
				 import org.apache.hadoop.io.RawComparator;
			
 
				 import org.apache.hadoop.io.WritableComparator;
			
 
				 import org.apache.hadoop.io.WritableUtils;
			
 
				 import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.JobTracker;
			
 
				 import org.apache.hadoop.mapreduce.InputSplit;
			
 
				 import org.apache.hadoop.mapreduce.Job;
			
 
				 import org.apache.hadoop.mapreduce.JobContext;
			
@@ -53,16 +56,17 @@ import org.apache.commons.logging.LogFactory;
 
				  */
			
 
				 abstract class GridmixJob implements Callable<Job>, Delayed {
			
 
				 
			
 
				-  public static final String JOBNAME = "GRIDMIX";
			
 
				-  public static final String ORIGNAME = "gridmix.job.name.original";
			
 
				+  // Gridmix job name format is GRIDMIX<6 digit sequence number>
			
 
				+  public static final String JOB_NAME_PREFIX = "GRIDMIX";
			
 
				   public static final Log LOG = LogFactory.getLog(GridmixJob.class);
			
 
				 
			
 
				   private static final ThreadLocal<Formatter> nameFormat =
			
 
				     new ThreadLocal<Formatter>() {
			
 
				       @Override
			
 
				       protected Formatter initialValue() {
			
 
				-        final StringBuilder sb = new StringBuilder(JOBNAME.length() + 5);
			
 
				-        sb.append(JOBNAME);
			
 
				+        final StringBuilder sb =
			
 
				+            new StringBuilder(JOB_NAME_PREFIX.length() + 6);
			
 
				+        sb.append(JOB_NAME_PREFIX);
			
 
				         return new Formatter(sb);
			
 
				       }
			
 
				     };
			
@@ -80,6 +84,14 @@ abstract class GridmixJob implements Callable<Job>, Delayed {
 
				       "gridmix.job-submission.use-queue-in-trace";
			
 
				   protected static final String GRIDMIX_DEFAULT_QUEUE = 
			
 
				       "gridmix.job-submission.default-queue";
			
 
				+  // configuration key to enable/disable High-Ram feature emulation
			
 
				+  static final String GRIDMIX_HIGHRAM_EMULATION_ENABLE = 
			
 
				+    "gridmix.highram-emulation.enable";
			
 
				+  // configuration key to enable/disable task jvm options
			
 
				+  static final String GRIDMIX_TASK_JVM_OPTIONS_ENABLE = 
			
 
				+    "gridmix.task.jvm-options.enable";
			
 
				+  private static final Pattern maxHeapPattern = 
			
 
				+    Pattern.compile("-Xmx[0-9]+[kKmMgGtT]?+");
			
 
				 
			
 
				   private static void setJobQueue(Job job, String queue) {
			
 
				     if (queue != null)
			
@@ -93,22 +105,56 @@ abstract class GridmixJob implements Callable<Job>, Delayed {
 
				     this.jobdesc = jobdesc;
			
 
				     this.seq = seq;
			
 
				 
			
 
				-    ((StringBuilder)nameFormat.get().out()).setLength(JOBNAME.length());
			
 
				+    ((StringBuilder)nameFormat.get().out()).setLength(JOB_NAME_PREFIX.length());
			
 
				     try {
			
 
				       job = this.ugi.doAs(new PrivilegedExceptionAction<Job>() {
			
 
				         public Job run() throws IOException {
			
 
				-          Job ret = new Job(conf, nameFormat.get().format("%05d", seq)
			
 
				-              .toString());
			
 
				+
			
 
				+          String jobId = null == jobdesc.getJobID()
			
 
				+                         ? "<unknown>"
			
 
				+                         : jobdesc.getJobID().toString();
			
 
				+          Job ret = new Job(conf,
			
 
				+                            nameFormat.get().format("%06d", seq).toString());
			
 
				           ret.getConfiguration().setInt(GRIDMIX_JOB_SEQ, seq);
			
 
				-          ret.getConfiguration().set(ORIGNAME,
			
 
				-              null == jobdesc.getJobID() ? "<unknown>" : jobdesc.getJobID()
			
 
				-                  .toString());
			
 
				+
			
 
				+          ret.getConfiguration().set(Gridmix.ORIGINAL_JOB_ID, jobId);
			
 
				+          ret.getConfiguration().set(Gridmix.ORIGINAL_JOB_NAME,
			
 
				+                                     jobdesc.getName());
			
 
				           if (conf.getBoolean(GRIDMIX_USE_QUEUE_IN_TRACE, false)) {
			
 
				             setJobQueue(ret, jobdesc.getQueueName());
			
 
				           } else {
			
 
				             setJobQueue(ret, conf.get(GRIDMIX_DEFAULT_QUEUE));
			
 
				           }
			
 
				 
			
 
				+          // check if the job can emulate compression
			
 
				+          if (canEmulateCompression()) {
			
 
				+            // set the compression related configs if compression emulation is
			
 
				+            // enabled
			
 
				+            if (CompressionEmulationUtil.isCompressionEmulationEnabled(conf)) {
			
 
				+              CompressionEmulationUtil.configureCompressionEmulation(
			
 
				+                  jobdesc.getJobConf(), ret.getConfiguration());
			
 
				+            }
			
 
				+          }
			
 
				+          
			
 
				+          // configure high ram properties if enabled
			
 
				+          if (conf.getBoolean(GRIDMIX_HIGHRAM_EMULATION_ENABLE, true)) {
			
 
				+            configureHighRamProperties(jobdesc.getJobConf(), 
			
 
				+                                       ret.getConfiguration());
			
 
				+          }
			
 
				+          
			
 
				+          // configure task jvm options if enabled
			
 
				+          // this knob can be turned off if there is a mismatch between the
			
 
				+          // target (simulation) cluster and the original cluster. Such a 
			
 
				+          // mismatch can result in job failures (due to memory issues) on the 
			
 
				+          // target (simulated) cluster.
			
 
				+          //
			
 
				+          // TODO If configured, scale the original task's JVM (heap related)
			
 
				+          //      options to suit the target (simulation) cluster
			
 
				+          if (conf.getBoolean(GRIDMIX_TASK_JVM_OPTIONS_ENABLE, true)) {
			
 
				+            configureTaskJVMOptions(jobdesc.getJobConf(), 
			
 
				+                                    ret.getConfiguration());
			
 
				+          }
			
 
				+          
			
 
				           return ret;
			
 
				         }
			
 
				       });
			
@@ -120,6 +166,185 @@ abstract class GridmixJob implements Callable<Job>, Delayed {
 
				         submissionMillis, TimeUnit.MILLISECONDS);
			
 
				     outdir = new Path(outRoot, "" + seq);
			
 
				   }
			
 
				+  
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  protected static void configureTaskJVMOptions(Configuration originalJobConf,
			
 
				+                                                Configuration simulatedJobConf){
			
 
				+    // Get the heap related java opts used for the original job and set the 
			
 
				+    // same for the simulated job.
			
 
				+    //    set task task heap options
			
 
				+    configureTaskJVMMaxHeapOptions(originalJobConf, simulatedJobConf, 
			
 
				+                                   JobConf.MAPRED_TASK_JAVA_OPTS);
			
 
				+    //  set map task heap options
			
 
				+    configureTaskJVMMaxHeapOptions(originalJobConf, simulatedJobConf, 
			
 
				+                                   JobConf.MAPRED_MAP_TASK_JAVA_OPTS);
			
 
				+
			
 
				+    //  set reduce task heap options
			
 
				+    configureTaskJVMMaxHeapOptions(originalJobConf, simulatedJobConf, 
			
 
				+                                   JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS);
			
 
				+  }
			
 
				+  
			
 
				+  // Configures the task's max heap options using the specified key
			
 
				+  private static void configureTaskJVMMaxHeapOptions(Configuration srcConf, 
			
 
				+                                                     Configuration destConf,
			
 
				+                                                     String key) {
			
 
				+    String srcHeapOpts = srcConf.get(key);
			
 
				+    if (srcHeapOpts != null) {
			
 
				+      List<String> srcMaxOptsList = new ArrayList<String>();
			
 
				+      // extract the max heap options and ignore the rest
			
 
				+      extractMaxHeapOpts(srcHeapOpts, srcMaxOptsList, 
			
 
				+                         new ArrayList<String>());
			
 
				+      if (srcMaxOptsList.size() > 0) {
			
 
				+        List<String> destOtherOptsList = new ArrayList<String>();
			
 
				+        // extract the other heap options and ignore the max options in the 
			
 
				+        // destination configuration
			
 
				+        String destHeapOpts = destConf.get(key);
			
 
				+        if (destHeapOpts != null) {
			
 
				+          extractMaxHeapOpts(destHeapOpts, new ArrayList<String>(), 
			
 
				+                             destOtherOptsList);
			
 
				+        }
			
 
				+        
			
 
				+        // the source configuration might have some task level max heap opts set
			
 
				+        // remove these opts from the destination configuration and replace
			
 
				+        // with the options set in the original configuration
			
 
				+        StringBuilder newHeapOpts = new StringBuilder();
			
 
				+        
			
 
				+        for (String otherOpt : destOtherOptsList) {
			
 
				+          newHeapOpts.append(otherOpt).append(" ");
			
 
				+        }
			
 
				+        
			
 
				+        for (String opts : srcMaxOptsList) {
			
 
				+          newHeapOpts.append(opts).append(" ");
			
 
				+        }
			
 
				+        
			
 
				+        // set the final heap opts 
			
 
				+        destConf.set(key, newHeapOpts.toString().trim());
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  private static void extractMaxHeapOpts(String javaOptions,  
			
 
				+      List<String> maxOpts,  List<String> others) {
			
 
				+    for (String opt : javaOptions.split(" ")) {
			
 
				+      Matcher matcher = maxHeapPattern.matcher(opt);
			
 
				+      if (matcher.find()) {
			
 
				+        maxOpts.add(opt);
			
 
				+      } else {
			
 
				+        others.add(opt);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // Scales the desired job-level configuration parameter. This API makes sure 
			
 
				+  // that the ratio of the job level configuration parameter to the cluster 
			
 
				+  // level configuration parameter is maintained in the simulated run. Hence 
			
 
				+  // the values are scaled from the original cluster's configuration to the 
			
 
				+  // simulated cluster's configuration for higher emulation accuracy.
			
 
				+  // This kind of scaling is useful for memory parameters.
			
 
				+  private static void scaleConfigParameter(Configuration sourceConf, 
			
 
				+                        Configuration destConf, String clusterValueKey, 
			
 
				+                        String jobValueKey, long defaultValue) {
			
 
				+    long simulatedClusterDefaultValue = 
			
 
				+           destConf.getLong(clusterValueKey, defaultValue);
			
 
				+    
			
 
				+    long originalClusterDefaultValue = 
			
 
				+           sourceConf.getLong(clusterValueKey, defaultValue);
			
 
				+    
			
 
				+    long originalJobValue = 
			
 
				+           sourceConf.getLong(jobValueKey, defaultValue);
			
 
				+    
			
 
				+    double scaleFactor = (double)originalJobValue/originalClusterDefaultValue;
			
 
				+    
			
 
				+    long simulatedJobValue = (long)(scaleFactor * simulatedClusterDefaultValue);
			
 
				+    
			
 
				+    if (LOG.isDebugEnabled()) {
			
 
				+      LOG.debug("For the job configuration parameter '" + jobValueKey 
			
 
				+                + "' and the cluster configuration parameter '" 
			
 
				+                + clusterValueKey + "', the original job's configuration value"
			
 
				+                + " is scaled from '" + originalJobValue + "' to '" 
			
 
				+                + simulatedJobValue + "' using the default (unit) value of "
			
 
				+                + "'" + originalClusterDefaultValue + "' for the original "
			
 
				+                + " cluster and '" + simulatedClusterDefaultValue + "' for the"
			
 
				+                + " simulated cluster.");
			
 
				+    }
			
 
				+    
			
 
				+    destConf.setLong(jobValueKey, simulatedJobValue);
			
 
				+  }
			
 
				+  
			
 
				+  // Checks if the scaling of original job's memory parameter value is 
			
 
				+  // valid
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  private static boolean checkMemoryUpperLimits(String jobKey, String limitKey,  
			
 
				+                                                Configuration conf, 
			
 
				+                                                boolean convertLimitToMB) {
			
 
				+    if (conf.get(limitKey) != null) {
			
 
				+      long limit = conf.getLong(limitKey, JobConf.DISABLED_MEMORY_LIMIT);
			
 
				+      // scale only if the max memory limit is set.
			
 
				+      if (limit >= 0) {
			
 
				+        if (convertLimitToMB) {
			
 
				+          limit /= (1024 * 1024); //Converting to MB
			
 
				+        }
			
 
				+        
			
 
				+        long scaledConfigValue = 
			
 
				+               conf.getLong(jobKey, JobConf.DISABLED_MEMORY_LIMIT);
			
 
				+        
			
 
				+        // check now
			
 
				+        if (scaledConfigValue > limit) {
			
 
				+          throw new RuntimeException("Simulated job's configuration" 
			
 
				+              + " parameter '" + jobKey + "' got scaled to a value '" 
			
 
				+              + scaledConfigValue + "' which exceeds the upper limit of '" 
			
 
				+              + limit + "' defined for the simulated cluster by the key '" 
			
 
				+              + limitKey + "'. To disable High-Ram feature emulation, set '" 
			
 
				+              + GRIDMIX_HIGHRAM_EMULATION_ENABLE + "' to 'false'.");
			
 
				+        }
			
 
				+        return true;
			
 
				+      }
			
 
				+    }
			
 
				+    return false;
			
 
				+  }
			
 
				+  
			
 
				+  // Check if the parameter scaling does not exceed the cluster limits.
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  private static void validateTaskMemoryLimits(Configuration conf, 
			
 
				+                        String jobKey, String clusterMaxKey) {
			
 
				+    if (!checkMemoryUpperLimits(jobKey, 
			
 
				+        JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY, conf, true)) {
			
 
				+      checkMemoryUpperLimits(jobKey, clusterMaxKey, conf, false);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Sets the high ram job properties in the simulated job's configuration.
			
 
				+   */
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  static void configureHighRamProperties(Configuration sourceConf, 
			
 
				+                                         Configuration destConf) {
			
 
				+    // set the memory per map task
			
 
				+    scaleConfigParameter(sourceConf, destConf, 
			
 
				+                         JobTracker.MAPRED_CLUSTER_MAP_MEMORY_MB_PROPERTY,
			
 
				+                         JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, 
			
 
				+                         JobConf.DISABLED_MEMORY_LIMIT);
			
 
				+    
			
 
				+    // validate and fail early
			
 
				+    validateTaskMemoryLimits(destConf,
			
 
				+        JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, 
			
 
				+        JobTracker.MAPRED_CLUSTER_MAX_MAP_MEMORY_MB_PROPERTY);
			
 
				+    
			
 
				+    // set the memory per reduce task
			
 
				+    scaleConfigParameter(sourceConf, destConf, 
			
 
				+                         JobTracker.MAPRED_CLUSTER_REDUCE_MEMORY_MB_PROPERTY,
			
 
				+                         JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY,
			
 
				+                         JobConf.DISABLED_MEMORY_LIMIT);
			
 
				+    // validate and fail early
			
 
				+    validateTaskMemoryLimits(destConf,
			
 
				+        JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY, 
			
 
				+        JobTracker.MAPRED_CLUSTER_MAX_REDUCE_MEMORY_MB_PROPERTY);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Indicates whether this {@link GridmixJob} supports compression emulation.
			
 
				+   */
			
 
				+  protected abstract boolean canEmulateCompression();
			
 
				 
			
 
				   protected GridmixJob(
			
 
				     final Configuration conf, long submissionMillis, final String name)
			
@@ -289,13 +514,18 @@ abstract class GridmixJob implements Callable<Job>, Delayed {
 
				         TaskAttemptContext job) throws IOException {
			
 
				 
			
 
				       Path file = getDefaultWorkFile(job, "");
			
 
				-      FileSystem fs = file.getFileSystem(job.getConfiguration());
			
 
				-      final FSDataOutputStream fileOut = fs.create(file, false);
			
 
				+      final DataOutputStream fileOut;
			
 
				+
			
 
				+      fileOut = 
			
 
				+        new DataOutputStream(CompressionEmulationUtil
			
 
				+            .getPossiblyCompressedOutputStream(file, job.getConfiguration()));
			
 
				+
			
 
				       return new RecordWriter<K,GridmixRecord>() {
			
 
				         @Override
			
 
				         public void write(K ignored, GridmixRecord value)
			
 
				             throws IOException {
			
 
				-          value.writeRandom(fileOut, value.getSize());
			
 
				+          // Let the Gridmix record fill itself.
			
 
				+          value.write(fileOut);
			
 
				         }
			
 
				         @Override
			
 
				         public void close(TaskAttemptContext ctxt) throws IOException {
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixKey.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixKey.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.io.DataInputBuffer;
 
				 import org.apache.hadoop.io.Writable;
			
 
				 import org.apache.hadoop.io.WritableUtils;
			
 
				 import org.apache.hadoop.io.WritableComparator;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				 
			
 
				 class GridmixKey extends GridmixRecord {
			
 
				   static final byte REDUCE_SPEC = 0;
			
@@ -115,6 +116,22 @@ class GridmixKey extends GridmixRecord {
 
				     setSize(origSize);
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Get the {@link ResourceUsageMetrics} stored in the key.
			
 
				+   */
			
 
				+  public ResourceUsageMetrics getReduceResourceUsageMetrics() {
			
 
				+    assert REDUCE_SPEC == getType();
			
 
				+    return spec.metrics;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Store the {@link ResourceUsageMetrics} in the key.
			
 
				+   */
			
 
				+  public void setReduceResourceUsageMetrics(ResourceUsageMetrics metrics) {
			
 
				+    assert REDUCE_SPEC == getType();
			
 
				+    spec.setResourceUsageSpecification(metrics);
			
 
				+  }
			
 
				+  
			
 
				   public byte getType() {
			
 
				     return type;
			
 
				   }
			
@@ -195,18 +212,35 @@ class GridmixKey extends GridmixRecord {
 
				     long rec_in;
			
 
				     long rec_out;
			
 
				     long bytes_out;
			
 
				+    private ResourceUsageMetrics metrics = null;
			
 
				+    private int sizeOfResourceUsageMetrics = 0;
			
 
				     public Spec() { }
			
 
				 
			
 
				     public void set(Spec other) {
			
 
				       rec_in = other.rec_in;
			
 
				       bytes_out = other.bytes_out;
			
 
				       rec_out = other.rec_out;
			
 
				+      setResourceUsageSpecification(other.metrics);
			
 
				     }
			
 
				 
			
 
				+    /**
			
 
				+     * Sets the {@link ResourceUsageMetrics} for this {@link Spec}.
			
 
				+     */
			
 
				+    public void setResourceUsageSpecification(ResourceUsageMetrics metrics) {
			
 
				+      this.metrics = metrics;
			
 
				+      if (metrics != null) {
			
 
				+        this.sizeOfResourceUsageMetrics = metrics.size();
			
 
				+      } else {
			
 
				+        this.sizeOfResourceUsageMetrics = 0;
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				     public int getSize() {
			
 
				       return WritableUtils.getVIntSize(rec_in) +
			
 
				              WritableUtils.getVIntSize(rec_out) +
			
 
				-             WritableUtils.getVIntSize(bytes_out);
			
 
				+             WritableUtils.getVIntSize(bytes_out) +
			
 
				+             WritableUtils.getVIntSize(sizeOfResourceUsageMetrics) +
			
 
				+             sizeOfResourceUsageMetrics;
			
 
				     }
			
 
				 
			
 
				     @Override
			
@@ -214,6 +248,11 @@ class GridmixKey extends GridmixRecord {
 
				       rec_in = WritableUtils.readVLong(in);
			
 
				       rec_out = WritableUtils.readVLong(in);
			
 
				       bytes_out = WritableUtils.readVLong(in);
			
 
				+      sizeOfResourceUsageMetrics =  WritableUtils.readVInt(in);
			
 
				+      if (sizeOfResourceUsageMetrics > 0) {
			
 
				+        metrics = new ResourceUsageMetrics();
			
 
				+        metrics.readFields(in);
			
 
				+      }
			
 
				     }
			
 
				 
			
 
				     @Override
			
@@ -221,6 +260,10 @@ class GridmixKey extends GridmixRecord {
 
				       WritableUtils.writeVLong(out, rec_in);
			
 
				       WritableUtils.writeVLong(out, rec_out);
			
 
				       WritableUtils.writeVLong(out, bytes_out);
			
 
				+      WritableUtils.writeVInt(out, sizeOfResourceUsageMetrics);
			
 
				+      if (sizeOfResourceUsageMetrics > 0) {
			
 
				+        metrics.write(out);
			
 
				+      }
			
 
				     }
			
 
				   }
			
 
				 
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixRecord.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixRecord.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.io.DataOutputBuffer;
 
				 import org.apache.hadoop.io.WritableComparable;
			
 
				 import org.apache.hadoop.io.WritableComparator;
			
 
				 import org.apache.hadoop.io.WritableUtils;
			
 
				+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
			
 
				 
			
 
				 class GridmixRecord implements WritableComparable<GridmixRecord> {
			
 
				 
			
@@ -39,6 +40,10 @@ class GridmixRecord implements WritableComparable<GridmixRecord> {
 
				   private final DataOutputBuffer dob =
			
 
				     new DataOutputBuffer(Long.SIZE / Byte.SIZE);
			
 
				   private byte[] literal = dob.getData();
			
 
				+  private boolean compressible = false;
			
 
				+  private float compressionRatio = 
			
 
				+    CompressionEmulationUtil.DEFAULT_COMPRESSION_RATIO;
			
 
				+  private RandomTextDataGenerator rtg = null;
			
 
				 
			
 
				   GridmixRecord() {
			
 
				     this(1, 0L);
			
@@ -57,6 +62,19 @@ class GridmixRecord implements WritableComparable<GridmixRecord> {
 
				     setSizeInternal(size);
			
 
				   }
			
 
				 
			
 
				+  void setCompressibility(boolean compressible, float ratio) {
			
 
				+    this.compressible = compressible;
			
 
				+    this.compressionRatio = ratio;
			
 
				+    // Initialize the RandomTextDataGenerator once for every GridMix record
			
 
				+    // Note that RandomTextDataGenerator is needed only when the GridMix record
			
 
				+    // is configured to generate compressible text data.
			
 
				+    if (compressible) {
			
 
				+      rtg = 
			
 
				+        CompressionEmulationUtil.getRandomTextDataGenerator(ratio, 
			
 
				+                                   RandomTextDataGenerator.DEFAULT_SEED);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				   private void setSizeInternal(int size) {
			
 
				     this.size = Math.max(1, size);
			
 
				     try {
			
@@ -79,6 +97,39 @@ class GridmixRecord implements WritableComparable<GridmixRecord> {
 
				     return (x ^= (x << 17));
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Generate random text data that can be compressed. If the record is marked
			
 
				+   * compressible (via {@link FileOutputFormat#COMPRESS}), only then the 
			
 
				+   * random data will be text data else 
			
 
				+   * {@link GridmixRecord#writeRandom(DataOutput, int)} will be invoked.
			
 
				+   */
			
 
				+  private void writeRandomText(DataOutput out, final int size) 
			
 
				+  throws IOException {
			
 
				+    long tmp = seed;
			
 
				+    out.writeLong(tmp);
			
 
				+    int i = size - (Long.SIZE / Byte.SIZE);
			
 
				+    //TODO Should we use long for size. What if the data is more than 4G?
			
 
				+    
			
 
				+    String randomWord = rtg.getRandomWord();
			
 
				+    byte[] bytes = randomWord.getBytes("UTF-8");
			
 
				+    long randomWordSize = bytes.length;
			
 
				+    while (i >= randomWordSize) {
			
 
				+      out.write(bytes);
			
 
				+      i -= randomWordSize;
			
 
				+      
			
 
				+      // get the next random word
			
 
				+      randomWord = rtg.getRandomWord();
			
 
				+      bytes = randomWord.getBytes("UTF-8");
			
 
				+      // determine the random word size
			
 
				+      randomWordSize = bytes.length;
			
 
				+    }
			
 
				+    
			
 
				+    // pad the remaining bytes
			
 
				+    if (i > 0) {
			
 
				+      out.write(bytes, 0, i);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				   public void writeRandom(DataOutput out, final int size) throws IOException {
			
 
				     long tmp = seed;
			
 
				     out.writeLong(tmp);
			
@@ -120,8 +171,13 @@ class GridmixRecord implements WritableComparable<GridmixRecord> {
 
				     WritableUtils.writeVInt(out, size);
			
 
				     final int payload = size - WritableUtils.getVIntSize(size);
			
 
				     if (payload > Long.SIZE / Byte.SIZE) {
			
 
				-      writeRandom(out, payload);
			
 
				+      if (compressible) {
			
 
				+        writeRandomText(out, payload);
			
 
				+      } else {
			
 
				+        writeRandom(out, payload);
			
 
				+      }
			
 
				     } else if (payload > 0) {
			
 
				+      //TODO What is compressible is turned on? LOG is a bad idea!
			
 
				       out.write(literal, 0, payload);
			
 
				     }
			
 
				   }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/InputStriper.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/InputStriper.java
@@ -25,9 +25,12 @@ import java.util.HashMap;
 
				 import java.util.List;
			
 
				 import java.util.Map.Entry;
			
 
				 
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				 import org.apache.hadoop.fs.BlockLocation;
			
 
				 import org.apache.hadoop.fs.FileStatus;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.io.compress.CompressionCodec;
			
 
				+import org.apache.hadoop.io.compress.CompressionCodecFactory;
			
 
				 
			
 
				 import org.apache.commons.logging.Log;
			
 
				 import org.apache.commons.logging.LogFactory;
			
@@ -42,6 +45,7 @@ class InputStriper {
 
				   long currentStart;
			
 
				   FileStatus current;
			
 
				   final List<FileStatus> files = new ArrayList<FileStatus>();
			
 
				+  final Configuration conf = new Configuration();
			
 
				 
			
 
				   /**
			
 
				    * @param inputDir Pool from which files are requested.
			
@@ -91,7 +95,15 @@ class InputStriper {
 
				       }
			
 
				       currentStart += fromFile;
			
 
				       bytes -= fromFile;
			
 
				-      if (current.getLen() - currentStart == 0) {
			
 
				+      // Switch to a new file if
			
 
				+      //  - the current file is uncompressed and completely used
			
 
				+      //  - the current file is compressed
			
 
				+      
			
 
				+      CompressionCodecFactory compressionCodecs = 
			
 
				+        new CompressionCodecFactory(conf);
			
 
				+      CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
			
 
				+      if (current.getLen() - currentStart == 0
			
 
				+          || codec != null) {
			
 
				         current = files.get(++idx % files.size());
			
 
				         currentStart = 0;
			
 
				       }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/JobCreator.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/JobCreator.java
@@ -18,31 +18,42 @@
 
				 
			
 
				 package org.apache.hadoop.mapred.gridmix;
			
 
				 
			
 
				+import org.apache.hadoop.classification.InterfaceAudience;
			
 
				+import org.apache.hadoop.classification.InterfaceStability;
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				 import org.apache.hadoop.mapred.ClusterStatus;
			
 
				 import org.apache.hadoop.mapred.JobClient;
			
 
				 import org.apache.hadoop.mapred.JobConf;
			
 
				-import org.apache.hadoop.mapred.gridmix.GenerateData.GenSplit;
			
 
				-import org.apache.hadoop.mapreduce.InputSplit;
			
 
				 import org.apache.hadoop.security.UserGroupInformation;
			
 
				 import org.apache.hadoop.tools.rumen.JobStory;
			
 
				 
			
 
				 import java.io.IOException;
			
 
				 import java.util.ArrayList;
			
 
				-import java.util.Random;
			
 
				 import java.util.regex.Matcher;
			
 
				 import java.util.regex.Pattern;
			
 
				 
			
 
				+@InterfaceAudience.Private
			
 
				+@InterfaceStability.Evolving
			
 
				 public enum JobCreator {
			
 
				 
			
 
				   LOADJOB {
			
 
				     @Override
			
 
				     public GridmixJob createGridmixJob(
			
 
				-      Configuration conf, long submissionMillis, JobStory jobdesc, Path outRoot,
			
 
				-      UserGroupInformation ugi, int seq) throws IOException {
			
 
				+      Configuration gridmixConf, long submissionMillis, JobStory jobdesc,
			
 
				+      Path outRoot, UserGroupInformation ugi, int seq) throws IOException {
			
 
				+
			
 
				+      // Build configuration for this simulated job
			
 
				+      Configuration conf = new Configuration(gridmixConf);
			
 
				+      dce.configureDistCacheFiles(conf, jobdesc.getJobConf());
			
 
				       return new LoadJob(conf, submissionMillis, jobdesc, outRoot, ugi, seq);
			
 
				-    }},
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public boolean canEmulateDistCacheLoad() {
			
 
				+      return true;
			
 
				+    }
			
 
				+  },
			
 
				 
			
 
				   SLEEPJOB {
			
 
				     private String[] hosts;
			
@@ -72,12 +83,30 @@ public enum JobCreator {
 
				       }
			
 
				       return new SleepJob(conf, submissionMillis, jobdesc, outRoot, ugi, seq,
			
 
				           numLocations, hosts);
			
 
				-    }};
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public boolean canEmulateDistCacheLoad() {
			
 
				+      return false;
			
 
				+    }
			
 
				+  };
			
 
				 
			
 
				   public static final String GRIDMIX_JOB_TYPE = "gridmix.job.type";
			
 
				   public static final String SLEEPJOB_RANDOM_LOCATIONS = 
			
 
				     "gridmix.sleep.fake-locations";
			
 
				 
			
 
				+  /**
			
 
				+   * Create Gridmix simulated job.
			
 
				+   * @param conf configuration of simulated job
			
 
				+   * @param submissionMillis At what time submission of this simulated job be
			
 
				+   *                         done
			
 
				+   * @param jobdesc JobStory obtained from trace
			
 
				+   * @param outRoot gridmix output directory
			
 
				+   * @param ugi UGI of job submitter of this simulated job
			
 
				+   * @param seq job sequence number
			
 
				+   * @return the created simulated job
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				   public abstract GridmixJob createGridmixJob(
			
 
				     final Configuration conf, long submissionMillis, final JobStory jobdesc,
			
 
				     Path outRoot, UserGroupInformation ugi, final int seq) throws IOException;
			
@@ -86,4 +115,21 @@ public enum JobCreator {
 
				     Configuration conf, JobCreator defaultPolicy) {
			
 
				     return conf.getEnum(GRIDMIX_JOB_TYPE, defaultPolicy);
			
 
				   }
			
 
				+
			
 
				+  /**
			
 
				+   * @return true if gridmix simulated jobs of this job type can emulate
			
 
				+   *         distributed cache load
			
 
				+   */
			
 
				+  abstract boolean canEmulateDistCacheLoad();
			
 
				+
			
 
				+  DistributedCacheEmulator dce;
			
 
				+  /**
			
 
				+   * This method is to be called before calling any other method in JobCreator
			
 
				+   * except canEmulateDistCacheLoad(), especially if canEmulateDistCacheLoad()
			
 
				+   * returns true for that job type.
			
 
				+   * @param e Distributed Cache Emulator
			
 
				+   */
			
 
				+  void setDistCacheEmulator(DistributedCacheEmulator e) {
			
 
				+    this.dce = e;
			
 
				+  }
			
 
				 }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/JobFactory.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/JobFactory.java
@@ -63,6 +63,7 @@ abstract class JobFactory<T> implements Gridmix.Component<Void>,StatListener<T>{
 
				   protected final JobStoryProducer jobProducer;
			
 
				   protected final ReentrantLock lock = new ReentrantLock(true);
			
 
				   protected final JobCreator jobCreator;
			
 
				+  protected int numJobsInTrace = 0;
			
 
				 
			
 
				   /**
			
 
				    * Creating a new instance does not start the thread.
			
@@ -112,7 +113,7 @@ abstract class JobFactory<T> implements Gridmix.Component<Void>,StatListener<T>{
 
				     public MinTaskInfo(TaskInfo info) {
			
 
				       super(info.getInputBytes(), info.getInputRecords(),
			
 
				             info.getOutputBytes(), info.getOutputRecords(),
			
 
				-            info.getTaskMemory());
			
 
				+            info.getTaskMemory(), info.getResourceUsageMetrics());
			
 
				     }
			
 
				     public long getInputBytes() {
			
 
				       return Math.max(0, super.getInputBytes());
			
@@ -168,13 +169,33 @@ abstract class JobFactory<T> implements Gridmix.Component<Void>,StatListener<T>{
 
				 
			
 
				   protected abstract Thread createReaderThread() ;
			
 
				 
			
 
				+  //gets the next job from the trace and does some bookkeeping for the same
			
 
				+  private JobStory getNextJobFromTrace() throws IOException {
			
 
				+    JobStory story = jobProducer.getNextJob();
			
 
				+    if (story != null) {
			
 
				+      ++numJobsInTrace;
			
 
				+    }
			
 
				+    return story;
			
 
				+  }
			
 
				+
			
 
				   protected JobStory getNextJobFiltered() throws IOException {
			
 
				-    JobStory job;
			
 
				-    do {
			
 
				-      job = jobProducer.getNextJob();
			
 
				-    } while (job != null
			
 
				-        && (job.getOutcome() != Pre21JobHistoryConstants.Values.SUCCESS ||
			
 
				-            job.getSubmissionTime() < 0));
			
 
				+    JobStory job = getNextJobFromTrace();
			
 
				+    while (job != null &&
			
 
				+           (job.getOutcome() != Pre21JobHistoryConstants.Values.SUCCESS ||
			
 
				+            job.getSubmissionTime() < 0)) {
			
 
				+      if (LOG.isDebugEnabled()) {
			
 
				+        String reason = null;
			
 
				+        if (job.getOutcome() != Pre21JobHistoryConstants.Values.SUCCESS) {
			
 
				+          reason = "STATE (" + job.getOutcome().name() + ") ";
			
 
				+        }
			
 
				+        if (job.getSubmissionTime() < 0) {
			
 
				+          reason += "SUBMISSION-TIME (" + job.getSubmissionTime() + ")";
			
 
				+        }
			
 
				+        LOG.debug("Ignoring job " + job.getJobID() + " from the input trace."
			
 
				+                  + " Reason: " + reason == null ? "N/A" : reason);
			
 
				+      }
			
 
				+      job = getNextJobFromTrace();
			
 
				+    }
			
 
				     return null == job ? null : new FilterJobStory(job) {
			
 
				         @Override
			
 
				         public TaskInfo getTaskInfo(TaskType taskType, int taskNumber) {
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/JobMonitor.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/JobMonitor.java
@@ -78,13 +78,13 @@ class JobMonitor implements Gridmix.Component<Job> {
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Add a submission failed job , such tht it can be communicated
			
 
				+   * Add a submission failed job , such that it can be communicated
			
 
				    * back to serial.
			
 
				    * TODO: Cleaner solution for this problem
			
 
				    * @param job
			
 
				    */
			
 
				   public void submissionFailed(Job job) {
			
 
				-    LOG.info(" Job submission failed notify if anyone is waiting " + job);
			
 
				+    LOG.info("Job submission failed notification for job " + job.getJobID());
			
 
				     this.statistics.add(job);
			
 
				   }
			
 
				 
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/JobSubmitter.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/JobSubmitter.java
@@ -127,7 +127,7 @@ class JobSubmitter implements Gridmix.Component<GridmixJob> {
 
				         monitor.submissionFailed(job.getJob());
			
 
				       } catch(Exception e) {
			
 
				         //Due to some exception job wasnt submitted.
			
 
				-        LOG.info(" Job " + job.getJob() + " submission failed " , e);
			
 
				+        LOG.info(" Job " + job.getJob().getJobID() + " submission failed " , e);
			
 
				         monitor.submissionFailed(job.getJob());
			
 
				       } finally {
			
 
				         sem.release();
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/LoadJob.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/LoadJob.java
@@ -22,6 +22,9 @@ import org.apache.commons.logging.LogFactory;
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				 import org.apache.hadoop.io.NullWritable;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.TaskTracker;
			
 
				+import org.apache.hadoop.mapred.gridmix.emulators.resourceusage.ResourceUsageMatcher;
			
 
				 import org.apache.hadoop.mapreduce.InputFormat;
			
 
				 import org.apache.hadoop.mapreduce.InputSplit;
			
 
				 import org.apache.hadoop.mapreduce.Job;
			
@@ -30,10 +33,13 @@ import org.apache.hadoop.mapreduce.Mapper;
 
				 import org.apache.hadoop.mapreduce.RecordReader;
			
 
				 import org.apache.hadoop.mapreduce.Reducer;
			
 
				 import org.apache.hadoop.mapreduce.TaskAttemptContext;
			
 
				+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
			
 
				 import org.apache.hadoop.mapreduce.TaskType;
			
 
				 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
			
 
				+import org.apache.hadoop.util.ResourceCalculatorPlugin;
			
 
				 import org.apache.hadoop.security.UserGroupInformation;
			
 
				 import org.apache.hadoop.tools.rumen.JobStory;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				 import org.apache.hadoop.tools.rumen.TaskInfo;
			
 
				 
			
 
				 import java.io.IOException;
			
@@ -83,6 +89,106 @@ class LoadJob extends GridmixJob {
 
				     return job;
			
 
				   }
			
 
				 
			
 
				+  @Override
			
 
				+  protected boolean canEmulateCompression() {
			
 
				+    return true;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * This is a progress based resource usage matcher.
			
 
				+   */
			
 
				+  @SuppressWarnings("unchecked")
			
 
				+  static class ResourceUsageMatcherRunner extends Thread {
			
 
				+    private final ResourceUsageMatcher matcher;
			
 
				+    private final Progressive progress;
			
 
				+    private final long sleepTime;
			
 
				+    private static final String SLEEP_CONFIG = 
			
 
				+      "gridmix.emulators.resource-usage.sleep-duration";
			
 
				+    private static final long DEFAULT_SLEEP_TIME = 100; // 100ms
			
 
				+    
			
 
				+    ResourceUsageMatcherRunner(final TaskInputOutputContext context, 
			
 
				+                               ResourceUsageMetrics metrics) {
			
 
				+      Configuration conf = context.getConfiguration();
			
 
				+      
			
 
				+      // set the resource calculator plugin
			
 
				+      Class<? extends ResourceCalculatorPlugin> clazz =
			
 
				+        conf.getClass(TaskTracker.TT_RESOURCE_CALCULATOR_PLUGIN,
			
 
				+                      null, ResourceCalculatorPlugin.class);
			
 
				+      ResourceCalculatorPlugin plugin = 
			
 
				+        ResourceCalculatorPlugin.getResourceCalculatorPlugin(clazz, conf);
			
 
				+      
			
 
				+      // set the other parameters
			
 
				+      this.sleepTime = conf.getLong(SLEEP_CONFIG, DEFAULT_SLEEP_TIME);
			
 
				+      progress = new Progressive() {
			
 
				+        @Override
			
 
				+        public float getProgress() {
			
 
				+          return context.getProgress();
			
 
				+        }
			
 
				+      };
			
 
				+      
			
 
				+      // instantiate a resource-usage-matcher
			
 
				+      matcher = new ResourceUsageMatcher();
			
 
				+      matcher.configure(conf, plugin, metrics, progress);
			
 
				+    }
			
 
				+    
			
 
				+    protected void match() throws Exception {
			
 
				+      // match the resource usage
			
 
				+      matcher.matchResourceUsage();
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public void run() {
			
 
				+      LOG.info("Resource usage matcher thread started.");
			
 
				+      try {
			
 
				+        while (progress.getProgress() < 1) {
			
 
				+          // match
			
 
				+          match();
			
 
				+          
			
 
				+          // sleep for some time
			
 
				+          try {
			
 
				+            Thread.sleep(sleepTime);
			
 
				+          } catch (Exception e) {}
			
 
				+        }
			
 
				+        
			
 
				+        // match for progress = 1
			
 
				+        match();
			
 
				+        LOG.info("Resource usage emulation complete! Matcher exiting");
			
 
				+      } catch (Exception e) {
			
 
				+        LOG.info("Exception while running the resource-usage-emulation matcher"
			
 
				+                 + " thread! Exiting.", e);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  // Makes sure that the TaskTracker doesn't kill the map/reduce tasks while
			
 
				+  // they are emulating
			
 
				+  private static class StatusReporter extends Thread {
			
 
				+    private TaskInputOutputContext context;
			
 
				+    StatusReporter(TaskInputOutputContext context) {
			
 
				+      this.context = context;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public void run() {
			
 
				+      LOG.info("Status reporter thread started.");
			
 
				+      try {
			
 
				+        while (context.getProgress() < 1) {
			
 
				+          // report progress
			
 
				+          context.progress();
			
 
				+
			
 
				+          // sleep for some time
			
 
				+          try {
			
 
				+            Thread.sleep(100); // sleep for 100ms
			
 
				+          } catch (Exception e) {}
			
 
				+        }
			
 
				+        
			
 
				+        LOG.info("Status reporter thread exiting");
			
 
				+      } catch (Exception e) {
			
 
				+        LOG.info("Exception while running the status reporter thread!", e);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				   public static class LoadMapper
			
 
				       extends Mapper<NullWritable,GridmixRecord,GridmixKey,GridmixRecord> {
			
 
				 
			
@@ -95,6 +201,9 @@ class LoadJob extends GridmixJob {
 
				     private final GridmixKey key = new GridmixKey();
			
 
				     private final GridmixRecord val = new GridmixRecord();
			
 
				 
			
 
				+    private ResourceUsageMatcherRunner matcher = null;
			
 
				+    private StatusReporter reporter = null;
			
 
				+    
			
 
				     @Override
			
 
				     protected void setup(Context ctxt)
			
 
				         throws IOException, InterruptedException {
			
@@ -104,6 +213,20 @@ class LoadJob extends GridmixJob {
 
				       final long[] reduceBytes = split.getOutputBytes();
			
 
				       final long[] reduceRecords = split.getOutputRecords();
			
 
				 
			
 
				+      // enable gridmix map output record for compression
			
 
				+      final boolean emulateMapOutputCompression = 
			
 
				+        CompressionEmulationUtil.isCompressionEmulationEnabled(conf)
			
 
				+        && conf.getBoolean("mapred.compress.map.output", false);
			
 
				+      float compressionRatio = 1.0f;
			
 
				+      if (emulateMapOutputCompression) {
			
 
				+        compressionRatio = 
			
 
				+          CompressionEmulationUtil.getMapOutputCompressionEmulationRatio(conf);
			
 
				+        LOG.info("GridMix is configured to use a compression ratio of " 
			
 
				+                 + compressionRatio + " for the map output data.");
			
 
				+        key.setCompressibility(true, compressionRatio);
			
 
				+        val.setCompressibility(true, compressionRatio);
			
 
				+      }
			
 
				+      
			
 
				       long totalRecords = 0L;
			
 
				       final int nReduces = ctxt.getNumReduceTasks();
			
 
				       if (nReduces > 0) {
			
@@ -114,17 +237,30 @@ class LoadJob extends GridmixJob {
 
				           if (i == id) {
			
 
				             spec.bytes_out = split.getReduceBytes(idx);
			
 
				             spec.rec_out = split.getReduceRecords(idx);
			
 
				+            spec.setResourceUsageSpecification(
			
 
				+                   split.getReduceResourceUsageMetrics(idx));
			
 
				             ++idx;
			
 
				             id += maps;
			
 
				           }
			
 
				+          // set the map output bytes such that the final reduce input bytes 
			
 
				+          // match the expected value obtained from the original job
			
 
				+          long mapOutputBytes = reduceBytes[i];
			
 
				+          if (emulateMapOutputCompression) {
			
 
				+            mapOutputBytes /= compressionRatio;
			
 
				+          }
			
 
				           reduces.add(new IntermediateRecordFactory(
			
 
				-              new AvgRecordFactory(reduceBytes[i], reduceRecords[i], conf),
			
 
				+              new AvgRecordFactory(mapOutputBytes, reduceRecords[i], conf, 
			
 
				+                                   5*1024),
			
 
				               i, reduceRecords[i], spec, conf));
			
 
				           totalRecords += reduceRecords[i];
			
 
				         }
			
 
				       } else {
			
 
				-        reduces.add(new AvgRecordFactory(reduceBytes[0], reduceRecords[0],
			
 
				-              conf));
			
 
				+        long mapOutputBytes = reduceBytes[0];
			
 
				+        if (emulateMapOutputCompression) {
			
 
				+          mapOutputBytes /= compressionRatio;
			
 
				+        }
			
 
				+        reduces.add(new AvgRecordFactory(mapOutputBytes, reduceRecords[0],
			
 
				+                    conf, 5*1024));
			
 
				         totalRecords = reduceRecords[0];
			
 
				       }
			
 
				       final long splitRecords = split.getInputRecords();
			
@@ -134,6 +270,13 @@ class LoadJob extends GridmixJob {
 
				         : splitRecords;
			
 
				       ratio = totalRecords / (1.0 * inputRecords);
			
 
				       acc = 0.0;
			
 
				+      
			
 
				+      matcher = new ResourceUsageMatcherRunner(ctxt, 
			
 
				+                      split.getMapResourceUsageMetrics());
			
 
				+      
			
 
				+      // start the status reporter thread
			
 
				+      reporter = new StatusReporter(ctxt);
			
 
				+      reporter.start();
			
 
				     }
			
 
				 
			
 
				     @Override
			
@@ -151,6 +294,13 @@ class LoadJob extends GridmixJob {
 
				         }
			
 
				         context.write(key, val);
			
 
				         acc -= 1.0;
			
 
				+        
			
 
				+        // match inline
			
 
				+        try {
			
 
				+          matcher.match();
			
 
				+        } catch (Exception e) {
			
 
				+          LOG.debug("Error in resource usage emulation! Message: ", e);
			
 
				+        }
			
 
				       }
			
 
				     }
			
 
				 
			
@@ -162,8 +312,18 @@ class LoadJob extends GridmixJob {
 
				         while (factory.next(key, val)) {
			
 
				           context.write(key, val);
			
 
				           key.setSeed(r.nextLong());
			
 
				+          
			
 
				+          // match inline
			
 
				+          try {
			
 
				+            matcher.match();
			
 
				+          } catch (Exception e) {
			
 
				+            LOG.debug("Error in resource usage emulation! Message: ", e);
			
 
				+          }
			
 
				         }
			
 
				       }
			
 
				+      
			
 
				+      // start the matcher thread since the map phase ends here
			
 
				+      matcher.start();
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -177,6 +337,9 @@ class LoadJob extends GridmixJob {
 
				     private double ratio;
			
 
				     private RecordFactory factory;
			
 
				 
			
 
				+    private ResourceUsageMatcherRunner matcher = null;
			
 
				+    private StatusReporter reporter = null;
			
 
				+    
			
 
				     @Override
			
 
				     protected void setup(Context context)
			
 
				         throws IOException, InterruptedException {
			
@@ -187,20 +350,48 @@ class LoadJob extends GridmixJob {
 
				       long outBytes = 0L;
			
 
				       long outRecords = 0L;
			
 
				       long inRecords = 0L;
			
 
				+      ResourceUsageMetrics metrics = new ResourceUsageMetrics();
			
 
				       for (GridmixRecord ignored : context.getValues()) {
			
 
				         final GridmixKey spec = context.getCurrentKey();
			
 
				         inRecords += spec.getReduceInputRecords();
			
 
				         outBytes += spec.getReduceOutputBytes();
			
 
				         outRecords += spec.getReduceOutputRecords();
			
 
				+        if (spec.getReduceResourceUsageMetrics() != null) {
			
 
				+          metrics = spec.getReduceResourceUsageMetrics();
			
 
				+        }
			
 
				       }
			
 
				       if (0 == outRecords && inRecords > 0) {
			
 
				         LOG.info("Spec output bytes w/o records. Using input record count");
			
 
				         outRecords = inRecords;
			
 
				       }
			
 
				+      
			
 
				+      // enable gridmix reduce output record for compression
			
 
				+      Configuration conf = context.getConfiguration();
			
 
				+      if (CompressionEmulationUtil.isCompressionEmulationEnabled(conf)
			
 
				+          && FileOutputFormat.getCompressOutput(context)) {
			
 
				+        float compressionRatio = 
			
 
				+          CompressionEmulationUtil
			
 
				+            .getReduceOutputCompressionEmulationRatio(conf);
			
 
				+        LOG.info("GridMix is configured to use a compression ratio of " 
			
 
				+                 + compressionRatio + " for the reduce output data.");
			
 
				+        val.setCompressibility(true, compressionRatio);
			
 
				+        
			
 
				+        // Set the actual output data size to make sure that the actual output 
			
 
				+        // data size is same after compression
			
 
				+        outBytes /= compressionRatio;
			
 
				+      }
			
 
				+      
			
 
				       factory =
			
 
				-        new AvgRecordFactory(outBytes, outRecords, context.getConfiguration());
			
 
				+        new AvgRecordFactory(outBytes, outRecords, 
			
 
				+                             context.getConfiguration(), 5*1024);
			
 
				       ratio = outRecords / (1.0 * inRecords);
			
 
				       acc = 0.0;
			
 
				+      
			
 
				+      matcher = new ResourceUsageMatcherRunner(context, metrics);
			
 
				+      
			
 
				+      // start the status reporter thread
			
 
				+      reporter = new StatusReporter(context);
			
 
				+      reporter.start();
			
 
				     }
			
 
				     @Override
			
 
				     protected void reduce(GridmixKey key, Iterable<GridmixRecord> values,
			
@@ -210,6 +401,13 @@ class LoadJob extends GridmixJob {
 
				         while (acc >= 1.0 && factory.next(null, val)) {
			
 
				           context.write(NullWritable.get(), val);
			
 
				           acc -= 1.0;
			
 
				+          
			
 
				+          // match inline
			
 
				+          try {
			
 
				+            matcher.match();
			
 
				+          } catch (Exception e) {
			
 
				+            LOG.debug("Error in resource usage emulation! Message: ", e);
			
 
				+          }
			
 
				         }
			
 
				       }
			
 
				     }
			
@@ -220,6 +418,13 @@ class LoadJob extends GridmixJob {
 
				       while (factory.next(null, val)) {
			
 
				         context.write(NullWritable.get(), val);
			
 
				         val.setSeed(r.nextLong());
			
 
				+        
			
 
				+        // match inline
			
 
				+        try {
			
 
				+          matcher.match();
			
 
				+        } catch (Exception e) {
			
 
				+          LOG.debug("Error in resource usage emulation! Message: ", e);
			
 
				+        }
			
 
				       }
			
 
				     }
			
 
				   }
			
@@ -311,11 +516,13 @@ class LoadJob extends GridmixJob {
 
				       final int nSpec = reds / maps + ((reds % maps) > i ? 1 : 0);
			
 
				       final long[] specBytes = new long[nSpec];
			
 
				       final long[] specRecords = new long[nSpec];
			
 
				+      final ResourceUsageMetrics[] metrics = new ResourceUsageMetrics[nSpec];
			
 
				       for (int j = 0; j < nSpec; ++j) {
			
 
				         final TaskInfo info =
			
 
				           jobdesc.getTaskInfo(TaskType.REDUCE, i + j * maps);
			
 
				         specBytes[j] = info.getOutputBytes();
			
 
				         specRecords[j] = info.getOutputRecords();
			
 
				+        metrics[j] = info.getResourceUsageMetrics();
			
 
				         if (LOG.isDebugEnabled()) {
			
 
				           LOG.debug(String.format("SPEC(%d) %d -> %d %d %d", id(), i,
			
 
				               i + j * maps, info.getOutputRecords(), info.getOutputBytes()));
			
@@ -326,7 +533,8 @@ class LoadJob extends GridmixJob {
 
				               info.getInputBytes(), 3), maps, i,
			
 
				             info.getInputBytes(), info.getInputRecords(),
			
 
				             info.getOutputBytes(), info.getOutputRecords(),
			
 
				-            reduceByteRatio, reduceRecordRatio, specBytes, specRecords));
			
 
				+            reduceByteRatio, reduceRecordRatio, specBytes, specRecords,
			
 
				+            info.getResourceUsageMetrics(), metrics));
			
 
				     }
			
 
				     pushDescription(id(), splits);
			
 
				   }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/LoadSplit.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/LoadSplit.java
@@ -22,6 +22,7 @@ import java.io.DataOutput;
 
				 import java.io.IOException;
			
 
				 
			
 
				 import org.apache.hadoop.io.WritableUtils;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				 
			
 
				 class LoadSplit extends CombineFileSplit {
			
 
				   private int id;
			
@@ -39,6 +40,9 @@ class LoadSplit extends CombineFileSplit {
 
				   private long[] reduceOutputBytes = new long[0];
			
 
				   private long[] reduceOutputRecords = new long[0];
			
 
				 
			
 
				+  private ResourceUsageMetrics mapMetrics;
			
 
				+  private ResourceUsageMetrics[] reduceMetrics;
			
 
				+
			
 
				   LoadSplit() {
			
 
				     super();
			
 
				   }
			
@@ -46,7 +50,9 @@ class LoadSplit extends CombineFileSplit {
 
				   public LoadSplit(CombineFileSplit cfsplit, int maps, int id,
			
 
				       long inputBytes, long inputRecords, long outputBytes,
			
 
				       long outputRecords, double[] reduceBytes, double[] reduceRecords,
			
 
				-      long[] reduceOutputBytes, long[] reduceOutputRecords)
			
 
				+      long[] reduceOutputBytes, long[] reduceOutputRecords,
			
 
				+      ResourceUsageMetrics metrics,
			
 
				+      ResourceUsageMetrics[] rMetrics)
			
 
				       throws IOException {
			
 
				     super(cfsplit);
			
 
				     this.id = id;
			
@@ -60,6 +66,8 @@ class LoadSplit extends CombineFileSplit {
 
				     nSpec = reduceOutputBytes.length;
			
 
				     this.reduceOutputBytes = reduceOutputBytes;
			
 
				     this.reduceOutputRecords = reduceOutputRecords;
			
 
				+    this.mapMetrics = metrics;
			
 
				+    this.reduceMetrics = rMetrics;
			
 
				   }
			
 
				 
			
 
				   public int getId() {
			
@@ -97,6 +105,15 @@ class LoadSplit extends CombineFileSplit {
 
				   public long getReduceRecords(int i) {
			
 
				     return reduceOutputRecords[i];
			
 
				   }
			
 
				+  
			
 
				+  public ResourceUsageMetrics getMapResourceUsageMetrics() {
			
 
				+    return mapMetrics;
			
 
				+  }
			
 
				+  
			
 
				+  public ResourceUsageMetrics getReduceResourceUsageMetrics(int i) {
			
 
				+    return reduceMetrics[i];
			
 
				+  }
			
 
				+  
			
 
				   @Override
			
 
				   public void write(DataOutput out) throws IOException {
			
 
				     super.write(out);
			
@@ -116,6 +133,12 @@ class LoadSplit extends CombineFileSplit {
 
				       WritableUtils.writeVLong(out, reduceOutputBytes[i]);
			
 
				       WritableUtils.writeVLong(out, reduceOutputRecords[i]);
			
 
				     }
			
 
				+    mapMetrics.write(out);
			
 
				+    int numReduceMetrics = (reduceMetrics == null) ? 0 : reduceMetrics.length;
			
 
				+    WritableUtils.writeVInt(out, numReduceMetrics);
			
 
				+    for (int i = 0; i < numReduceMetrics; ++i) {
			
 
				+      reduceMetrics[i].write(out);
			
 
				+    }
			
 
				   }
			
 
				   @Override
			
 
				   public void readFields(DataInput in) throws IOException {
			
@@ -144,5 +167,13 @@ class LoadSplit extends CombineFileSplit {
 
				       reduceOutputBytes[i] = WritableUtils.readVLong(in);
			
 
				       reduceOutputRecords[i] = WritableUtils.readVLong(in);
			
 
				     }
			
 
				+    mapMetrics = new ResourceUsageMetrics();
			
 
				+    mapMetrics.readFields(in);
			
 
				+    int numReduceMetrics = WritableUtils.readVInt(in);
			
 
				+    reduceMetrics = new ResourceUsageMetrics[numReduceMetrics];
			
 
				+    for (int i = 0; i < numReduceMetrics; ++i) {
			
 
				+      reduceMetrics[i] = new ResourceUsageMetrics();
			
 
				+      reduceMetrics[i].readFields(in);
			
 
				+    }
			
 
				   }
			
 
				 }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Progressive.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Progressive.java
@@ -0,0 +1,25 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+/**
			
 
				+ * Used to track progress of tasks.
			
 
				+ */
			
 
				+public interface Progressive {
			
 
				+  public float getProgress();
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/PseudoLocalFs.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/PseudoLocalFs.java
@@ -0,0 +1,337 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ * <p/>
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ * <p/>
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.io.FileNotFoundException;
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+import java.util.Random;
			
 
				+import java.net.URI;
			
 
				+
			
 
				+import org.apache.hadoop.fs.FSDataInputStream;
			
 
				+import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.PositionedReadable;
			
 
				+import org.apache.hadoop.fs.Seekable;
			
 
				+import org.apache.hadoop.fs.permission.FsPermission;
			
 
				+import org.apache.hadoop.io.BytesWritable;
			
 
				+import org.apache.hadoop.util.Progressable;
			
 
				+
			
 
				+/**
			
 
				+ * Pseudo local file system that generates random data for any file on the fly
			
 
				+ * instead of storing files on disk. So opening same file multiple times will
			
 
				+ * not give same file content. There are no directories in this file system
			
 
				+ * other than the root and all the files are under root i.e. "/". All file URIs
			
 
				+ * on pseudo local file system should be of the format <code>
			
 
				+ * pseudo:///&lt;name&gt;.&lt;fileSize&gt;</code> where name is a unique name
			
 
				+ * and &lt;fileSize&gt; is a number representing the size of the file in bytes.
			
 
				+ */
			
 
				+class PseudoLocalFs extends FileSystem {
			
 
				+  Path home;
			
 
				+  /**
			
 
				+   * The creation time and modification time of all files in
			
 
				+   * {@link PseudoLocalFs} is same.
			
 
				+   */
			
 
				+  private static final long TIME = System.currentTimeMillis();
			
 
				+  private static final String HOME_DIR = "/";
			
 
				+  private static final long BLOCK_SIZE  = 4 * 1024 * 1024L; // 4 MB
			
 
				+  private static final int DEFAULT_BUFFER_SIZE = 1024  * 1024; // 1MB
			
 
				+
			
 
				+  static final URI NAME = URI.create("pseudo:///");
			
 
				+
			
 
				+  PseudoLocalFs() {
			
 
				+    this(new Path(HOME_DIR));
			
 
				+  }
			
 
				+
			
 
				+  PseudoLocalFs(Path home) {
			
 
				+    super();
			
 
				+    this.home = home;
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public URI getUri() {
			
 
				+    return NAME;
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public Path getHomeDirectory() {
			
 
				+    return home;
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public Path getWorkingDirectory() {
			
 
				+    return getHomeDirectory();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Generates a valid pseudo local file path from the given <code>fileId</code>
			
 
				+   * and <code>fileSize</code>.
			
 
				+   * @param fileId unique file id string
			
 
				+   * @param fileSize file size
			
 
				+   * @return the generated relative path
			
 
				+   */
			
 
				+  static Path generateFilePath(String fileId, long fileSize) {
			
 
				+    return new Path(fileId + "." + fileSize);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Creating a pseudo local file is nothing but validating the file path.
			
 
				+   * Actual data of the file is generated on the fly when client tries to open
			
 
				+   * the file for reading.
			
 
				+   * @param path file path to be created
			
 
				+   */
			
 
				+  @Override
			
 
				+  public FSDataOutputStream create(Path path) throws IOException {
			
 
				+    try {
			
 
				+      validateFileNameFormat(path);
			
 
				+    } catch (FileNotFoundException e) {
			
 
				+      throw new IOException("File creation failed for " + path);
			
 
				+    }
			
 
				+    return null;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate if the path provided is of expected format of Pseudo Local File
			
 
				+   * System based files.
			
 
				+   * @param path file path
			
 
				+   * @return the file size
			
 
				+   * @throws FileNotFoundException
			
 
				+   */
			
 
				+  long validateFileNameFormat(Path path) throws FileNotFoundException {
			
 
				+    path = path.makeQualified(this);
			
 
				+    boolean valid = true;
			
 
				+    long fileSize = 0;
			
 
				+    if (!path.toUri().getScheme().equals(getUri().getScheme())) {
			
 
				+      valid = false;
			
 
				+    } else {
			
 
				+      String[] parts = path.toUri().getPath().split("\\.");
			
 
				+      try {
			
 
				+        fileSize = Long.valueOf(parts[parts.length - 1]);
			
 
				+        valid = (fileSize >= 0);
			
 
				+      } catch (NumberFormatException e) {
			
 
				+        valid = false;
			
 
				+      }
			
 
				+    }
			
 
				+    if (!valid) {
			
 
				+      throw new FileNotFoundException("File " + path
			
 
				+          + " does not exist in pseudo local file system");
			
 
				+    }
			
 
				+    return fileSize;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @See create(Path) for details
			
 
				+   */
			
 
				+  @Override
			
 
				+  public FSDataInputStream open(Path path, int bufferSize) throws IOException {
			
 
				+    long fileSize = validateFileNameFormat(path);
			
 
				+    InputStream in = new RandomInputStream(fileSize, bufferSize);
			
 
				+    return new FSDataInputStream(in);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * @See create(Path) for details
			
 
				+   */
			
 
				+  @Override
			
 
				+  public FSDataInputStream open(Path path) throws IOException {
			
 
				+    return open(path, DEFAULT_BUFFER_SIZE);
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public FileStatus getFileStatus(Path path) throws IOException {
			
 
				+    long fileSize = validateFileNameFormat(path);
			
 
				+    return new FileStatus(fileSize, false, 1, BLOCK_SIZE, TIME, path);
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public boolean exists(Path path) {
			
 
				+    try{
			
 
				+      validateFileNameFormat(path);
			
 
				+    } catch (FileNotFoundException e) {
			
 
				+      return false;
			
 
				+    }
			
 
				+    return true;
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public FSDataOutputStream create(Path path, FsPermission permission,
			
 
				+      boolean overwrite, int bufferSize, short replication, long blockSize,
			
 
				+      Progressable progress) throws IOException {
			
 
				+    return create(path);
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public FileStatus[] listStatus(Path path) throws FileNotFoundException,
			
 
				+      IOException {
			
 
				+    return new FileStatus[] {getFileStatus(path)};
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Input Stream that generates specified number of random bytes.
			
 
				+   */
			
 
				+  static class RandomInputStream extends InputStream
			
 
				+      implements Seekable, PositionedReadable {
			
 
				+
			
 
				+    private final Random r = new Random();
			
 
				+    private BytesWritable val = null;
			
 
				+    private int positionInVal = 0;// current position in the buffer 'val'
			
 
				+
			
 
				+    private long totalSize = 0;// total number of random bytes to be generated
			
 
				+    private long curPos = 0;// current position in this stream
			
 
				+
			
 
				+    /**
			
 
				+     * @param size total number of random bytes to be generated in this stream
			
 
				+     * @param bufferSize the buffer size. An internal buffer array of length
			
 
				+     * <code>bufferSize</code> is created. If <code>bufferSize</code> is not a
			
 
				+     * positive number, then a default value of 1MB is used.
			
 
				+     */
			
 
				+    RandomInputStream(long size, int bufferSize) {
			
 
				+      totalSize = size;
			
 
				+      if (bufferSize <= 0) {
			
 
				+        bufferSize = DEFAULT_BUFFER_SIZE;
			
 
				+      }
			
 
				+      val = new BytesWritable(new byte[bufferSize]);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public int read() throws IOException {
			
 
				+      byte[] b = new byte[1];
			
 
				+      if (curPos < totalSize) {
			
 
				+        if (positionInVal < val.getLength()) {// use buffered byte
			
 
				+          b[0] = val.getBytes()[positionInVal++];
			
 
				+          ++curPos;
			
 
				+        } else {// generate data
			
 
				+          int num = read(b);
			
 
				+          if (num < 0) {
			
 
				+            return num;
			
 
				+          }
			
 
				+        }
			
 
				+      } else {
			
 
				+        return -1;
			
 
				+      }
			
 
				+      return b[0];
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public int read(byte[] bytes) throws IOException {
			
 
				+      return read(bytes, 0, bytes.length);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public int read(byte[] bytes, int off, int len) throws IOException {
			
 
				+      if (curPos == totalSize) {
			
 
				+        return -1;// EOF
			
 
				+      }
			
 
				+      int numBytes = len;
			
 
				+      if (numBytes > (totalSize - curPos)) {// position in file is close to EOF
			
 
				+        numBytes = (int)(totalSize - curPos);
			
 
				+      }
			
 
				+      if (numBytes > (val.getLength() - positionInVal)) {
			
 
				+        // need to generate data into val
			
 
				+        r.nextBytes(val.getBytes());
			
 
				+        positionInVal = 0;
			
 
				+      }
			
 
				+
			
 
				+      System.arraycopy(val.getBytes(), positionInVal, bytes, off, numBytes);
			
 
				+      curPos += numBytes;
			
 
				+      positionInVal += numBytes;
			
 
				+      return numBytes;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public int available() {
			
 
				+      return (int)(val.getLength() - positionInVal);
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public int read(long position, byte[] buffer, int offset, int length)
			
 
				+        throws IOException {
			
 
				+      throw new UnsupportedOperationException();
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public void readFully(long position, byte[] buffer) throws IOException {
			
 
				+      throw new UnsupportedOperationException();
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public void readFully(long position, byte[] buffer, int offset, int length)
			
 
				+        throws IOException {
			
 
				+      throw new UnsupportedOperationException();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Get the current position in this stream/pseudo-file
			
 
				+     * @return the position in this stream/pseudo-file
			
 
				+     * @throws IOException
			
 
				+     */
			
 
				+    @Override
			
 
				+    public long getPos() throws IOException {
			
 
				+      return curPos;
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public void seek(long pos) throws IOException {
			
 
				+      throw new UnsupportedOperationException();
			
 
				+    }
			
 
				+
			
 
				+    @Override
			
 
				+    public boolean seekToNewSource(long targetPos) throws IOException {
			
 
				+      throw new UnsupportedOperationException();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public FSDataOutputStream append(Path path, int bufferSize,
			
 
				+      Progressable progress) throws IOException {
			
 
				+    throw new UnsupportedOperationException("Append is not supported"
			
 
				+        + " in pseudo local file system.");
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public boolean mkdirs(Path f, FsPermission permission) throws IOException {
			
 
				+    throw new UnsupportedOperationException("Mkdirs is not supported"
			
 
				+        + " in pseudo local file system.");
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public boolean rename(Path src, Path dst) throws IOException {
			
 
				+    throw new UnsupportedOperationException("Rename is not supported"
			
 
				+        + " in pseudo local file system.");
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public boolean delete(Path path, boolean recursive) {
			
 
				+    throw new UnsupportedOperationException("File deletion is not supported "
			
 
				+        + "in pseudo local file system.");
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public void setWorkingDirectory(Path newDir) {
			
 
				+    throw new UnsupportedOperationException("SetWorkingDirectory "
			
 
				+        + "is not supported in pseudo local file system.");
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public boolean delete(Path f) throws IOException {//dummy implementation
			
 
				+    return true;
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java
@@ -0,0 +1,147 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.util.Arrays;
			
 
				+import java.util.List;
			
 
				+import java.util.Random;
			
 
				+
			
 
				+import org.apache.commons.lang.RandomStringUtils;
			
 
				+import org.apache.commons.logging.Log;
			
 
				+import org.apache.commons.logging.LogFactory;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+
			
 
				+/**
			
 
				+ * A random text generator. The words are simply sequences of alphabets.
			
 
				+ */
			
 
				+class RandomTextDataGenerator {
			
 
				+  static final Log LOG = LogFactory.getLog(RandomTextDataGenerator.class);
			
 
				+  
			
 
				+  /**
			
 
				+   * Configuration key for random text data generator's list size.
			
 
				+   */
			
 
				+  static final String GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE = 
			
 
				+    "gridmix.datagenerator.randomtext.listsize";
			
 
				+  
			
 
				+  /**
			
 
				+   * Configuration key for random text data generator's word size.
			
 
				+   */
			
 
				+  static final String GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE = 
			
 
				+    "gridmix.datagenerator.randomtext.wordsize";
			
 
				+  
			
 
				+  /**
			
 
				+   * Default random text data generator's list size.
			
 
				+   */
			
 
				+  static final int DEFAULT_LIST_SIZE = 200;
			
 
				+  
			
 
				+  /**
			
 
				+   * Default random text data generator's word size.
			
 
				+   */
			
 
				+  static final int DEFAULT_WORD_SIZE = 10;
			
 
				+  
			
 
				+  /**
			
 
				+   * Default random text data generator's seed.
			
 
				+   */
			
 
				+  static final long DEFAULT_SEED = 0L;
			
 
				+  
			
 
				+  /**
			
 
				+   * A list of random words
			
 
				+   */
			
 
				+  private String[] words;
			
 
				+  private Random random;
			
 
				+  
			
 
				+  /**
			
 
				+   * Constructor for {@link RandomTextDataGenerator} with default seed.
			
 
				+   * @param size the total number of words to consider.
			
 
				+   * @param wordSize Size of each word
			
 
				+   */
			
 
				+  RandomTextDataGenerator(int size, int wordSize) {
			
 
				+    this(size, DEFAULT_SEED , wordSize);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Constructor for {@link RandomTextDataGenerator}.
			
 
				+   * @param size the total number of words to consider.
			
 
				+   * @param seed Random number generator seed for repeatability
			
 
				+   * @param wordSize Size of each word
			
 
				+   */
			
 
				+  RandomTextDataGenerator(int size, Long seed, int wordSize) {
			
 
				+    random = new Random(seed);
			
 
				+    words = new String[size];
			
 
				+    
			
 
				+    //TODO change the default with the actual stats
			
 
				+    //TODO do u need varied sized words?
			
 
				+    for (int i = 0; i < size; ++i) {
			
 
				+      words[i] = 
			
 
				+        RandomStringUtils.random(wordSize, 0, 0, true, false, null, random);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the configured random text data generator's list size.
			
 
				+   */
			
 
				+  static int getRandomTextDataGeneratorListSize(Configuration conf) {
			
 
				+    return conf.getInt(GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE, DEFAULT_LIST_SIZE);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the random text data generator's list size.
			
 
				+   */
			
 
				+  static void setRandomTextDataGeneratorListSize(Configuration conf, 
			
 
				+                                                 int listSize) {
			
 
				+    if (LOG.isDebugEnabled()) {
			
 
				+      LOG.debug("Random text data generator is configured to use a dictionary " 
			
 
				+                + " with " + listSize + " words");
			
 
				+    }
			
 
				+    conf.setInt(GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE, listSize);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the configured random text data generator word size.
			
 
				+   */
			
 
				+  static int getRandomTextDataGeneratorWordSize(Configuration conf) {
			
 
				+    return conf.getInt(GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE, DEFAULT_WORD_SIZE);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the random text data generator word size.
			
 
				+   */
			
 
				+  static void setRandomTextDataGeneratorWordSize(Configuration conf, 
			
 
				+                                                 int wordSize) {
			
 
				+    if (LOG.isDebugEnabled()) {
			
 
				+      LOG.debug("Random text data generator is configured to use a dictionary " 
			
 
				+                + " with words of length " + wordSize);
			
 
				+    }
			
 
				+    conf.setInt(GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE, wordSize);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Returns a randomly selected word from a list of random words.
			
 
				+   */
			
 
				+  String getRandomWord() {
			
 
				+    int index = random.nextInt(words.length);
			
 
				+    return words[index];
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * This is mainly for testing.
			
 
				+   */
			
 
				+  List<String> getRandomWords() {
			
 
				+    return Arrays.asList(words);
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RoundRobinUserResolver.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RoundRobinUserResolver.java
@@ -38,12 +38,18 @@ public class RoundRobinUserResolver implements UserResolver {
 
				 
			
 
				   private int uidx = 0;
			
 
				   private List<UserGroupInformation> users = Collections.emptyList();
			
 
				-  private final HashMap<UserGroupInformation,UserGroupInformation> usercache =
			
 
				-    new HashMap<UserGroupInformation,UserGroupInformation>();
			
 
				+
			
 
				+  /**
			
 
				+   *  Mapping between user names of original cluster and UGIs of proxy users of
			
 
				+   *  simulated cluster
			
 
				+   */
			
 
				+  private final HashMap<String,UserGroupInformation> usercache =
			
 
				+      new HashMap<String,UserGroupInformation>();
			
 
				   
			
 
				   /**
			
 
				-   * Userlist assumes one UGI per line, each UGI matching
			
 
				-   * &lt;username&gt;,&lt;group&gt;[,group]*
			
 
				+   * Userlist assumes one user per line.
			
 
				+   * Each line in users-list-file is of the form &lt;username&gt;[,group]* 
			
 
				+   * <br> Group names are ignored(they are not parsed at all).
			
 
				    */
			
 
				   private List<UserGroupInformation> parseUserList(
			
 
				       URI userUri, Configuration conf) throws IOException {
			
@@ -54,64 +60,78 @@ public class RoundRobinUserResolver implements UserResolver {
 
				     final Path userloc = new Path(userUri.toString());
			
 
				     final Text rawUgi = new Text();
			
 
				     final FileSystem fs = userloc.getFileSystem(conf);
			
 
				-    final ArrayList<UserGroupInformation> ret = new ArrayList();
			
 
				+    final ArrayList<UserGroupInformation> ugiList =
			
 
				+        new ArrayList<UserGroupInformation>();
			
 
				 
			
 
				     LineReader in = null;
			
 
				     try {
			
 
				-      final ArrayList<String> groups = new ArrayList();
			
 
				       in = new LineReader(fs.open(userloc));
			
 
				-      while (in.readLine(rawUgi) > 0) {
			
 
				+      while (in.readLine(rawUgi) > 0) {//line is of the form username[,group]*
			
 
				+        // e is end position of user name in this line
			
 
				         int e = rawUgi.find(",");
			
 
				-        if (e <= 0) {
			
 
				+        if (rawUgi.getLength() == 0 || e == 0) {
			
 
				           throw new IOException("Missing username: " + rawUgi);
			
 
				         }
			
 
				+        if (e == -1) {
			
 
				+          e = rawUgi.getLength();
			
 
				+        }
			
 
				         final String username = Text.decode(rawUgi.getBytes(), 0, e);
			
 
				-        int s = e;
			
 
				-        while ((e = rawUgi.find(",", ++s)) != -1) {
			
 
				-          groups.add(Text.decode(rawUgi.getBytes(), s, e - s));
			
 
				-          s = e;
			
 
				+        UserGroupInformation ugi = null;
			
 
				+        try {
			
 
				+          ugi = UserGroupInformation.createProxyUser(username,
			
 
				+                    UserGroupInformation.getLoginUser());
			
 
				+        } catch (IOException ioe) {
			
 
				+          LOG.error("Error while creating a proxy user " ,ioe);
			
 
				         }
			
 
				-        groups.add(Text.decode(rawUgi.getBytes(), s, rawUgi.getLength() - s));
			
 
				-        if (groups.size() == 0) {
			
 
				-          throw new IOException("Missing groups: " + rawUgi);
			
 
				+        if (ugi != null) {
			
 
				+          ugiList.add(ugi);
			
 
				         }
			
 
				-        ret.add(UserGroupInformation.createRemoteUser(username));
			
 
				+        // No need to parse groups, even if they exist. Go to next line
			
 
				       }
			
 
				     } finally {
			
 
				       if (in != null) {
			
 
				         in.close();
			
 
				       }
			
 
				     }
			
 
				-    return ret;
			
 
				+    return ugiList;
			
 
				   }
			
 
				 
			
 
				   @Override
			
 
				   public synchronized boolean setTargetUsers(URI userloc, Configuration conf)
			
 
				       throws IOException {
			
 
				+    uidx = 0;
			
 
				     users = parseUserList(userloc, conf);
			
 
				     if (users.size() == 0) {
			
 
				-      throw new IOException("Empty user list");
			
 
				+      throw new IOException(buildEmptyUsersErrorMsg(userloc));
			
 
				     }
			
 
				-    usercache.keySet().retainAll(users);
			
 
				+    usercache.clear();
			
 
				     return true;
			
 
				   }
			
 
				 
			
 
				+  static String buildEmptyUsersErrorMsg(URI userloc) {
			
 
				+    return "Empty user list is not allowed for RoundRobinUserResolver. Provided"
			
 
				+    + " user resource URI '" + userloc + "' resulted in an empty user list.";
			
 
				+  }
			
 
				+
			
 
				   @Override
			
 
				   public synchronized UserGroupInformation getTargetUgi(
			
 
				       UserGroupInformation ugi) {
			
 
				-    UserGroupInformation ret = usercache.get(ugi);
			
 
				-    if (null == ret) {
			
 
				-      ret = users.get(uidx++ % users.size());
			
 
				-      usercache.put(ugi, ret);
			
 
				+    // UGI of proxy user
			
 
				+    UserGroupInformation targetUGI = usercache.get(ugi.getUserName());
			
 
				+    if (targetUGI == null) {
			
 
				+      targetUGI = users.get(uidx++ % users.size());
			
 
				+      usercache.put(ugi.getUserName(), targetUGI);
			
 
				     }
			
 
				-    UserGroupInformation val = null;
			
 
				-    try {
			
 
				-      val = UserGroupInformation.createProxyUser(
			
 
				-        ret.getUserName(), UserGroupInformation.getLoginUser());
			
 
				-    } catch (IOException e) {
			
 
				-      LOG.error("Error while creating the proxy user " ,e);
			
 
				-    }
			
 
				-    return val;
			
 
				+    return targetUGI;
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * {@inheritDoc}
			
 
				+   * <p>
			
 
				+   * {@link RoundRobinUserResolver} needs to map the users in the
			
 
				+   * trace to the provided list of target users. So user list is needed.
			
 
				+   */
			
 
				+  public boolean needsTargetUsersList() {
			
 
				+    return true;
			
 
				+  }
			
 
				 }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/SleepJob.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/SleepJob.java
@@ -93,6 +93,11 @@ public class SleepJob extends GridmixJob {
 
				         Long.MAX_VALUE);
			
 
				   }
			
 
				 
			
 
				+  @Override
			
 
				+  protected boolean canEmulateCompression() {
			
 
				+    return false;
			
 
				+  }
			
 
				+  
			
 
				   @Override
			
 
				   public Job call()
			
 
				     throws IOException, InterruptedException, ClassNotFoundException {
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Statistics.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Statistics.java
@@ -189,9 +189,10 @@ public class Statistics implements Component<Job> {
 
				         try {
			
 
				           jobCompleted.await(jtPollingInterval, TimeUnit.MILLISECONDS);
			
 
				         } catch (InterruptedException ie) {
			
 
				-          LOG.error(
			
 
				-            "Statistics interrupt while waiting for polling " + ie.getCause(),
			
 
				-            ie);
			
 
				+          if (!shutdown) {
			
 
				+            LOG.error("Statistics interrupt while waiting for completion of "
			
 
				+                + "a job.", ie);
			
 
				+          }
			
 
				           return;
			
 
				         } finally {
			
 
				           lock.unlock();
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/SubmitterUserResolver.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/SubmitterUserResolver.java
@@ -32,13 +32,13 @@ public class SubmitterUserResolver implements UserResolver {
 
				   
			
 
				   private UserGroupInformation ugi = null;
			
 
				 
			
 
				-  public SubmitterUserResolver() {
			
 
				+  public SubmitterUserResolver() throws IOException {
			
 
				     LOG.info(" Current user resolver is SubmitterUserResolver ");
			
 
				+    ugi = UserGroupInformation.getLoginUser();
			
 
				   }
			
 
				 
			
 
				   public synchronized boolean setTargetUsers(URI userdesc, Configuration conf)
			
 
				       throws IOException {
			
 
				-    ugi = UserGroupInformation.getLoginUser();
			
 
				     return false;
			
 
				   }
			
 
				 
			
@@ -47,4 +47,13 @@ public class SubmitterUserResolver implements UserResolver {
 
				     return this.ugi;
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * {@inheritDoc}
			
 
				+   * <p>
			
 
				+   * Since {@link SubmitterUserResolver} returns the user name who is running
			
 
				+   * gridmix, it doesn't need a target list of users.
			
 
				+   */
			
 
				+  public boolean needsTargetUsersList() {
			
 
				+    return false;
			
 
				+  }
			
 
				 }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Summarizer.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Summarizer.java
@@ -0,0 +1,75 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics;
			
 
				+
			
 
				+/**
			
 
				+ * Summarizes various aspects of a {@link Gridmix} run.
			
 
				+ */
			
 
				+class Summarizer {
			
 
				+  private ExecutionSummarizer executionSummarizer;
			
 
				+  private ClusterSummarizer clusterSummarizer;
			
 
				+  protected static final String NA = "N/A";
			
 
				+  
			
 
				+  Summarizer() {
			
 
				+    this(new String[]{NA});
			
 
				+  }
			
 
				+  
			
 
				+  Summarizer(String[] args) {
			
 
				+    executionSummarizer = new ExecutionSummarizer(args);
			
 
				+    clusterSummarizer = new ClusterSummarizer();
			
 
				+  }
			
 
				+  
			
 
				+  ExecutionSummarizer getExecutionSummarizer() {
			
 
				+    return executionSummarizer;
			
 
				+  }
			
 
				+  
			
 
				+  ClusterSummarizer getClusterSummarizer() {
			
 
				+    return clusterSummarizer;
			
 
				+  }
			
 
				+  
			
 
				+  void start(Configuration conf) {
			
 
				+    executionSummarizer.start(conf);
			
 
				+    clusterSummarizer.start(conf);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * This finalizes the summarizer.
			
 
				+   */
			
 
				+  @SuppressWarnings("unchecked")
			
 
				+  void finalize(JobFactory factory, String path, long size, 
			
 
				+                UserResolver resolver, DataStatistics stats, Configuration conf)
			
 
				+  throws IOException {
			
 
				+    executionSummarizer.finalize(factory, path, size, resolver, stats, conf);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Summarizes the current {@link Gridmix} run and the cluster used. 
			
 
				+   */
			
 
				+  @Override
			
 
				+  public String toString() {
			
 
				+    StringBuilder builder = new StringBuilder();
			
 
				+    builder.append(executionSummarizer.toString());
			
 
				+    builder.append(clusterSummarizer.toString());
			
 
				+    return builder.toString();
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/UserResolver.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/UserResolver.java
@@ -19,29 +19,27 @@ package org.apache.hadoop.mapred.gridmix;
 
				 
			
 
				 import java.io.IOException;
			
 
				 import java.net.URI;
			
 
				-import java.util.ArrayList;
			
 
				-import java.util.Collections;
			
 
				-import java.util.List;
			
 
				-
			
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				-import org.apache.hadoop.fs.FileSystem;
			
 
				-import org.apache.hadoop.fs.Path;
			
 
				-import org.apache.hadoop.io.Text;
			
 
				 import org.apache.hadoop.security.UserGroupInformation;
			
 
				-import org.apache.hadoop.util.LineReader;
			
 
				+import org.apache.hadoop.classification.InterfaceAudience;
			
 
				+import org.apache.hadoop.classification.InterfaceStability;
			
 
				 
			
 
				 /**
			
 
				  * Maps users in the trace to a set of valid target users on the test cluster.
			
 
				  */
			
 
				+@InterfaceAudience.Private
			
 
				+@InterfaceStability.Evolving
			
 
				 public interface UserResolver {
			
 
				 
			
 
				   /**
			
 
				    * Configure the user map given the URI and configuration. The resolver's
			
 
				    * contract will define how the resource will be interpreted, but the default
			
 
				    * will typically interpret the URI as a {@link org.apache.hadoop.fs.Path}
			
 
				-   * listing target users. 
			
 
				-   * @param userdesc URI (possibly null) from which user information may be
			
 
				-   * loaded per the subclass contract.
			
 
				+   * listing target users.
			
 
				+   * This method should be called only if {@link #needsTargetUsersList()}
			
 
				+   * returns true.
			
 
				+   * @param userdesc URI from which user information may be loaded per the
			
 
				+   * subclass contract.
			
 
				    * @param conf The tool configuration.
			
 
				    * @return true if the resource provided was used in building the list of
			
 
				    * target users
			
@@ -55,4 +53,13 @@ public interface UserResolver {
 
				    */
			
 
				   public UserGroupInformation getTargetUgi(UserGroupInformation ugi);
			
 
				 
			
 
				+  /**
			
 
				+   * Indicates whether this user resolver needs a list of target users to be
			
 
				+   * provided.
			
 
				+   *
			
 
				+   * @return true if a list of target users is to be provided for this
			
 
				+   * user resolver
			
 
				+   */
			
 
				+  public boolean needsTargetUsersList();
			
 
				+
			
 
				 }
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/emulators/resourceusage/CumulativeCpuUsageEmulatorPlugin.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/emulators/resourceusage/CumulativeCpuUsageEmulatorPlugin.java
@@ -0,0 +1,315 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix.emulators.resourceusage;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.Random;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.mapred.gridmix.Progressive;
			
 
				+import org.apache.hadoop.util.ResourceCalculatorPlugin;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				+
			
 
				+/**
			
 
				+ * <p>A {@link ResourceUsageEmulatorPlugin} that emulates the cumulative CPU 
			
 
				+ * usage by performing certain CPU intensive operations. Performing such CPU 
			
 
				+ * intensive operations essentially uses up some CPU. Every 
			
 
				+ * {@link ResourceUsageEmulatorPlugin} is configured with a feedback module i.e 
			
 
				+ * a {@link ResourceCalculatorPlugin}, to monitor the resource usage.</p>
			
 
				+ * 
			
 
				+ * <p>{@link CumulativeCpuUsageEmulatorPlugin} emulates the CPU usage in steps. 
			
 
				+ * The frequency of emulation can be configured via 
			
 
				+ * {@link #CPU_EMULATION_PROGRESS_INTERVAL}.
			
 
				+ * CPU usage values are matched via emulation only on the interval boundaries.
			
 
				+ * </p>
			
 
				+ *  
			
 
				+ * {@link CumulativeCpuUsageEmulatorPlugin} is a wrapper program for managing 
			
 
				+ * the CPU usage emulation feature. It internally uses an emulation algorithm 
			
 
				+ * (called as core and described using {@link CpuUsageEmulatorCore}) for 
			
 
				+ * performing the actual emulation. Multiple calls to this core engine should 
			
 
				+ * use up some amount of CPU.<br>
			
 
				+ * 
			
 
				+ * <p>{@link CumulativeCpuUsageEmulatorPlugin} provides a calibration feature 
			
 
				+ * via {@link #initialize(Configuration, ResourceUsageMetrics, 
			
 
				+ *                        ResourceCalculatorPlugin, Progressive)} to calibrate 
			
 
				+ *  the plugin and its core for the underlying hardware. As a result of 
			
 
				+ *  calibration, every call to the emulation engine's core should roughly use up
			
 
				+ *  1% of the total usage value to be emulated. This makes sure that the 
			
 
				+ *  underlying hardware is profiled before use and that the plugin doesn't 
			
 
				+ *  accidently overuse the CPU. With 1% as the unit emulation target value for 
			
 
				+ *  the core engine, there will be roughly 100 calls to the engine resulting in 
			
 
				+ *  roughly 100 calls to the feedback (resource usage monitor) module. 
			
 
				+ *  Excessive usage of the feedback module is discouraged as 
			
 
				+ *  it might result into excess CPU usage resulting into no real CPU emulation.
			
 
				+ *  </p>
			
 
				+ */
			
 
				+public class CumulativeCpuUsageEmulatorPlugin 
			
 
				+implements ResourceUsageEmulatorPlugin {
			
 
				+  protected CpuUsageEmulatorCore emulatorCore;
			
 
				+  private ResourceCalculatorPlugin monitor;
			
 
				+  private Progressive progress;
			
 
				+  private boolean enabled = true;
			
 
				+  private float emulationInterval; // emulation interval
			
 
				+  private long targetCpuUsage = 0;
			
 
				+  private float lastSeenProgress = 0;
			
 
				+  private long lastSeenCpuUsageCpuUsage = 0;
			
 
				+  
			
 
				+  // Configuration parameters
			
 
				+  public static final String CPU_EMULATION_PROGRESS_INTERVAL = 
			
 
				+    "gridmix.emulators.resource-usage.cpu.emulation-interval";
			
 
				+  private static final float DEFAULT_EMULATION_FREQUENCY = 0.1F; // 10 times
			
 
				+
			
 
				+  /**
			
 
				+   * This is the core CPU usage emulation algorithm. This is the core engine
			
 
				+   * which actually performs some CPU intensive operations to consume some
			
 
				+   * amount of CPU. Multiple calls of {@link #compute()} should help the 
			
 
				+   * plugin emulate the desired level of CPU usage. This core engine can be
			
 
				+   * calibrated using the {@link #calibrate(ResourceCalculatorPlugin, long)}
			
 
				+   * API to suit the underlying hardware better. It also can be used to optimize
			
 
				+   * the emulation cycle.
			
 
				+   */
			
 
				+  public interface CpuUsageEmulatorCore {
			
 
				+    /**
			
 
				+     * Performs some computation to use up some CPU.
			
 
				+     */
			
 
				+    public void compute();
			
 
				+    
			
 
				+    /**
			
 
				+     * Allows the core to calibrate itself.
			
 
				+     */
			
 
				+    public void calibrate(ResourceCalculatorPlugin monitor, 
			
 
				+                          long totalCpuUsage);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * This is the core engine to emulate the CPU usage. The only responsibility 
			
 
				+   * of this class is to perform certain math intensive operations to make sure 
			
 
				+   * that some desired value of CPU is used.
			
 
				+   */
			
 
				+  public static class DefaultCpuUsageEmulator implements CpuUsageEmulatorCore {
			
 
				+    // number of times to loop for performing the basic unit computation
			
 
				+    private int numIterations;
			
 
				+    private final Random random;
			
 
				+    
			
 
				+    /**
			
 
				+     * This is to fool the JVM and make it think that we need the value 
			
 
				+     * stored in the unit computation i.e {@link #compute()}. This will prevent
			
 
				+     * the JVM from optimizing the code.
			
 
				+     */
			
 
				+    protected double returnValue;
			
 
				+    
			
 
				+    /**
			
 
				+     * Initialized the {@link DefaultCpuUsageEmulator} with default values. 
			
 
				+     * Note that the {@link DefaultCpuUsageEmulator} should be calibrated 
			
 
				+     * (see {@link #calibrate(ResourceCalculatorPlugin, long)}) when initialized
			
 
				+     * using this constructor.
			
 
				+     */
			
 
				+    public DefaultCpuUsageEmulator() {
			
 
				+      this(-1);
			
 
				+    }
			
 
				+    
			
 
				+    DefaultCpuUsageEmulator(int numIterations) {
			
 
				+      this.numIterations = numIterations;
			
 
				+      random = new Random();
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * This will consume some desired level of CPU. This API will try to use up
			
 
				+     * 'X' percent of the target cumulative CPU usage. Currently X is set to 
			
 
				+     * 10%.
			
 
				+     */
			
 
				+    public void compute() {
			
 
				+      for (int i = 0; i < numIterations; ++i) {
			
 
				+        performUnitComputation();
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    // Perform unit computation. The complete CPU emulation will be based on 
			
 
				+    // multiple invocations to this unit computation module.
			
 
				+    protected void performUnitComputation() {
			
 
				+      //TODO can this be configurable too. Users/emulators should be able to 
			
 
				+      // pick and choose what MATH operations to run.
			
 
				+      // Example :
			
 
				+      //           BASIC : ADD, SUB, MUL, DIV
			
 
				+      //           ADV   : SQRT, SIN, COSIN..
			
 
				+      //           COMPO : (BASIC/ADV)*
			
 
				+      // Also define input generator. For now we can use the random number 
			
 
				+      // generator. Later this can be changed to accept multiple sources.
			
 
				+      
			
 
				+      int randomData = random.nextInt();
			
 
				+      int randomDataCube = randomData * randomData * randomData;
			
 
				+      double randomDataCubeRoot = Math.cbrt(randomData);
			
 
				+      returnValue = Math.log(Math.tan(randomDataCubeRoot 
			
 
				+                                      * Math.exp(randomDataCube)) 
			
 
				+                             * Math.sqrt(randomData));
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * This will calibrate the algorithm such that a single invocation of
			
 
				+     * {@link #compute()} emulates roughly 1% of the total desired resource 
			
 
				+     * usage value.
			
 
				+     */
			
 
				+    public void calibrate(ResourceCalculatorPlugin monitor, 
			
 
				+                          long totalCpuUsage) {
			
 
				+      long initTime = monitor.getProcResourceValues().getCumulativeCpuTime();
			
 
				+      
			
 
				+      long defaultLoopSize = 0;
			
 
				+      long finalTime = initTime;
			
 
				+      
			
 
				+      //TODO Make this configurable
			
 
				+      while (finalTime - initTime < 100) { // 100 ms
			
 
				+        ++defaultLoopSize;
			
 
				+        performUnitComputation(); //perform unit computation
			
 
				+        finalTime = monitor.getProcResourceValues().getCumulativeCpuTime();
			
 
				+      }
			
 
				+      
			
 
				+      long referenceRuntime = finalTime - initTime;
			
 
				+      
			
 
				+      // time for one loop = (final-time - init-time) / total-loops
			
 
				+      float timePerLoop = ((float)referenceRuntime) / defaultLoopSize;
			
 
				+      
			
 
				+      // compute the 1% of the total CPU usage desired
			
 
				+      //TODO Make this configurable
			
 
				+      long onePercent = totalCpuUsage / 100;
			
 
				+      
			
 
				+      // num-iterations for 1% = (total-desired-usage / 100) / time-for-one-loop
			
 
				+      numIterations = Math.max(1, (int)((float)onePercent/timePerLoop));
			
 
				+      
			
 
				+      System.out.println("Calibration done. Basic computation runtime : " 
			
 
				+          + timePerLoop + " milliseconds. Optimal number of iterations (1%): " 
			
 
				+          + numIterations);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  public CumulativeCpuUsageEmulatorPlugin() {
			
 
				+    this(new DefaultCpuUsageEmulator());
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * For testing.
			
 
				+   */
			
 
				+  public CumulativeCpuUsageEmulatorPlugin(CpuUsageEmulatorCore core) {
			
 
				+    emulatorCore = core;
			
 
				+  }
			
 
				+  
			
 
				+  // Note that this weighing function uses only the current progress. In future,
			
 
				+  // this might depend on progress, emulation-interval and expected target.
			
 
				+  private float getWeightForProgressInterval(float progress) {
			
 
				+    // we want some kind of exponential growth function that gives less weight
			
 
				+    // on lower progress boundaries but high (exact emulation) near progress 
			
 
				+    // value of 1.
			
 
				+    // so here is how the current growth function looks like
			
 
				+    //    progress    weight
			
 
				+    //      0.1       0.0001
			
 
				+    //      0.2       0.0016
			
 
				+    //      0.3       0.0081
			
 
				+    //      0.4       0.0256
			
 
				+    //      0.5       0.0625
			
 
				+    //      0.6       0.1296
			
 
				+    //      0.7       0.2401
			
 
				+    //      0.8       0.4096
			
 
				+    //      0.9       0.6561
			
 
				+    //      1.0       1.000
			
 
				+    
			
 
				+    return progress * progress * progress * progress;
			
 
				+  }
			
 
				+  
			
 
				+  @Override
			
 
				+  //TODO Multi-threading for speedup?
			
 
				+  public void emulate() throws IOException, InterruptedException {
			
 
				+    if (enabled) {
			
 
				+      float currentProgress = progress.getProgress();
			
 
				+      if (lastSeenProgress < currentProgress 
			
 
				+          && ((currentProgress - lastSeenProgress) >= emulationInterval
			
 
				+              || currentProgress == 1)) {
			
 
				+        // Estimate the final cpu usage
			
 
				+        //
			
 
				+        //   Consider the following
			
 
				+        //     Cl/Cc/Cp : Last/Current/Projected Cpu usage
			
 
				+        //     Pl/Pc/Pp : Last/Current/Projected progress
			
 
				+        //   Then
			
 
				+        //     (Cp-Cc)/(Pp-Pc) = (Cc-Cl)/(Pc-Pl)
			
 
				+        //   Solving this for Cp, we get
			
 
				+        //     Cp = Cc + (1-Pc)*(Cc-Cl)/Pc-Pl)
			
 
				+        //   Note that (Cc-Cl)/(Pc-Pl) is termed as 'rate' in the following 
			
 
				+        //   section
			
 
				+        
			
 
				+        long currentCpuUsage = 
			
 
				+          monitor.getProcResourceValues().getCumulativeCpuTime();
			
 
				+        // estimate the cpu usage rate
			
 
				+        float rate = (currentCpuUsage - lastSeenCpuUsageCpuUsage)
			
 
				+                     / (currentProgress - lastSeenProgress);
			
 
				+        long projectedUsage = 
			
 
				+          currentCpuUsage + (long)((1 - currentProgress) * rate);
			
 
				+        
			
 
				+        if (projectedUsage < targetCpuUsage) {
			
 
				+          // determine the correction factor between the current usage and the
			
 
				+          // expected usage and add some weight to the target
			
 
				+          long currentWeighedTarget = 
			
 
				+            (long)(targetCpuUsage 
			
 
				+                   * getWeightForProgressInterval(currentProgress));
			
 
				+          
			
 
				+          while (monitor.getProcResourceValues().getCumulativeCpuTime() 
			
 
				+                 < currentWeighedTarget) {
			
 
				+            emulatorCore.compute();
			
 
				+            // sleep for 100ms
			
 
				+            try {
			
 
				+              Thread.sleep(100);
			
 
				+            } catch (InterruptedException ie) {
			
 
				+              String message = 
			
 
				+                "CumulativeCpuUsageEmulatorPlugin got interrupted. Exiting.";
			
 
				+              throw new RuntimeException(message);
			
 
				+            }
			
 
				+          }
			
 
				+        }
			
 
				+        
			
 
				+        // set the last seen progress
			
 
				+        lastSeenProgress = progress.getProgress();
			
 
				+        // set the last seen usage
			
 
				+        lastSeenCpuUsageCpuUsage = 
			
 
				+          monitor.getProcResourceValues().getCumulativeCpuTime();
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public void initialize(Configuration conf, ResourceUsageMetrics metrics,
			
 
				+                         ResourceCalculatorPlugin monitor,
			
 
				+                         Progressive progress) {
			
 
				+    // get the target CPU usage
			
 
				+    targetCpuUsage = metrics.getCumulativeCpuUsage();
			
 
				+    if (targetCpuUsage <= 0 ) {
			
 
				+      enabled = false;
			
 
				+      return;
			
 
				+    } else {
			
 
				+      enabled = true;
			
 
				+    }
			
 
				+    
			
 
				+    this.monitor = monitor;
			
 
				+    this.progress = progress;
			
 
				+    emulationInterval =  conf.getFloat(CPU_EMULATION_PROGRESS_INTERVAL, 
			
 
				+                                       DEFAULT_EMULATION_FREQUENCY);
			
 
				+    
			
 
				+    // calibrate the core cpu-usage utility
			
 
				+    emulatorCore.calibrate(monitor, targetCpuUsage);
			
 
				+    
			
 
				+    // initialize the states
			
 
				+    lastSeenProgress = 0;
			
 
				+    lastSeenCpuUsageCpuUsage = 0;
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/emulators/resourceusage/ResourceUsageEmulatorPlugin.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/emulators/resourceusage/ResourceUsageEmulatorPlugin.java
@@ -0,0 +1,63 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix.emulators.resourceusage;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.apache.hadoop.mapred.gridmix.Progressive;
			
 
				+import org.apache.hadoop.util.ResourceCalculatorPlugin;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+
			
 
				+/**
			
 
				+ * <p>Each resource to be emulated should have a corresponding implementation 
			
 
				+ * class that implements {@link ResourceUsageEmulatorPlugin}.</p>
			
 
				+ * <br><br>
			
 
				+ * {@link ResourceUsageEmulatorPlugin} will be configured using the 
			
 
				+ * {@link #initialize(Configuration, ResourceUsageMetrics, 
			
 
				+ *                    ResourceCalculatorPlugin, Progressive)} call.
			
 
				+ * Every 
			
 
				+ * {@link ResourceUsageEmulatorPlugin} is also configured with a feedback module
			
 
				+ * i.e a {@link ResourceCalculatorPlugin}, to monitor the current resource 
			
 
				+ * usage. {@link ResourceUsageMetrics} decides the final resource usage value to
			
 
				+ * emulate. {@link Progressive} keeps track of the task's progress.</p>
			
 
				+ * 
			
 
				+ * <br><br>
			
 
				+ * 
			
 
				+ * For configuring GridMix to load and and use a resource usage emulator, 
			
 
				+ * see {@link ResourceUsageMatcher}. 
			
 
				+ */
			
 
				+public interface ResourceUsageEmulatorPlugin {
			
 
				+  /**
			
 
				+   * Initialize the plugin. This might involve
			
 
				+   *   - initializing the variables
			
 
				+   *   - calibrating the plugin
			
 
				+   */
			
 
				+  void initialize(Configuration conf, ResourceUsageMetrics metrics, 
			
 
				+                  ResourceCalculatorPlugin monitor,
			
 
				+                  Progressive progress);
			
 
				+
			
 
				+  /**
			
 
				+   * Emulate the resource usage to match the usage target. The plugin can use
			
 
				+   * the given {@link ResourceCalculatorPlugin} to query for the current 
			
 
				+   * resource usage.
			
 
				+   * @throws IOException
			
 
				+   * @throws InterruptedException
			
 
				+   */
			
 
				+  void emulate() throws IOException, InterruptedException;
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/emulators/resourceusage/ResourceUsageMatcher.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/emulators/resourceusage/ResourceUsageMatcher.java
@@ -0,0 +1,80 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix.emulators.resourceusage;
			
 
				+
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.mapred.gridmix.Progressive;
			
 
				+import org.apache.hadoop.util.ResourceCalculatorPlugin;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				+import org.apache.hadoop.util.ReflectionUtils;
			
 
				+
			
 
				+/**
			
 
				+ * <p>This is the driver class for managing all the resource usage emulators.
			
 
				+ * {@link ResourceUsageMatcher} expects a comma separated list of 
			
 
				+ * {@link ResourceUsageEmulatorPlugin} implementations specified using 
			
 
				+ * {@link #RESOURCE_USAGE_EMULATION_PLUGINS} as the configuration parameter.</p>
			
 
				+ * 
			
 
				+ * <p>Note that the order in which the emulators are invoked is same as the 
			
 
				+ * order in which they are configured.
			
 
				+ */
			
 
				+public class ResourceUsageMatcher {
			
 
				+  /**
			
 
				+   * Configuration key to set resource usage emulators.
			
 
				+   */
			
 
				+  public static final String RESOURCE_USAGE_EMULATION_PLUGINS =
			
 
				+    "gridmix.emulators.resource-usage.plugins";
			
 
				+  
			
 
				+  private List<ResourceUsageEmulatorPlugin> emulationPlugins = 
			
 
				+    new ArrayList<ResourceUsageEmulatorPlugin>();
			
 
				+  
			
 
				+  /**
			
 
				+   * Configure the {@link ResourceUsageMatcher} to load the configured plugins
			
 
				+   * and initialize them.
			
 
				+   */
			
 
				+  @SuppressWarnings("unchecked")
			
 
				+  public void configure(Configuration conf, ResourceCalculatorPlugin monitor, 
			
 
				+                        ResourceUsageMetrics metrics, Progressive progress) {
			
 
				+    Class[] plugins = conf.getClasses(RESOURCE_USAGE_EMULATION_PLUGINS);
			
 
				+//, null, ResourceUsageEmulatorPlugin.class);
			
 
				+    if (plugins == null) {
			
 
				+      System.out.println("No resource usage emulator plugins configured.");
			
 
				+    } else {
			
 
				+      for (Class<? extends ResourceUsageEmulatorPlugin> plugin : plugins) {
			
 
				+        if (plugin != null) {
			
 
				+          emulationPlugins.add(ReflectionUtils.newInstance(plugin, conf));
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // initialize the emulators once all the configured emulator plugins are
			
 
				+    // loaded
			
 
				+    for (ResourceUsageEmulatorPlugin emulator : emulationPlugins) {
			
 
				+      emulator.initialize(conf, metrics, monitor, progress);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  public void matchResourceUsage() throws Exception {
			
 
				+    for (ResourceUsageEmulatorPlugin emulator : emulationPlugins) {
			
 
				+      // match the resource usage
			
 
				+      emulator.emulate();
			
 
				+    }
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/emulators/resourceusage/TotalHeapUsageEmulatorPlugin.java
+++ b/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/emulators/resourceusage/TotalHeapUsageEmulatorPlugin.java
@@ -0,0 +1,258 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix.emulators.resourceusage;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.mapred.gridmix.Progressive;
			
 
				+import org.apache.hadoop.util.ResourceCalculatorPlugin;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				+
			
 
				+/**
			
 
				+ * <p>A {@link ResourceUsageEmulatorPlugin} that emulates the total heap 
			
 
				+ * usage by loading the JVM heap memory. Adding smaller chunks of data to the 
			
 
				+ * heap will essentially use up some heap space thus forcing the JVM to expand 
			
 
				+ * its heap and thus resulting into increase in the heap usage.</p>
			
 
				+ * 
			
 
				+ * <p>{@link TotalHeapUsageEmulatorPlugin} emulates the heap usage in steps. 
			
 
				+ * The frequency of emulation can be configured via 
			
 
				+ * {@link #HEAP_EMULATION_PROGRESS_INTERVAL}.
			
 
				+ * Heap usage values are matched via emulation only at specific interval 
			
 
				+ * boundaries.
			
 
				+ * </p>
			
 
				+ *  
			
 
				+ * {@link TotalHeapUsageEmulatorPlugin} is a wrapper program for managing 
			
 
				+ * the heap usage emulation feature. It internally uses an emulation algorithm 
			
 
				+ * (called as core and described using {@link HeapUsageEmulatorCore}) for 
			
 
				+ * performing the actual emulation. Multiple calls to this core engine should 
			
 
				+ * use up some amount of heap.
			
 
				+ */
			
 
				+public class TotalHeapUsageEmulatorPlugin 
			
 
				+implements ResourceUsageEmulatorPlugin {
			
 
				+  // Configuration parameters
			
 
				+  //  the core engine to emulate heap usage
			
 
				+  protected HeapUsageEmulatorCore emulatorCore;
			
 
				+  //  the progress bar
			
 
				+  private Progressive progress;
			
 
				+  //  decides if this plugin can emulate heap usage or not
			
 
				+  private boolean enabled = true;
			
 
				+  //  the progress boundaries/interval where emulation should be done
			
 
				+  private float emulationInterval;
			
 
				+  //  target heap usage to emulate
			
 
				+  private long targetHeapUsageInMB = 0;
			
 
				+  
			
 
				+  /**
			
 
				+   * The frequency (based on task progress) with which memory-emulation code is
			
 
				+   * run. If the value is set to 0.1 then the emulation will happen at 10% of 
			
 
				+   * the task's progress. The default value of this parameter is 
			
 
				+   * {@link #DEFAULT_EMULATION_PROGRESS_INTERVAL}.
			
 
				+   */
			
 
				+  public static final String HEAP_EMULATION_PROGRESS_INTERVAL = 
			
 
				+    "gridmix.emulators.resource-usage.heap.emulation-interval";
			
 
				+  
			
 
				+  // Default value for emulation interval
			
 
				+  private static final float DEFAULT_EMULATION_PROGRESS_INTERVAL = 0.1F; // 10 %
			
 
				+
			
 
				+  private float prevEmulationProgress = 0F;
			
 
				+  
			
 
				+  /**
			
 
				+   * The minimum buffer reserved for other non-emulation activities.
			
 
				+   */
			
 
				+  public static final String MIN_HEAP_FREE_RATIO = 
			
 
				+    "gridmix.emulators.resource-usage.heap.min-free-ratio";
			
 
				+  
			
 
				+  private float minFreeHeapRatio;
			
 
				+  
			
 
				+  private static final float DEFAULT_MIN_FREE_HEAP_RATIO = 0.3F;
			
 
				+  
			
 
				+  /**
			
 
				+   * Determines the unit increase per call to the core engine's load API. This
			
 
				+   * is expressed as a percentage of the difference between the expected total 
			
 
				+   * heap usage and the current usage. 
			
 
				+   */
			
 
				+  public static final String HEAP_LOAD_RATIO = 
			
 
				+    "gridmix.emulators.resource-usage.heap.load-ratio";
			
 
				+  
			
 
				+  private float heapLoadRatio;
			
 
				+  
			
 
				+  private static final float DEFAULT_HEAP_LOAD_RATIO = 0.1F;
			
 
				+  
			
 
				+  public static int ONE_MB = 1024 * 1024;
			
 
				+  
			
 
				+  /**
			
 
				+   * Defines the core heap usage emulation algorithm. This engine is expected
			
 
				+   * to perform certain memory intensive operations to consume some
			
 
				+   * amount of heap. {@link #load(long)} should load the current heap and 
			
 
				+   * increase the heap usage by the specified value. This core engine can be 
			
 
				+   * initialized using the {@link #initialize(ResourceCalculatorPlugin, long)} 
			
 
				+   * API to suit the underlying hardware better.
			
 
				+   */
			
 
				+  public interface HeapUsageEmulatorCore {
			
 
				+    /**
			
 
				+     * Performs some memory intensive operations to use up some heap.
			
 
				+     */
			
 
				+    public void load(long sizeInMB);
			
 
				+    
			
 
				+    /**
			
 
				+     * Initialize the core.
			
 
				+     */
			
 
				+    public void initialize(ResourceCalculatorPlugin monitor, 
			
 
				+                           long totalHeapUsageInMB);
			
 
				+    
			
 
				+    /**
			
 
				+     * Reset the resource usage
			
 
				+     */
			
 
				+    public void reset();
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * This is the core engine to emulate the heap usage. The only responsibility 
			
 
				+   * of this class is to perform certain memory intensive operations to make 
			
 
				+   * sure that some desired value of heap is used.
			
 
				+   */
			
 
				+  public static class DefaultHeapUsageEmulator 
			
 
				+  implements HeapUsageEmulatorCore {
			
 
				+    // store the unit loads in a list
			
 
				+    protected static ArrayList<Object> heapSpace = new ArrayList<Object>();
			
 
				+    
			
 
				+    /**
			
 
				+     * Increase heap usage by current process by the given amount.
			
 
				+     * This is done by creating objects each of size 1MB.
			
 
				+     */
			
 
				+    public void load(long sizeInMB) {
			
 
				+      for (long i = 0; i < sizeInMB; ++i) {
			
 
				+        // Create another String object of size 1MB
			
 
				+        heapSpace.add((Object)new byte[ONE_MB]);
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * This will initialize the core and check if the core can emulate the 
			
 
				+     * desired target on the underlying hardware.
			
 
				+     */
			
 
				+    public void initialize(ResourceCalculatorPlugin monitor, 
			
 
				+                           long totalHeapUsageInMB) {
			
 
				+      long maxPhysicalMemoryInMB = monitor.getPhysicalMemorySize() / ONE_MB ;
			
 
				+      if(maxPhysicalMemoryInMB < totalHeapUsageInMB) {
			
 
				+        throw new RuntimeException("Total heap the can be used is " 
			
 
				+            + maxPhysicalMemoryInMB 
			
 
				+            + " bytes while the emulator is configured to emulate a total of " 
			
 
				+            + totalHeapUsageInMB + " bytes");
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * Clear references to all the GridMix-allocated special objects so that 
			
 
				+     * heap usage is reduced.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public void reset() {
			
 
				+      heapSpace.clear();
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  public TotalHeapUsageEmulatorPlugin() {
			
 
				+    this(new DefaultHeapUsageEmulator());
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * For testing.
			
 
				+   */
			
 
				+  public TotalHeapUsageEmulatorPlugin(HeapUsageEmulatorCore core) {
			
 
				+    emulatorCore = core;
			
 
				+  }
			
 
				+  
			
 
				+  protected long getTotalHeapUsageInMB() {
			
 
				+    return Runtime.getRuntime().totalMemory() / ONE_MB;
			
 
				+  }
			
 
				+  
			
 
				+  protected long getMaxHeapUsageInMB() {
			
 
				+    return Runtime.getRuntime().maxMemory() / ONE_MB;
			
 
				+  }
			
 
				+  
			
 
				+  @Override
			
 
				+  public void emulate() throws IOException, InterruptedException {
			
 
				+    if (enabled) {
			
 
				+      float currentProgress = progress.getProgress();
			
 
				+      if (prevEmulationProgress < currentProgress 
			
 
				+          && ((currentProgress - prevEmulationProgress) >= emulationInterval
			
 
				+              || currentProgress == 1)) {
			
 
				+
			
 
				+        long maxHeapSizeInMB = getMaxHeapUsageInMB();
			
 
				+        long committedHeapSizeInMB = getTotalHeapUsageInMB();
			
 
				+        
			
 
				+        // Increase committed heap usage, if needed
			
 
				+        // Using a linear weighing function for computing the expected usage
			
 
				+        long expectedHeapUsageInMB = 
			
 
				+          Math.min(maxHeapSizeInMB,
			
 
				+                   (long) (targetHeapUsageInMB * currentProgress));
			
 
				+        if (expectedHeapUsageInMB < maxHeapSizeInMB
			
 
				+            && committedHeapSizeInMB < expectedHeapUsageInMB) {
			
 
				+          long bufferInMB = (long)(minFreeHeapRatio * expectedHeapUsageInMB);
			
 
				+          long currentDifferenceInMB = 
			
 
				+            expectedHeapUsageInMB - committedHeapSizeInMB;
			
 
				+          long currentIncrementLoadSizeInMB = 
			
 
				+                (long)(currentDifferenceInMB * heapLoadRatio);
			
 
				+          // Make sure that at least 1 MB is incremented.
			
 
				+          currentIncrementLoadSizeInMB = 
			
 
				+            Math.max(1, currentIncrementLoadSizeInMB);
			
 
				+          while (committedHeapSizeInMB + bufferInMB < expectedHeapUsageInMB) {
			
 
				+            // add blocks in order of X% of the difference, X = 10% by default
			
 
				+            emulatorCore.load(currentIncrementLoadSizeInMB);
			
 
				+            committedHeapSizeInMB = getTotalHeapUsageInMB();
			
 
				+          }
			
 
				+        }
			
 
				+        
			
 
				+        // store the emulation progress boundary
			
 
				+        prevEmulationProgress = currentProgress;
			
 
				+      }
			
 
				+      
			
 
				+      // reset the core so that the garbage is reclaimed
			
 
				+      emulatorCore.reset();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  @Override
			
 
				+  public void initialize(Configuration conf, ResourceUsageMetrics metrics,
			
 
				+                         ResourceCalculatorPlugin monitor,
			
 
				+                         Progressive progress) {
			
 
				+    // get the target heap usage
			
 
				+    targetHeapUsageInMB = metrics.getHeapUsage() / ONE_MB;
			
 
				+    if (targetHeapUsageInMB <= 0 ) {
			
 
				+      enabled = false;
			
 
				+      return;
			
 
				+    } else {
			
 
				+      // calibrate the core heap-usage utility
			
 
				+      emulatorCore.initialize(monitor, targetHeapUsageInMB);
			
 
				+      enabled = true;
			
 
				+    }
			
 
				+    
			
 
				+    this.progress = progress;
			
 
				+    emulationInterval = 
			
 
				+      conf.getFloat(HEAP_EMULATION_PROGRESS_INTERVAL, 
			
 
				+                    DEFAULT_EMULATION_PROGRESS_INTERVAL);
			
 
				+    
			
 
				+    minFreeHeapRatio = conf.getFloat(MIN_HEAP_FREE_RATIO, 
			
 
				+                                     DEFAULT_MIN_FREE_HEAP_RATIO);
			
 
				+    
			
 
				+    heapLoadRatio = conf.getFloat(HEAP_LOAD_RATIO, DEFAULT_HEAP_LOAD_RATIO);
			
 
				+    
			
 
				+    prevEmulationProgress = 0;
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/data/wordcount.json.gz
+++ b/src/contrib/gridmix/src/test/data/wordcount.json.gz
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/DebugJobProducer.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/DebugJobProducer.java
@@ -146,7 +146,7 @@ public class DebugJobProducer implements JobStoryProducer {
 
				       final long seed = r.nextLong();
			
 
				       r.setSeed(seed);
			
 
				       id = seq.getAndIncrement();
			
 
				-      name = String.format("MOCKJOB%05d", id);
			
 
				+      name = String.format("MOCKJOB%06d", id);
			
 
				       this.conf = conf;
			
 
				       LOG.info(name + " (" + seed + ")");
			
 
				       submitTime = timestamp.addAndGet(
			
@@ -209,9 +209,14 @@ public class DebugJobProducer implements JobStoryProducer {
 
				 
			
 
				    @Override
			
 
				    public String getUser() {
			
 
				-     String s = String.format("foobar%d", id);
			
 
				-     GridmixTestUtils.createHomeAndStagingDirectory(s,(JobConf)conf);
			
 
				-     return s;
			
 
				+     // Obtain user name from job configuration, if available.
			
 
				+     // Otherwise use dummy user names.
			
 
				+     String user = conf.get("user.name");
			
 
				+     if (user == null) {
			
 
				+       user = String.format("foobar%d", id);
			
 
				+     }
			
 
				+     GridmixTestUtils.createHomeAndStagingDirectory(user, (JobConf)conf);
			
 
				+     return user;
			
 
				    }
			
 
				 
			
 
				    @Override
			
@@ -285,7 +290,7 @@ public class DebugJobProducer implements JobStoryProducer {
 
				 
			
 
				     @Override
			
 
				     public org.apache.hadoop.mapred.JobConf getJobConf() {
			
 
				-      throw new UnsupportedOperationException();
			
 
				+      return new JobConf(conf);
			
 
				     }
			
 
				 
			
 
				     @Override
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestCompressionEmulationUtils.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestCompressionEmulationUtils.java
@@ -0,0 +1,563 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.io.BufferedReader;
			
 
				+import java.io.BufferedWriter;
			
 
				+import java.io.DataInput;
			
 
				+import java.io.DataInputStream;
			
 
				+import java.io.DataOutputStream;
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+import java.io.InputStreamReader;
			
 
				+import java.io.OutputStream;
			
 
				+import java.io.OutputStreamWriter;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FSDataInputStream;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.io.compress.CompressionCodec;
			
 
				+import org.apache.hadoop.io.compress.GzipCodec;
			
 
				+import org.apache.hadoop.mapred.ClusterStatus;
			
 
				+import org.apache.hadoop.mapred.JobClient;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.Utils;
			
 
				+import org.apache.hadoop.mapred.gridmix.CompressionEmulationUtil.RandomTextDataMapper;
			
 
				+import org.apache.hadoop.mapred.gridmix.GenerateData.GenSplit;
			
 
				+import org.apache.hadoop.mapreduce.InputSplit;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.mapreduce.JobContext;
			
 
				+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
			
 
				+
			
 
				+import static org.junit.Assert.*;
			
 
				+import org.junit.Test;
			
 
				+
			
 
				+/**
			
 
				+ * Test {@link CompressionEmulationUtil}
			
 
				+ */
			
 
				+public class TestCompressionEmulationUtils {
			
 
				+  //TODO Remove this once LocalJobRunner can run Gridmix.
			
 
				+  static class CustomInputFormat extends GenerateData.GenDataFormat {
			
 
				+    @Override
			
 
				+    public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
			
 
				+      // get the total data to be generated
			
 
				+      long toGen =
			
 
				+        jobCtxt.getConfiguration().getLong(GenerateData.GRIDMIX_GEN_BYTES, -1);
			
 
				+      if (toGen < 0) {
			
 
				+        throw new IOException("Invalid/missing generation bytes: " + toGen);
			
 
				+      }
			
 
				+      // get the total number of mappers configured
			
 
				+      int totalMappersConfigured =
			
 
				+        jobCtxt.getConfiguration().getInt("mapred.map.tasks", -1);
			
 
				+      if (totalMappersConfigured < 0) {
			
 
				+        throw new IOException("Invalid/missing num mappers: " 
			
 
				+                              + totalMappersConfigured);
			
 
				+      }
			
 
				+      
			
 
				+      final long bytesPerTracker = toGen / totalMappersConfigured;
			
 
				+      final ArrayList<InputSplit> splits = 
			
 
				+        new ArrayList<InputSplit>(totalMappersConfigured);
			
 
				+      for (int i = 0; i < totalMappersConfigured; ++i) {
			
 
				+        splits.add(new GenSplit(bytesPerTracker, 
			
 
				+                   new String[] { "tracker_local" }));
			
 
				+      }
			
 
				+      return splits;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link RandomTextDataMapper} via {@link CompressionEmulationUtil}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testRandomCompressedTextDataGenerator() throws Exception {
			
 
				+    int wordSize = 10;
			
 
				+    int listSize = 20;
			
 
				+    long dataSize = 10*1024*1024;
			
 
				+    
			
 
				+    Configuration conf = new Configuration();
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
			
 
				+    
			
 
				+    // configure the RandomTextDataGenerator to generate desired sized data
			
 
				+    conf.setInt(RandomTextDataGenerator.GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE, 
			
 
				+                listSize);
			
 
				+    conf.setInt(RandomTextDataGenerator.GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE, 
			
 
				+                wordSize);
			
 
				+    conf.setLong(GenerateData.GRIDMIX_GEN_BYTES, dataSize);
			
 
				+    
			
 
				+    FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+    
			
 
				+    // define the test's root temp directory
			
 
				+    Path rootTempDir =
			
 
				+        new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(
			
 
				+            lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+
			
 
				+    Path tempDir = new Path(rootTempDir, "TestRandomCompressedTextDataGenr");
			
 
				+    lfs.delete(tempDir, true);
			
 
				+    
			
 
				+    runDataGenJob(conf, tempDir);
			
 
				+    
			
 
				+    // validate the output data
			
 
				+    FileStatus[] files = 
			
 
				+      lfs.listStatus(tempDir, new Utils.OutputFileUtils.OutputFilesFilter());
			
 
				+    long size = 0;
			
 
				+    long maxLineSize = 0;
			
 
				+    
			
 
				+    for (FileStatus status : files) {
			
 
				+      InputStream in = 
			
 
				+        CompressionEmulationUtil
			
 
				+          .getPossiblyDecompressedInputStream(status.getPath(), conf, 0);
			
 
				+      BufferedReader reader = new BufferedReader(new InputStreamReader(in));
			
 
				+      String line = reader.readLine();
			
 
				+      if (line != null) {
			
 
				+        long lineSize = line.getBytes().length;
			
 
				+        if (lineSize > maxLineSize) {
			
 
				+          maxLineSize = lineSize;
			
 
				+        }
			
 
				+        while (line != null) {
			
 
				+          for (String word : line.split("\\s")) {
			
 
				+            size += word.getBytes().length;
			
 
				+          }
			
 
				+          line = reader.readLine();
			
 
				+        }
			
 
				+      }
			
 
				+      reader.close();
			
 
				+    }
			
 
				+
			
 
				+    assertTrue(size >= dataSize);
			
 
				+    assertTrue(size <= dataSize + maxLineSize);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Runs a GridMix data-generation job.
			
 
				+   */
			
 
				+  private static void runDataGenJob(Configuration conf, Path tempDir) 
			
 
				+  throws IOException, ClassNotFoundException, InterruptedException {
			
 
				+    JobConf jobConf = new JobConf(conf);
			
 
				+    JobClient client = new JobClient(jobConf);
			
 
				+    
			
 
				+    // get the local job runner
			
 
				+    jobConf.setInt("mapred.map.tasks", 1);
			
 
				+    
			
 
				+    Job job = new Job(jobConf);
			
 
				+
			
 
				+    CompressionEmulationUtil.configure(job);
			
 
				+    job.setInputFormatClass(CustomInputFormat.class);
			
 
				+    
			
 
				+    // set the output path
			
 
				+    FileOutputFormat.setOutputPath(job, tempDir);
			
 
				+    
			
 
				+    // submit and wait for completion
			
 
				+    job.submit();
			
 
				+    int ret = job.waitForCompletion(true) ? 0 : 1;
			
 
				+
			
 
				+    assertEquals("Job Failed", 0, ret);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test if {@link RandomTextDataGenerator} can generate random text data 
			
 
				+   * with the desired compression ratio. This involves
			
 
				+   *   - using {@link CompressionEmulationUtil} to configure the MR job for 
			
 
				+   *     generating the random text data with the desired compression ratio
			
 
				+   *   - running the MR job
			
 
				+   *   - test {@link RandomTextDataGenerator}'s output and match the output size
			
 
				+   *     (compressed) with the expected compression ratio.
			
 
				+   */
			
 
				+  private void testCompressionRatioConfigure(float ratio)
			
 
				+  throws Exception {
			
 
				+    long dataSize = 10*1024*1024;
			
 
				+    
			
 
				+    Configuration conf = new Configuration();
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
			
 
				+    
			
 
				+    conf.setLong(GenerateData.GRIDMIX_GEN_BYTES, dataSize);
			
 
				+    
			
 
				+    float expectedRatio = CompressionEmulationUtil.DEFAULT_COMPRESSION_RATIO;
			
 
				+    if (ratio > 0) {
			
 
				+      // set the compression ratio in the conf
			
 
				+      CompressionEmulationUtil.setMapInputCompressionEmulationRatio(conf, ratio);
			
 
				+      expectedRatio = 
			
 
				+        CompressionEmulationUtil.standardizeCompressionRatio(ratio);
			
 
				+    }
			
 
				+    
			
 
				+    // invoke the utility to map from ratio to word-size
			
 
				+    CompressionEmulationUtil.setupDataGeneratorConfig(conf);
			
 
				+    
			
 
				+    FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+    
			
 
				+    // define the test's root temp directory
			
 
				+    Path rootTempDir =
			
 
				+        new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(
			
 
				+            lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+
			
 
				+    Path tempDir = 
			
 
				+      new Path(rootTempDir, "TestCustomRandomCompressedTextDataGenr");
			
 
				+    lfs.delete(tempDir, true);
			
 
				+    
			
 
				+    runDataGenJob(conf, tempDir);
			
 
				+    
			
 
				+    // validate the output data
			
 
				+    FileStatus[] files = 
			
 
				+      lfs.listStatus(tempDir, new Utils.OutputFileUtils.OutputFilesFilter());
			
 
				+    long size = 0;
			
 
				+    
			
 
				+    for (FileStatus status : files) {
			
 
				+      size += status.getLen();
			
 
				+    }
			
 
				+
			
 
				+    float compressionRatio = ((float)size)/dataSize;
			
 
				+    float stdRatio = 
			
 
				+      CompressionEmulationUtil.standardizeCompressionRatio(compressionRatio);
			
 
				+    
			
 
				+    assertEquals(expectedRatio, stdRatio, 0.0D);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test compression ratio with multiple compression ratios.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testCompressionRatios() throws Exception {
			
 
				+    // test default compression ratio i.e 0.5
			
 
				+    testCompressionRatioConfigure(0F);
			
 
				+    // test for a sample compression ratio of 0.2
			
 
				+    testCompressionRatioConfigure(0.2F);
			
 
				+    // test for a sample compression ratio of 0.4
			
 
				+    testCompressionRatioConfigure(0.4F);
			
 
				+    // test for a sample compression ratio of 0.65
			
 
				+    testCompressionRatioConfigure(0.65F);
			
 
				+    // test for a compression ratio of 0.682 which should be standardized
			
 
				+    // to round(0.682) i.e 0.68
			
 
				+    testCompressionRatioConfigure(0.682F);
			
 
				+    // test for a compression ratio of 0.567 which should be standardized
			
 
				+    // to round(0.567) i.e 0.57
			
 
				+    testCompressionRatioConfigure(0.567F);
			
 
				+    
			
 
				+    // test with a compression ratio of 0.01 which less than the min supported
			
 
				+    // value of 0.07
			
 
				+    boolean failed = false;
			
 
				+    try {
			
 
				+      testCompressionRatioConfigure(0.01F);
			
 
				+    } catch (RuntimeException re) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertTrue("Compression ratio min value (0.07) check failed!", failed);
			
 
				+    
			
 
				+    // test with a compression ratio of 0.01 which less than the max supported
			
 
				+    // value of 0.68
			
 
				+    failed = false;
			
 
				+    try {
			
 
				+      testCompressionRatioConfigure(0.7F);
			
 
				+    } catch (RuntimeException re) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertTrue("Compression ratio max value (0.68) check failed!", failed);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test compression ratio standardization.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testCompressionRatioStandardization() throws Exception {
			
 
				+    assertEquals(0.55F, 
			
 
				+        CompressionEmulationUtil.standardizeCompressionRatio(0.55F), 0.0D);
			
 
				+    assertEquals(0.65F, 
			
 
				+        CompressionEmulationUtil.standardizeCompressionRatio(0.652F), 0.0D);
			
 
				+    assertEquals(0.78F, 
			
 
				+        CompressionEmulationUtil.standardizeCompressionRatio(0.777F), 0.0D);
			
 
				+    assertEquals(0.86F, 
			
 
				+        CompressionEmulationUtil.standardizeCompressionRatio(0.855F), 0.0D);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test map input compression ratio configuration utilities.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testInputCompressionRatioConfiguration() throws Exception {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    float ratio = 0.567F;
			
 
				+    CompressionEmulationUtil.setMapInputCompressionEmulationRatio(conf, ratio);
			
 
				+    assertEquals(ratio, 
			
 
				+        CompressionEmulationUtil.getMapInputCompressionEmulationRatio(conf), 
			
 
				+        0.0D);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test map output compression ratio configuration utilities.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testIntermediateCompressionRatioConfiguration() 
			
 
				+  throws Exception {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    float ratio = 0.567F;
			
 
				+    CompressionEmulationUtil.setMapOutputCompressionEmulationRatio(conf, ratio);
			
 
				+    assertEquals(ratio, 
			
 
				+        CompressionEmulationUtil.getMapOutputCompressionEmulationRatio(conf), 
			
 
				+        0.0D);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test reduce output compression ratio configuration utilities.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testOutputCompressionRatioConfiguration() throws Exception {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    float ratio = 0.567F;
			
 
				+    CompressionEmulationUtil.setReduceOutputCompressionEmulationRatio(conf, 
			
 
				+                                                                      ratio);
			
 
				+    assertEquals(ratio, 
			
 
				+        CompressionEmulationUtil.getReduceOutputCompressionEmulationRatio(conf),
			
 
				+        0.0D);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test compressible {@link GridmixRecord}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testCompressibleGridmixRecord() throws IOException {
			
 
				+    JobConf conf = new JobConf();
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
			
 
				+    
			
 
				+    FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+    int dataSize = 1024 * 1024 * 10; // 10 MB
			
 
				+    float ratio = 0.357F;
			
 
				+    
			
 
				+    // define the test's root temp directory
			
 
				+    Path rootTempDir =
			
 
				+        new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(
			
 
				+            lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+
			
 
				+    Path tempDir = new Path(rootTempDir, 
			
 
				+                            "TestPossiblyCompressibleGridmixRecord");
			
 
				+    lfs.delete(tempDir, true);
			
 
				+    
			
 
				+    // define a compressible GridmixRecord
			
 
				+    GridmixRecord record = new GridmixRecord(dataSize, 0);
			
 
				+    record.setCompressibility(true, ratio); // enable compression
			
 
				+    
			
 
				+    conf.setClass("mapred.output.compression.codec", GzipCodec.class, 
			
 
				+                  CompressionCodec.class);
			
 
				+    org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true);
			
 
				+    
			
 
				+    // write the record to a file
			
 
				+    Path recordFile = new Path(tempDir, "record");
			
 
				+    OutputStream outStream = CompressionEmulationUtil
			
 
				+                               .getPossiblyCompressedOutputStream(recordFile, 
			
 
				+                                                                  conf);    
			
 
				+    DataOutputStream out = new DataOutputStream(outStream);
			
 
				+    record.write(out);
			
 
				+    out.close();
			
 
				+    outStream.close();
			
 
				+    
			
 
				+    // open the compressed stream for reading
			
 
				+    Path actualRecordFile = recordFile.suffix(".gz");
			
 
				+    InputStream in = 
			
 
				+      CompressionEmulationUtil
			
 
				+        .getPossiblyDecompressedInputStream(actualRecordFile, conf, 0);
			
 
				+    
			
 
				+    // get the compressed file size
			
 
				+    long compressedFileSize = lfs.listStatus(actualRecordFile)[0].getLen();
			
 
				+    
			
 
				+    GridmixRecord recordRead = new GridmixRecord();
			
 
				+    recordRead.readFields(new DataInputStream(in));
			
 
				+    
			
 
				+    assertEquals("Record size mismatch in a compressible GridmixRecord",
			
 
				+                 dataSize, recordRead.getSize());
			
 
				+    assertTrue("Failed to generate a compressible GridmixRecord",
			
 
				+               recordRead.getSize() > compressedFileSize);
			
 
				+    
			
 
				+    // check if the record can generate data with the desired compression ratio
			
 
				+    float seenRatio = ((float)compressedFileSize)/dataSize;
			
 
				+    assertEquals(CompressionEmulationUtil.standardizeCompressionRatio(ratio), 
			
 
				+        CompressionEmulationUtil.standardizeCompressionRatio(seenRatio), 1.0D);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test 
			
 
				+   * {@link CompressionEmulationUtil#isCompressionEmulationEnabled(
			
 
				+   *          org.apache.hadoop.conf.Configuration)}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testIsCompressionEmulationEnabled() {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    // Check default values
			
 
				+    assertTrue(CompressionEmulationUtil.isCompressionEmulationEnabled(conf));
			
 
				+    
			
 
				+    // Check disabled
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
			
 
				+    assertFalse(CompressionEmulationUtil.isCompressionEmulationEnabled(conf));
			
 
				+    
			
 
				+    // Check enabled
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    assertTrue(CompressionEmulationUtil.isCompressionEmulationEnabled(conf));
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test 
			
 
				+   * {@link CompressionEmulationUtil#getPossiblyDecompressedInputStream(Path, 
			
 
				+   *                                   Configuration, long)}
			
 
				+   *  and
			
 
				+   *  {@link CompressionEmulationUtil#getPossiblyCompressedOutputStream(Path, 
			
 
				+   *                                    Configuration)}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testPossiblyCompressedDecompressedStreams() throws IOException {
			
 
				+    JobConf conf = new JobConf();
			
 
				+    FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+    String inputLine = "Hi Hello!";
			
 
				+
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
			
 
				+    conf.setBoolean("mapred.output.compress", true);
			
 
				+    conf.setClass("mapred.output.compression.codec", GzipCodec.class, 
			
 
				+                  CompressionCodec.class);
			
 
				+
			
 
				+    // define the test's root temp directory
			
 
				+    Path rootTempDir =
			
 
				+        new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(
			
 
				+            lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+
			
 
				+    Path tempDir =
			
 
				+      new Path(rootTempDir, "TestPossiblyCompressedDecompressedStreams");
			
 
				+    lfs.delete(tempDir, true);
			
 
				+
			
 
				+    // create a compressed file
			
 
				+    Path compressedFile = new Path(tempDir, "test");
			
 
				+    OutputStream out = 
			
 
				+      CompressionEmulationUtil.getPossiblyCompressedOutputStream(compressedFile, 
			
 
				+                                                                 conf);
			
 
				+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
			
 
				+    writer.write(inputLine);
			
 
				+    writer.close();
			
 
				+    
			
 
				+    // now read back the data from the compressed stream
			
 
				+    compressedFile = compressedFile.suffix(".gz");
			
 
				+    InputStream in = 
			
 
				+      CompressionEmulationUtil
			
 
				+        .getPossiblyDecompressedInputStream(compressedFile, conf, 0);
			
 
				+    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
			
 
				+    String readLine = reader.readLine();
			
 
				+    assertEquals("Compression/Decompression error", inputLine, readLine);
			
 
				+    reader.close();
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test if 
			
 
				+   * {@link CompressionEmulationUtil#configureCompressionEmulation(
			
 
				+   *        org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.JobConf)}
			
 
				+   *  can extract compression related configuration parameters.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testExtractCompressionConfigs() {
			
 
				+    JobConf source = new JobConf();
			
 
				+    JobConf target = new JobConf();
			
 
				+    
			
 
				+    // set the default values
			
 
				+    source.setBoolean("mapred.output.compress", false);
			
 
				+    source.set("mapred.output.compression.codec", "MyDefaultCodec");
			
 
				+    source.set("mapred.output.compression.type", "MyDefaultType");
			
 
				+    source.setBoolean("mapred.compress.map.output", false); 
			
 
				+    source.set("mapred.map.output.compression.codec", "MyDefaultCodec2");
			
 
				+    
			
 
				+    CompressionEmulationUtil.configureCompressionEmulation(source, target);
			
 
				+    
			
 
				+    // check default values
			
 
				+    assertFalse(target.getBoolean("mapred.output.compress", true));
			
 
				+    assertEquals("MyDefaultCodec",
			
 
				+                 target.get("mapred.output.compression.codec"));
			
 
				+    assertEquals("MyDefaultType", target.get("mapred.output.compression.type"));
			
 
				+    assertFalse(target.getBoolean("mapred.compress.map.output", true));
			
 
				+    assertEquals("MyDefaultCodec2", 
			
 
				+                 target.get("mapred.map.output.compression.codec"));
			
 
				+    assertFalse(CompressionEmulationUtil
			
 
				+                .isInputCompressionEmulationEnabled(target));
			
 
				+    
			
 
				+    // set new values
			
 
				+    source.setBoolean("mapred.output.compress", true);
			
 
				+    source.set("mapred.output.compression.codec", "MyCodec");
			
 
				+    source.set("mapred.output.compression.type", "MyType");
			
 
				+    source.setBoolean("mapred.compress.map.output", true);
			
 
				+    source.set("mapred.map.output.compression.codec", "MyCodec2");
			
 
				+    org.apache.hadoop.mapred.FileInputFormat.setInputPaths(source, "file.gz");
			
 
				+    
			
 
				+    target = new JobConf(); // reset
			
 
				+    CompressionEmulationUtil.configureCompressionEmulation(source, target);
			
 
				+    
			
 
				+    // check new values
			
 
				+    assertTrue(target.getBoolean("mapred.output.compress", false));
			
 
				+    assertEquals("MyCodec",
			
 
				+                 target.get("mapred.output.compression.codec"));
			
 
				+    assertEquals("MyType", target.get("mapred.output.compression.type"));
			
 
				+    assertTrue(target.getBoolean("mapred.compress.map.output", false));
			
 
				+    assertEquals("MyCodec2", 
			
 
				+                 target.get("mapred.map.output.compression.codec"));
			
 
				+    assertTrue(CompressionEmulationUtil
			
 
				+               .isInputCompressionEmulationEnabled(target));
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test of {@link FileQueue} can identify compressed file and provide
			
 
				+   * readers to extract uncompressed data only if input-compression is enabled.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testFileQueueDecompression() throws IOException {
			
 
				+    JobConf conf = new JobConf();
			
 
				+    FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+    String inputLine = "Hi Hello!";
			
 
				+    
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    CompressionEmulationUtil.setInputCompressionEmulationEnabled(conf, true);
			
 
				+    org.apache.hadoop.mapred.FileOutputFormat.setCompressOutput(conf, true);
			
 
				+    org.apache.hadoop.mapred.FileOutputFormat.setOutputCompressorClass(conf, 
			
 
				+                                                GzipCodec.class);
			
 
				+
			
 
				+    // define the test's root temp directory
			
 
				+    Path rootTempDir =
			
 
				+        new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(
			
 
				+            lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+
			
 
				+    Path tempDir = new Path(rootTempDir, "TestFileQueueDecompression");
			
 
				+    lfs.delete(tempDir, true);
			
 
				+
			
 
				+    // create a compressed file
			
 
				+    Path compressedFile = new Path(tempDir, "test");
			
 
				+    OutputStream out = 
			
 
				+      CompressionEmulationUtil.getPossiblyCompressedOutputStream(compressedFile, 
			
 
				+                                                                 conf);
			
 
				+    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out));
			
 
				+    writer.write(inputLine);
			
 
				+    writer.close();
			
 
				+    
			
 
				+    compressedFile = compressedFile.suffix(".gz");
			
 
				+    // now read back the data from the compressed stream using FileQueue
			
 
				+    long fileSize = lfs.listStatus(compressedFile)[0].getLen();
			
 
				+    CombineFileSplit split = 
			
 
				+      new CombineFileSplit(new Path[] {compressedFile}, new long[] {fileSize});
			
 
				+    FileQueue queue = new FileQueue(split, conf);
			
 
				+    byte[] bytes = new byte[inputLine.getBytes().length];
			
 
				+    queue.read(bytes);
			
 
				+    queue.close();
			
 
				+    String readLine = new String(bytes);
			
 
				+    assertEquals("Compression/Decompression error", inputLine, readLine);
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestDistCacheEmulation.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestDistCacheEmulation.java
@@ -0,0 +1,498 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ * <p/>
			
 
				+ * http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ * <p/>
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import static org.junit.Assert.*;
			
 
				+
			
 
				+import java.io.FileNotFoundException;
			
 
				+import java.io.IOException;
			
 
				+import java.util.ArrayList;
			
 
				+import java.util.List;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.filecache.DistributedCache;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.permission.FsAction;
			
 
				+import org.apache.hadoop.fs.permission.FsPermission;
			
 
				+import org.apache.hadoop.io.BytesWritable;
			
 
				+import org.apache.hadoop.io.LongWritable;
			
 
				+import org.apache.hadoop.io.NullWritable;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapreduce.InputSplit;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.mapreduce.JobContext;
			
 
				+import org.apache.hadoop.mapreduce.MapContext;
			
 
				+import org.apache.hadoop.mapreduce.MapReduceTestUtil;
			
 
				+import org.apache.hadoop.mapreduce.RecordReader;
			
 
				+import org.apache.hadoop.mapreduce.TaskAttemptContext;
			
 
				+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
			
 
				+import org.apache.hadoop.security.UserGroupInformation;
			
 
				+import org.junit.AfterClass;
			
 
				+import org.junit.BeforeClass;
			
 
				+import org.junit.Test;
			
 
				+
			
 
				+/**
			
 
				+ * Validate emulation of distributed cache load in gridmix simulated jobs.
			
 
				+ *
			
 
				+ */
			
 
				+public class TestDistCacheEmulation {
			
 
				+
			
 
				+  private DistributedCacheEmulator dce = null;
			
 
				+
			
 
				+  @BeforeClass
			
 
				+  public static void init() throws IOException {
			
 
				+    GridmixTestUtils.initCluster();
			
 
				+  }
			
 
				+
			
 
				+  @AfterClass
			
 
				+  public static void shutDown() throws IOException {
			
 
				+    GridmixTestUtils.shutdownCluster();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate the dist cache files generated by GenerateDistCacheData job.
			
 
				+   * @param jobConf configuration of GenerateDistCacheData job.
			
 
				+   * @param sortedFileSizes array of sorted distributed cache file sizes 
			
 
				+   * @throws IOException 
			
 
				+   * @throws FileNotFoundException 
			
 
				+   */
			
 
				+  private void validateDistCacheData(JobConf jobConf, long[] sortedFileSizes)
			
 
				+      throws FileNotFoundException, IOException {
			
 
				+    Path distCachePath = dce.getDistributedCacheDir();
			
 
				+    String filesListFile =
			
 
				+        jobConf.get(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST);
			
 
				+    FileSystem fs = FileSystem.get(jobConf);
			
 
				+
			
 
				+    // Validate the existence of Distributed Cache files list file directly
			
 
				+    // under distributed cache directory
			
 
				+    Path listFile = new Path(filesListFile);
			
 
				+    assertTrue("Path of Distributed Cache files list file is wrong.",
			
 
				+        distCachePath.equals(listFile.getParent().makeQualified(fs)));
			
 
				+
			
 
				+    // Delete the dist cache files list file
			
 
				+    assertTrue("Failed to delete distributed Cache files list file " + listFile,
			
 
				+               fs.delete(listFile));
			
 
				+
			
 
				+    List<Long> fileSizes = new ArrayList<Long>();
			
 
				+    for (long size : sortedFileSizes) {
			
 
				+      fileSizes.add(size);
			
 
				+    }
			
 
				+    // validate dist cache files after deleting the 'files list file'
			
 
				+    validateDistCacheFiles(fileSizes, distCachePath);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate private/public distributed cache files.
			
 
				+   * @param filesSizesExpected list of sizes of expected dist cache files
			
 
				+   * @param distCacheDir the distributed cache dir to be validated
			
 
				+   * @throws IOException 
			
 
				+   * @throws FileNotFoundException 
			
 
				+   */
			
 
				+  private void validateDistCacheFiles(List filesSizesExpected,
			
 
				+      Path distCacheDir) throws FileNotFoundException, IOException {
			
 
				+    //RemoteIterator<LocatedFileStatus> iter =
			
 
				+    FileStatus[] statuses = GridmixTestUtils.dfs.listStatus(distCacheDir);
			
 
				+    int numFiles = filesSizesExpected.size();
			
 
				+    assertEquals("Number of files under distributed cache dir is wrong.",
			
 
				+                 numFiles, statuses.length);
			
 
				+    for (int i = 0; i < numFiles; i++) {
			
 
				+      FileStatus stat = statuses[i];
			
 
				+      assertTrue("File size of distributed cache file "
			
 
				+          + stat.getPath().toUri().getPath() + " is wrong.",
			
 
				+          filesSizesExpected.remove(stat.getLen()));
			
 
				+
			
 
				+      FsPermission perm = stat.getPermission();
			
 
				+      assertEquals("Wrong permissions for distributed cache file "
			
 
				+          + stat.getPath().toUri().getPath(),
			
 
				+          new FsPermission((short)0644), perm);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Configures 5 HDFS-based dist cache files and 1 local-FS-based dist cache
			
 
				+   * file in the given Configuration object <code>conf</code>.
			
 
				+   * @param conf configuration where dist cache config properties are to be set
			
 
				+   * @return array of sorted HDFS-based distributed cache file sizes
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private long[] configureDummyDistCacheFiles(Configuration conf)
			
 
				+      throws IOException {
			
 
				+    String user = UserGroupInformation.getCurrentUser().getShortUserName();
			
 
				+    conf.set("user.name", user);
			
 
				+    // Set some dummy dist cache files in gridmix configuration so that they go
			
 
				+    // into the configuration of JobStory objects.
			
 
				+    String[] distCacheFiles = {"hdfs:///tmp/file1.txt",
			
 
				+                               "/tmp/" + user + "/.staging/job_1/file2.txt",
			
 
				+                               "hdfs:///user/user1/file3.txt",
			
 
				+                               "/home/user2/file4.txt",
			
 
				+                               "subdir1/file5.txt",
			
 
				+                               "subdir2/file6.gz"};
			
 
				+    String[] fileSizes = {"400", "2500", "700", "1200", "1500", "500"};
			
 
				+
			
 
				+    String[] visibilities = {"true", "false", "false", "true", "true", "false"};
			
 
				+    String[] timeStamps = {"1234", "2345", "34567", "5434", "125", "134"};
			
 
				+
			
 
				+    conf.setStrings(DistributedCache.CACHE_FILES, distCacheFiles);
			
 
				+    conf.setStrings(DistributedCache.CACHE_FILES_SIZES, fileSizes);
			
 
				+    conf.setStrings(JobContext.CACHE_FILE_VISIBILITIES, visibilities);
			
 
				+    conf.setStrings(DistributedCache.CACHE_FILES_TIMESTAMPS, timeStamps);
			
 
				+
			
 
				+    // local FS based dist cache file whose path contains <user>/.staging is
			
 
				+    // not created on HDFS. So file size 2500 is not added to sortedFileSizes.
			
 
				+    long[] sortedFileSizes = new long[] {1500, 1200, 700, 500, 400};
			
 
				+    return sortedFileSizes;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Runs setupGenerateDistCacheData() on a new DistrbutedCacheEmulator and
			
 
				+   * and returns the jobConf. Fills the array <code>sortedFileSizes</code> that
			
 
				+   * can be used for validation.
			
 
				+   * Validation of exit code from setupGenerateDistCacheData() is done.
			
 
				+   * @param generate true if -generate option is specified
			
 
				+   * @param sortedFileSizes sorted HDFS-based distributed cache file sizes
			
 
				+   * @throws IOException
			
 
				+   * @throws InterruptedException
			
 
				+   */
			
 
				+  private JobConf runSetupGenerateDistCacheData(boolean generate,
			
 
				+      long[] sortedFileSizes) throws IOException, InterruptedException {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    long[] fileSizes = configureDummyDistCacheFiles(conf);
			
 
				+    System.arraycopy(fileSizes, 0, sortedFileSizes, 0, fileSizes.length);
			
 
				+
			
 
				+    // Job stories of all 3 jobs will have same dist cache files in their
			
 
				+    // configurations
			
 
				+    final int numJobs = 3;
			
 
				+    DebugJobProducer jobProducer = new DebugJobProducer(numJobs, conf);
			
 
				+
			
 
				+    JobConf jobConf =
			
 
				+        GridmixTestUtils.mrCluster.createJobConf(new JobConf(conf));
			
 
				+    Path ioPath = new Path("testSetupGenerateDistCacheData")
			
 
				+                    .makeQualified(GridmixTestUtils.dfs);
			
 
				+    FileSystem fs = FileSystem.get(jobConf);
			
 
				+    if (fs.exists(ioPath)) {
			
 
				+      fs.delete(ioPath, true);
			
 
				+    }
			
 
				+    FileSystem.mkdirs(fs, ioPath, new FsPermission((short)0777));
			
 
				+
			
 
				+    dce = createDistributedCacheEmulator(jobConf, ioPath, generate);
			
 
				+    int exitCode = dce.setupGenerateDistCacheData(jobProducer);
			
 
				+    int expectedExitCode = generate ? 0 : dce.MISSING_DIST_CACHE_FILES_ERROR;
			
 
				+    assertEquals("setupGenerateDistCacheData failed.",
			
 
				+                 expectedExitCode, exitCode);
			
 
				+
			
 
				+    // reset back
			
 
				+    resetDistCacheConfigProperties(jobConf);
			
 
				+    return jobConf;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Reset the config properties related to Distributed Cache in the given
			
 
				+   * job configuration <code>jobConf</code>.
			
 
				+   * @param jobConf job configuration
			
 
				+   */
			
 
				+  private void resetDistCacheConfigProperties(JobConf jobConf) {
			
 
				+    // reset current/latest property names
			
 
				+    jobConf.setStrings(DistributedCache.CACHE_FILES, "");
			
 
				+    jobConf.setStrings(DistributedCache.CACHE_FILES_SIZES, "");
			
 
				+    jobConf.setStrings(DistributedCache.CACHE_FILES_TIMESTAMPS, "");
			
 
				+    jobConf.setStrings(JobContext.CACHE_FILE_VISIBILITIES, "");
			
 
				+    // reset old property names
			
 
				+    jobConf.setStrings("mapred.cache.files", "");
			
 
				+    jobConf.setStrings("mapred.cache.files.filesizes", "");
			
 
				+    jobConf.setStrings("mapred.cache.files.visibilities", "");
			
 
				+    jobConf.setStrings("mapred.cache.files.timestamps", "");
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate GenerateDistCacheData job if it creates dist cache files properly.
			
 
				+   * @throws Exception
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testGenerateDistCacheData() throws Exception {
			
 
				+    long[] sortedFileSizes = new long[5];
			
 
				+    JobConf jobConf =
			
 
				+        runSetupGenerateDistCacheData(true, sortedFileSizes);
			
 
				+    GridmixJob gridmixJob = new GenerateDistCacheData(jobConf);
			
 
				+    Job job = gridmixJob.call();
			
 
				+    assertEquals("Number of reduce tasks in GenerateDistCacheData is not 0.",
			
 
				+        0, job.getNumReduceTasks());
			
 
				+    assertTrue("GenerateDistCacheData job failed.",
			
 
				+        job.waitForCompletion(false));
			
 
				+    validateDistCacheData(jobConf, sortedFileSizes);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   *  Validate setupGenerateDistCacheData by validating
			
 
				+   *  <li> permissions of the distributed cache directories and
			
 
				+   *  <li> content of the generated sequence file. This includes validation of
			
 
				+   *       dist cache file paths and their file sizes.
			
 
				+   */
			
 
				+  private void validateSetupGenDC(JobConf jobConf, long[] sortedFileSizes)
			
 
				+      throws IOException, InterruptedException {
			
 
				+    // build things needed for validation
			
 
				+    long sumOfFileSizes = 0;
			
 
				+    for (int i = 0; i < sortedFileSizes.length; i++) {
			
 
				+      sumOfFileSizes += sortedFileSizes[i];
			
 
				+    }
			
 
				+
			
 
				+    FileSystem fs = FileSystem.get(jobConf);
			
 
				+    assertEquals("Number of distributed cache files to be generated is wrong.",
			
 
				+        sortedFileSizes.length,
			
 
				+        jobConf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1));
			
 
				+    assertEquals("Total size of dist cache files to be generated is wrong.",
			
 
				+        sumOfFileSizes, jobConf.getLong(
			
 
				+        GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
			
 
				+    Path filesListFile = new Path(jobConf.get(
			
 
				+        GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST));
			
 
				+    FileStatus stat = fs.getFileStatus(filesListFile);
			
 
				+    assertEquals("Wrong permissions of dist Cache files list file "
			
 
				+        + filesListFile, new FsPermission((short)0644), stat.getPermission());
			
 
				+
			
 
				+    InputSplit split =
			
 
				+        new FileSplit(filesListFile, 0, stat.getLen(), (String[])null);
			
 
				+    TaskAttemptContext taskContext =
			
 
				+        MapReduceTestUtil.createDummyMapTaskAttemptContext(jobConf);
			
 
				+    RecordReader<LongWritable, BytesWritable> reader =
			
 
				+      new GenerateDistCacheData.GenDCDataFormat().createRecordReader(
			
 
				+      split, taskContext);
			
 
				+    MapContext<LongWritable, BytesWritable, NullWritable, BytesWritable>
			
 
				+        mapContext = new MapContext<LongWritable, BytesWritable,
			
 
				+        NullWritable, BytesWritable>(jobConf, taskContext.getTaskAttemptID(),
			
 
				+        reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
			
 
				+    reader.initialize(split, mapContext);
			
 
				+
			
 
				+    // start validating setupGenerateDistCacheData
			
 
				+    doValidateSetupGenDC(reader, fs, sortedFileSizes);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   *  Validate setupGenerateDistCacheData by validating
			
 
				+   *  <li> permissions of the distributed cache directory and
			
 
				+   *  <li> content of the generated sequence file. This includes validation of
			
 
				+   *       dist cache file paths and their file sizes.
			
 
				+   */
			
 
				+  private void doValidateSetupGenDC(RecordReader<LongWritable, BytesWritable>
			
 
				+      reader, FileSystem fs, long[] sortedFileSizes)
			
 
				+      throws IOException, InterruptedException {
			
 
				+
			
 
				+    // Validate permissions of dist cache directory
			
 
				+    Path distCacheDir = dce.getDistributedCacheDir();
			
 
				+    assertEquals("Wrong permissions for distributed cache dir " + distCacheDir,
			
 
				+        fs.getFileStatus(distCacheDir).getPermission()
			
 
				+        .getOtherAction().and(FsAction.EXECUTE), FsAction.EXECUTE);
			
 
				+
			
 
				+    // Validate the content of the sequence file generated by
			
 
				+    // dce.setupGenerateDistCacheData().
			
 
				+    LongWritable key = new LongWritable();
			
 
				+    BytesWritable val = new BytesWritable();
			
 
				+    for (int i = 0; i < sortedFileSizes.length; i++) {
			
 
				+      assertTrue("Number of files written to the sequence file by "
			
 
				+          + "setupGenerateDistCacheData is less than the expected.",
			
 
				+          reader.nextKeyValue());
			
 
				+      key = reader.getCurrentKey();
			
 
				+      val = reader.getCurrentValue();
			
 
				+      long fileSize = key.get();
			
 
				+      String file = new String(val.getBytes(), 0, val.getLength());
			
 
				+
			
 
				+      // Dist Cache files should be sorted based on file size.
			
 
				+      assertEquals("Dist cache file size is wrong.",
			
 
				+          sortedFileSizes[i], fileSize);
			
 
				+
			
 
				+      // Validate dist cache file path.
			
 
				+
			
 
				+      // parent dir of dist cache file
			
 
				+      Path parent = new Path(file).getParent().makeQualified(fs);
			
 
				+      // should exist in dist cache dir
			
 
				+      assertTrue("Public dist cache file path is wrong.",
			
 
				+          distCacheDir.equals(parent));
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   *  Test if DistributedCacheEmulator's setup of GenerateDistCacheData is
			
 
				+   *  working as expected.
			
 
				+   * @throws IOException
			
 
				+   * @throws InterruptedException
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testSetupGenerateDistCacheData()
			
 
				+      throws IOException, InterruptedException {
			
 
				+    long[] sortedFileSizes = new long[5];
			
 
				+    JobConf jobConf = runSetupGenerateDistCacheData(true, sortedFileSizes);
			
 
				+    validateSetupGenDC(jobConf, sortedFileSizes);
			
 
				+
			
 
				+    // Verify if correct exit code is seen when -generate option is missing and
			
 
				+    // distributed cache files are missing in the expected path.
			
 
				+    runSetupGenerateDistCacheData(false, sortedFileSizes);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   *  Create DistributedCacheEmulator object and do the initialization by
			
 
				+   *  calling init() on it with dummy trace. Also configure the pseudo local FS.
			
 
				+   */
			
 
				+  private DistributedCacheEmulator createDistributedCacheEmulator(
			
 
				+      Configuration conf, Path ioPath, boolean generate) throws IOException {
			
 
				+    DistributedCacheEmulator dce =
			
 
				+        new DistributedCacheEmulator(conf, ioPath);
			
 
				+    JobCreator jobCreator = JobCreator.getPolicy(conf, JobCreator.LOADJOB);
			
 
				+    jobCreator.setDistCacheEmulator(dce);
			
 
				+    dce.init("dummytrace", jobCreator, generate);
			
 
				+    return dce;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   *  Test the configuration property for disabling/enabling emulation of
			
 
				+   *  distributed cache load.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testDistCacheEmulationConfigurability() throws IOException {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    JobConf jobConf = GridmixTestUtils.mrCluster.createJobConf(
			
 
				+        new JobConf(conf));
			
 
				+    Path ioPath = new Path("testDistCacheEmulationConfigurability")
			
 
				+        .makeQualified(GridmixTestUtils.dfs);
			
 
				+    FileSystem fs = FileSystem.get(jobConf);
			
 
				+    FileSystem.mkdirs(fs, ioPath, new FsPermission((short)0777));
			
 
				+
			
 
				+    // default config
			
 
				+    dce = createDistributedCacheEmulator(jobConf, ioPath, false);
			
 
				+    assertTrue("Default configuration of "
			
 
				+        + DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE
			
 
				+        + " is wrong.", dce.shouldEmulateDistCacheLoad());
			
 
				+
			
 
				+    // config property set to false
			
 
				+    jobConf.setBoolean(
			
 
				+        DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE, false);
			
 
				+    dce = createDistributedCacheEmulator(jobConf, ioPath, false);
			
 
				+    assertFalse("Disabling of emulation of distributed cache load by setting "
			
 
				+        + DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE
			
 
				+        + " to false is not working.", dce.shouldEmulateDistCacheLoad());
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Verify if DistributedCacheEmulator can configure distributed cache files
			
 
				+   * for simulated job if job conf from trace had no dist cache files.
			
 
				+   * @param conf configuration for the simulated job to be run
			
 
				+   * @param jobConf job configuration of original cluster's job, obtained from
			
 
				+   *                trace
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void validateJobConfWithOutDCFiles(Configuration conf,
			
 
				+      JobConf jobConf) throws IOException {
			
 
				+    // Validate if Gridmix can configure dist cache files properly if there are
			
 
				+    // no HDFS-based dist cache files and localFS-based dist cache files in
			
 
				+    // trace for a job.
			
 
				+    dce.configureDistCacheFiles(conf, jobConf);
			
 
				+    assertNull("Distributed cache files configured by GridMix is wrong.",
			
 
				+               conf.get(DistributedCache.CACHE_FILES));
			
 
				+    assertNull("Distributed cache files configured by Gridmix through -files "
			
 
				+               + "option is wrong.", conf.get("tmpfiles"));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Verify if DistributedCacheEmulator can configure distributed cache files
			
 
				+   * for simulated job if job conf from trace had HDFS-based dist cache files
			
 
				+   * and local-FS-based dist cache files.
			
 
				+   * @param conf configuration for the simulated job to be run
			
 
				+   * @param jobConf job configuration of original cluster's job, obtained from
			
 
				+   *                trace
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void validateJobConfWithDCFiles(Configuration conf,
			
 
				+      JobConf jobConf) throws IOException {
			
 
				+    long[] sortedFileSizes = configureDummyDistCacheFiles(jobConf);
			
 
				+
			
 
				+    // 1 local FS based dist cache file and 5 HDFS based dist cache files. So
			
 
				+    // total expected dist cache files count is 6.
			
 
				+    assertEquals("Gridmix is not able to extract dist cache file sizes.",
			
 
				+                 6, jobConf.getStrings(DistributedCache.CACHE_FILES_SIZES).length);
			
 
				+    assertEquals("Gridmix is not able to extract dist cache file visibilities.",
			
 
				+                 6, jobConf.getStrings(
			
 
				+                      JobContext.CACHE_FILE_VISIBILITIES).length);
			
 
				+
			
 
				+    dce.configureDistCacheFiles(conf, jobConf);
			
 
				+
			
 
				+    assertEquals("Configuring of HDFS-based dist cache files by gridmix is "
			
 
				+                 + "wrong.", sortedFileSizes.length,
			
 
				+                 conf.getStrings(DistributedCache.CACHE_FILES).length);
			
 
				+    assertEquals("Configuring of local-FS-based dist cache files by gridmix is "
			
 
				+                 + "wrong.", 1, conf.getStrings("tmpfiles").length);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Verify if configureDistCacheFiles() works fine when there are distributed
			
 
				+   * cache files set but visibilities are not set. This is to handle history
			
 
				+   * traces of older hadoop version where there are no private/public
			
 
				+   * Distributed Caches.
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void validateWithOutVisibilities() throws IOException {
			
 
				+    Configuration conf = new Configuration();// configuration for simulated job
			
 
				+    JobConf jobConf = new JobConf();
			
 
				+    String user = "user1";
			
 
				+    jobConf.setUser(user);
			
 
				+    String[] files = {"/tmp/hdfs1.txt", "/tmp/"+ user + "/.staging/file1"};
			
 
				+    jobConf.setStrings(DistributedCache.CACHE_FILES, files);
			
 
				+    jobConf.setStrings(DistributedCache.CACHE_FILES_SIZES, "12,200");
			
 
				+    jobConf.setStrings(DistributedCache.CACHE_FILES_TIMESTAMPS, "56789,98345");
			
 
				+    dce.configureDistCacheFiles(conf, jobConf);
			
 
				+    assertEquals("Configuring of HDFS-based dist cache files by gridmix is "
			
 
				+                 + "wrong.", files.length,
			
 
				+                 conf.getStrings(DistributedCache.CACHE_FILES).length);
			
 
				+    assertNull("Configuring of local-FS-based dist cache files by gridmix is "
			
 
				+               + "wrong.", conf.get("tmpfiles"));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Test if Gridmix can configure config properties related to Distributed
			
 
				+   * Cache properly.
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testDistCacheFilesConfiguration() throws IOException {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    JobConf jobConf = GridmixTestUtils.mrCluster.createJobConf(
			
 
				+                        new JobConf(conf));
			
 
				+    Path ioPath = new Path("testDistCacheEmulationConfigurability")
			
 
				+                    .makeQualified(GridmixTestUtils.dfs);
			
 
				+    FileSystem fs = FileSystem.get(jobConf);
			
 
				+    FileSystem.mkdirs(fs, ioPath, new FsPermission((short)0777));
			
 
				+
			
 
				+    // default config
			
 
				+    dce = createDistributedCacheEmulator(jobConf, ioPath, false);
			
 
				+    assertTrue("Default configuration of "
			
 
				+               + DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE
			
 
				+               + " is wrong.", dce.shouldEmulateDistCacheLoad());
			
 
				+
			
 
				+    // Validate if DistributedCacheEmulator can handle a JobStory with out
			
 
				+    // Distributed Cache files properly.
			
 
				+    validateJobConfWithOutDCFiles(conf, jobConf);
			
 
				+
			
 
				+    // Validate if Gridmix can configure dist cache files properly if there are
			
 
				+    // HDFS-based dist cache files and localFS-based dist cache files in trace
			
 
				+    // for a job.
			
 
				+    validateJobConfWithDCFiles(conf, jobConf);
			
 
				+    
			
 
				+    // Use new JobConf as JobStory conf and check if configureDistCacheFiles()
			
 
				+    // doesn't throw NPE when there are dist cache files set but visibilities
			
 
				+    // are not set.
			
 
				+    validateWithOutVisibilities();
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixMemoryEmulation.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixMemoryEmulation.java
@@ -0,0 +1,453 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import org.junit.Test;
			
 
				+import static org.junit.Assert.*;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.util.DummyResourceCalculatorPlugin;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.gridmix.DebugJobProducer.MockJob;
			
 
				+import org.apache.hadoop.mapred.gridmix.TestHighRamJob.DummyGridmixJob;
			
 
				+import org.apache.hadoop.mapred.gridmix.TestResourceUsageEmulators.FakeProgressive;
			
 
				+import org.apache.hadoop.mapred.gridmix.emulators.resourceusage.TotalHeapUsageEmulatorPlugin;
			
 
				+import org.apache.hadoop.mapred.gridmix.emulators.resourceusage.TotalHeapUsageEmulatorPlugin.DefaultHeapUsageEmulator;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.util.ResourceCalculatorPlugin;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				+
			
 
				+/**
			
 
				+ * Test Gridmix memory emulation.
			
 
				+ */
			
 
				+public class TestGridmixMemoryEmulation {
			
 
				+  /**
			
 
				+   * This is a dummy class that fakes heap usage.
			
 
				+   */
			
 
				+  private static class FakeHeapUsageEmulatorCore 
			
 
				+  extends DefaultHeapUsageEmulator {
			
 
				+    private int numCalls = 0;
			
 
				+    
			
 
				+    @Override
			
 
				+    public void load(long sizeInMB) {
			
 
				+      ++numCalls;
			
 
				+      super.load(sizeInMB);
			
 
				+    }
			
 
				+    
			
 
				+    // Get the total number of times load() was invoked
			
 
				+    int getNumCalls() {
			
 
				+      return numCalls;
			
 
				+    }
			
 
				+    
			
 
				+    // Get the total number of 1mb objects stored within
			
 
				+    long getHeapUsageInMB() {
			
 
				+      return heapSpace.size();
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public void reset() {
			
 
				+      // no op to stop emulate() from resetting
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * For re-testing purpose.
			
 
				+     */
			
 
				+    void resetFake() {
			
 
				+      numCalls = 0;
			
 
				+      super.reset();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * This is a dummy class that fakes the heap usage emulator plugin.
			
 
				+   */
			
 
				+  private static class FakeHeapUsageEmulatorPlugin 
			
 
				+  extends TotalHeapUsageEmulatorPlugin {
			
 
				+    private FakeHeapUsageEmulatorCore core;
			
 
				+    
			
 
				+    public FakeHeapUsageEmulatorPlugin(FakeHeapUsageEmulatorCore core) {
			
 
				+      super(core);
			
 
				+      this.core = core;
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    protected long getMaxHeapUsageInMB() {
			
 
				+      return Long.MAX_VALUE / ONE_MB;
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    protected long getTotalHeapUsageInMB() {
			
 
				+      return core.getHeapUsageInMB();
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link TotalHeapUsageEmulatorPlugin}'s core heap usage emulation 
			
 
				+   * engine.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testHeapUsageEmulator() throws IOException {
			
 
				+    FakeHeapUsageEmulatorCore heapEmulator = new FakeHeapUsageEmulatorCore();
			
 
				+    
			
 
				+    long testSizeInMB = 10; // 10 mb
			
 
				+    long previousHeap = heapEmulator.getHeapUsageInMB();
			
 
				+    heapEmulator.load(testSizeInMB);
			
 
				+    long currentHeap = heapEmulator.getHeapUsageInMB();
			
 
				+    
			
 
				+    // check if the heap has increased by expected value
			
 
				+    assertEquals("Default heap emulator failed to load 10mb", 
			
 
				+                 previousHeap + testSizeInMB, currentHeap);
			
 
				+    
			
 
				+    // test reset
			
 
				+    heapEmulator.resetFake();
			
 
				+    assertEquals("Default heap emulator failed to reset", 
			
 
				+                 0, heapEmulator.getHeapUsageInMB());
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Test {@link TotalHeapUsageEmulatorPlugin}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testTotalHeapUsageEmulatorPlugin() throws Exception {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    // set the dummy resource calculator for testing
			
 
				+    ResourceCalculatorPlugin monitor = new DummyResourceCalculatorPlugin();
			
 
				+    long maxHeapUsage = 1024 * TotalHeapUsageEmulatorPlugin.ONE_MB; // 1GB
			
 
				+    conf.setLong(DummyResourceCalculatorPlugin.MAXPMEM_TESTING_PROPERTY, 
			
 
				+                 maxHeapUsage);
			
 
				+    monitor.setConf(conf);
			
 
				+    
			
 
				+    // no buffer to be reserved
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.MIN_HEAP_FREE_RATIO, 0F);
			
 
				+    // only 1 call to be made per cycle
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.HEAP_LOAD_RATIO, 1F);
			
 
				+    long targetHeapUsageInMB = 200; // 200mb
			
 
				+    
			
 
				+    // fake progress indicator
			
 
				+    FakeProgressive fakeProgress = new FakeProgressive();
			
 
				+    
			
 
				+    // fake heap usage generator
			
 
				+    FakeHeapUsageEmulatorCore fakeCore = new FakeHeapUsageEmulatorCore();
			
 
				+    
			
 
				+    // a heap usage emulator with fake core
			
 
				+    FakeHeapUsageEmulatorPlugin heapPlugin = 
			
 
				+      new FakeHeapUsageEmulatorPlugin(fakeCore);
			
 
				+    
			
 
				+    // test with invalid or missing resource usage value
			
 
				+    ResourceUsageMetrics invalidUsage = 
			
 
				+      TestResourceUsageEmulators.createMetrics(0);
			
 
				+    heapPlugin.initialize(conf, invalidUsage, null, null);
			
 
				+    
			
 
				+    // test if disabled heap emulation plugin's emulate() call is a no-operation
			
 
				+    // this will test if the emulation plugin is disabled or not
			
 
				+    int numCallsPre = fakeCore.getNumCalls();
			
 
				+    long heapUsagePre = fakeCore.getHeapUsageInMB();
			
 
				+    heapPlugin.emulate();
			
 
				+    int numCallsPost = fakeCore.getNumCalls();
			
 
				+    long heapUsagePost = fakeCore.getHeapUsageInMB();
			
 
				+    
			
 
				+    //  test if no calls are made heap usage emulator core
			
 
				+    assertEquals("Disabled heap usage emulation plugin works!", 
			
 
				+                 numCallsPre, numCallsPost);
			
 
				+    //  test if no calls are made heap usage emulator core
			
 
				+    assertEquals("Disabled heap usage emulation plugin works!", 
			
 
				+                 heapUsagePre, heapUsagePost);
			
 
				+    
			
 
				+    // test with wrong/invalid configuration
			
 
				+    Boolean failed = null;
			
 
				+    invalidUsage = 
			
 
				+      TestResourceUsageEmulators.createMetrics(maxHeapUsage 
			
 
				+                                   + TotalHeapUsageEmulatorPlugin.ONE_MB);
			
 
				+    try {
			
 
				+      heapPlugin.initialize(conf, invalidUsage, monitor, null);
			
 
				+      failed = false;
			
 
				+    } catch (Exception e) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertNotNull("Fail case failure!", failed);
			
 
				+    assertTrue("Expected failure!", failed); 
			
 
				+    
			
 
				+    // test with valid resource usage value
			
 
				+    ResourceUsageMetrics metrics = 
			
 
				+      TestResourceUsageEmulators.createMetrics(targetHeapUsageInMB 
			
 
				+                                   * TotalHeapUsageEmulatorPlugin.ONE_MB);
			
 
				+    
			
 
				+    // test with default emulation interval
			
 
				+    // in every interval, the emulator will add 100% of the expected usage 
			
 
				+    // (since gridmix.emulators.resource-usage.heap.load-ratio=1)
			
 
				+    // so at 10%, emulator will add 10% (difference), at 20% it will add 10% ...
			
 
				+    // So to emulate 200MB, it will add
			
 
				+    //   20mb + 20mb + 20mb + 20mb + .. = 200mb 
			
 
				+    testEmulationAccuracy(conf, fakeCore, monitor, metrics, heapPlugin, 200, 
			
 
				+                          10);
			
 
				+    
			
 
				+    // test with custom value for emulation interval of 20%
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.HEAP_EMULATION_PROGRESS_INTERVAL,
			
 
				+                  0.2F);
			
 
				+    //  40mb + 40mb + 40mb + 40mb + 40mb = 200mb
			
 
				+    testEmulationAccuracy(conf, fakeCore, monitor, metrics, heapPlugin, 200, 5);
			
 
				+    
			
 
				+    // test with custom value of free heap ratio and load ratio = 1
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.HEAP_LOAD_RATIO, 1F);
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.MIN_HEAP_FREE_RATIO, 0.5F);
			
 
				+    //  40mb + 0mb + 80mb + 0mb + 0mb = 120mb
			
 
				+    testEmulationAccuracy(conf, fakeCore, monitor, metrics, heapPlugin, 120, 2);
			
 
				+    
			
 
				+    // test with custom value of heap load ratio and min free heap ratio = 0
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.HEAP_LOAD_RATIO, 0.5F);
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.MIN_HEAP_FREE_RATIO, 0F);
			
 
				+    // 20mb (call#1) + 20mb (call#1) + 20mb (call#2) + 20mb (call#2) +.. = 200mb
			
 
				+    testEmulationAccuracy(conf, fakeCore, monitor, metrics, heapPlugin, 200, 
			
 
				+                          10);
			
 
				+    
			
 
				+    // test with custom value of free heap ratio = 0.3 and load ratio = 0.5
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.MIN_HEAP_FREE_RATIO, 0.25F);
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.HEAP_LOAD_RATIO, 0.5F);
			
 
				+    // 20mb (call#1) + 20mb (call#1) + 30mb (call#2) + 0mb (call#2) 
			
 
				+    // + 30mb (call#3) + 0mb (call#3) + 35mb (call#4) + 0mb (call#4)
			
 
				+    // + 37mb (call#5) + 0mb (call#5) = 162mb
			
 
				+    testEmulationAccuracy(conf, fakeCore, monitor, metrics, heapPlugin, 162, 6);
			
 
				+    
			
 
				+    // test if emulation interval boundary is respected
			
 
				+    fakeProgress = new FakeProgressive(); // initialize
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.MIN_HEAP_FREE_RATIO, 0F);
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.HEAP_LOAD_RATIO, 1F);
			
 
				+    conf.setFloat(TotalHeapUsageEmulatorPlugin.HEAP_EMULATION_PROGRESS_INTERVAL,
			
 
				+                  0.25F);
			
 
				+    heapPlugin.initialize(conf, metrics, monitor, fakeProgress);
			
 
				+    fakeCore.resetFake();
			
 
				+    // take a snapshot after the initialization
			
 
				+    long initHeapUsage = fakeCore.getHeapUsageInMB();
			
 
				+    long initNumCallsUsage = fakeCore.getNumCalls();
			
 
				+    // test with 0 progress
			
 
				+    testEmulationBoundary(0F, fakeCore, fakeProgress, heapPlugin, initHeapUsage, 
			
 
				+                          initNumCallsUsage, "[no-op, 0 progress]");
			
 
				+    // test with 24% progress
			
 
				+    testEmulationBoundary(0.24F, fakeCore, fakeProgress, heapPlugin, 
			
 
				+                          initHeapUsage, initNumCallsUsage, 
			
 
				+                          "[no-op, 24% progress]");
			
 
				+    // test with 25% progress
			
 
				+    testEmulationBoundary(0.25F, fakeCore, fakeProgress, heapPlugin, 
			
 
				+        targetHeapUsageInMB / 4, 1, "[op, 25% progress]");
			
 
				+    // test with 80% progress
			
 
				+    testEmulationBoundary(0.80F, fakeCore, fakeProgress, heapPlugin, 
			
 
				+        (targetHeapUsageInMB * 4) / 5, 2, "[op, 80% progress]");
			
 
				+    
			
 
				+    // now test if the final call with 100% progress ramps up the heap usage
			
 
				+    testEmulationBoundary(1F, fakeCore, fakeProgress, heapPlugin, 
			
 
				+        targetHeapUsageInMB, 3, "[op, 100% progress]");
			
 
				+  }
			
 
				+
			
 
				+  // test whether the heap usage emulator achieves the desired target using
			
 
				+  // desired calls to the underling core engine.
			
 
				+  private static void testEmulationAccuracy(Configuration conf, 
			
 
				+                        FakeHeapUsageEmulatorCore fakeCore,
			
 
				+                        ResourceCalculatorPlugin monitor,
			
 
				+                        ResourceUsageMetrics metrics,
			
 
				+                        TotalHeapUsageEmulatorPlugin heapPlugin,
			
 
				+                        long expectedTotalHeapUsageInMB,
			
 
				+                        long expectedTotalNumCalls)
			
 
				+  throws Exception {
			
 
				+    FakeProgressive fakeProgress = new FakeProgressive();
			
 
				+    fakeCore.resetFake();
			
 
				+    heapPlugin.initialize(conf, metrics, monitor, fakeProgress);
			
 
				+    int numLoops = 0;
			
 
				+    while (fakeProgress.getProgress() < 1) {
			
 
				+      ++numLoops;
			
 
				+      float progress = numLoops / 100.0F;
			
 
				+      fakeProgress.setProgress(progress);
			
 
				+      heapPlugin.emulate();
			
 
				+    }
			
 
				+    
			
 
				+    // test if the resource plugin shows the expected usage
			
 
				+    assertEquals("Cumulative heap usage emulator plugin failed (total usage)!", 
			
 
				+                 expectedTotalHeapUsageInMB, fakeCore.getHeapUsageInMB(), 1L);
			
 
				+    // test if the resource plugin shows the expected num calls
			
 
				+    assertEquals("Cumulative heap usage emulator plugin failed (num calls)!", 
			
 
				+                 expectedTotalNumCalls, fakeCore.getNumCalls(), 0L);
			
 
				+  }
			
 
				+
			
 
				+  // tests if the heap usage emulation plugin emulates only at the expected
			
 
				+  // progress gaps
			
 
				+  private static void testEmulationBoundary(float progress, 
			
 
				+      FakeHeapUsageEmulatorCore fakeCore, FakeProgressive fakeProgress, 
			
 
				+      TotalHeapUsageEmulatorPlugin heapPlugin, long expectedTotalHeapUsageInMB, 
			
 
				+      long expectedTotalNumCalls, String info) throws Exception {
			
 
				+    fakeProgress.setProgress(progress);
			
 
				+    heapPlugin.emulate();
			
 
				+    // test heap usage
			
 
				+    assertEquals("Emulation interval test for heap usage failed " + info + "!", 
			
 
				+                 expectedTotalHeapUsageInMB, fakeCore.getHeapUsageInMB(), 0L);
			
 
				+    // test num calls
			
 
				+    assertEquals("Emulation interval test for heap usage failed " + info + "!", 
			
 
				+                 expectedTotalNumCalls, fakeCore.getNumCalls(), 0L);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test the specified task java heap options.
			
 
				+   */
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  private void testJavaHeapOptions(String mapOptions, 
			
 
				+      String reduceOptions, String taskOptions, String defaultMapOptions, 
			
 
				+      String defaultReduceOptions, String defaultTaskOptions, 
			
 
				+      String expectedMapOptions, String expectedReduceOptions, 
			
 
				+      String expectedTaskOptions) throws Exception {
			
 
				+    Configuration simulatedConf = new Configuration(false);
			
 
				+//    // reset the configuration parameters
			
 
				+//    simulatedConf.set(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, "");
			
 
				+//    simulatedConf.set(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS, "");
			
 
				+//    simulatedConf.set(JobConf.MAPRED_TASK_JAVA_OPTS, "");
			
 
				+    
			
 
				+    // set the default map task options
			
 
				+    if (defaultMapOptions != null) {
			
 
				+      simulatedConf.set(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, defaultMapOptions);
			
 
				+    }
			
 
				+    // set the default reduce task options
			
 
				+    if (defaultReduceOptions != null) {
			
 
				+      simulatedConf.set(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS,
			
 
				+                        defaultReduceOptions);
			
 
				+    }
			
 
				+    // set the default task options
			
 
				+    if (defaultTaskOptions != null) {
			
 
				+      simulatedConf.set(JobConf.MAPRED_TASK_JAVA_OPTS, defaultTaskOptions);
			
 
				+    }
			
 
				+    
			
 
				+    Configuration originalConf = new Configuration(false);
			
 
				+//    // reset the configuration parameters
			
 
				+//    originalConf.set(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, "");
			
 
				+//    originalConf.set(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS, "");
			
 
				+//    originalConf.set(JobConf.MAPRED_TASK_JAVA_OPTS, "");
			
 
				+    
			
 
				+    // set the map task options
			
 
				+    if (mapOptions != null) {
			
 
				+      originalConf.set(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, mapOptions);
			
 
				+    }
			
 
				+    // set the reduce task options
			
 
				+    if (reduceOptions != null) {
			
 
				+      originalConf.set(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS, reduceOptions);
			
 
				+    }
			
 
				+    // set the task options
			
 
				+    if (taskOptions != null) {
			
 
				+      originalConf.set(JobConf.MAPRED_TASK_JAVA_OPTS, taskOptions);
			
 
				+    }
			
 
				+    
			
 
				+    // configure the task jvm's heap options
			
 
				+    GridmixJob.configureTaskJVMOptions(originalConf, simulatedConf);
			
 
				+    
			
 
				+    assertEquals("Map heap options mismatch!", expectedMapOptions, 
			
 
				+                 simulatedConf.get(JobConf.MAPRED_MAP_TASK_JAVA_OPTS));
			
 
				+    assertEquals("Reduce heap options mismatch!", expectedReduceOptions, 
			
 
				+                 simulatedConf.get(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS));
			
 
				+    assertEquals("Task heap options mismatch!", expectedTaskOptions, 
			
 
				+                 simulatedConf.get(JobConf.MAPRED_TASK_JAVA_OPTS));
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test task-level java heap options configuration in {@link GridmixJob}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testJavaHeapOptions() throws Exception {
			
 
				+    // test missing opts
			
 
				+    testJavaHeapOptions(null, null, null, null, null, null, null, null, 
			
 
				+                        null);
			
 
				+    
			
 
				+    // test original heap opts and missing default opts
			
 
				+    testJavaHeapOptions("-Xms10m", "-Xms20m", "-Xms30m", null, null, null,
			
 
				+                        null, null, null);
			
 
				+    
			
 
				+    // test missing opts with default opts
			
 
				+    testJavaHeapOptions(null, null, null, "-Xms10m", "-Xms20m", "-Xms30m",
			
 
				+                        "-Xms10m", "-Xms20m", "-Xms30m");
			
 
				+    
			
 
				+    // test empty option
			
 
				+    testJavaHeapOptions("", "", "", null, null, null, null, null, null);
			
 
				+    
			
 
				+    // test empty default option and no original heap options
			
 
				+    testJavaHeapOptions(null, null, null, "", "", "", "", "", "");
			
 
				+    
			
 
				+    // test empty opts and default opts
			
 
				+    testJavaHeapOptions("", "", "", "-Xmx10m -Xms1m", "-Xmx50m -Xms2m", 
			
 
				+                        "-Xms2m -Xmx100m", "-Xmx10m -Xms1m", "-Xmx50m -Xms2m", 
			
 
				+                        "-Xms2m -Xmx100m");
			
 
				+    
			
 
				+    // test custom heap opts with no default opts
			
 
				+    testJavaHeapOptions("-Xmx10m", "-Xmx20m", "-Xmx30m", null, null, null,
			
 
				+                        "-Xmx10m", "-Xmx20m", "-Xmx30m");
			
 
				+    
			
 
				+    // test heap opts with default opts (multiple value)
			
 
				+    testJavaHeapOptions("-Xms5m -Xmx200m", "-Xms15m -Xmx300m", 
			
 
				+                        "-Xms25m -Xmx50m", "-XXabc", "-XXxyz", "-XXdef", 
			
 
				+                        "-XXabc -Xmx200m", "-XXxyz -Xmx300m", "-XXdef -Xmx50m");
			
 
				+    
			
 
				+    // test heap opts with default opts (duplication of -Xmx)
			
 
				+    testJavaHeapOptions("-Xms5m -Xmx200m", "-Xms15m -Xmx300m", 
			
 
				+                        "-Xms25m -Xmx50m", "-XXabc -Xmx500m", "-XXxyz -Xmx600m",
			
 
				+                        "-XXdef -Xmx700m", "-XXabc -Xmx200m", "-XXxyz -Xmx300m",
			
 
				+                        "-XXdef -Xmx50m");
			
 
				+    
			
 
				+    // test heap opts with default opts (single value)
			
 
				+    testJavaHeapOptions("-Xmx10m", "-Xmx20m", "-Xmx50m", "-Xms2m", 
			
 
				+                        "-Xms3m", "-Xms5m", "-Xms2m -Xmx10m", "-Xms3m -Xmx20m",
			
 
				+                        "-Xms5m -Xmx50m");
			
 
				+    
			
 
				+    // test heap opts with default opts (duplication of -Xmx)
			
 
				+    testJavaHeapOptions("-Xmx10m", "-Xmx20m", "-Xmx50m", "-Xmx2m", 
			
 
				+                        "-Xmx3m", "-Xmx5m", "-Xmx10m", "-Xmx20m", "-Xmx50m");
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test disabled task heap options configuration in {@link GridmixJob}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  public void testJavaHeapOptionsDisabled() throws Exception {
			
 
				+    Configuration gridmixConf = new Configuration();
			
 
				+    gridmixConf.setBoolean(GridmixJob.GRIDMIX_TASK_JVM_OPTIONS_ENABLE, false);
			
 
				+    
			
 
				+    // set the default values of simulated job
			
 
				+    gridmixConf.set(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, "-Xmx1m");
			
 
				+    gridmixConf.set(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS, "-Xmx2m");
			
 
				+    gridmixConf.set(JobConf.MAPRED_TASK_JAVA_OPTS, "-Xmx3m");
			
 
				+    
			
 
				+    // set the default map and reduce task options for original job
			
 
				+    final JobConf originalConf = new JobConf();
			
 
				+    originalConf.set(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, "-Xmx10m");
			
 
				+    originalConf.set(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS, "-Xmx20m");
			
 
				+    originalConf.set(JobConf.MAPRED_TASK_JAVA_OPTS, "-Xmx30m");
			
 
				+    
			
 
				+    // define a mock job
			
 
				+    MockJob story = new MockJob(originalConf) {
			
 
				+      public JobConf getJobConf() {
			
 
				+        return originalConf;
			
 
				+      }
			
 
				+    };
			
 
				+    
			
 
				+    GridmixJob job = new DummyGridmixJob(gridmixConf, story);
			
 
				+    Job simulatedJob = job.getJob();
			
 
				+    Configuration simulatedConf = simulatedJob.getConfiguration();
			
 
				+    
			
 
				+    assertEquals("Map heap options works when disabled!", "-Xmx1m", 
			
 
				+                 simulatedConf.get(JobConf.MAPRED_MAP_TASK_JAVA_OPTS));
			
 
				+    assertEquals("Reduce heap options works when disabled!", "-Xmx2m", 
			
 
				+                 simulatedConf.get(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS));
			
 
				+    assertEquals("Task heap options works when disabled!", "-Xmx3m", 
			
 
				+                 simulatedConf.get(JobConf.MAPRED_TASK_JAVA_OPTS));
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixRecord.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixRecord.java
@@ -176,7 +176,8 @@ public class TestGridmixRecord {
 
				       a.setReduceOutputBytes(out_bytes);
			
 
				       final int min = WritableUtils.getVIntSize(in_rec)
			
 
				                     + WritableUtils.getVIntSize(out_rec)
			
 
				-                    + WritableUtils.getVIntSize(out_bytes);
			
 
				+                    + WritableUtils.getVIntSize(out_bytes)
			
 
				+                    + WritableUtils.getVIntSize(0);
			
 
				       assertEquals(min + 2, a.fixedBytes()); // meta + vint min
			
 
				       final int size = r.nextInt(1024) + a.fixedBytes() + 1;
			
 
				       setSerialize(a, r.nextLong(), size, out);
			
@@ -207,7 +208,7 @@ public class TestGridmixRecord {
 
				 
			
 
				   @Test
			
 
				   public void testKeySpec() throws Exception {
			
 
				-    final int min = 5;
			
 
				+    final int min = 6;
			
 
				     final int max = 300;
			
 
				     final GridmixKey a = new GridmixKey(GridmixKey.REDUCE_SPEC, 1, 0L);
			
 
				     final GridmixKey b = new GridmixKey(GridmixKey.REDUCE_SPEC, 1, 0L);
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java
@@ -23,6 +23,8 @@ import org.apache.commons.logging.impl.Log4JLogger;
 
				 import org.apache.hadoop.conf.Configuration;
			
 
				 import org.apache.hadoop.fs.ContentSummary;
			
 
				 import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.FSDataOutputStream;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				 import org.apache.hadoop.fs.permission.FsPermission;
			
 
				 import org.apache.hadoop.mapred.Counters;
			
@@ -34,6 +36,7 @@ import org.apache.hadoop.mapred.TaskReport;
 
				 import org.apache.hadoop.mapreduce.Job;
			
 
				 import org.apache.hadoop.mapreduce.TaskType;
			
 
				 import org.apache.hadoop.tools.rumen.JobStory;
			
 
				+import org.apache.hadoop.tools.rumen.JobStoryProducer;
			
 
				 import org.apache.hadoop.tools.rumen.TaskInfo;
			
 
				 import org.apache.hadoop.util.ToolRunner;
			
 
				 import org.apache.log4j.Level;
			
@@ -41,13 +44,16 @@ import org.junit.AfterClass;
 
				 import org.junit.BeforeClass;
			
 
				 import org.junit.Test;
			
 
				 
			
 
				+import java.io.InputStream;
			
 
				 import java.io.IOException;
			
 
				+import java.text.DecimalFormat;
			
 
				 import java.util.ArrayList;
			
 
				 import java.util.Arrays;
			
 
				 import java.util.HashMap;
			
 
				 import java.util.concurrent.BlockingQueue;
			
 
				 import java.util.concurrent.CountDownLatch;
			
 
				 import java.util.concurrent.LinkedBlockingQueue;
			
 
				+import java.util.zip.GZIPInputStream;
			
 
				 
			
 
				 import static org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS;
			
 
				 import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_BYTES;
			
@@ -103,17 +109,10 @@ public class TestGridmixSubmission {
 
				         GridmixTestUtils.mrCluster.createJobConf());
			
 
				       for (Job job : succeeded) {
			
 
				         final String jobname = job.getJobName();
			
 
				-        if ("GRIDMIX_GENDATA".equals(jobname)) {
			
 
				-          if (!job.getConfiguration().getBoolean(
			
 
				-            GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true)) {
			
 
				-            assertEquals(
			
 
				-              " Improper queue for " + job.getJobName(),
			
 
				-              job.getConfiguration().get("mapred.job.queue.name"), "q1");
			
 
				-          } else {
			
 
				-            assertEquals(
			
 
				-              " Improper queue for " + job.getJobName(),
			
 
				-              job.getConfiguration().get("mapred.job.queue.name"), "default");
			
 
				-          }
			
 
				+        final String jobName = job.getJobName();
			
 
				+        Configuration conf = job.getConfiguration();
			
 
				+        if (GenerateData.JOB_NAME.equals(jobName)) {
			
 
				+          verifyQueue(conf, jobName);
			
 
				           final Path in = new Path("foo").makeQualified(GridmixTestUtils.dfs);
			
 
				           final Path out = new Path("/gridmix").makeQualified(GridmixTestUtils.dfs);
			
 
				           final ContentSummary generated = GridmixTestUtils.dfs.getContentSummary(in);
			
@@ -123,37 +122,55 @@ public class TestGridmixSubmission {
 
				           FileStatus[] outstat = GridmixTestUtils.dfs.listStatus(out);
			
 
				           assertEquals("Mismatched job count", NJOBS, outstat.length);
			
 
				           continue;
			
 
				+        } else if (GenerateDistCacheData.JOB_NAME.equals(jobName)) {
			
 
				+          verifyQueue(conf, jobName);
			
 
				+          continue;
			
 
				         }
			
 
				-        
			
 
				-        if (!job.getConfiguration().getBoolean(
			
 
				-          GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true)) {
			
 
				-          assertEquals(" Improper queue for  " + job.getJobName() + " " ,
			
 
				-          job.getConfiguration().get("mapred.job.queue.name"),"q1" );
			
 
				+
			
 
				+        if (!conf.getBoolean(
			
 
				+            GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true)) {
			
 
				+          assertEquals(" Improper queue for  " + jobName + " " ,
			
 
				+              conf.get("mapred.queue.name"), "q1" );
			
 
				         } else {
			
 
				-          assertEquals(
			
 
				-            " Improper queue for  " + job.getJobName() + " ",
			
 
				-            job.getConfiguration().get("mapred.job.queue.name"), sub.get(
			
 
				-              job.getConfiguration().get(GridmixJob.ORIGNAME)).getQueueName());
			
 
				+          assertEquals(" Improper queue for  " + jobName + " ",
			
 
				+              conf.get("mapred.queue.name"),
			
 
				+              sub.get(conf.get(Gridmix.ORIGINAL_JOB_ID)).getQueueName());
			
 
				         }
			
 
				 
			
 
				-        final JobStory spec =
			
 
				-          sub.get(job.getConfiguration().get(GridmixJob.ORIGNAME));
			
 
				-        assertNotNull("No spec for " + job.getJobName(), spec);
			
 
				-        assertNotNull("No counters for " + job.getJobName(), job.getCounters());
			
 
				-        final String specname = spec.getName();
			
 
				-        final FileStatus stat = GridmixTestUtils.dfs.getFileStatus(new Path(
			
 
				-          GridmixTestUtils.DEST, "" +
			
 
				-              Integer.valueOf(specname.substring(specname.length() - 5))));
			
 
				-        assertEquals("Wrong owner for " + job.getJobName(), spec.getUser(),
			
 
				-            stat.getOwner());
			
 
				-
			
 
				+        final String originalJobId = conf.get(Gridmix.ORIGINAL_JOB_ID);
			
 
				+        final JobStory spec = sub.get(originalJobId);
			
 
				+        assertNotNull("No spec for " + jobName, spec);
			
 
				+        assertNotNull("No counters for " + jobName, job.getCounters());
			
 
				+        final String originalJobName = spec.getName();
			
 
				+        System.out.println("originalJobName=" + originalJobName
			
 
				+            + ";GridmixJobName=" + jobName + ";originalJobID=" + originalJobId);
			
 
				+        assertTrue("Original job name is wrong.", originalJobName.equals(
			
 
				+            conf.get(Gridmix.ORIGINAL_JOB_NAME)));
			
 
				+
			
 
				+        // Gridmix job seqNum contains 6 digits
			
 
				+        int seqNumLength = 6;
			
 
				+        String jobSeqNum = new DecimalFormat("000000").format(
			
 
				+            conf.getInt(GridmixJob.GRIDMIX_JOB_SEQ, -1));
			
 
				+        // Original job name is of the format MOCKJOB<6 digit sequence number>
			
 
				+        // because MockJob jobNames are of this format.
			
 
				+        assertTrue(originalJobName.substring(
			
 
				+            originalJobName.length() - seqNumLength).equals(jobSeqNum));
			
 
				+
			
 
				+        assertTrue("Gridmix job name is not in the expected format.",
			
 
				+            jobName.equals(
			
 
				+                GridmixJob.JOB_NAME_PREFIX + jobSeqNum));
			
 
				+        final FileStatus stat =
			
 
				+          GridmixTestUtils.dfs.getFileStatus(
			
 
				+            new Path(GridmixTestUtils.DEST, "" + Integer.valueOf(jobSeqNum)));
			
 
				+        assertEquals("Wrong owner for " + jobName, spec.getUser(),
			
 
				+                     stat.getOwner());
			
 
				         final int nMaps = spec.getNumberMaps();
			
 
				         final int nReds = spec.getNumberReduces();
			
 
				 
			
 
				         // TODO Blocked by MAPREDUCE-118
			
 
				         if (true) return;
			
 
				         // TODO
			
 
				-        System.out.println(jobname + ": " + nMaps + "/" + nReds);
			
 
				+        System.out.println(jobName + ": " + nMaps + "/" + nReds);
			
 
				         final TaskReport[] mReports =
			
 
				           client.getMapTaskReports(JobID.downgrade(job.getJobID()));
			
 
				         assertEquals("Mismatched map count", nMaps, mReports.length);
			
@@ -168,6 +185,18 @@ public class TestGridmixSubmission {
 
				       }
			
 
				     }
			
 
				 
			
 
				+    // Verify if correct job queue is used
			
 
				+    private void verifyQueue(Configuration conf, String jobName) {
			
 
				+      if (!conf.getBoolean(
			
 
				+          GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true)) {
			
 
				+        assertEquals(" Improper queue for " + jobName,
			
 
				+            conf.get("mapred.job.queue.name"), "q1");
			
 
				+      } else {
			
 
				+        assertEquals(" Improper queue for " + jobName,
			
 
				+            conf.get("mapred.job.queue.name"), "default");
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				     public void check(final TaskType type, Job job, JobStory spec,
			
 
				           final TaskReport[] runTasks,
			
 
				           long extraInputBytes, int extraInputRecords,
			
@@ -325,19 +354,118 @@ public class TestGridmixSubmission {
 
				     }
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Verifies that the given {@code JobStory} corresponds to the checked-in
			
 
				+   * WordCount {@code JobStory}. The verification is effected via JUnit
			
 
				+   * assertions.
			
 
				+   *
			
 
				+   * @param js the candidate JobStory.
			
 
				+   */
			
 
				+  private void verifyWordCountJobStory(JobStory js) {
			
 
				+    assertNotNull("Null JobStory", js);
			
 
				+    String expectedJobStory = "WordCount:johndoe:default:1285322645148:3:1";
			
 
				+    String actualJobStory = js.getName() + ":" + js.getUser() + ":"
			
 
				+      + js.getQueueName() + ":" + js.getSubmissionTime() + ":"
			
 
				+      + js.getNumberMaps() + ":" + js.getNumberReduces();
			
 
				+    assertEquals("Unexpected JobStory", expectedJobStory, actualJobStory);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Expands a file compressed using {@code gzip}.
			
 
				+   *
			
 
				+   * @param fs the {@code FileSystem} corresponding to the given
			
 
				+   * file.
			
 
				+   *
			
 
				+   * @param in the path to the compressed file.
			
 
				+   *
			
 
				+   * @param out the path to the uncompressed output.
			
 
				+   *
			
 
				+   * @throws Exception if there was an error during the operation.
			
 
				+   */
			
 
				+  private void expandGzippedTrace(FileSystem fs, Path in, Path out)
			
 
				+    throws Exception {
			
 
				+    byte[] buff = new byte[4096];
			
 
				+    GZIPInputStream gis = new GZIPInputStream(fs.open(in));
			
 
				+    FSDataOutputStream fsdos = fs.create(out);
			
 
				+    int numRead;
			
 
				+    while ((numRead = gis.read(buff, 0, buff.length)) != -1) {
			
 
				+      fsdos.write(buff, 0, numRead);
			
 
				+    }
			
 
				+    gis.close();
			
 
				+    fsdos.close();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Tests the reading of traces in GridMix3. These traces are generated
			
 
				+   * by Rumen and are in the JSON format. The traces can optionally be
			
 
				+   * compressed and uncompressed traces can also be passed to GridMix3 via
			
 
				+   * its standard input stream. The testing is effected via JUnit assertions.
			
 
				+   *
			
 
				+   * @throws Exception if there was an error.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testTraceReader() throws Exception {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+    Path rootInputDir = new Path(System.getProperty("src.test.data"));
			
 
				+    rootInputDir
			
 
				+      = rootInputDir.makeQualified(lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+    Path rootTempDir
			
 
				+      = new Path(System.getProperty("test.build.data",
			
 
				+        System.getProperty("java.io.tmpdir")), "testTraceReader");
			
 
				+    rootTempDir
			
 
				+      = rootTempDir.makeQualified(lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+    Path inputFile = new Path(rootInputDir, "wordcount.json.gz");
			
 
				+    Path tempFile = new Path(rootTempDir, "gridmix3-wc.json");
			
 
				+
			
 
				+    InputStream origStdIn = System.in;
			
 
				+    InputStream tmpIs = null;
			
 
				+    try {
			
 
				+      DebugGridmix dgm = new DebugGridmix();
			
 
				+      JobStoryProducer jsp
			
 
				+        = dgm.createJobStoryProducer(inputFile.toString(), conf);
			
 
				+
			
 
				+      System.out.println("Verifying JobStory from compressed trace...");
			
 
				+      verifyWordCountJobStory(jsp.getNextJob());
			
 
				+
			
 
				+      expandGzippedTrace(lfs, inputFile, tempFile);
			
 
				+      jsp = dgm.createJobStoryProducer(tempFile.toString(), conf);
			
 
				+      System.out.println("Verifying JobStory from uncompressed trace...");
			
 
				+      verifyWordCountJobStory(jsp.getNextJob());
			
 
				+
			
 
				+      tmpIs = lfs.open(tempFile);
			
 
				+      System.setIn(tmpIs);
			
 
				+      System.out.println("Verifying JobStory from trace in standard input...");
			
 
				+      jsp = dgm.createJobStoryProducer("-", conf);
			
 
				+      verifyWordCountJobStory(jsp.getNextJob());
			
 
				+    } finally {
			
 
				+      System.setIn(origStdIn);
			
 
				+      if (tmpIs != null) {
			
 
				+	tmpIs.close();
			
 
				+      }
			
 
				+      lfs.delete(rootTempDir, true);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				   @Test
			
 
				   public void testReplaySubmit() throws Exception {
			
 
				     policy = GridmixJobSubmissionPolicy.REPLAY;
			
 
				     System.out.println(" Replay started at " + System.currentTimeMillis());
			
 
				-    doSubmission(false);
			
 
				+    doSubmission(false, false);
			
 
				     System.out.println(" Replay ended at " + System.currentTimeMillis());
			
 
				+
			
 
				+    System.out.println(" Replay started with default output path at time "
			
 
				+        + System.currentTimeMillis());
			
 
				+    doSubmission(false, true);
			
 
				+    System.out.println(" Replay ended with default output path at time "
			
 
				+        + System.currentTimeMillis());
			
 
				   }
			
 
				   
			
 
				   @Test
			
 
				   public void testStressSubmit() throws Exception {
			
 
				     policy = GridmixJobSubmissionPolicy.STRESS;
			
 
				     System.out.println(" Stress started at " + System.currentTimeMillis());
			
 
				-    doSubmission(false);
			
 
				+    doSubmission(false, false);
			
 
				     System.out.println(" Stress ended at " + System.currentTimeMillis());
			
 
				   }
			
 
				 
			
@@ -346,7 +474,7 @@ public class TestGridmixSubmission {
 
				     policy = GridmixJobSubmissionPolicy.STRESS;
			
 
				     System.out.println(
			
 
				       " Stress with default q started at " + System.currentTimeMillis());
			
 
				-    doSubmission(true);
			
 
				+    doSubmission(true, false);
			
 
				     System.out.println(
			
 
				       " Stress with default q ended at " + System.currentTimeMillis());
			
 
				   }
			
@@ -355,26 +483,39 @@ public class TestGridmixSubmission {
 
				   public void testSerialSubmit() throws Exception {
			
 
				     policy = GridmixJobSubmissionPolicy.SERIAL;
			
 
				     System.out.println("Serial started at " + System.currentTimeMillis());
			
 
				-    doSubmission(false);
			
 
				+    doSubmission(false, false);
			
 
				     System.out.println("Serial ended at " + System.currentTimeMillis());
			
 
				   }
			
 
				 
			
 
				-  private void doSubmission(boolean useDefaultQueue) throws Exception {
			
 
				+  private void doSubmission(boolean useDefaultQueue,
			
 
				+      boolean defaultOutputPath) throws Exception {
			
 
				     final Path in = new Path("foo").makeQualified(GridmixTestUtils.dfs);
			
 
				     final Path out = GridmixTestUtils.DEST.makeQualified(GridmixTestUtils.dfs);
			
 
				     final Path root = new Path("/user");
			
 
				     Configuration conf = null;
			
 
				+
			
 
				     try{
			
 
				-    final String[] argv = {
			
 
				-      "-D" + FilePool.GRIDMIX_MIN_FILE + "=0",
			
 
				-      "-D" + Gridmix.GRIDMIX_OUT_DIR + "=" + out,
			
 
				-      "-D" + Gridmix.GRIDMIX_USR_RSV + "=" + EchoUserResolver.class.getName(),
			
 
				-      "-generate", String.valueOf(GENDATA) + "m",
			
 
				-      in.toString(),
			
 
				-      "-" // ignored by DebugGridmix
			
 
				-    };
			
 
				-    DebugGridmix client = new DebugGridmix();
			
 
				-    conf = new Configuration();
			
 
				+      ArrayList<String> argsList = new ArrayList<String>();
			
 
				+
			
 
				+      argsList.add("-D" + FilePool.GRIDMIX_MIN_FILE + "=0");
			
 
				+      argsList.add("-D" + Gridmix.GRIDMIX_USR_RSV + "="
			
 
				+          + EchoUserResolver.class.getName());
			
 
				+
			
 
				+      // Set the config property gridmix.output.directory only if
			
 
				+      // defaultOutputPath is false. If defaultOutputPath is true, then
			
 
				+      // let us allow gridmix to use the path foo/gridmix/ as output dir.
			
 
				+      if (!defaultOutputPath) {
			
 
				+        argsList.add("-D" + Gridmix.GRIDMIX_OUT_DIR + "=" + out);
			
 
				+      }
			
 
				+      argsList.add("-generate");
			
 
				+      argsList.add(String.valueOf(GENDATA) + "m");
			
 
				+      argsList.add(in.toString());
			
 
				+      argsList.add("-"); // ignored by DebugGridmix
			
 
				+
			
 
				+      String[] argv = argsList.toArray(new String[argsList.size()]);
			
 
				+
			
 
				+      DebugGridmix client = new DebugGridmix();
			
 
				+      conf = new Configuration();
			
 
				       conf.setEnum(GridmixJobSubmissionPolicy.JOB_SUBMISSION_POLICY,policy);
			
 
				       if (useDefaultQueue) {
			
 
				         conf.setBoolean(GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, false);
			
@@ -382,13 +523,13 @@ public class TestGridmixSubmission {
 
				       } else {
			
 
				         conf.setBoolean(GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true);
			
 
				       }
			
 
				-    conf = GridmixTestUtils.mrCluster.createJobConf(new JobConf(conf));
			
 
				-    // allow synthetic users to create home directories
			
 
				-    GridmixTestUtils.dfs.mkdirs(root, new FsPermission((short)0777));
			
 
				-    GridmixTestUtils.dfs.setPermission(root, new FsPermission((short)0777));
			
 
				-    int res = ToolRunner.run(conf, client, argv);
			
 
				-    assertEquals("Client exited with nonzero status", 0, res);
			
 
				-    client.checkMonitor();
			
 
				+      conf = GridmixTestUtils.mrCluster.createJobConf(new JobConf(conf));
			
 
				+      // allow synthetic users to create home directories
			
 
				+      GridmixTestUtils.dfs.mkdirs(root, new FsPermission((short)0777));
			
 
				+      GridmixTestUtils.dfs.setPermission(root, new FsPermission((short)0777));
			
 
				+      int res = ToolRunner.run(conf, client, argv);
			
 
				+      assertEquals("Client exited with nonzero status", 0, res);
			
 
				+      client.checkMonitor();
			
 
				      } catch (Exception e) {
			
 
				        e.printStackTrace();
			
 
				      } finally {
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSummary.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSummary.java
@@ -0,0 +1,371 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import static org.junit.Assert.*;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.CommonConfigurationKeys;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.permission.FsPermission;
			
 
				+import org.apache.hadoop.mapred.JobClient;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.UtilsForTests;
			
 
				+import org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics;
			
 
				+import org.apache.hadoop.mapred.gridmix.Statistics.ClusterStats;
			
 
				+import org.apache.hadoop.mapred.gridmix.Statistics.JobStats;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.tools.rumen.JobStory;
			
 
				+import org.apache.hadoop.tools.rumen.JobStoryProducer;
			
 
				+import org.junit.Test;
			
 
				+
			
 
				+/**
			
 
				+ * Test {@link ExecutionSummarizer} and {@link ClusterSummarizer}.
			
 
				+ */
			
 
				+public class TestGridmixSummary {
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link DataStatistics}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testDataStatistics() throws Exception {
			
 
				+    // test data-statistics getters with compression enabled
			
 
				+    DataStatistics stats = new DataStatistics(10, 2, true);
			
 
				+    assertEquals("Data size mismatch", 10, stats.getDataSize());
			
 
				+    assertEquals("Num files mismatch", 2, stats.getNumFiles());
			
 
				+    assertTrue("Compression configuration mismatch", stats.isDataCompressed());
			
 
				+    
			
 
				+    // test data-statistics getters with compression disabled
			
 
				+    stats = new DataStatistics(100, 5, false);
			
 
				+    assertEquals("Data size mismatch", 100, stats.getDataSize());
			
 
				+    assertEquals("Num files mismatch", 5, stats.getNumFiles());
			
 
				+    assertFalse("Compression configuration mismatch", stats.isDataCompressed());
			
 
				+    
			
 
				+    // test publish data stats
			
 
				+    Configuration conf = new Configuration();
			
 
				+    Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp"));
			
 
				+    Path testDir = new Path(rootTempDir, "testDataStatistics");
			
 
				+    FileSystem fs = testDir.getFileSystem(conf);
			
 
				+    fs.delete(testDir, true);
			
 
				+    Path testInputDir = new Path(testDir, "test");
			
 
				+    fs.mkdirs(testInputDir);
			
 
				+    
			
 
				+    // test empty folder (compression = true)
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    Boolean failed = null;
			
 
				+    try {
			
 
				+      GenerateData.publishDataStatistics(testInputDir, 1024L, conf);
			
 
				+      failed = false;
			
 
				+    } catch (RuntimeException e) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertNotNull("Expected failure!", failed);
			
 
				+    assertTrue("Compression data publishing error", failed);
			
 
				+    
			
 
				+    // test with empty folder (compression = off)
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
			
 
				+    stats = GenerateData.publishDataStatistics(testInputDir, 1024L, conf);
			
 
				+    assertEquals("Data size mismatch", 0, stats.getDataSize());
			
 
				+    assertEquals("Num files mismatch", 0, stats.getNumFiles());
			
 
				+    assertFalse("Compression configuration mismatch", stats.isDataCompressed());
			
 
				+    
			
 
				+    // test with some plain input data (compression = off)
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
			
 
				+    Path inputDataFile = new Path(testInputDir, "test");
			
 
				+    long size = 
			
 
				+      UtilsForTests.createTmpFileDFS(fs, inputDataFile, 
			
 
				+          FsPermission.createImmutable((short)777), "hi hello bye").size();
			
 
				+    stats = GenerateData.publishDataStatistics(testInputDir, -1, conf);
			
 
				+    assertEquals("Data size mismatch", size, stats.getDataSize());
			
 
				+    assertEquals("Num files mismatch", 1, stats.getNumFiles());
			
 
				+    assertFalse("Compression configuration mismatch", stats.isDataCompressed());
			
 
				+    
			
 
				+    // test with some plain input data (compression = on)
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    failed = null;
			
 
				+    try {
			
 
				+      GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
			
 
				+      failed = false;
			
 
				+    } catch (RuntimeException e) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertNotNull("Expected failure!", failed);
			
 
				+    assertTrue("Compression data publishing error", failed);
			
 
				+    
			
 
				+    // test with some compressed input data (compression = off)
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, false);
			
 
				+    fs.delete(inputDataFile, false);
			
 
				+    inputDataFile = new Path(testInputDir, "test.gz");
			
 
				+    size = 
			
 
				+      UtilsForTests.createTmpFileDFS(fs, inputDataFile, 
			
 
				+          FsPermission.createImmutable((short)777), "hi hello").size();
			
 
				+    stats =  GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
			
 
				+    assertEquals("Data size mismatch", size, stats.getDataSize());
			
 
				+    assertEquals("Num files mismatch", 1, stats.getNumFiles());
			
 
				+    assertFalse("Compression configuration mismatch", stats.isDataCompressed());
			
 
				+    
			
 
				+    // test with some compressed input data (compression = on)
			
 
				+    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
			
 
				+    stats = GenerateData.publishDataStatistics(testInputDir, 1234L, conf);
			
 
				+    assertEquals("Data size mismatch", size, stats.getDataSize());
			
 
				+    assertEquals("Num files mismatch", 1, stats.getNumFiles());
			
 
				+    assertTrue("Compression configuration mismatch", stats.isDataCompressed());
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * A fake {@link JobFactory}.
			
 
				+   */
			
 
				+  @SuppressWarnings("unchecked")
			
 
				+  private static class FakeJobFactory extends JobFactory {
			
 
				+    /**
			
 
				+     * A fake {@link JobStoryProducer} for {@link FakeJobFactory}.
			
 
				+     */
			
 
				+    private static class FakeJobStoryProducer implements JobStoryProducer {
			
 
				+      @Override
			
 
				+      public void close() throws IOException {
			
 
				+      }
			
 
				+
			
 
				+      @Override
			
 
				+      public JobStory getNextJob() throws IOException {
			
 
				+        return null;
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    FakeJobFactory(Configuration conf) {
			
 
				+      super(null, new FakeJobStoryProducer(), null, conf, null, null);
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public void update(Object item) {
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    protected Thread createReaderThread() {
			
 
				+      return null;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link ExecutionSummarizer}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  @SuppressWarnings("unchecked")
			
 
				+  public void testExecutionSummarizer() throws IOException {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    
			
 
				+    ExecutionSummarizer es = new ExecutionSummarizer();
			
 
				+    assertEquals("ExecutionSummarizer init failed", 
			
 
				+                 Summarizer.NA, es.getCommandLineArgsString());
			
 
				+    
			
 
				+    long startTime = System.currentTimeMillis();
			
 
				+    // test configuration parameters
			
 
				+    String[] initArgs = new String[] {"-Xmx20m", "-Dtest.args='test'"};
			
 
				+    es = new ExecutionSummarizer(initArgs);
			
 
				+    
			
 
				+    assertEquals("ExecutionSummarizer init failed", 
			
 
				+                 "-Xmx20m -Dtest.args='test'", 
			
 
				+                 es.getCommandLineArgsString());
			
 
				+    
			
 
				+    // test start time
			
 
				+    assertTrue("Start time mismatch", es.getStartTime() >= startTime);
			
 
				+    assertTrue("Start time mismatch", 
			
 
				+               es.getStartTime() <= System.currentTimeMillis());
			
 
				+    
			
 
				+    // test start() of ExecutionSummarizer
			
 
				+    es.update(null);
			
 
				+    assertEquals("ExecutionSummarizer init failed", 0, 
			
 
				+                 es.getSimulationStartTime());
			
 
				+    testExecutionSummarizer(0, 0, 0, 0, 0, 0, es);
			
 
				+    
			
 
				+    long simStartTime = System.currentTimeMillis();
			
 
				+    es.start(null);
			
 
				+    assertTrue("Simulation start time mismatch", 
			
 
				+               es.getSimulationStartTime() >= simStartTime);
			
 
				+    assertTrue("Simulation start time mismatch", 
			
 
				+               es.getSimulationStartTime() <= System.currentTimeMillis());
			
 
				+    
			
 
				+    // test with job stats
			
 
				+    JobStats stats = generateFakeJobStats(1, 10, true);
			
 
				+    es.update(stats);
			
 
				+    testExecutionSummarizer(1, 10, 0, 1, 1, 0, es);
			
 
				+    
			
 
				+    // test with failed job 
			
 
				+    stats = generateFakeJobStats(5, 1, false);
			
 
				+    es.update(stats);
			
 
				+    testExecutionSummarizer(6, 11, 0, 2, 1, 1, es);
			
 
				+    
			
 
				+    // test finalize
			
 
				+    //  define a fake job factory
			
 
				+    JobFactory factory = new FakeJobFactory(conf);
			
 
				+    
			
 
				+    // fake the num jobs in trace
			
 
				+    factory.numJobsInTrace = 3;
			
 
				+    
			
 
				+    Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp"));
			
 
				+    Path testDir = new Path(rootTempDir, "testGridmixSummary");
			
 
				+    Path testTraceFile = new Path(testDir, "test-trace.json");
			
 
				+    FileSystem fs = FileSystem.getLocal(conf);
			
 
				+    fs.create(testTraceFile).close();
			
 
				+    
			
 
				+    // finalize the summarizer
			
 
				+    UserResolver resolver = new RoundRobinUserResolver();
			
 
				+    DataStatistics dataStats = new DataStatistics(100, 2, true);
			
 
				+    String policy = GridmixJobSubmissionPolicy.REPLAY.name();
			
 
				+    conf.set(GridmixJobSubmissionPolicy.JOB_SUBMISSION_POLICY, policy);
			
 
				+    es.finalize(factory, testTraceFile.toString(), 1024L, resolver, dataStats, 
			
 
				+                conf);
			
 
				+    
			
 
				+    // test num jobs in trace
			
 
				+    assertEquals("Mismtach in num jobs in trace", 3, es.getNumJobsInTrace());
			
 
				+    
			
 
				+    // test trace signature
			
 
				+    String tid = 
			
 
				+      ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
			
 
				+    assertEquals("Mismatch in trace signature", 
			
 
				+                 tid, es.getInputTraceSignature());
			
 
				+    // test trace location
			
 
				+    Path qPath = fs.makeQualified(testTraceFile);
			
 
				+    assertEquals("Mismatch in trace signature", 
			
 
				+                 qPath.toString(), es.getInputTraceLocation());
			
 
				+    // test expected data size
			
 
				+    assertEquals("Mismatch in expected data size", 
			
 
				+                 "1.0k", es.getExpectedDataSize());
			
 
				+    // test input data statistics
			
 
				+    assertEquals("Mismatch in input data statistics", 
			
 
				+                 ExecutionSummarizer.stringifyDataStatistics(dataStats), 
			
 
				+                 es.getInputDataStatistics());
			
 
				+    // test user resolver
			
 
				+    assertEquals("Mismatch in user resolver", 
			
 
				+                 resolver.getClass().getName(), es.getUserResolver());
			
 
				+    // test policy
			
 
				+    assertEquals("Mismatch in policy", policy, es.getJobSubmissionPolicy());
			
 
				+    
			
 
				+    // test data stringification using large data
			
 
				+    es.finalize(factory, testTraceFile.toString(), 1024*1024*1024*10L, resolver,
			
 
				+                dataStats, conf);
			
 
				+    assertEquals("Mismatch in expected data size", 
			
 
				+                 "10.0g", es.getExpectedDataSize());
			
 
				+    
			
 
				+    // test trace signature uniqueness
			
 
				+    //  touch the trace file
			
 
				+    fs.delete(testTraceFile, false);
			
 
				+    //  sleep for 1 sec
			
 
				+    try {
			
 
				+      Thread.sleep(1000);
			
 
				+    } catch (InterruptedException ie) {}
			
 
				+    fs.create(testTraceFile).close();
			
 
				+    es.finalize(factory, testTraceFile.toString(), 0L, resolver, dataStats, 
			
 
				+                conf);
			
 
				+    // test missing expected data size
			
 
				+    assertEquals("Mismatch in trace signature", 
			
 
				+                 Summarizer.NA, es.getExpectedDataSize());
			
 
				+    assertFalse("Mismatch in trace signature", 
			
 
				+                tid.equals(es.getInputTraceSignature()));
			
 
				+    // get the new identifier
			
 
				+    tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
			
 
				+    assertEquals("Mismatch in trace signature", 
			
 
				+                 tid, es.getInputTraceSignature());
			
 
				+    
			
 
				+    testTraceFile = new Path(testDir, "test-trace2.json");
			
 
				+    fs.create(testTraceFile).close();
			
 
				+    es.finalize(factory, testTraceFile.toString(), 0L, resolver, dataStats, 
			
 
				+                conf);
			
 
				+    assertFalse("Mismatch in trace signature", 
			
 
				+                tid.equals(es.getInputTraceSignature()));
			
 
				+    // get the new identifier
			
 
				+    tid = ExecutionSummarizer.getTraceSignature(testTraceFile.toString());
			
 
				+    assertEquals("Mismatch in trace signature", 
			
 
				+                 tid, es.getInputTraceSignature());
			
 
				+    
			
 
				+  }
			
 
				+  
			
 
				+  // test the ExecutionSummarizer
			
 
				+  private static void testExecutionSummarizer(int numMaps, int numReds,
			
 
				+      int totalJobsInTrace, int totalJobSubmitted, int numSuccessfulJob, 
			
 
				+      int numFailedJobs, ExecutionSummarizer es) {
			
 
				+    assertEquals("ExecutionSummarizer test failed [num-maps]", 
			
 
				+                 numMaps, es.getNumMapTasksLaunched());
			
 
				+    assertEquals("ExecutionSummarizer test failed [num-reducers]", 
			
 
				+                 numReds, es.getNumReduceTasksLaunched());
			
 
				+    assertEquals("ExecutionSummarizer test failed [num-jobs-in-trace]", 
			
 
				+                 totalJobsInTrace, es.getNumJobsInTrace());
			
 
				+    assertEquals("ExecutionSummarizer test failed [num-submitted jobs]", 
			
 
				+                 totalJobSubmitted, es.getNumSubmittedJobs());
			
 
				+    assertEquals("ExecutionSummarizer test failed [num-successful-jobs]", 
			
 
				+                 numSuccessfulJob, es.getNumSuccessfulJobs());
			
 
				+    assertEquals("ExecutionSummarizer test failed [num-failed jobs]", 
			
 
				+                 numFailedJobs, es.getNumFailedJobs());
			
 
				+  }
			
 
				+  
			
 
				+  // generate fake job stats
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  private static JobStats generateFakeJobStats(final int numMaps, 
			
 
				+      final int numReds, final boolean isSuccessful) 
			
 
				+  throws IOException {
			
 
				+    // A fake job 
			
 
				+    Job fakeJob = new Job() {
			
 
				+      @Override
			
 
				+      public int getNumReduceTasks() {
			
 
				+        return numReds;
			
 
				+      };
			
 
				+      
			
 
				+      @Override
			
 
				+      public boolean isSuccessful() throws IOException {
			
 
				+        return isSuccessful;
			
 
				+      };
			
 
				+    };
			
 
				+    return new JobStats(numMaps, fakeJob);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link ClusterSummarizer}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  public void testClusterSummarizer() throws IOException {
			
 
				+    ClusterSummarizer cs = new ClusterSummarizer();
			
 
				+    Configuration conf = new Configuration();
			
 
				+    
			
 
				+    String jt = "test-jt:1234";
			
 
				+    String nn = "test-nn:5678";
			
 
				+    conf.set("mapred.job.tracker", jt);
			
 
				+    conf.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, nn);
			
 
				+    cs.start(conf);
			
 
				+    
			
 
				+    assertEquals("JT name mismatch", jt, cs.getJobTrackerInfo());
			
 
				+    assertEquals("NN name mismatch", nn, cs.getNamenodeInfo());
			
 
				+    
			
 
				+    ClusterStats cstats = ClusterStats.getClusterStats();
			
 
				+    conf.set("mapred.job.tracker", "local");
			
 
				+    conf.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, "local");
			
 
				+    JobClient jc = new JobClient(new JobConf(conf));
			
 
				+    cstats.setClusterMetric(jc.getClusterStatus());
			
 
				+    
			
 
				+    cs.update(cstats);
			
 
				+    
			
 
				+    // test
			
 
				+    assertEquals("Cluster summary test failed!", 1, cs.getMaxMapTasks());
			
 
				+    assertEquals("Cluster summary test failed!", 1, cs.getMaxReduceTasks());
			
 
				+    assertEquals("Cluster summary test failed!", 1, cs.getNumActiveTrackers());
			
 
				+    assertEquals("Cluster summary test failed!", 0, 
			
 
				+                 cs.getNumBlacklistedTrackers());
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestHighRamJob.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestHighRamJob.java
@@ -0,0 +1,202 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import static org.junit.Assert.*;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.mapred.JobConf;
			
 
				+import org.apache.hadoop.mapred.JobTracker;
			
 
				+import org.apache.hadoop.mapred.gridmix.DebugJobProducer.MockJob;
			
 
				+import org.apache.hadoop.mapreduce.Job;
			
 
				+import org.apache.hadoop.security.UserGroupInformation;
			
 
				+import org.apache.hadoop.tools.rumen.JobStory;
			
 
				+import org.junit.Test;
			
 
				+
			
 
				+/**
			
 
				+ * Test if Gridmix correctly configures the simulated job's configuration for
			
 
				+ * high ram job properties.
			
 
				+ */
			
 
				+public class TestHighRamJob {
			
 
				+  /**
			
 
				+   * A dummy {@link GridmixJob} that opens up the simulated job for testing.
			
 
				+   */
			
 
				+  protected static class DummyGridmixJob extends GridmixJob {
			
 
				+    public DummyGridmixJob(Configuration conf, JobStory desc) 
			
 
				+    throws IOException {
			
 
				+      super(conf, System.currentTimeMillis(), desc, new Path("test"), 
			
 
				+            UserGroupInformation.getCurrentUser(), -1);
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * Do nothing since this is a dummy gridmix job.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public Job call() throws Exception {
			
 
				+      return null;
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    protected boolean canEmulateCompression() {
			
 
				+      // return false as we don't need compression
			
 
				+      return false;
			
 
				+    }
			
 
				+    
			
 
				+    protected Job getJob() {
			
 
				+      // open the simulated job for testing
			
 
				+      return job;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  private static void testHighRamConfig(long jobMapMB, long jobReduceMB, 
			
 
				+      long clusterMapMB, long clusterReduceMB, long simulatedClusterMapMB, 
			
 
				+      long simulatedClusterReduceMB, long expectedMapMB, long expectedReduceMB, 
			
 
				+      Configuration gConf) 
			
 
				+  throws IOException {
			
 
				+    Configuration simulatedJobConf = new Configuration(gConf);
			
 
				+    simulatedJobConf.setLong(JobTracker.MAPRED_CLUSTER_MAP_MEMORY_MB_PROPERTY,
			
 
				+                             simulatedClusterMapMB);
			
 
				+    simulatedJobConf.setLong(
			
 
				+        JobTracker.MAPRED_CLUSTER_REDUCE_MEMORY_MB_PROPERTY,
			
 
				+        simulatedClusterReduceMB);
			
 
				+    
			
 
				+    // define a source conf
			
 
				+    Configuration sourceConf = new Configuration();
			
 
				+    
			
 
				+    // configure the original job
			
 
				+    sourceConf.setLong(JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, jobMapMB);
			
 
				+    sourceConf.setLong(JobTracker.MAPRED_CLUSTER_MAP_MEMORY_MB_PROPERTY,
			
 
				+                       clusterMapMB);
			
 
				+    sourceConf.setLong(JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY,
			
 
				+                       jobReduceMB);
			
 
				+    sourceConf.setLong(JobTracker.MAPRED_CLUSTER_REDUCE_MEMORY_MB_PROPERTY,
			
 
				+                       clusterReduceMB);
			
 
				+    
			
 
				+    // define a mock job
			
 
				+    MockJob story = new MockJob(sourceConf);
			
 
				+    
			
 
				+    GridmixJob job = new DummyGridmixJob(simulatedJobConf, story);
			
 
				+    Job simulatedJob = job.getJob();
			
 
				+    Configuration simulatedConf = simulatedJob.getConfiguration();
			
 
				+    
			
 
				+    // check if the high ram properties are not set
			
 
				+    assertEquals(expectedMapMB, simulatedConf.getLong(
			
 
				+        JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY,
			
 
				+        JobConf.DISABLED_MEMORY_LIMIT));
			
 
				+    assertEquals(expectedReduceMB,
			
 
				+        simulatedConf.getLong(JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY,
			
 
				+        JobConf.DISABLED_MEMORY_LIMIT));
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Tests high ram job properties configuration.
			
 
				+   */
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  @Test
			
 
				+  public void testHighRamFeatureEmulation() throws IOException {
			
 
				+    // define the gridmix conf
			
 
				+    Configuration gridmixConf = new Configuration();
			
 
				+    
			
 
				+    // test : check high ram emulation disabled
			
 
				+    gridmixConf.setBoolean(GridmixJob.GRIDMIX_HIGHRAM_EMULATION_ENABLE, false);
			
 
				+    testHighRamConfig(10, 20, 5, 10, JobConf.DISABLED_MEMORY_LIMIT, 
			
 
				+                      JobConf.DISABLED_MEMORY_LIMIT, 
			
 
				+                      JobConf.DISABLED_MEMORY_LIMIT, 
			
 
				+                      JobConf.DISABLED_MEMORY_LIMIT, gridmixConf);
			
 
				+    
			
 
				+    // test : check with high ram enabled (default) and no scaling
			
 
				+    gridmixConf = new Configuration();
			
 
				+    // set the deprecated max memory limit
			
 
				+    gridmixConf.setLong(JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY, 
			
 
				+                        20*1024*1024);
			
 
				+    testHighRamConfig(10, 20, 5, 10, 5, 10, 10, 20, gridmixConf);
			
 
				+    
			
 
				+    // test : check with high ram enabled and scaling
			
 
				+    gridmixConf = new Configuration();
			
 
				+    // set the new max map/reduce memory limits
			
 
				+    gridmixConf.setLong(JobTracker.MAPRED_CLUSTER_MAX_MAP_MEMORY_MB_PROPERTY,
			
 
				+                        100);
			
 
				+    gridmixConf.setLong(JobTracker.MAPRED_CLUSTER_MAX_REDUCE_MEMORY_MB_PROPERTY,
			
 
				+                        300);
			
 
				+    testHighRamConfig(10, 45, 5, 15, 50, 100, 100, 300, gridmixConf);
			
 
				+    
			
 
				+    // test : check with high ram enabled and map memory scaling mismatch 
			
 
				+    //        (deprecated)
			
 
				+    gridmixConf = new Configuration();
			
 
				+    gridmixConf.setLong(JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY, 
			
 
				+                        70*1024*1024);
			
 
				+    Boolean failed = null;
			
 
				+    try {
			
 
				+      testHighRamConfig(10, 45, 5, 15, 50, 100, 100, 300, gridmixConf);
			
 
				+      failed = false;
			
 
				+    } catch (Exception e) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertNotNull(failed);
			
 
				+    assertTrue("Exception expected for exceeding map memory limit "
			
 
				+               + "(deprecation)!", failed);
			
 
				+    
			
 
				+    // test : check with high ram enabled and reduce memory scaling mismatch 
			
 
				+    //        (deprecated)
			
 
				+    gridmixConf = new Configuration();
			
 
				+    gridmixConf.setLong(JobConf.UPPER_LIMIT_ON_TASK_VMEM_PROPERTY, 
			
 
				+                        150*1024*1024);
			
 
				+    failed = null;
			
 
				+    try {
			
 
				+      testHighRamConfig(10, 45, 5, 15, 50, 100, 100, 300, gridmixConf);
			
 
				+      failed = false;
			
 
				+    } catch (Exception e) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertNotNull(failed);
			
 
				+    assertTrue("Exception expected for exceeding reduce memory limit "
			
 
				+               + "(deprecation)!", failed);
			
 
				+    
			
 
				+    // test : check with high ram enabled and scaling mismatch on map limits
			
 
				+    gridmixConf = new Configuration();
			
 
				+    gridmixConf.setLong(JobTracker.MAPRED_CLUSTER_MAX_MAP_MEMORY_MB_PROPERTY,
			
 
				+                        70);
			
 
				+    failed = null;
			
 
				+    try {
			
 
				+      testHighRamConfig(10, 45, 5, 15, 50, 100, 100, 300, gridmixConf);
			
 
				+      failed = false;
			
 
				+    } catch (Exception e) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertNotNull(failed);
			
 
				+    assertTrue("Exception expected for exceeding map memory limit!", failed);
			
 
				+    
			
 
				+    // test : check with high ram enabled and scaling mismatch on reduce 
			
 
				+    //        limits
			
 
				+    gridmixConf = new Configuration();
			
 
				+    gridmixConf.setLong(JobTracker.MAPRED_CLUSTER_MAX_REDUCE_MEMORY_MB_PROPERTY,
			
 
				+                        200);
			
 
				+    failed = null;
			
 
				+    try {
			
 
				+      testHighRamConfig(10, 45, 5, 15, 50, 100, 100, 300, gridmixConf);
			
 
				+      failed = false;
			
 
				+    } catch (Exception e) {
			
 
				+      failed = true;
			
 
				+    }
			
 
				+    assertNotNull(failed);
			
 
				+    assertTrue("Exception expected for exceeding reduce memory limit!", failed);
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestPseudoLocalFs.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestPseudoLocalFs.java
@@ -0,0 +1,233 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import static org.junit.Assert.*;
			
 
				+
			
 
				+import java.io.FileNotFoundException;
			
 
				+import java.io.IOException;
			
 
				+import java.io.InputStream;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FileStatus;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.junit.Test;
			
 
				+
			
 
				+/**
			
 
				+ * Test the basic functionality of PseudoLocalFs
			
 
				+ */
			
 
				+public class TestPseudoLocalFs {
			
 
				+
			
 
				+  /**
			
 
				+   * Test if a file on PseudoLocalFs of a specific size can be opened and read.
			
 
				+   * Validate the size of the data read.
			
 
				+   * Test the read methods of {@link PseudoLocalFs.RandomInputStream}.
			
 
				+   * @throws Exception
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testPseudoLocalFsFileSize() throws Exception {
			
 
				+    long fileSize = 10000;
			
 
				+    Path path = PseudoLocalFs.generateFilePath("myPsedoFile", fileSize);
			
 
				+    PseudoLocalFs pfs = new PseudoLocalFs();
			
 
				+    pfs.create(path);
			
 
				+
			
 
				+    // Read 1 byte at a time and validate file size.
			
 
				+    InputStream in = pfs.open(path, 0);
			
 
				+    long totalSize = 0;
			
 
				+
			
 
				+    while (in.read() >= 0) {
			
 
				+      ++totalSize;
			
 
				+    }
			
 
				+    in.close();
			
 
				+    assertEquals("File size mismatch with read().", fileSize, totalSize);
			
 
				+
			
 
				+    // Read data from PseudoLocalFs-based file into buffer to
			
 
				+    // validate read(byte[]) and file size.
			
 
				+    in = pfs.open(path, 0);
			
 
				+    totalSize = 0;
			
 
				+    byte[] b = new byte[1024];
			
 
				+    int bytesRead = in.read(b);
			
 
				+    while (bytesRead >= 0) {
			
 
				+      totalSize += bytesRead;
			
 
				+      bytesRead = in.read(b);
			
 
				+    }
			
 
				+    assertEquals("File size mismatch with read(byte[]).", fileSize, totalSize);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate if file status is obtained for correctly formed file paths on
			
 
				+   * PseudoLocalFs and also verify if appropriate exception is thrown for
			
 
				+   * invalid file paths.
			
 
				+   * @param pfs Pseudo Local File System
			
 
				+   * @param path file path for which getFileStatus() is to be called
			
 
				+   * @param shouldSucceed <code>true</code> if getFileStatus() should succeed
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void validateGetFileStatus(FileSystem pfs, Path path,
			
 
				+      boolean shouldSucceed) throws IOException {
			
 
				+    boolean expectedExceptionSeen = false;
			
 
				+    FileStatus stat = null;
			
 
				+    try {
			
 
				+      stat = pfs.getFileStatus(path);
			
 
				+    } catch(FileNotFoundException e) {
			
 
				+      expectedExceptionSeen = true;
			
 
				+    }
			
 
				+    if (shouldSucceed) {
			
 
				+      assertFalse("getFileStatus() has thrown Exception for valid file name "
			
 
				+                  + path, expectedExceptionSeen);
			
 
				+      assertNotNull("Missing file status for a valid file.", stat);
			
 
				+
			
 
				+      // validate fileSize
			
 
				+      String[] parts = path.toUri().getPath().split("\\.");
			
 
				+      long expectedFileSize = Long.valueOf(parts[parts.length - 1]);
			
 
				+      assertEquals("Invalid file size.", expectedFileSize, stat.getLen());
			
 
				+    } else {
			
 
				+      assertTrue("getFileStatus() did not throw Exception for invalid file "
			
 
				+                 + " name " + path, expectedExceptionSeen);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate if file creation succeeds for correctly formed file paths on
			
 
				+   * PseudoLocalFs and also verify if appropriate exception is thrown for
			
 
				+   * invalid file paths.
			
 
				+   * @param pfs Pseudo Local File System
			
 
				+   * @param path file path for which create() is to be called
			
 
				+   * @param shouldSucceed <code>true</code> if create() should succeed
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void validateCreate(FileSystem pfs, Path path,
			
 
				+      boolean shouldSucceed) throws IOException {
			
 
				+    boolean expectedExceptionSeen = false;
			
 
				+    try {
			
 
				+      pfs.create(path);
			
 
				+    } catch(IOException e) {
			
 
				+      expectedExceptionSeen = true;
			
 
				+    }
			
 
				+    if (shouldSucceed) {
			
 
				+      assertFalse("create() has thrown Exception for valid file name "
			
 
				+                  + path, expectedExceptionSeen);
			
 
				+    } else {
			
 
				+      assertTrue("create() did not throw Exception for invalid file name "
			
 
				+                 + path, expectedExceptionSeen);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate if opening of file succeeds for correctly formed file paths on
			
 
				+   * PseudoLocalFs and also verify if appropriate exception is thrown for
			
 
				+   * invalid file paths.
			
 
				+   * @param pfs Pseudo Local File System
			
 
				+   * @param path file path for which open() is to be called
			
 
				+   * @param shouldSucceed <code>true</code> if open() should succeed
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void validateOpen(FileSystem pfs, Path path,
			
 
				+      boolean shouldSucceed) throws IOException {
			
 
				+    boolean expectedExceptionSeen = false;
			
 
				+    try {
			
 
				+      pfs.open(path);
			
 
				+    } catch(IOException e) {
			
 
				+      expectedExceptionSeen = true;
			
 
				+    }
			
 
				+    if (shouldSucceed) {
			
 
				+      assertFalse("open() has thrown Exception for valid file name "
			
 
				+                  + path, expectedExceptionSeen);
			
 
				+    } else {
			
 
				+      assertTrue("open() did not throw Exception for invalid file name "
			
 
				+                 + path, expectedExceptionSeen);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate if exists() returns <code>true</code> for correctly formed file
			
 
				+   * paths on PseudoLocalFs and returns <code>false</code> for improperly
			
 
				+   * formed file paths.
			
 
				+   * @param pfs Pseudo Local File System
			
 
				+   * @param path file path for which exists() is to be called
			
 
				+   * @param shouldSucceed expected return value of exists(&lt;path&gt;)
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void validateExists(FileSystem pfs, Path path,
			
 
				+      boolean shouldSucceed) throws IOException {
			
 
				+    boolean ret = pfs.exists(path);
			
 
				+    if (shouldSucceed) {
			
 
				+      assertTrue("exists() returned false for valid file name " + path, ret);
			
 
				+    } else {
			
 
				+      assertFalse("exists() returned true for invalid file name " + path, ret);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   *  Test Pseudo Local File System methods like getFileStatus(), create(),
			
 
				+   *  open(), exists() for <li> valid file paths and <li> invalid file paths.
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testPseudoLocalFsFileNames() throws IOException {
			
 
				+    PseudoLocalFs pfs = new PseudoLocalFs();
			
 
				+    Configuration conf = new Configuration();
			
 
				+    conf.setClass("fs.pseudo.impl", PseudoLocalFs.class, FileSystem.class);
			
 
				+
			
 
				+    Path path = new Path("pseudo:///myPsedoFile.1234");
			
 
				+    FileSystem testFs = path.getFileSystem(conf);
			
 
				+    assertEquals("Failed to obtain a pseudo local file system object from path",
			
 
				+                 pfs.getUri().getScheme(), testFs.getUri().getScheme());
			
 
				+
			
 
				+    // Validate PseudoLocalFS operations on URI of some other file system
			
 
				+    path = new Path("file:///myPsedoFile.12345");
			
 
				+    validateGetFileStatus(pfs, path, false);
			
 
				+    validateCreate(pfs, path, false);
			
 
				+    validateOpen(pfs, path, false);
			
 
				+    validateExists(pfs, path, false);
			
 
				+
			
 
				+    path = new Path("pseudo:///myPsedoFile");//.<fileSize> missing
			
 
				+    validateGetFileStatus(pfs, path, false);
			
 
				+    validateCreate(pfs, path, false);
			
 
				+    validateOpen(pfs, path, false);
			
 
				+    validateExists(pfs, path, false);
			
 
				+
			
 
				+    // thing after final '.' is not a number
			
 
				+    path = new Path("pseudo:///myPsedoFile.txt");
			
 
				+    validateGetFileStatus(pfs, path, false);
			
 
				+    validateCreate(pfs, path, false);
			
 
				+    validateOpen(pfs, path, false);
			
 
				+    validateExists(pfs, path, false);
			
 
				+
			
 
				+    // Generate valid file name(relative path) and validate operations on it
			
 
				+    long fileSize = 231456;
			
 
				+    path = PseudoLocalFs.generateFilePath("my.Psedo.File", fileSize);
			
 
				+    // Validate the above generateFilePath()
			
 
				+    assertEquals("generateFilePath() failed.", fileSize,
			
 
				+                 pfs.validateFileNameFormat(path));
			
 
				+
			
 
				+    validateGetFileStatus(pfs, path, true);
			
 
				+    validateCreate(pfs, path, true);
			
 
				+    validateOpen(pfs, path, true);
			
 
				+    validateExists(pfs, path, true);
			
 
				+
			
 
				+    // Validate operations on valid qualified path
			
 
				+    path = new Path("myPsedoFile.1237");
			
 
				+    path = path.makeQualified(pfs);
			
 
				+    validateGetFileStatus(pfs, path, true);
			
 
				+    validateCreate(pfs, path, true);
			
 
				+    validateOpen(pfs, path, true);
			
 
				+    validateExists(pfs, path, true);
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestRandomTextDataGenerator.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestRandomTextDataGenerator.java
@@ -0,0 +1,84 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.util.HashSet;
			
 
				+import java.util.List;
			
 
				+import java.util.Set;
			
 
				+
			
 
				+import org.apache.hadoop.mapred.gridmix.RandomTextDataGenerator;
			
 
				+
			
 
				+import static org.junit.Assert.*;
			
 
				+import org.junit.Test;
			
 
				+
			
 
				+/**
			
 
				+ * Test {@link RandomTextDataGenerator}.
			
 
				+ */
			
 
				+public class TestRandomTextDataGenerator {
			
 
				+  /**
			
 
				+   * Test if {@link RandomTextDataGenerator} can generate random words of 
			
 
				+   * desired size.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testRandomTextDataGenerator() {
			
 
				+    RandomTextDataGenerator rtdg = new RandomTextDataGenerator(10, 0L, 5);
			
 
				+    List<String> words = rtdg.getRandomWords();
			
 
				+
			
 
				+    // check the size
			
 
				+    assertEquals("List size mismatch", 10, words.size());
			
 
				+
			
 
				+    // check the words
			
 
				+    Set<String> wordsSet = new HashSet<String>(words);
			
 
				+    assertEquals("List size mismatch due to duplicates", 10, wordsSet.size());
			
 
				+
			
 
				+    // check the word lengths
			
 
				+    for (String word : wordsSet) {
			
 
				+      assertEquals("Word size mismatch", 5, word.length());
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test if {@link RandomTextDataGenerator} can generate same words given the
			
 
				+   * same list-size, word-length and seed.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testRandomTextDataGeneratorRepeatability() {
			
 
				+    RandomTextDataGenerator rtdg1 = new RandomTextDataGenerator(10, 0L, 5);
			
 
				+    List<String> words1 = rtdg1.getRandomWords();
			
 
				+
			
 
				+    RandomTextDataGenerator rtdg2 = new RandomTextDataGenerator(10, 0L, 5);
			
 
				+    List<String> words2 = rtdg2.getRandomWords();
			
 
				+    
			
 
				+    assertTrue("List mismatch", words1.equals(words2));
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test if {@link RandomTextDataGenerator} can generate different words given 
			
 
				+   * different seeds.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testRandomTextDataGeneratorUniqueness() {
			
 
				+    RandomTextDataGenerator rtdg1 = new RandomTextDataGenerator(10, 1L, 5);
			
 
				+    Set<String> words1 = new HashSet(rtdg1.getRandomWords());
			
 
				+
			
 
				+    RandomTextDataGenerator rtdg2 = new RandomTextDataGenerator(10, 0L, 5);
			
 
				+    Set<String> words2 = new HashSet(rtdg2.getRandomWords());
			
 
				+    
			
 
				+    assertFalse("List size mismatch across lists", words1.equals(words2));
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestResourceUsageEmulators.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestResourceUsageEmulators.java
@@ -0,0 +1,612 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.mapred.gridmix;
			
 
				+
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.junit.Test;
			
 
				+import static org.junit.Assert.*;
			
 
				+
			
 
				+import org.apache.hadoop.conf.Configuration;
			
 
				+import org.apache.hadoop.fs.FileSystem;
			
 
				+import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.mapreduce.MapContext;
			
 
				+import org.apache.hadoop.mapreduce.StatusReporter;
			
 
				+import org.apache.hadoop.mapreduce.TaskAttemptID;
			
 
				+import org.apache.hadoop.mapreduce.TaskInputOutputContext;
			
 
				+import org.apache.hadoop.util.ResourceCalculatorPlugin;
			
 
				+import org.apache.hadoop.util.ResourceCalculatorPlugin.ProcResourceValues;
			
 
				+import org.apache.hadoop.tools.rumen.ResourceUsageMetrics;
			
 
				+import org.apache.hadoop.util.DummyResourceCalculatorPlugin;
			
 
				+import org.apache.hadoop.mapred.TaskTracker;
			
 
				+import org.apache.hadoop.mapred.gridmix.LoadJob.ResourceUsageMatcherRunner;
			
 
				+import org.apache.hadoop.mapred.gridmix.emulators.resourceusage.CumulativeCpuUsageEmulatorPlugin;
			
 
				+import org.apache.hadoop.mapred.gridmix.emulators.resourceusage.ResourceUsageEmulatorPlugin;
			
 
				+import org.apache.hadoop.mapred.gridmix.emulators.resourceusage.ResourceUsageMatcher;
			
 
				+import org.apache.hadoop.mapred.gridmix.emulators.resourceusage.CumulativeCpuUsageEmulatorPlugin.DefaultCpuUsageEmulator;
			
 
				+
			
 
				+/**
			
 
				+ * Test Gridmix's resource emulator framework and supported plugins.
			
 
				+ */
			
 
				+public class TestResourceUsageEmulators {
			
 
				+  /**
			
 
				+   * A {@link ResourceUsageEmulatorPlugin} implementation for testing purpose.
			
 
				+   * It essentially creates a file named 'test' in the test directory.
			
 
				+   */
			
 
				+  static class TestResourceUsageEmulatorPlugin 
			
 
				+  implements ResourceUsageEmulatorPlugin {
			
 
				+    static final Path rootTempDir =
			
 
				+        new Path(System.getProperty("test.build.data", "/tmp"));
			
 
				+    static final Path tempDir = 
			
 
				+      new Path(rootTempDir, "TestResourceUsageEmulatorPlugin");
			
 
				+    static final String DEFAULT_IDENTIFIER = "test";
			
 
				+    
			
 
				+    private Path touchPath = null;
			
 
				+    private FileSystem fs = null;
			
 
				+    
			
 
				+    @Override
			
 
				+    public void emulate() throws IOException, InterruptedException {
			
 
				+      // add some time between 2 calls to emulate()
			
 
				+      try {
			
 
				+        Thread.sleep(1000); // sleep for 1s
			
 
				+      } catch (Exception e){}
			
 
				+      
			
 
				+      try {
			
 
				+        fs.delete(touchPath, false); // delete the touch file
			
 
				+        //TODO Search for a better touch utility
			
 
				+        fs.create(touchPath).close(); // recreate it
			
 
				+      } catch (Exception e) {
			
 
				+        throw new RuntimeException(e);
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    protected String getIdentifier() {
			
 
				+      return DEFAULT_IDENTIFIER;
			
 
				+    }
			
 
				+    
			
 
				+    private static Path getFilePath(String id) {
			
 
				+      return new Path(tempDir, id);
			
 
				+    }
			
 
				+    
			
 
				+    private static Path getInitFilePath(String id) {
			
 
				+      return new Path(tempDir, id + ".init");
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public void initialize(Configuration conf, ResourceUsageMetrics metrics,
			
 
				+        ResourceCalculatorPlugin monitor, Progressive progress) {
			
 
				+      // add some time between 2 calls to initialize()
			
 
				+      try {
			
 
				+        Thread.sleep(1000); // sleep for 1s
			
 
				+      } catch (Exception e){}
			
 
				+      
			
 
				+      try {
			
 
				+        fs = FileSystem.getLocal(conf);
			
 
				+        
			
 
				+        Path initPath = getInitFilePath(getIdentifier());
			
 
				+        fs.delete(initPath, false); // delete the old file
			
 
				+        fs.create(initPath).close(); // create a new one
			
 
				+        
			
 
				+        touchPath = getFilePath(getIdentifier());
			
 
				+        fs.delete(touchPath, false);
			
 
				+      } catch (Exception e) {
			
 
				+        
			
 
				+      } finally {
			
 
				+        if (fs != null) {
			
 
				+          try {
			
 
				+            fs.deleteOnExit(tempDir);
			
 
				+          } catch (IOException ioe){}
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    // test if the emulation framework successfully loaded this plugin
			
 
				+    static long testInitialization(String id, Configuration conf) 
			
 
				+    throws IOException {
			
 
				+      Path testPath = getInitFilePath(id);
			
 
				+      FileSystem fs = FileSystem.getLocal(conf);
			
 
				+      return fs.exists(testPath) 
			
 
				+             ? fs.getFileStatus(testPath).getModificationTime() 
			
 
				+             : 0;
			
 
				+    }
			
 
				+    
			
 
				+    // test if the emulation framework successfully loaded this plugin
			
 
				+    static long testEmulation(String id, Configuration conf) 
			
 
				+    throws IOException {
			
 
				+      Path testPath = getFilePath(id);
			
 
				+      FileSystem fs = FileSystem.getLocal(conf);
			
 
				+      return fs.exists(testPath) 
			
 
				+             ? fs.getFileStatus(testPath).getModificationTime() 
			
 
				+             : 0;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test implementation of {@link ResourceUsageEmulatorPlugin} which creates
			
 
				+   * a file named 'others' in the test directory.
			
 
				+   */
			
 
				+  static class TestOthers extends TestResourceUsageEmulatorPlugin {
			
 
				+    static final String ID = "others";
			
 
				+    
			
 
				+    @Override
			
 
				+    protected String getIdentifier() {
			
 
				+      return ID;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test implementation of {@link ResourceUsageEmulatorPlugin} which creates
			
 
				+   * a file named 'cpu' in the test directory.
			
 
				+   */
			
 
				+  static class TestCpu extends TestResourceUsageEmulatorPlugin {
			
 
				+    static final String ID = "cpu";
			
 
				+    
			
 
				+    @Override
			
 
				+    protected String getIdentifier() {
			
 
				+      return ID;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link ResourceUsageMatcher}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testResourceUsageMatcher() throws Exception {
			
 
				+    ResourceUsageMatcher matcher = new ResourceUsageMatcher();
			
 
				+    Configuration conf = new Configuration();
			
 
				+    conf.setClass(ResourceUsageMatcher.RESOURCE_USAGE_EMULATION_PLUGINS, 
			
 
				+                  TestResourceUsageEmulatorPlugin.class, 
			
 
				+                  ResourceUsageEmulatorPlugin.class);
			
 
				+    long currentTime = System.currentTimeMillis();
			
 
				+    
			
 
				+    matcher.configure(conf, null, null, null);
			
 
				+    
			
 
				+    matcher.matchResourceUsage();
			
 
				+    
			
 
				+    String id = TestResourceUsageEmulatorPlugin.DEFAULT_IDENTIFIER;
			
 
				+    long result = 
			
 
				+      TestResourceUsageEmulatorPlugin.testInitialization(id, conf);
			
 
				+    assertTrue("Resource usage matcher failed to initialize the configured"
			
 
				+               + " plugin", result > currentTime);
			
 
				+    result = TestResourceUsageEmulatorPlugin.testEmulation(id, conf);
			
 
				+    assertTrue("Resource usage matcher failed to load and emulate the"
			
 
				+               + " configured plugin", result > currentTime);
			
 
				+    
			
 
				+    // test plugin order to first emulate cpu and then others
			
 
				+    conf.setStrings(ResourceUsageMatcher.RESOURCE_USAGE_EMULATION_PLUGINS, 
			
 
				+                    TestCpu.class.getName() + "," + TestOthers.class.getName());
			
 
				+    
			
 
				+    matcher.configure(conf, null, null, null);
			
 
				+
			
 
				+    // test the initialization order
			
 
				+    long time1 = 
			
 
				+           TestResourceUsageEmulatorPlugin.testInitialization(TestCpu.ID, conf);
			
 
				+    long time2 = 
			
 
				+           TestResourceUsageEmulatorPlugin.testInitialization(TestOthers.ID, 
			
 
				+                                                              conf);
			
 
				+    assertTrue("Resource usage matcher failed to initialize the configured"
			
 
				+               + " plugins in order", time1 < time2);
			
 
				+    
			
 
				+    matcher.matchResourceUsage();
			
 
				+
			
 
				+    // Note that the cpu usage emulator plugin is configured 1st and then the
			
 
				+    // others plugin.
			
 
				+    time1 = 
			
 
				+      TestResourceUsageEmulatorPlugin.testInitialization(TestCpu.ID, conf);
			
 
				+    time2 = 
			
 
				+      TestResourceUsageEmulatorPlugin.testInitialization(TestOthers.ID, 
			
 
				+                                                         conf);
			
 
				+    assertTrue("Resource usage matcher failed to load the configured plugins", 
			
 
				+               time1 < time2);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Fakes the cumulative usage using {@link FakeCpuUsageEmulatorCore}.
			
 
				+   */
			
 
				+  static class FakeResourceUsageMonitor extends DummyResourceCalculatorPlugin {
			
 
				+    private FakeCpuUsageEmulatorCore core;
			
 
				+    
			
 
				+    public FakeResourceUsageMonitor(FakeCpuUsageEmulatorCore core) {
			
 
				+      this.core = core;
			
 
				+    }
			
 
				+    
			
 
				+    /**
			
 
				+     * A dummy CPU usage monitor. Every call to 
			
 
				+     * {@link ResourceCalculatorPlugin#getCumulativeCpuTime()} will return the 
			
 
				+     * value of {@link FakeCpuUsageEmulatorCore#getNumCalls()}.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public long getCumulativeCpuTime() {
			
 
				+      return core.getCpuUsage();
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Returns a {@link ProcResourceValues} with cumulative cpu usage  
			
 
				+     * computed using {@link #getCumulativeCpuTime()}.
			
 
				+     */
			
 
				+    @Override
			
 
				+    public ProcResourceValues getProcResourceValues() {
			
 
				+      long usageValue = getCumulativeCpuTime();
			
 
				+      return new ProcResourceValues(usageValue, -1, -1);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * A dummy {@link Progressive} implementation that allows users to set the
			
 
				+   * progress for testing. The {@link Progressive#getProgress()} call will 
			
 
				+   * return the last progress value set using 
			
 
				+   * {@link FakeProgressive#setProgress(float)}.
			
 
				+   */
			
 
				+  static class FakeProgressive implements Progressive {
			
 
				+    private float progress = 0F;
			
 
				+    @Override
			
 
				+    public float getProgress() {
			
 
				+      return progress;
			
 
				+    }
			
 
				+    
			
 
				+    void setProgress(float progress) {
			
 
				+      this.progress = progress;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * A dummy reporter for {@link LoadJob.ResourceUsageMatcherRunner}.
			
 
				+   */
			
 
				+  private static class DummyReporter extends StatusReporter {
			
 
				+    private Progressive progress;
			
 
				+    
			
 
				+    DummyReporter(Progressive progress) {
			
 
				+      this.progress = progress;
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public org.apache.hadoop.mapreduce.Counter getCounter(Enum<?> name) {
			
 
				+      return null;
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public org.apache.hadoop.mapreduce.Counter getCounter(String group,
			
 
				+                                                          String name) {
			
 
				+      return null;
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public void progress() {
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public float getProgress() {
			
 
				+      return progress.getProgress();
			
 
				+    }
			
 
				+    
			
 
				+    @Override
			
 
				+    public void setStatus(String status) {
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  // Extends ResourceUsageMatcherRunner for testing.
			
 
				+  @SuppressWarnings("unchecked")
			
 
				+  private static class FakeResourceUsageMatcherRunner 
			
 
				+  extends ResourceUsageMatcherRunner {
			
 
				+    FakeResourceUsageMatcherRunner(TaskInputOutputContext context, 
			
 
				+                                   ResourceUsageMetrics metrics) {
			
 
				+      super(context, metrics);
			
 
				+    }
			
 
				+    
			
 
				+    // test ResourceUsageMatcherRunner
			
 
				+    void test() throws Exception {
			
 
				+      super.match();
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link LoadJob.ResourceUsageMatcherRunner}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  @SuppressWarnings("unchecked")
			
 
				+  public void testResourceUsageMatcherRunner() throws Exception {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    FakeProgressive progress = new FakeProgressive();
			
 
				+    
			
 
				+    // set the resource calculator plugin
			
 
				+    conf.setClass(TaskTracker.TT_RESOURCE_CALCULATOR_PLUGIN,
			
 
				+                  DummyResourceCalculatorPlugin.class, 
			
 
				+                  ResourceCalculatorPlugin.class);
			
 
				+    // set the resources
			
 
				+    // set the resource implementation class
			
 
				+    conf.setClass(ResourceUsageMatcher.RESOURCE_USAGE_EMULATION_PLUGINS, 
			
 
				+                  TestResourceUsageEmulatorPlugin.class, 
			
 
				+                  ResourceUsageEmulatorPlugin.class);
			
 
				+    
			
 
				+    long currentTime = System.currentTimeMillis();
			
 
				+    
			
 
				+    // initialize the matcher class
			
 
				+    TaskAttemptID id = new TaskAttemptID("test", 1, true, 1, 1);
			
 
				+    StatusReporter reporter = new DummyReporter(progress);
			
 
				+    TaskInputOutputContext context = 
			
 
				+      new MapContext(conf, id, null, null, null, reporter, null);
			
 
				+    FakeResourceUsageMatcherRunner matcher = 
			
 
				+      new FakeResourceUsageMatcherRunner(context, null);
			
 
				+    
			
 
				+    // check if the matcher initialized the plugin
			
 
				+    String identifier = TestResourceUsageEmulatorPlugin.DEFAULT_IDENTIFIER;
			
 
				+    long initTime = 
			
 
				+      TestResourceUsageEmulatorPlugin.testInitialization(identifier, conf);
			
 
				+    assertTrue("ResourceUsageMatcherRunner failed to initialize the"
			
 
				+               + " configured plugin", initTime > currentTime);
			
 
				+    
			
 
				+    // check the progress
			
 
				+    assertEquals("Progress mismatch in ResourceUsageMatcherRunner", 
			
 
				+                 0, progress.getProgress(), 0D);
			
 
				+    
			
 
				+    // call match() and check progress
			
 
				+    progress.setProgress(0.01f);
			
 
				+    currentTime = System.currentTimeMillis();
			
 
				+    matcher.test();
			
 
				+    long emulateTime = 
			
 
				+      TestResourceUsageEmulatorPlugin.testEmulation(identifier, conf);
			
 
				+    assertTrue("ProgressBasedResourceUsageMatcher failed to load and emulate"
			
 
				+               + " the configured plugin", emulateTime > currentTime);
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link CumulativeCpuUsageEmulatorPlugin}'s core CPU usage emulation 
			
 
				+   * engine.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testCpuUsageEmulator() throws IOException {
			
 
				+    // test CpuUsageEmulator calibration with fake resource calculator plugin
			
 
				+    long target = 100000L; // 100 secs
			
 
				+    int unitUsage = 50;
			
 
				+    FakeCpuUsageEmulatorCore fakeCpuEmulator = new FakeCpuUsageEmulatorCore();
			
 
				+    fakeCpuEmulator.setUnitUsage(unitUsage);
			
 
				+    FakeResourceUsageMonitor fakeMonitor = 
			
 
				+      new FakeResourceUsageMonitor(fakeCpuEmulator);
			
 
				+    
			
 
				+    // calibrate for 100ms
			
 
				+    fakeCpuEmulator.calibrate(fakeMonitor, target);
			
 
				+    
			
 
				+    // by default, CpuUsageEmulator.calibrate() will consume 100ms of CPU usage
			
 
				+    assertEquals("Fake calibration failed", 
			
 
				+                 100, fakeMonitor.getCumulativeCpuTime());
			
 
				+    assertEquals("Fake calibration failed", 
			
 
				+                 100, fakeCpuEmulator.getCpuUsage());
			
 
				+    // by default, CpuUsageEmulator.performUnitComputation() will be called 
			
 
				+    // twice
			
 
				+    assertEquals("Fake calibration failed", 
			
 
				+                 2, fakeCpuEmulator.getNumCalls());
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * This is a dummy class that fakes CPU usage.
			
 
				+   */
			
 
				+  private static class FakeCpuUsageEmulatorCore 
			
 
				+  extends DefaultCpuUsageEmulator {
			
 
				+    private int numCalls = 0;
			
 
				+    private int unitUsage = 1;
			
 
				+    private int cpuUsage = 0;
			
 
				+    
			
 
				+    @Override
			
 
				+    protected void performUnitComputation() {
			
 
				+      ++numCalls;
			
 
				+      cpuUsage += unitUsage;
			
 
				+    }
			
 
				+    
			
 
				+    int getNumCalls() {
			
 
				+      return numCalls;
			
 
				+    }
			
 
				+    
			
 
				+    int getCpuUsage() {
			
 
				+      return cpuUsage;
			
 
				+    }
			
 
				+    
			
 
				+    void reset() {
			
 
				+      numCalls = 0;
			
 
				+      cpuUsage = 0;
			
 
				+    }
			
 
				+    
			
 
				+    void setUnitUsage(int unitUsage) {
			
 
				+      this.unitUsage = unitUsage;
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  // Creates a ResourceUsageMetrics object from the target usage
			
 
				+  static ResourceUsageMetrics createMetrics(long target) {
			
 
				+    ResourceUsageMetrics metrics = new ResourceUsageMetrics();
			
 
				+    metrics.setCumulativeCpuUsage(target);
			
 
				+    metrics.setVirtualMemoryUsage(target);
			
 
				+    metrics.setPhysicalMemoryUsage(target);
			
 
				+    metrics.setHeapUsage(target);
			
 
				+    return metrics;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Test {@link CumulativeCpuUsageEmulatorPlugin}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testCumulativeCpuUsageEmulatorPlugin() throws Exception {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    long targetCpuUsage = 1000L;
			
 
				+    int unitCpuUsage = 50;
			
 
				+    
			
 
				+    // fake progress indicator
			
 
				+    FakeProgressive fakeProgress = new FakeProgressive();
			
 
				+    
			
 
				+    // fake cpu usage generator
			
 
				+    FakeCpuUsageEmulatorCore fakeCore = new FakeCpuUsageEmulatorCore();
			
 
				+    fakeCore.setUnitUsage(unitCpuUsage);
			
 
				+    
			
 
				+    // a cumulative cpu usage emulator with fake core
			
 
				+    CumulativeCpuUsageEmulatorPlugin cpuPlugin = 
			
 
				+      new CumulativeCpuUsageEmulatorPlugin(fakeCore);
			
 
				+    
			
 
				+    // test with invalid or missing resource usage value
			
 
				+    ResourceUsageMetrics invalidUsage = createMetrics(0);
			
 
				+    cpuPlugin.initialize(conf, invalidUsage, null, null);
			
 
				+    
			
 
				+    // test if disabled cpu emulation plugin's emulate() call is a no-operation
			
 
				+    // this will test if the emulation plugin is disabled or not
			
 
				+    int numCallsPre = fakeCore.getNumCalls();
			
 
				+    long cpuUsagePre = fakeCore.getCpuUsage();
			
 
				+    cpuPlugin.emulate();
			
 
				+    int numCallsPost = fakeCore.getNumCalls();
			
 
				+    long cpuUsagePost = fakeCore.getCpuUsage();
			
 
				+    
			
 
				+    //  test if no calls are made cpu usage emulator core
			
 
				+    assertEquals("Disabled cumulative CPU usage emulation plugin works!", 
			
 
				+                 numCallsPre, numCallsPost);
			
 
				+    
			
 
				+    //  test if no calls are made cpu usage emulator core
			
 
				+    assertEquals("Disabled cumulative CPU usage emulation plugin works!", 
			
 
				+                 cpuUsagePre, cpuUsagePost);
			
 
				+    
			
 
				+    // test with valid resource usage value
			
 
				+    ResourceUsageMetrics metrics = createMetrics(targetCpuUsage);
			
 
				+    
			
 
				+    // fake monitor
			
 
				+    ResourceCalculatorPlugin monitor = new FakeResourceUsageMonitor(fakeCore);
			
 
				+    
			
 
				+    // test with default emulation interval
			
 
				+    testEmulationAccuracy(conf, fakeCore, monitor, metrics, cpuPlugin, 
			
 
				+                          targetCpuUsage, targetCpuUsage / unitCpuUsage);
			
 
				+    
			
 
				+    // test with custom value for emulation interval of 20%
			
 
				+    conf.setFloat(CumulativeCpuUsageEmulatorPlugin.CPU_EMULATION_PROGRESS_INTERVAL,
			
 
				+                  0.2F);
			
 
				+    testEmulationAccuracy(conf, fakeCore, monitor, metrics, cpuPlugin, 
			
 
				+                          targetCpuUsage, targetCpuUsage / unitCpuUsage);
			
 
				+    
			
 
				+    // test if emulation interval boundary is respected (unit usage = 1)
			
 
				+    //  test the case where the current progress is less than threshold
			
 
				+    fakeProgress = new FakeProgressive(); // initialize
			
 
				+    fakeCore.reset();
			
 
				+    fakeCore.setUnitUsage(1);
			
 
				+    conf.setFloat(CumulativeCpuUsageEmulatorPlugin.CPU_EMULATION_PROGRESS_INTERVAL,
			
 
				+                  0.25F);
			
 
				+    cpuPlugin.initialize(conf, metrics, monitor, fakeProgress);
			
 
				+    // take a snapshot after the initialization
			
 
				+    long initCpuUsage = monitor.getCumulativeCpuTime();
			
 
				+    long initNumCalls = fakeCore.getNumCalls();
			
 
				+    // test with 0 progress
			
 
				+    testEmulationBoundary(0F, fakeCore, fakeProgress, cpuPlugin, initCpuUsage, 
			
 
				+                          initNumCalls, "[no-op, 0 progress]");
			
 
				+    // test with 24% progress
			
 
				+    testEmulationBoundary(0.24F, fakeCore, fakeProgress, cpuPlugin, 
			
 
				+                          initCpuUsage, initNumCalls, "[no-op, 24% progress]");
			
 
				+    // test with 25% progress
			
 
				+    //  target = 1000ms, target emulation at 25% = 250ms, 
			
 
				+    //  weighed target = 1000 * 0.25^4 (we are using progress^4 as the weight)
			
 
				+    //                 ~ 4
			
 
				+    //  but current usage = init-usage = 100, hence expected = 100
			
 
				+    testEmulationBoundary(0.25F, fakeCore, fakeProgress, cpuPlugin, 
			
 
				+                          initCpuUsage, initNumCalls, "[op, 25% progress]");
			
 
				+    
			
 
				+    // test with 80% progress
			
 
				+    //  target = 1000ms, target emulation at 80% = 800ms, 
			
 
				+    //  weighed target = 1000 * 0.25^4 (we are using progress^4 as the weight)
			
 
				+    //                 ~ 410
			
 
				+    //  current-usage = init-usage = 100, hence expected-usage = 410
			
 
				+    testEmulationBoundary(0.80F, fakeCore, fakeProgress, cpuPlugin, 410, 410, 
			
 
				+                          "[op, 80% progress]");
			
 
				+    
			
 
				+    // now test if the final call with 100% progress ramps up the CPU usage
			
 
				+    testEmulationBoundary(1F, fakeCore, fakeProgress, cpuPlugin, targetCpuUsage,
			
 
				+                          targetCpuUsage, "[op, 100% progress]");
			
 
				+    
			
 
				+    // test if emulation interval boundary is respected (unit usage = 50)
			
 
				+    //  test the case where the current progress is less than threshold
			
 
				+    fakeProgress = new FakeProgressive(); // initialize
			
 
				+    fakeCore.reset();
			
 
				+    fakeCore.setUnitUsage(unitCpuUsage);
			
 
				+    conf.setFloat(CumulativeCpuUsageEmulatorPlugin.CPU_EMULATION_PROGRESS_INTERVAL,
			
 
				+                  0.40F);
			
 
				+    cpuPlugin.initialize(conf, metrics, monitor, fakeProgress);
			
 
				+    // take a snapshot after the initialization
			
 
				+    initCpuUsage = monitor.getCumulativeCpuTime();
			
 
				+    initNumCalls = fakeCore.getNumCalls();
			
 
				+    // test with 0 progress
			
 
				+    testEmulationBoundary(0F, fakeCore, fakeProgress, cpuPlugin, initCpuUsage, 
			
 
				+                          initNumCalls, "[no-op, 0 progress]");
			
 
				+    // test with 39% progress
			
 
				+    testEmulationBoundary(0.39F, fakeCore, fakeProgress, cpuPlugin, 
			
 
				+                          initCpuUsage, initNumCalls, "[no-op, 39% progress]");
			
 
				+    // test with 40% progress
			
 
				+    //  target = 1000ms, target emulation at 40% = 4000ms, 
			
 
				+    //  weighed target = 1000 * 0.40^4 (we are using progress^4 as the weight)
			
 
				+    //                 ~ 26
			
 
				+    // current-usage = init-usage = 100, hence expected-usage = 100
			
 
				+    testEmulationBoundary(0.40F, fakeCore, fakeProgress, cpuPlugin, 
			
 
				+                          initCpuUsage, initNumCalls, "[op, 40% progress]");
			
 
				+    
			
 
				+    // test with 90% progress
			
 
				+    //  target = 1000ms, target emulation at 90% = 900ms, 
			
 
				+    //  weighed target = 1000 * 0.90^4 (we are using progress^4 as the weight)
			
 
				+    //                 ~ 657
			
 
				+    //  current-usage = init-usage = 100, hence expected-usage = 657 but 
			
 
				+    //  the fake-core increases in steps of 50, hence final target = 700
			
 
				+    testEmulationBoundary(0.90F, fakeCore, fakeProgress, cpuPlugin, 700, 
			
 
				+                          700 / unitCpuUsage, "[op, 90% progress]");
			
 
				+    
			
 
				+    // now test if the final call with 100% progress ramps up the CPU usage
			
 
				+    testEmulationBoundary(1F, fakeCore, fakeProgress, cpuPlugin, targetCpuUsage,
			
 
				+                          targetCpuUsage / unitCpuUsage, "[op, 100% progress]");
			
 
				+  }
			
 
				+  
			
 
				+  // test whether the CPU usage emulator achieves the desired target using
			
 
				+  // desired calls to the underling core engine.
			
 
				+  private static void testEmulationAccuracy(Configuration conf, 
			
 
				+                        FakeCpuUsageEmulatorCore fakeCore,
			
 
				+                        ResourceCalculatorPlugin monitor,
			
 
				+                        ResourceUsageMetrics metrics,
			
 
				+                        CumulativeCpuUsageEmulatorPlugin cpuPlugin,
			
 
				+                        long expectedTotalCpuUsage, long expectedTotalNumCalls) 
			
 
				+  throws Exception {
			
 
				+    FakeProgressive fakeProgress = new FakeProgressive();
			
 
				+    fakeCore.reset();
			
 
				+    cpuPlugin.initialize(conf, metrics, monitor, fakeProgress);
			
 
				+    int numLoops = 0;
			
 
				+    while (fakeProgress.getProgress() < 1) {
			
 
				+      ++numLoops;
			
 
				+      float progress = (float)numLoops / 100;
			
 
				+      fakeProgress.setProgress(progress);
			
 
				+      cpuPlugin.emulate();
			
 
				+    }
			
 
				+    
			
 
				+    // test if the resource plugin shows the expected invocations
			
 
				+    assertEquals("Cumulative cpu usage emulator plugin failed (num calls)!", 
			
 
				+                 expectedTotalNumCalls, fakeCore.getNumCalls(), 0L);
			
 
				+    // test if the resource plugin shows the expected usage
			
 
				+    assertEquals("Cumulative cpu usage emulator plugin failed (total usage)!", 
			
 
				+                 expectedTotalCpuUsage, fakeCore.getCpuUsage(), 0L);
			
 
				+  }
			
 
				+  
			
 
				+  // tests if the CPU usage emulation plugin emulates only at the expected
			
 
				+  // progress gaps
			
 
				+  private static void testEmulationBoundary(float progress, 
			
 
				+      FakeCpuUsageEmulatorCore fakeCore, FakeProgressive fakeProgress, 
			
 
				+      CumulativeCpuUsageEmulatorPlugin cpuPlugin, long expectedTotalCpuUsage, 
			
 
				+      long expectedTotalNumCalls, String info) throws Exception {
			
 
				+    fakeProgress.setProgress(progress);
			
 
				+    cpuPlugin.emulate();
			
 
				+    
			
 
				+    assertEquals("Emulation interval test for cpu usage failed " + info + "!", 
			
 
				+                 expectedTotalCpuUsage, fakeCore.getCpuUsage(), 0L);
			
 
				+    assertEquals("Emulation interval test for num calls failed " + info + "!", 
			
 
				+                 expectedTotalNumCalls, fakeCore.getNumCalls(), 0L);
			
 
				+  }
			
 
				+}
			
--- a/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestUserResolve.java
+++ b/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestUserResolve.java
@@ -33,23 +33,31 @@ import org.apache.hadoop.security.UserGroupInformation;
 
				 
			
 
				 public class TestUserResolve {
			
 
				 
			
 
				-  static Path userlist;
			
 
				+  private static Path rootDir = null;
			
 
				+  private static Configuration conf = null;
			
 
				+  private static FileSystem fs = null;
			
 
				 
			
 
				   @BeforeClass
			
 
				-  public static void writeUserList() throws IOException {
			
 
				-    final Configuration conf = new Configuration();
			
 
				-    final FileSystem fs = FileSystem.getLocal(conf);
			
 
				-    final Path wd = new Path(new Path(
			
 
				-          System.getProperty("test.build.data", "/tmp")).makeQualified(fs),
			
 
				-        "gridmixUserResolve");
			
 
				-    userlist = new Path(wd, "users");
			
 
				+  public static void createRootDir() throws IOException {
			
 
				+    conf = new Configuration();
			
 
				+    fs = FileSystem.getLocal(conf);
			
 
				+    rootDir = new Path(new Path(System.getProperty("test.build.data", "/tmp"))
			
 
				+                  .makeQualified(fs), "gridmixUserResolve");
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Creates users file with the content as the String usersFileContent.
			
 
				+   * @param usersFilePath    the path to the file that is to be created
			
 
				+   * @param usersFileContent Content of users file
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private static void writeUserList(Path usersFilePath, String usersFileContent)
			
 
				+  throws IOException {
			
 
				+
			
 
				     FSDataOutputStream out = null;
			
 
				     try {
			
 
				-      out = fs.create(userlist, true);
			
 
				-      out.writeBytes("user0,groupA,groupB,groupC\n");
			
 
				-      out.writeBytes("user1,groupA,groupC\n");
			
 
				-      out.writeBytes("user2,groupB\n");
			
 
				-      out.writeBytes("user3,groupA,groupB,groupC\n");
			
 
				+      out = fs.create(usersFilePath, true);
			
 
				+      out.writeBytes(usersFileContent);
			
 
				     } finally {
			
 
				       if (out != null) {
			
 
				         out.close();
			
@@ -57,42 +65,110 @@ public class TestUserResolve {
 
				     }
			
 
				   }
			
 
				 
			
 
				-  @Test
			
 
				-  public void testRoundRobinResolver() throws Exception {
			
 
				-    final Configuration conf = new Configuration();
			
 
				-    final UserResolver rslv = new RoundRobinUserResolver();
			
 
				-
			
 
				+  /**
			
 
				+   * Validate RoundRobinUserResolver's behavior for bad user resource file.
			
 
				+   * RoundRobinUserResolver.setTargetUsers() should throw proper Exception for
			
 
				+   * the cases like
			
 
				+   * <li> non existent user resource file and
			
 
				+   * <li> empty user resource file
			
 
				+   *
			
 
				+   * @param rslv              The RoundRobinUserResolver object
			
 
				+   * @param userRsrc          users file
			
 
				+   * @param expectedErrorMsg  expected error message
			
 
				+   */
			
 
				+  private void validateBadUsersFile(UserResolver rslv, URI userRsrc,
			
 
				+      String expectedErrorMsg) {
			
 
				     boolean fail = false;
			
 
				     try {
			
 
				-      rslv.setTargetUsers(null, conf);
			
 
				+      rslv.setTargetUsers(userRsrc, conf);
			
 
				     } catch (IOException e) {
			
 
				+      assertTrue("Exception message from RoundRobinUserResolver is wrong",
			
 
				+          e.getMessage().equals(expectedErrorMsg));
			
 
				       fail = true;
			
 
				     }
			
 
				     assertTrue("User list required for RoundRobinUserResolver", fail);
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate the behavior of {@link RoundRobinUserResolver} for different
			
 
				+   * user resource files like
			
 
				+   * <li> Empty user resource file
			
 
				+   * <li> Non existent user resource file
			
 
				+   * <li> User resource file with valid content
			
 
				+   * @throws Exception
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testRoundRobinResolver() throws Exception {
			
 
				+
			
 
				+    final UserResolver rslv = new RoundRobinUserResolver();
			
 
				+    Path usersFilePath = new Path(rootDir, "users");
			
 
				+    URI userRsrc = new URI(usersFilePath.toString());
			
 
				+
			
 
				+    // Check if the error message is as expected for non existent
			
 
				+    // user resource file.
			
 
				+    fs.delete(usersFilePath, false);
			
 
				+    String expectedErrorMsg = "File " + userRsrc + " does not exist.";
			
 
				+    validateBadUsersFile(rslv, userRsrc, expectedErrorMsg);
			
 
				+
			
 
				+    // Check if the error message is as expected for empty user resource file
			
 
				+    writeUserList(usersFilePath, "");// creates empty users file
			
 
				+    expectedErrorMsg =
			
 
				+      RoundRobinUserResolver.buildEmptyUsersErrorMsg(userRsrc);
			
 
				+    validateBadUsersFile(rslv, userRsrc, expectedErrorMsg);
			
 
				 
			
 
				-    rslv.setTargetUsers(new URI(userlist.toString()), conf);
			
 
				-    UserGroupInformation ugi1;
			
 
				-    assertEquals("user0", 
			
 
				-        rslv.getTargetUgi((ugi1 = 
			
 
				-          UserGroupInformation.createRemoteUser("hfre0"))).getUserName());
			
 
				-    assertEquals("user1", rslv.getTargetUgi(UserGroupInformation.createRemoteUser("hfre1")).getUserName());
			
 
				-    assertEquals("user2", rslv.getTargetUgi(UserGroupInformation.createRemoteUser("hfre2")).getUserName());
			
 
				+    // Create user resource file with valid content like older users list file
			
 
				+    // with usernames and groups
			
 
				+    writeUserList(usersFilePath,
			
 
				+    "user0,groupA,groupB,groupC\nuser1,groupA,groupC\n");
			
 
				+    validateValidUsersFile(rslv, userRsrc);
			
 
				+
			
 
				+    // Create user resource file with valid content with
			
 
				+    // usernames with groups and without groups
			
 
				+    writeUserList(usersFilePath, "user0,groupA,groupB\nuser1,");
			
 
				+    validateValidUsersFile(rslv, userRsrc);
			
 
				+
			
 
				+    // Create user resource file with valid content with
			
 
				+    // usernames without groups
			
 
				+    writeUserList(usersFilePath, "user0\nuser1");
			
 
				+    validateValidUsersFile(rslv, userRsrc);
			
 
				+  }
			
 
				+
			
 
				+  // Validate RoundRobinUserResolver for the case of
			
 
				+  // user resource file with valid content.
			
 
				+  private void validateValidUsersFile(UserResolver rslv, URI userRsrc)
			
 
				+      throws IOException {
			
 
				+    assertTrue(rslv.setTargetUsers(userRsrc, conf));
			
 
				+    UserGroupInformation ugi1 = UserGroupInformation.createRemoteUser("hfre0");
			
 
				     assertEquals("user0", rslv.getTargetUgi(ugi1).getUserName());
			
 
				-    assertEquals("user3", rslv.getTargetUgi(UserGroupInformation.createRemoteUser("hfre3")).getUserName());
			
 
				+    assertEquals("user1",
			
 
				+        rslv.getTargetUgi(UserGroupInformation.createRemoteUser("hfre1"))
			
 
				+            .getUserName());
			
 
				+    assertEquals("user0",
			
 
				+        rslv.getTargetUgi(UserGroupInformation.createRemoteUser("hfre2"))
			
 
				+            .getUserName());
			
 
				     assertEquals("user0", rslv.getTargetUgi(ugi1).getUserName());
			
 
				+    assertEquals("user1",
			
 
				+        rslv.getTargetUgi(UserGroupInformation.createRemoteUser("hfre3"))
			
 
				+            .getUserName());
			
 
				+
			
 
				+    // Verify if same user comes again, its mapped user name should be
			
 
				+    // correct even though UGI is constructed again.
			
 
				+    assertEquals("user0", rslv.getTargetUgi(
			
 
				+        UserGroupInformation.createRemoteUser("hfre0")).getUserName());
			
 
				+    assertEquals("user0",
			
 
				+        rslv.getTargetUgi(UserGroupInformation.createRemoteUser("hfre5"))
			
 
				+        .getUserName());
			
 
				+    assertEquals("user0",
			
 
				+        rslv.getTargetUgi(UserGroupInformation.createRemoteUser("hfre0"))
			
 
				+        .getUserName());
			
 
				   }
			
 
				 
			
 
				   @Test
			
 
				   public void testSubmitterResolver() throws Exception {
			
 
				-    final Configuration conf = new Configuration();
			
 
				     final UserResolver rslv = new SubmitterUserResolver();
			
 
				-    rslv.setTargetUsers(null, conf);
			
 
				+    assertFalse(rslv.needsTargetUsersList());
			
 
				     UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
			
 
				     assertEquals(ugi, rslv.getTargetUgi((UserGroupInformation)null));
			
 
				-    System.out.println(" Submitter current user " + ugi);
			
 
				-    System.out.println(
			
 
				-      " Target ugi " + rslv.getTargetUgi(
			
 
				-        (UserGroupInformation) null));
			
 
				   }
			
 
				 
			
 
				 }
			
--- a/src/core/org/apache/hadoop/util/Progress.java
+++ b/src/core/org/apache/hadoop/util/Progress.java
@@ -96,6 +96,7 @@ public class Progress {
 
				     return node.getInternal();
			
 
				   }
			
 
				 
			
 
				+  
			
 
				   /** Computes progress in this node. */
			
 
				   private synchronized float getInternal() {
			
 
				     int phaseCount = phases.size();
			
@@ -108,6 +109,14 @@ public class Progress {
 
				     }
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Returns progress in this node. get() would give overall progress of the
			
 
				+   * root node(not just given current node).
			
 
				+   */
			
 
				+  public synchronized float getProgress() {
			
 
				+    return getInternal();
			
 
				+  }
			
 
				+
			
 
				   public synchronized void setStatus(String status) {
			
 
				     this.status = status;
			
 
				   }
			
--- a/src/docs/src/documentation/content/xdocs/gridmix.xml
+++ b/src/docs/src/documentation/content/xdocs/gridmix.xml
@@ -15,150 +15,768 @@
 
				   See the License for the specific language governing permissions and
			
 
				   limitations under the License.
			
 
				 -->
			
 
				-
			
 
				 <!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				-
			
 
				 <document>
			
 
				+  <header>
			
 
				+    <title>GridMix</title>
			
 
				+  </header>
			
 
				+  <body>
			
 
				+    <section id="overview">
			
 
				+      <title>Overview</title>
			
 
				+      <p>GridMix is a benchmark for Hadoop clusters. It submits a mix of
			
 
				+      synthetic jobs, modeling a profile mined from production loads.</p>
			
 
				+      <p>There exist three versions of the GridMix tool. This document
			
 
				+      discusses the third (checked into <code>src/contrib</code>), distinct
			
 
				+      from the two checked into the <code>src/benchmarks</code> sub-directory.
			
 
				+      While the first two versions of the tool included stripped-down versions
			
 
				+      of common jobs, both were principally saturation tools for stressing the
			
 
				+      framework at scale. In support of a broader range of deployments and
			
 
				+      finer-tuned job mixes, this version of the tool will attempt to model
			
 
				+      the resource profiles of production jobs to identify bottlenecks, guide
			
 
				+      development, and serve as a replacement for the existing GridMix
			
 
				+      benchmarks.</p>
			
 
				+      <p>To run GridMix, you need a MapReduce job trace describing the job mix
			
 
				+      for a given cluster. Such traces are typically generated by Rumen (see
			
 
				+      Rumen documentation). GridMix also requires input data from which the
			
 
				+      synthetic jobs will be reading bytes. The input data need not be in any
			
 
				+      particular format, as the synthetic jobs are currently binary readers.
			
 
				+      If you are running on a new cluster, an optional step generating input
			
 
				+      data may precede the run.</p>
			
 
				+      <p>In order to emulate the load of production jobs from a given cluster
			
 
				+      on the same or another cluster, follow these steps:</p>
			
 
				+      <ol>
			
 
				+	<li>Locate the job history files on the production cluster. This
			
 
				+	location is specified by the
			
 
				+	<code>mapred.job.tracker.history.completed.location</code>
			
 
				+	configuration property of the cluster.</li>
			
 
				+	<li>Run Rumen to build a job trace in JSON format for all or select
			
 
				+	jobs.</li>
			
 
				+	<li>Use GridMix with the job trace on the benchmark cluster.</li>
			
 
				+      </ol>
			
 
				+      <p>Jobs submitted by GridMix have names of the form
			
 
				+      &quot;<code>GRIDMIXnnnnnn</code>&quot;, where
			
 
				+      &quot;<code>nnnnnn</code>&quot; is a sequence number padded with leading
			
 
				+      zeroes.</p>
			
 
				+    </section>
			
 
				+    <section id="usage">
			
 
				+      <title>Usage</title>
			
 
				+      <p>Basic command-line usage without configuration parameters:</p>
			
 
				+      <source>
			
 
				+org.apache.hadoop.mapred.gridmix.Gridmix [-generate &lt;size&gt;] [-users &lt;users-list&gt;] &lt;iopath&gt; &lt;trace&gt;
			
 
				+      </source>
			
 
				+      <p>Basic command-line usage with configuration parameters:</p>
			
 
				+      <source>
			
 
				+org.apache.hadoop.mapred.gridmix.Gridmix \
			
 
				+  -Dgridmix.client.submit.threads=10 -Dgridmix.output.directory=foo \
			
 
				+  [-generate &lt;size&gt;] [-users &lt;users-list&gt;] &lt;iopath&gt; &lt;trace&gt;
			
 
				+      </source>
			
 
				+      <note>
			
 
				+	Configuration parameters like
			
 
				+	<code>-Dgridmix.client.submit.threads=10</code> and
			
 
				+	<code>-Dgridmix.output.directory=foo</code> as given above should
			
 
				+	be used <em>before</em> other GridMix parameters.
			
 
				+      </note>
			
 
				+      <p>The <code>&lt;iopath&gt;</code> parameter is the working directory for
			
 
				+      GridMix. Note that this can either be on the local file-system
			
 
				+      or on HDFS, but it is highly recommended that it be the same as that for
			
 
				+      the original job mix so that GridMix puts the same load on the local
			
 
				+      file-system and HDFS respectively.</p>
			
 
				+      <p>The <code>-generate</code> option is used to generate input data and
			
 
				+      Distributed Cache files for the synthetic jobs. It accepts standard units
			
 
				+      of size suffixes, e.g. <code>100g</code> will generate
			
 
				+      100 * 2<sup>30</sup> bytes as input data.
			
 
				+      <code>&lt;iopath&gt;/input</code> is the destination directory for
			
 
				+      generated input data and/or the directory from which input data will be
			
 
				+      read. HDFS-based Distributed Cache files are generated under the
			
 
				+      distributed cache directory <code>&lt;iopath&gt;/distributedCache</code>.
			
 
				+      If some of the needed Distributed Cache files are already existing in the
			
 
				+      distributed cache directory, then only the remaining non-existing
			
 
				+      Distributed Cache files are generated when <code>-generate</code> option
			
 
				+      is specified.</p>
			
 
				+      <p>The <code>-users</code> option is used to point to a users-list
			
 
				+      file (see <a href="#usersqueues">Emulating Users and Queues</a>).</p>
			
 
				+      <p>The <code>&lt;trace&gt;</code> parameter is a path to a job trace
			
 
				+      generated by Rumen. This trace can be compressed (it must be readable
			
 
				+      using one of the compression codecs supported by the cluster) or
			
 
				+      uncompressed. Use &quot;-&quot; as the value of this parameter if you
			
 
				+      want to pass an <em>uncompressed</em> trace via the standard
			
 
				+      input-stream of GridMix.</p>
			
 
				+      <p>The class <code>org.apache.hadoop.mapred.gridmix.Gridmix</code> can
			
 
				+      be found in the JAR
			
 
				+      <code>contrib/gridmix/hadoop-gridmix-$VERSION.jar</code> inside your
			
 
				+      Hadoop installation, where <code>$VERSION</code> corresponds to the
			
 
				+      version of Hadoop installed. A simple way of ensuring that this class
			
 
				+      and all its dependencies are loaded correctly is to use the
			
 
				+      <code>hadoop</code> wrapper script in Hadoop:</p>
			
 
				+      <source>
			
 
				+hadoop jar &lt;gridmix-jar&gt; org.apache.hadoop.mapred.gridmix.Gridmix \
			
 
				+  [-generate &lt;size&gt;] [-users &lt;users-list&gt;] &lt;iopath&gt; &lt;trace&gt;
			
 
				+      </source>
			
 
				+      <p>The supported configuration parameters are explained in the
			
 
				+      following sections.</p>
			
 
				+    </section>
			
 
				+    <section id="cfgparams">
			
 
				+      <title>General Configuration Parameters</title>
			
 
				+      <p/>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th>Parameter</th>
			
 
				+          <th>Description</th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.output.directory</code>
			
 
				+          </td>
			
 
				+          <td>The directory into which output will be written. If specified,
			
 
				+	  <code>iopath</code> will be relative to this parameter. The
			
 
				+	  submitting user must have read/write access to this directory. The
			
 
				+	  user should also be mindful of any quota issues that may arise
			
 
				+	  during a run. The default is &quot;<code>gridmix</code>&quot;.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.client.submit.threads</code>
			
 
				+          </td>
			
 
				+          <td>The number of threads submitting jobs to the cluster. This
			
 
				+	  also controls how many splits will be loaded into memory at a given
			
 
				+	  time, pending the submit time in the trace. Splits are pre-generated
			
 
				+	  to hit submission deadlines, so particularly dense traces may want
			
 
				+	  more submitting threads. However, storing splits in memory is
			
 
				+	  reasonably expensive, so you should raise this cautiously. The
			
 
				+	  default is 1 for the SERIAL job-submission policy (see
			
 
				+	  <a href="#policies">Job Submission Policies</a>) and one more than
			
 
				+	  the number of processors on the client machine for the other
			
 
				+	  policies.</td>
			
 
				+        </tr>
			
 
				+	<tr>
			
 
				+	  <td>
			
 
				+	    <code>gridmix.submit.multiplier</code>
			
 
				+	  </td>
			
 
				+	  <td>The multiplier to accelerate or decelerate the submission of
			
 
				+	  jobs. The time separating two jobs is multiplied by this factor.
			
 
				+	  The default value is 1.0. This is a crude mechanism to size
			
 
				+	  a job trace to a cluster.</td>
			
 
				+	</tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.client.pending.queue.depth</code>
			
 
				+          </td>
			
 
				+          <td>The depth of the queue of job descriptions awaiting split
			
 
				+	  generation. The jobs read from the trace occupy a queue of this
			
 
				+	  depth before being processed by the submission threads. It is
			
 
				+	  unusual to configure this. The default is 5.</td>
			
 
				+        </tr>
			
 
				+	<tr>
			
 
				+	  <td>
			
 
				+	    <code>gridmix.gen.blocksize</code>
			
 
				+	  </td>
			
 
				+	  <td>The block-size of generated data. The default value is 256
			
 
				+	  MiB.</td>
			
 
				+	</tr>
			
 
				+	<tr>
			
 
				+	  <td>
			
 
				+	    <code>gridmix.gen.bytes.per.file</code>
			
 
				+	  </td>
			
 
				+	  <td>The maximum bytes written per file. The default value is 1
			
 
				+	  GiB.</td>
			
 
				+	</tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.min.file.size</code>
			
 
				+          </td>
			
 
				+          <td>The minimum size of the input files. The default limit is 128
			
 
				+	  MiB. Tweak this parameter if you see an error-message like
			
 
				+	  &quot;Found no satisfactory file&quot; while testing GridMix with
			
 
				+	  a relatively-small input data-set.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.max.total.scan</code>
			
 
				+          </td>
			
 
				+          <td>The maximum size of the input files. The default limit is 100
			
 
				+	  TiB.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.task.jvm-options.enable</code>
			
 
				+          </td>
			
 
				+          <td>Enables Gridmix to configure the simulated task's max heap 
			
 
				+              options using the values obtained from the original task (i.e via
			
 
				+              trace).
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+    </section>
			
 
				+    <section id="jobtypes">
			
 
				+      <title>Job Types</title>
			
 
				+      <p>GridMix takes as input a job trace, essentially a stream of
			
 
				+      JSON-encoded job descriptions. For each job description, the submission
			
 
				+      client obtains the original job submission time and for each task in
			
 
				+      that job, the byte and record counts read and written. Given this data,
			
 
				+      it constructs a synthetic job with the same byte and record patterns as
			
 
				+      recorded in the trace. It constructs jobs of two types:</p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th>Job Type</th>
			
 
				+          <th>Description</th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>LOADJOB</code>
			
 
				+          </td>
			
 
				+          <td>A synthetic job that emulates the workload mentioned in Rumen
			
 
				+	  trace. In the current version we are supporting I/O. It reproduces
			
 
				+	  the I/O workload on the benchmark cluster. It does so by embedding
			
 
				+	  the detailed I/O information for every map and reduce task, such as
			
 
				+	  the number of bytes and records read and written, into each
			
 
				+	  job's input splits. The map tasks further relay the I/O patterns of
			
 
				+	  reduce tasks through the intermediate map output data.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>SLEEPJOB</code>
			
 
				+          </td>
			
 
				+	  <td>A synthetic job where each task does <em>nothing</em> but sleep
			
 
				+	  for a certain duration as observed in the production trace. The
			
 
				+	  scalability of the Job Tracker is often limited by how many
			
 
				+	  heartbeats it can handle every second. (Heartbeats are periodic
			
 
				+	  messages sent from Task Trackers to update their status and grab new
			
 
				+	  tasks from the Job Tracker.) Since a benchmark cluster is typically
			
 
				+	  a fraction in size of a production cluster, the heartbeat traffic
			
 
				+	  generated by the slave nodes is well below the level of the
			
 
				+	  production cluster. One possible solution is to run multiple Task
			
 
				+	  Trackers on each slave node. This leads to the obvious problem that
			
 
				+	  the I/O workload generated by the synthetic jobs would thrash the
			
 
				+	  slave nodes. Hence the need for such a job.</td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+      <p>The following configuration parameters affect the job type:</p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th>Parameter</th>
			
 
				+          <th>Description</th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.job.type</code>
			
 
				+          </td>
			
 
				+          <td>The value for this key can be one of LOADJOB or SLEEPJOB. The
			
 
				+	  default value is LOADJOB.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.key.fraction</code>
			
 
				+          </td>
			
 
				+          <td>For a LOADJOB type of job, the fraction of a record used for
			
 
				+	  the data for the key. The default value is 0.1.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.sleep.maptask-only</code>
			
 
				+          </td>
			
 
				+          <td>For a SLEEPJOB type of job, whether to ignore the reduce
			
 
				+	  tasks for the job. The default is <code>false</code>.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.sleep.fake-locations</code>
			
 
				+          </td>
			
 
				+          <td>For a SLEEPJOB type of job, the number of fake locations
			
 
				+	  for map tasks for the job. The default is 0.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.sleep.max-map-time</code>
			
 
				+          </td>
			
 
				+          <td>For a SLEEPJOB type of job, the maximum runtime for map
			
 
				+	  tasks for the job in milliseconds. The default is unlimited.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.sleep.max-reduce-time</code>
			
 
				+          </td>
			
 
				+          <td>For a SLEEPJOB type of job, the maximum runtime for reduce
			
 
				+	  tasks for the job in milliseconds. The default is unlimited.</td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+    </section>
			
 
				+    <section id="policies">
			
 
				+      <title>Job Submission Policies</title>
			
 
				+      <p>GridMix controls the rate of job submission. This control can be
			
 
				+      based on the trace information or can be based on statistics it gathers
			
 
				+      from the Job Tracker. Based on the submission policies users define,
			
 
				+      GridMix uses the respective algorithm to control the job submission.
			
 
				+      There are currently three types of policies:</p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th>Job Submission Policy</th>
			
 
				+          <th>Description</th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>STRESS</code>
			
 
				+          </td>
			
 
				+          <td>Keep submitting jobs so that the cluster remains under stress.
			
 
				+	  In this mode we control the rate of job submission by monitoring
			
 
				+	  the real-time load of the cluster so that we can maintain a stable
			
 
				+	  stress level of workload on the cluster. Based on the statistics we
			
 
				+	  gather we define if a cluster is <em>underloaded</em> or
			
 
				+	  <em>overloaded</em>. We consider a cluster <em>underloaded</em> if
			
 
				+	  and only if the following three conditions are true:
			
 
				+	  <ol>
			
 
				+	    <li>the number of pending and running jobs are under a threshold
			
 
				+	    TJ</li>
			
 
				+	    <li>the number of pending and running maps are under threshold
			
 
				+	    TM</li>
			
 
				+	    <li>the number of pending and running reduces are under threshold
			
 
				+	    TR</li>
			
 
				+	  </ol>
			
 
				+          The thresholds TJ, TM and TR are proportional to the size of the
			
 
				+	  cluster and map, reduce slots capacities respectively. In case of a
			
 
				+	  cluster being <em>overloaded</em>, we throttle the job submission.
			
 
				+	  In the actual calculation we also weigh each running task with its
			
 
				+	  remaining work - namely, a 90% complete task is only counted as 0.1
			
 
				+	  in calculation. Finally, to avoid a very large job blocking other
			
 
				+	  jobs, we limit the number of pending/waiting tasks each job can
			
 
				+	  contribute.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>REPLAY</code>
			
 
				+          </td>
			
 
				+          <td>In this mode we replay the job traces faithfully. This mode
			
 
				+	  exactly follows the time-intervals given in the actual job
			
 
				+	  trace.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>SERIAL</code>
			
 
				+          </td>
			
 
				+          <td>In this mode we submit the next job only once the job submitted
			
 
				+	  earlier is completed.</td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+      <p>The following configuration parameters affect the job submission
			
 
				+      policy:</p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th>Parameter</th>
			
 
				+          <th>Description</th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.job-submission.policy</code>
			
 
				+          </td>
			
 
				+          <td>The value for this key would be one of the three: STRESS, REPLAY
			
 
				+	  or SERIAL. In most of the cases the value of key would be STRESS or
			
 
				+	  REPLAY. The default value is STRESS.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.throttle.jobs-to-tracker-ratio</code>
			
 
				+          </td>
			
 
				+          <td>In STRESS mode, the minimum ratio of running jobs to Task
			
 
				+	  Trackers in a cluster for the cluster to be considered
			
 
				+	  <em>overloaded</em>. This is the threshold TJ referred to earlier.
			
 
				+	  The default is 1.0.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.throttle.maps.task-to-slot-ratio</code>
			
 
				+          </td>
			
 
				+          <td>In STRESS mode, the minimum ratio of pending and running map
			
 
				+	  tasks (i.e. incomplete map tasks) to the number of map slots for
			
 
				+	  a cluster for the cluster to be considered <em>overloaded</em>.
			
 
				+	  This is the threshold TM referred to earlier. Running map tasks are
			
 
				+	  counted partially. For example, a 40% complete map task is counted
			
 
				+	  as 0.6 map tasks. The default is 2.0.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.throttle.reduces.task-to-slot-ratio</code>
			
 
				+          </td>
			
 
				+          <td>In STRESS mode, the minimum ratio of pending and running reduce
			
 
				+	  tasks (i.e. incomplete reduce tasks) to the number of reduce slots
			
 
				+	  for a cluster for the cluster to be considered <em>overloaded</em>.
			
 
				+	  This is the threshold TR referred to earlier. Running reduce tasks
			
 
				+	  are counted partially. For example, a 30% complete reduce task is
			
 
				+	  counted as 0.7 reduce tasks. The default is 2.5.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.throttle.maps.max-slot-share-per-job</code>
			
 
				+          </td>
			
 
				+          <td>In STRESS mode, the maximum share of a cluster's map-slots
			
 
				+	  capacity that can be counted toward a job's incomplete map tasks in
			
 
				+	  overload calculation. The default is 0.1.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.throttle.reducess.max-slot-share-per-job</code>
			
 
				+          </td>
			
 
				+          <td>In STRESS mode, the maximum share of a cluster's reduce-slots
			
 
				+	  capacity that can be counted toward a job's incomplete reduce tasks
			
 
				+	  in overload calculation. The default is 0.1.</td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+    </section>
			
 
				+    <section id="usersqueues">
			
 
				+      <title>Emulating Users and Queues</title>
			
 
				+      <p>Typical production clusters are often shared with different users and
			
 
				+      the cluster capacity is divided among different departments through job
			
 
				+      queues. Ensuring fairness among jobs from all users, honoring queue
			
 
				+      capacity allocation policies and avoiding an ill-behaving job from
			
 
				+      taking over the cluster adds significant complexity in Hadoop software.
			
 
				+      To be able to sufficiently test and discover bugs in these areas,
			
 
				+      GridMix must emulate the contentions of jobs from different users and/or
			
 
				+      submitted to different queues.</p>
			
 
				+      <p>Emulating multiple queues is easy - we simply set up the benchmark
			
 
				+      cluster with the same queue configuration as the production cluster and
			
 
				+      we configure synthetic jobs so that they get submitted to the same queue
			
 
				+      as recorded in the trace. However, not all users shown in the trace have
			
 
				+      accounts on the benchmark cluster. Instead, we set up a number of testing
			
 
				+      user accounts and associate each unique user in the trace to testing
			
 
				+      users in a round-robin fashion.</p>
			
 
				+      <p>The following configuration parameters affect the emulation of users
			
 
				+      and queues:</p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th>Parameter</th>
			
 
				+          <th>Description</th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.job-submission.use-queue-in-trace</code>
			
 
				+          </td>
			
 
				+          <td>When set to <code>true</code> it uses exactly the same set of
			
 
				+	  queues as those mentioned in the trace. The default value is
			
 
				+	  <code>false</code>.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.job-submission.default-queue</code>
			
 
				+          </td>
			
 
				+          <td>Specifies the default queue to which all the jobs would be
			
 
				+	  submitted. If this parameter is not specified, GridMix uses the
			
 
				+	  default queue defined for the submitting user on the cluster.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.user.resolve.class</code>
			
 
				+          </td>
			
 
				+          <td>Specifies which <code>UserResolver</code> implementation to use.
			
 
				+	  We currently have three implementations:
			
 
				+	  <ol>
			
 
				+	    <li><code>org.apache.hadoop.mapred.gridmix.EchoUserResolver</code>
			
 
				+	    - submits a job as the user who submitted the original job. All
			
 
				+	    the users of the production cluster identified in the job trace
			
 
				+	    must also have accounts on the benchmark cluster in this case.</li>
			
 
				+	    <li><code>org.apache.hadoop.mapred.gridmix.SubmitterUserResolver</code>
			
 
				+	    - submits all the jobs as current GridMix user. In this case we
			
 
				+	    simply map all the users in the trace to the current GridMix user
			
 
				+	    and submit the job.</li>
			
 
				+	    <li><code>org.apache.hadoop.mapred.gridmix.RoundRobinUserResolver</code>
			
 
				+	    - maps trace users to test users in a round-robin fashion. In
			
 
				+	    this case we set up a number of testing user accounts and
			
 
				+	    associate each unique user in the trace to testing users in a
			
 
				+	    round-robin fashion.</li>
			
 
				+	  </ol>
			
 
				+	  The default is
			
 
				+	  <code>org.apache.hadoop.mapred.gridmix.SubmitterUserResolver</code>.</td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+      <p>If the parameter <code>gridmix.user.resolve.class</code> is set to
			
 
				+      <code>org.apache.hadoop.mapred.gridmix.RoundRobinUserResolver</code>,
			
 
				+      we need to define a users-list file with a list of test users.
			
 
				+      This is specified using the <code>-users</code> option to GridMix.</p>
			
 
				+      <note>
			
 
				+      Specifying a users-list file using the <code>-users</code> option is
			
 
				+      mandatory when using the round-robin user-resolver. Other user-resolvers
			
 
				+      ignore this option.
			
 
				+      </note>
			
 
				+      <p>A users-list file has one user per line, each line of the format:</p>
			
 
				+      <source>
			
 
				+      &lt;username&gt;
			
 
				+      </source>
			
 
				+      <p>For example:</p>
			
 
				+      <source>
			
 
				+      user1
			
 
				+      user2
			
 
				+      user3
			
 
				+      </source>
			
 
				+      <p>In the above example we have defined three users <code>user1</code>,
			
 
				+      <code>user2</code> and <code>user3</code>.
			
 
				+      Now we would associate each unique user in the trace to the above users
			
 
				+      defined in round-robin fashion. For example, if trace's users are
			
 
				+      <code>tuser1</code>, <code>tuser2</code>, <code>tuser3</code>,
			
 
				+      <code>tuser4</code> and <code>tuser5</code>, then the mappings would
			
 
				+      be:</p>
			
 
				+      <source>
			
 
				+      tuser1 -&gt; user1
			
 
				+      tuser2 -&gt; user2
			
 
				+      tuser3 -&gt; user3
			
 
				+      tuser4 -&gt; user1
			
 
				+      tuser5 -&gt; user2
			
 
				+      </source>
			
 
				+      <p>For backward compatibility reasons, each line of users-list file can
			
 
				+      contain username followed by groupnames in the form username[,group]*.
			
 
				+      The groupnames will be ignored by Gridmix.
			
 
				+      </p>
			
 
				+    </section>
			
 
				 
			
 
				-<header>
			
 
				-  <title>Gridmix</title>
			
 
				-</header>
			
 
				-
			
 
				-<body>
			
 
				-
			
 
				-  <section>
			
 
				-  <title>Overview</title>
			
 
				-
			
 
				-  <p>Gridmix is a benchmark for live clusters. It submits a mix of synthetic
			
 
				-  jobs, modeling a profile mined from production loads.</p>
			
 
				-
			
 
				-  <p>There exist three versions of the Gridmix tool. This document discusses
			
 
				-  the third (checked into contrib), distinct from the two checked into the
			
 
				-  benchmarks subdirectory. While the first two versions of the tool included
			
 
				-  stripped-down versions of common jobs, both were principally saturation
			
 
				-  tools for stressing the framework at scale. In support of a broader range of
			
 
				-  deployments and finer-tuned job mixes, this version of the tool will attempt
			
 
				-  to model the resource profiles of production jobs to identify bottlenecks,
			
 
				-  guide development, and serve as a replacement for the existing gridmix
			
 
				-  benchmarks.</p>
			
 
				-
			
 
				-  </section>
			
 
				-
			
 
				-  <section id="usage">
			
 
				-
			
 
				-  <title>Usage</title>
			
 
				-
			
 
				-  <p>To run Gridmix, one requires a job trace describing the job mix for a
			
 
				-  given cluster. Such traces are typically genenerated by Rumen (see related
			
 
				-  documentation). Gridmix also requires input data from which the synthetic
			
 
				-  jobs will draw bytes. The input data need not be in any particular format,
			
 
				-  as the synthetic jobs are currently binary readers. If one is running on a
			
 
				-  new cluster, an optional step generating input data may precede the run.</p>
			
 
				-
			
 
				-  <p>Basic command line usage:</p>
			
 
				-<source>
			
 
				-
			
 
				-bin/mapred org.apache.hadoop.mapred.gridmix.Gridmix [-generate &lt;MiB&gt;] &lt;iopath&gt; &lt;trace&gt;
			
 
				-</source>
			
 
				-
			
 
				-  <p>The <code>-generate</code> parameter accepts standard units, e.g.
			
 
				-  <code>100g</code> will generate 100 * 2<sup>30</sup> bytes. The
			
 
				-  &lt;iopath&gt; parameter is the destination directory for generated and/or
			
 
				-  the directory from which input data will be read. The &lt;trace&gt;
			
 
				-  parameter is a path to a job trace. The following configuration parameters
			
 
				-  are also accepted in the standard idiom, before other Gridmix
			
 
				-  parameters.</p>
			
 
				-
			
 
				-  <section>
			
 
				-  <title>Configuration parameters</title>
			
 
				-  <p></p>
			
 
				-  <table>
			
 
				-    <tr><th> Parameter </th><th> Description </th><th> Notes </th></tr>
			
 
				-    <tr><td><code>gridmix.output.directory</code></td>
			
 
				-        <td>The directory into which output will be written. If specified, the
			
 
				-        <code>iopath</code> will be relative to this parameter.</td>
			
 
				-        <td>The submitting user must have read/write access to this
			
 
				-        directory. The user should also be mindful of any quota issues that
			
 
				-        may arise during a run.</td></tr>
			
 
				-    <tr><td><code>gridmix.client.submit.threads</code></td>
			
 
				-        <td>The number of threads submitting jobs to the cluster. This also
			
 
				-        controls how many splits will be loaded into memory at a given time,
			
 
				-        pending the submit time in the trace.</td>
			
 
				-        <td>Splits are pregenerated to hit submission deadlines, so
			
 
				-        particularly dense traces may want more submitting threads. However,
			
 
				-        storing splits in memory is reasonably expensive, so one should raise
			
 
				-        this cautiously.</td></tr>
			
 
				-    <tr><td><code>gridmix.client.pending.queue.depth</code></td>
			
 
				-        <td>The depth of the queue of job descriptions awaiting split
			
 
				-        generation.</td>
			
 
				-        <td>The jobs read from the trace occupy a queue of this depth before
			
 
				-        being processed by the submission threads. It is unusual to configure
			
 
				-        this.</td></tr>
			
 
				-    <tr><td><code>gridmix.min.key.length</code></td>
			
 
				-        <td>The key size for jobs submitted to the cluster.</td>
			
 
				-        <td>While this is clearly a job-specific, even task-specific property,
			
 
				-        no data on key length is currently available. Since the intermediate
			
 
				-        data are random, memcomparable data, not even the sort is likely
			
 
				-        affected. It exists as a tunable as no default value is appropriate,
			
 
				-        but future versions will likely replace it with trace data.</td></tr>
			
 
				-  </table>
			
 
				-
			
 
				+  <section id="distributedcacheload">
			
 
				+  <title>Emulating Distributed Cache Load</title>
			
 
				+    <p>Gridmix emulates Distributed Cache load by default for LOADJOB type of
			
 
				+    jobs. This is done by precreating the needed Distributed Cache files for all
			
 
				+    the simulated jobs as part of a separate MapReduce job.</p>
			
 
				+    <p>Emulation of Distributed Cache load in gridmix simulated jobs can be
			
 
				+    disabled by configuring the property
			
 
				+    <code>gridmix.distributed-cache-emulation.enable</code> to
			
 
				+    <code>false</code>.
			
 
				+    But generation of Distributed Cache data by gridmix is driven by
			
 
				+    <code>-generate</code> option and is independent of this configuration
			
 
				+    property.</p>
			
 
				+    <p>Both generation of Distributed Cache files and emulation of
			
 
				+    Distributed Cache load are disabled if:</p>
			
 
				+    <ul>
			
 
				+    <li>input trace comes from the standard input-stream instead of file, or</li>
			
 
				+    <li><code>&lt;iopath&gt;</code> specified is on local file-system, or</li>
			
 
				+    <li>any of the ascendant directories of the distributed cache directory
			
 
				+    i.e. <code>&lt;iopath&gt;/distributedCache</code> (including the distributed
			
 
				+    cache directory) doesn't have execute permission for others.</li>
			
 
				+    </ul>
			
 
				   </section>
			
 
				-</section>
			
 
				-
			
 
				-<section id="assumptions">
			
 
				-
			
 
				-  <title>Simplifying Assumptions</title>
			
 
				-
			
 
				-  <p>Gridmix will be developed in stages, incorporating feedback and patches
			
 
				-  from the community. Currently, its intent is to evaluate Map/Reduce and HDFS
			
 
				-  performance and not the layers on top of them (i.e. the extensive lib and
			
 
				-  subproject space). Given these two limitations, the following
			
 
				-  characteristics of job load are not currently captured in job traces and
			
 
				-  cannot be accurately reproduced in Gridmix.</p>
			
 
				-
			
 
				-  <table>
			
 
				-  <tr><th>Property</th><th>Notes</th></tr>
			
 
				-  <tr><td>CPU usage</td><td>We have no data for per-task CPU usage, so we
			
 
				-  cannot attempt even an approximation. Gridmix tasks are never CPU bound
			
 
				-  independent of I/O, though this surely happens in practice.</td></tr>
			
 
				-  <tr><td>Filesystem properties</td><td>No attempt is made to match block
			
 
				-  sizes, namespace hierarchies, or any property of input, intermediate, or
			
 
				-  output data other than the bytes/records consumed and emitted from a given
			
 
				-  task. This implies that some of the most heavily used parts of the system-
			
 
				-  the compression libraries, text processing, streaming, etc.- cannot be
			
 
				-  meaningfully tested with the current implementation.</td></tr>
			
 
				-  <tr><td>I/O rates</td><td>The rate at which records are consumed/emitted is
			
 
				-  assumed to be limited only by the speed of the reader/writer and constant
			
 
				-  throughout the task.</td></tr>
			
 
				-  <tr><td>Memory profile</td><td>No data on tasks' memory usage over time is
			
 
				-  available, though the max heap size is retained.</td></tr>
			
 
				-  <tr><td>Skew</td><td>The records consumed and emitted to/from a given task
			
 
				-  are assumed to follow observed averages, i.e. records will be more regular
			
 
				-  than may be seen in the wild. Each map also generates a proportional
			
 
				-  percentage of data for each reduce, so a job with unbalanced input will be
			
 
				-  flattened.</td></tr>
			
 
				-  <tr><td>Job failure</td><td>User code is assumed to be correct.</td></tr>
			
 
				-  <tr><td>Job independence</td><td>The output or outcome of one job does not
			
 
				-  affect when or whether a subsequent job will run.</td></tr>
			
 
				-  </table>
			
 
				-
			
 
				-</section>
			
 
				-
			
 
				-<section>
			
 
				-
			
 
				-  <title>Appendix</title>
			
 
				-
			
 
				-  <p>Issues tracking the implementations of <a
			
 
				-  href="https://issues.apache.org/jira/browse/HADOOP-2369">gridmix1</a>, <a
			
 
				-  href="https://issues.apache.org/jira/browse/HADOOP-3770">gridmix2</a>, and
			
 
				-  <a href="https://issues.apache.org/jira/browse/MAPREDUCE-776">gridmix3</a>.
			
 
				-  Other issues tracking the development of Gridmix can be found by searching
			
 
				-  the Map/Reduce <a
			
 
				-  href="https://issues.apache.org/jira/browse/MAPREDUCE">JIRA</a></p>
			
 
				 
			
 
				-</section>
			
 
				+    <section id="simulatedjobconf">
			
 
				+      <title>Configuration of Simulated Jobs</title>
			
 
				+      <p> Gridmix3 sets some configuration properties in the simulated Jobs
			
 
				+      submitted by it so that they can be mapped back to the corresponding Job
			
 
				+      in the input Job trace. These configuration parameters include:
			
 
				+      </p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th>Parameter</th>
			
 
				+          <th>Description</th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.job.original-job-id</code>
			
 
				+          </td>
			
 
				+          <td> The job id of the original cluster's job corresponding to this
			
 
				+          simulated job.
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>
			
 
				+            <code>gridmix.job.original-job-name</code>
			
 
				+          </td>
			
 
				+          <td> The job name of the original cluster's job corresponding to this
			
 
				+          simulated job.
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+    </section>
			
 
				 
			
 
				-</body>
			
 
				+  <section id="compression-emulation">
			
 
				+      <title>Emulating Compression/Decompression</title>
			
 
				+      <p>MapReduce supports data compression and decompression. 
			
 
				+         Input to a MapReduce job can be compressed. Similarly, output of Map
			
 
				+         and Reduce tasks can also be compressed. Compression/Decompression 
			
 
				+         emulation in GridMix is important because emulating 
			
 
				+         compression/decompression will effect the CPU and Memory usage of the 
			
 
				+         task. A task emulating compression/decompression will affect other 
			
 
				+         tasks and daemons running on the same node.
			
 
				+       </p>
			
 
				+       <p>Compression emulation is enabled if 
			
 
				+         <code>gridmix.compression-emulation.enable</code> is set to
			
 
				+         <code>true</code>. By default compression emulation is enabled for 
			
 
				+         jobs of type <em>LOADJOB</em>. With compression emulation enabled, 
			
 
				+         GridMix will now generate compressed text data with a constant 
			
 
				+         compression ratio. Hence a simulated GridMix job will now emulate 
			
 
				+         compression/decompression using compressible text data (having a 
			
 
				+         constant compression ratio), irrespective of the compression ratio 
			
 
				+         observed in the actual job.
			
 
				+      </p>
			
 
				+      <p>A typical MapReduce Job deals with data compression/decompression in 
			
 
				+         the following phases </p>
			
 
				+      <ul>
			
 
				+        <li><code>Job input data decompression: </code> GridMix generates 
			
 
				+            compressible input data when compression emulation is enabled. 
			
 
				+            Based on the original job's configuration, a simulated GridMix job 
			
 
				+            will use a decompressor to read the compressed input data. 
			
 
				+            Currently, GridMix uses
			
 
				+            <code>mapreduce.input.fileinputformat.inputdir</code> to determine 
			
 
				+            if the original job used compressed input data or 
			
 
				+            not. If the original job's input files are uncompressed then the 
			
 
				+            simulated job will read the compressed input file without using a 
			
 
				+            decompressor. 
			
 
				+        </li>
			
 
				+        <li><code>Intermediate data compression and decompression: </code>
			
 
				+            If the original job has map output compression enabled then GridMix 
			
 
				+            too will enable map output compression for the simulated job. 
			
 
				+            Accordingly, the reducers will use a decompressor to read the map 
			
 
				+            output data.
			
 
				+        </li>
			
 
				+        <li><code>Job output data compression: </code>
			
 
				+            If the original job's output is compressed then GridMix 
			
 
				+            too will enable job output compression for the simulated job. 
			
 
				+        </li>
			
 
				+      </ul>
			
 
				+       
			
 
				+      <p>The following configuration parameters affect compression emulation
			
 
				+      </p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th>Parameter</th>
			
 
				+          <th>Description</th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td>gridmix.compression-emulation.enable</td>
			
 
				+          <td>Enables compression emulation in simulated GridMix jobs. 
			
 
				+              Default is true.</td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+      
			
 
				+      <p>With compression emulation turned on, GridMix will generate compressed
			
 
				+         input data. Hence the total size of the input 
			
 
				+         data will be lesser than the expected size. Set 
			
 
				+         <code>gridmix.min.file.size</code> to a smaller value (roughly 10% of
			
 
				+         <code>gridmix.gen.bytes.per.file</code>) for enabling GridMix to 
			
 
				+         correctly emulate compression.
			
 
				+      </p>
			
 
				+    </section>
			
 
				 
			
 
				+    <section id="highram-emulation">
			
 
				+      <title>Emulating High-Ram jobs</title>
			
 
				+      <p>MapReduce allows users to define a job as a High-Ram job. Tasks from a 
			
 
				+         High-Ram job can occupy multiple slots on the task-trackers. 
			
 
				+         Task-tracker assigns fixed virtual memory for each slot. Tasks from 
			
 
				+         High-Ram jobs can occupy multiple slots and thus can use up more 
			
 
				+         virtual memory as compared to a default task.
			
 
				+      </p>
			
 
				+      <p>Emulating this behavior is important because of the following reasons
			
 
				+      </p>
			
 
				+     <ul>
			
 
				+       <li>Impact on scheduler:  Scheduling of tasks from High-Ram jobs 
			
 
				+           impacts the scheduling behavior as it might result into slot 
			
 
				+           reservation and slot/resource utilization.
			
 
				+       </li>
			
 
				+       <li>Impact on the node : Since High-Ram tasks occupy multiple slots,
			
 
				+           trackers do some bookkeeping for allocating extra resources for 
			
 
				+           these tasks. Thus this becomes a precursor for memory emulation
			
 
				+           where tasks with high memory requirements needs to be considered
			
 
				+           as a High-Ram task.
			
 
				+       </li>
			
 
				+     </ul>
			
 
				+     <p>High-Ram feature emulation can be disabled by setting  
			
 
				+        <code>gridmix.highram-emulation.enable</code> to
			
 
				+        <code>false</code>.
			
 
				+     </p>
			
 
				+    </section>
			
 
				+    
			
 
				+    <section id="resource-usage-emulation">
			
 
				+      <title>Emulating resource usages</title>
			
 
				+      <p>Usages of resources like CPU, physical memory, virtual memory, JVM heap
			
 
				+         etc are recorded by MapReduce using its task counters. This information
			
 
				+         is used by GridMix to emulate the resource usages in the simulated 
			
 
				+         tasks. Emulating resource usages will help GridMix exert similar load 
			
 
				+         on the test cluster as seen in the actual cluster.
			
 
				+      </p>
			
 
				+      <p>MapReduce tasks use up resources during its entire lifetime. GridMix
			
 
				+         also tries to mimic this behavior by spanning resource usage emulation
			
 
				+         across the entire lifetime of the simulated task. Each resource to be
			
 
				+         emulated should have an <em>emulator</em> associated with it.
			
 
				+         Each such <em>emulator</em> should implement the 
			
 
				+         <code>org.apache.hadoop.mapred.gridmix.emulators.resourceusage
			
 
				+         .ResourceUsageEmulatorPlugin</code> interface. Resource 
			
 
				+         <em>emulators</em> in GridMix are <em>plugins</em> that can be 
			
 
				+         configured (plugged in or out) before every run. GridMix users can 
			
 
				+         configure multiple emulator <em>plugins</em> by passing a comma 
			
 
				+         separated list of <em>emulators</em> as a value for the 
			
 
				+         <code>gridmix.emulators.resource-usage.plugins</code> parameter. 
			
 
				+      </p>
			
 
				+      <p>List of <em>emulators</em> shipped with GridMix:
			
 
				+      </p>
			
 
				+     <ul>
			
 
				+       <li>Cumulative CPU usage <em>emulator</em>: 
			
 
				+           GridMix uses the cumulative CPU usage value published by Rumen 
			
 
				+           and makes sure that the total cumulative CPU usage of the simulated 
			
 
				+           task is close to the value published by Rumen. GridMix can be 
			
 
				+           configured to emulate cumulative CPU usage by adding 
			
 
				+           <code>org.apache.hadoop.mapred.gridmix.emulators.resourceusage
			
 
				+           .CumulativeCpuUsageEmulatorPlugin</code> to the list of emulator 
			
 
				+           <em>plugins</em> configured for the 
			
 
				+           <code>gridmix.emulators.resource-usage.plugins</code> parameter.
			
 
				+           CPU usage emulator is designed in such a way that
			
 
				+           it only emulates at specific progress boundaries of the task. This 
			
 
				+           interval can be configured using 
			
 
				+           <code>gridmix.emulators.resource-usage.cpu.emulation-interval</code>.
			
 
				+           The default value for this parameter is <code>0.1</code> i.e 
			
 
				+           <code>10%</code>.
			
 
				+       </li>
			
 
				+       <li>Total heap usage <em>emulator</em>: 
			
 
				+           GridMix uses the total heap usage value published by Rumen 
			
 
				+           and makes sure that the total heap usage of the simulated 
			
 
				+           task is close to the value published by Rumen. GridMix can be 
			
 
				+           configured to emulate total heap usage by adding 
			
 
				+           <code>org.apache.hadoop.mapred.gridmix.emulators.resourceusage
			
 
				+           .TotalHeapUsageEmulatorPlugin</code> to the list of emulator 
			
 
				+           <em>plugins</em> configured for the 
			
 
				+           <code>gridmix.emulators.resource-usage.plugins</code> parameter.
			
 
				+           Heap usage emulator is designed in such a way that
			
 
				+           it only emulates at specific progress boundaries of the task. This 
			
 
				+           interval can be configured using 
			
 
				+           <code>gridmix.emulators.resource-usage.heap.emulation-interval
			
 
				+           </code>. The default value for this parameter is <code>0.1</code> 
			
 
				+           i.e <code>10%</code> progress interval.
			
 
				+</li>
			
 
				+     </ul>
			
 
				+     <p>Note that GridMix will emulate resource usages only for jobs of type 
			
 
				+        <em>LOADJOB</em>.
			
 
				+     </p>
			
 
				+    </section>
			
 
				+    
			
 
				+    <section id="assumptions">
			
 
				+      <title>Simplifying Assumptions</title>
			
 
				+      <p>GridMix will be developed in stages, incorporating feedback and
			
 
				+      patches from the community. Currently its intent is to evaluate
			
 
				+      MapReduce and HDFS performance and not the layers on top of them (i.e.
			
 
				+      the extensive lib and sub-project space). Given these two limitations,
			
 
				+      the following characteristics of job load are not currently captured in
			
 
				+      job traces and cannot be accurately reproduced in GridMix:</p>
			
 
				+      <ul>
			
 
				+	<li><em>Filesystem Properties</em> - No attempt is made to match block
			
 
				+	sizes, namespace hierarchies, or any property of input, intermediate
			
 
				+	or output data other than the bytes/records consumed and emitted from
			
 
				+	a given task. This implies that some of the most heavily-used parts of
			
 
				+	the system - text processing, streaming, etc. - cannot be meaningfully tested 
			
 
				+	with the current implementation.</li>
			
 
				+	<li><em>I/O Rates</em> - The rate at which records are
			
 
				+	consumed/emitted is assumed to be limited only by the speed of the
			
 
				+	reader/writer and constant throughout the task.</li>
			
 
				+	<li><em>Memory Profile</em> - No data on tasks' memory usage over time
			
 
				+	is available, though the max heap-size is retained.</li>
			
 
				+	<li><em>Skew</em> - The records consumed and emitted to/from a given
			
 
				+	task are assumed to follow observed averages, i.e. records will be
			
 
				+	more regular than may be seen in the wild. Each map also generates
			
 
				+	a proportional percentage of data for each reduce, so a job with
			
 
				+	unbalanced input will be flattened.</li>
			
 
				+	<li><em>Job Failure</em> - User code is assumed to be correct.</li>
			
 
				+	<li><em>Job Independence</em> - The output or outcome of one job does
			
 
				+	not affect when or whether a subsequent job will run.</li>
			
 
				+      </ul>
			
 
				+    </section>
			
 
				+    <section id="appendix">
			
 
				+      <title>Appendix</title>
			
 
				+      <p>Issues tracking the original implementations of <a
			
 
				+      href="https://issues.apache.org/jira/browse/HADOOP-2369">GridMix1</a>,
			
 
				+      <a href="https://issues.apache.org/jira/browse/HADOOP-3770">GridMix2</a>,
			
 
				+      and <a
			
 
				+      href="https://issues.apache.org/jira/browse/MAPREDUCE-776">GridMix3</a>
			
 
				+      can be found on the Apache Hadoop MapReduce JIRA. Other issues tracking
			
 
				+      the current development of GridMix can be found by searching <a
			
 
				+      href="https://issues.apache.org/jira/browse/MAPREDUCE/component/12313086">the
			
 
				+      Apache Hadoop MapReduce JIRA</a></p>
			
 
				+    </section>
			
 
				+  </body>
			
 
				 </document>
			
--- a/src/docs/src/documentation/content/xdocs/rumen.xml
+++ b/src/docs/src/documentation/content/xdocs/rumen.xml
@@ -0,0 +1,454 @@
 
				+<?xml version="1.0"?>
			
 
				+<!--
			
 
				+  Licensed to the Apache Software Foundation (ASF) under one or more
			
 
				+  contributor license agreements.  See the NOTICE file distributed with
			
 
				+  this work for additional information regarding copyright ownership.
			
 
				+  The ASF licenses this file to You under the Apache License, Version 2.0
			
 
				+  (the "License"); you may not use this file except in compliance with
			
 
				+  the License.  You may obtain a copy of the License at
			
 
				+
			
 
				+      http://www.apache.org/licenses/LICENSE-2.0
			
 
				+
			
 
				+  Unless required by applicable law or agreed to in writing, software
			
 
				+  distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+  See the License for the specific language governing permissions and
			
 
				+  limitations under the License.
			
 
				+-->
			
 
				+
			
 
				+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
			
 
				+
			
 
				+<document>
			
 
				+
			
 
				+<header>
			
 
				+  <title>Rumen</title>
			
 
				+</header>
			
 
				+
			
 
				+<body>
			
 
				+  <!--
			
 
				+    Overview [What is Rumen and why is it needed?]
			
 
				+  -->
			
 
				+  <section id="overview">
			
 
				+    <title>Overview</title>
			
 
				+    
			
 
				+    <p><em>Rumen</em> is a data extraction and analysis tool built for
			
 
				+       <em>Apache Hadoop</em>. <em>Rumen</em> mines <em>JobHistory</em> logs to 
			
 
				+       extract meaningful data and stores it in an easily-parsed, condensed 
			
 
				+       format or <em>digest</em>. The raw trace data from MapReduce logs are 
			
 
				+       often insufficient for simulation, emulation, and benchmarking, as these 
			
 
				+       tools often attempt to measure conditions that did not occur in the 
			
 
				+       source data. For example, if a task ran locally in the raw trace data 
			
 
				+       but a simulation of the scheduler elects to run that task on a remote 
			
 
				+       rack, the simulator requires a runtime its input cannot provide. 
			
 
				+       To fill in these gaps, Rumen performs a statistical analysis of the 
			
 
				+       digest to estimate the variables the trace doesn't supply. Rumen traces 
			
 
				+       drive both Gridmix (a benchmark of Hadoop MapReduce clusters) and Mumak 
			
 
				+       (a simulator for the JobTracker).
			
 
				+    </p>
			
 
				+
			
 
				+    <!--
			
 
				+      Why is Rumen needed?
			
 
				+    --> 
			
 
				+    <section>     
			
 
				+      <title>Motivation</title>
			
 
				+      
			
 
				+      <ul>
			
 
				+        <li>Extracting meaningful data from <em>JobHistory</em> logs is a common
			
 
				+            task for any tool built to work on <em>MapReduce</em>. It 
			
 
				+            is tedious to write a custom tool which is so tightly coupled with 
			
 
				+            the <em>MapReduce</em> framework. Hence there is a need for a 
			
 
				+            built-in tool for performing framework level task of log parsing and
			
 
				+            analysis. Such a tool would insulate external systems depending on 
			
 
				+            job history against the changes made to the job history format.
			
 
				+        </li>
			
 
				+        <li>Performing statistical analysis of various attributes of a 
			
 
				+            <em>MapReduce Job</em> such as <em>task runtimes, task failures 
			
 
				+            etc</em> is another common task that the benchmarking 
			
 
				+            and simulation tools might need. <em>Rumen</em> generates 
			
 
				+            <a href="http://en.wikipedia.org/wiki/Cumulative_distribution_function">
			
 
				+              <em>Cumulative Distribution Functions (CDF)</em>
			
 
				+            </a> for the Map/Reduce task runtimes. 
			
 
				+            Runtime CDF can be used for extrapolating the task runtime of 
			
 
				+            incomplete, missing and synthetic tasks. Similarly CDF is also 
			
 
				+            computed for the total number of successful tasks for every attempt.
			
 
				+            
			
 
				+        </li>
			
 
				+      </ul>
			
 
				+    </section>
			
 
				+
			
 
				+    <!--
			
 
				+      Basic high level view of components
			
 
				+    -->
			
 
				+    <section>  
			
 
				+      <title>Components</title>
			
 
				+      
			
 
				+      <p><em>Rumen</em> consists of 2 components</p>
			
 
				+      
			
 
				+      <ul>
			
 
				+        <li><em>Trace Builder</em> : 
			
 
				+            Converts <em>JobHistory</em> logs into an easily-parsed format.
			
 
				+            Currently <code>TraceBuilder</code> outputs the trace in 
			
 
				+            <a href="http://www.json.org/"><em>JSON</em></a> 
			
 
				+            format.   
			
 
				+        </li>
			
 
				+        <li><em>Folder </em>: 
			
 
				+            A utility to scale the input trace. A trace obtained from
			
 
				+            <em>TraceBuilder</em> simply summarizes the jobs in the 
			
 
				+            input folders and files. The time-span within which all the jobs in 
			
 
				+            a given trace finish can be considered as the trace runtime. 
			
 
				+            <em>Folder</em> can be used to scale the runtime of a trace.
			
 
				+            Decreasing the trace runtime might involve dropping some jobs from 
			
 
				+            the input trace and scaling down the runtime of remaining jobs. 
			
 
				+            Increasing the trace runtime might involve adding some dummy jobs to
			
 
				+            the resulting trace and scaling up the runtime of individual jobs.
			
 
				+       </li>
			
 
				+                 
			
 
				+      </ul>
			
 
				+      <p></p><p></p><p></p>
			
 
				+    </section>
			
 
				+  </section>    
			
 
				+
			
 
				+  <!--
			
 
				+    Usage [How to run Rumen? What are the various configuration parameters?]
			
 
				+  -->
			
 
				+  <section id="usage">
			
 
				+    <title>How to use <em>Rumen</em>?</title>
			
 
				+    
			
 
				+    <p>Converting <em>JobHistory</em> logs into a desired job-trace consists of 
			
 
				+       2 steps</p>
			
 
				+    <ol>
			
 
				+      <li>Extracting information into an intermediate format</li>
			
 
				+      <li>Adjusting the job-trace obtained from the intermediate trace to 
			
 
				+          have the desired properties.</li>
			
 
				+    </ol>
			
 
				+       
			
 
				+    <note>Extracting information from <em>JobHistory</em> logs is a one time
			
 
				+          operation. This so called <em>Gold Trace</em> can be reused to
			
 
				+          generate traces with desired values of properties such as 
			
 
				+          <code>output-duration</code>, <code>concentration</code> etc.
			
 
				+    </note>
			
 
				+       
			
 
				+    <p><em>Rumen</em> provides 2 basic commands</p>
			
 
				+     <ul>
			
 
				+       <li><code>TraceBuilder</code></li>
			
 
				+       <li><code>Folder</code></li>
			
 
				+     </ul>
			
 
				+       
			
 
				+    <p>Firstly, we need to generate the <em>Gold Trace</em>. Hence the first 
			
 
				+       step is to run <code>TraceBuilder</code> on a job-history folder. 
			
 
				+       The output of the <code>TraceBuilder</code> is a job-trace file (and an 
			
 
				+       optional cluster-topology file). In case we want to scale the output, we 
			
 
				+       can use the <code>Folder</code> utility to fold the current trace to the 
			
 
				+       desired length. The remaining part of this section explains these 
			
 
				+       utilities in detail.
			
 
				+    </p>
			
 
				+    
			
 
				+    <note>Examples in this section assumes that certain libraries are present 
			
 
				+          in the java CLASSPATH. See <em>Section-3.2</em> for more details.
			
 
				+    </note>
			
 
				+    <!--
			
 
				+     TraceBuilder command
			
 
				+    -->
			
 
				+    <section>
			
 
				+      <title>Trace Builder</title>
			
 
				+      
			
 
				+      <p><code>Command:</code></p>
			
 
				+      <source>java org.apache.hadoop.tools.rumen.TraceBuilder [options] &lt;jobtrace-output&gt; &lt;topology-output&gt; &lt;inputs&gt;</source>
			
 
				+
			
 
				+      <p>This command invokes the <code>TraceBuilder</code> utility of
			
 
				+         <em>Rumen</em>. It converts the JobHistory files into a series of JSON
			
 
				+         objects and writes them into the <code>&lt;jobtrace-output&gt;</code>
			
 
				+         file. It also extracts the cluster layout (topology) and writes it in
			
 
				+         the<code>&lt;topology-output&gt;</code> file.
			
 
				+         <code>&lt;inputs&gt;</code> represents a space-separated list of
			
 
				+         JobHistory files and folders.
			
 
				+      </p>
			
 
				+         
			
 
				+         <note>1) Input and output to <code>TraceBuilder</code> is expected to
			
 
				+               be a fully qualified FileSystem path. So use '<em>file://</em>' 
			
 
				+               to specify files on the <code>local</code> FileSystem and 
			
 
				+               '<em>hdfs://</em>' to specify files on HDFS. Since input files or
			
 
				+               folder are FileSystem paths, it means that they can be globbed.
			
 
				+               This can be useful while specifying multiple file paths using
			
 
				+               regular expressions.
			
 
				+         </note>
			
 
				+         <note>
			
 
				+               2) By default, TraceBuilder does not recursively scan the input
			
 
				+               folder for job history files. Only the files that are directly
			
 
				+               placed under the input folder will be considered for generating
			
 
				+               the trace. To add all the files under the input directory by
			
 
				+               recursively scanning the input directory, use ‘-recursive’
			
 
				+               option.
			
 
				+         </note>
			
 
				+      
			
 
				+      <p>Cluster topology is used as follows :</p>
			
 
				+      <ul>
			
 
				+        <li>To reconstruct the splits and make sure that the 
			
 
				+            distances/latencies seen in the actual run are modeled correctly.
			
 
				+        </li>
			
 
				+        <li>To extrapolate splits information for tasks with missing splits
			
 
				+            details or synthetically generated tasks.
			
 
				+        </li>
			
 
				+      </ul>
			
 
				+      
			
 
				+      <p><code>Options :</code></p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th> Parameter</th>
			
 
				+          <th> Description</th>
			
 
				+          <th> Notes </th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-demuxer</code></td>
			
 
				+          <td>Used to read the jobhistory files. The default is 
			
 
				+              <code>DefaultInputDemuxer</code>.</td>
			
 
				+          <td>Demuxer decides how the input file maps to jobhistory file(s). 
			
 
				+              Job history logs and job configuration files are typically small 
			
 
				+              files, and can be more effectively stored when embedded in some
			
 
				+              container file format like SequenceFile or TFile. To support such 
			
 
				+              usage cases, one can specify a customized Demuxer class that can 
			
 
				+              extract individual job history logs and job configuration files 
			
 
				+              from the source files.
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+           <td><code>-recursive</code></td>
			
 
				+           <td>Recursively traverse input paths for job history logs.</td>
			
 
				+           <td>This option should be used to inform the TraceBuilder to
			
 
				+           recursively scan the input paths and process all the files under it.
			
 
				+           Note that, by default, only the history logs that are directly under
			
 
				+           the input folder are considered for generating the trace.
			
 
				+           </td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+      
			
 
				+      <section>
			
 
				+        <title>Example</title>
			
 
				+        <source>java org.apache.hadoop.tools.rumen.TraceBuilder file:///home/user/job-trace.json file:///home/user/topology.output file:///home/user/logs/history/done</source>
			
 
				+        <p></p>
			
 
				+        <p>This will analyze all the jobs in 
			
 
				+         <code>/home/user/logs/history/done</code> stored on the 
			
 
				+         <code>local</code> FileSystem and output the jobtraces in 
			
 
				+         <code>/home/user/job-trace.json</code> along with topology 
			
 
				+         information in <code>/home/user/topology.output</code>.
			
 
				+        </p>
			
 
				+      </section>
			
 
				+      <p></p><p></p><p></p><p></p><p></p><p></p>
			
 
				+    </section>
			
 
				+
			
 
				+  <!--
			
 
				+   Folder command
			
 
				+  -->
			
 
				+  <section>
			
 
				+      <title>Folder</title>
			
 
				+      
			
 
				+      <p><code>Command</code>:</p>
			
 
				+      <source>java org.apache.hadoop.tools.rumen.Folder [options] [input] [output]</source>
			
 
				+      
			
 
				+      <note>Input and output to <code>Folder</code> is expected to be a fully 
			
 
				+            qualified FileSystem path. So use '<em>file://</em>' to specify 
			
 
				+            files on the <code>local</code> FileSystem and '<em>hdfs://</em>' to
			
 
				+            specify files on HDFS.
			
 
				+         </note>
			
 
				+      
			
 
				+      <p>This command invokes the <code>Folder</code> utility of 
			
 
				+         <em>Rumen</em>. Folding essentially means that the output duration of 
			
 
				+         the resulting trace is fixed and job timelines are adjusted 
			
 
				+         to respect the final output duration. 
			
 
				+      </p>
			
 
				+      
			
 
				+      <p></p>
			
 
				+      <p><code>Options :</code></p>
			
 
				+      <table>
			
 
				+        <tr>
			
 
				+          <th> Parameter</th>
			
 
				+          <th> Description</th>
			
 
				+          <th> Notes </th>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-input-cycle</code></td>
			
 
				+          <td>Defines the basic unit of time for the folding operation. There is
			
 
				+              no default value for <code>input-cycle</code>. 
			
 
				+              <strong>Input cycle must be provided</strong>.
			
 
				+          </td>
			
 
				+          <td>'<code>-input-cycle 10m</code>' 
			
 
				+              implies that the whole trace run will be now sliced at a 10min 
			
 
				+              interval. Basic operations will be done on the 10m chunks. Note 
			
 
				+              that <em>Rumen</em> understands various time units like 
			
 
				+              <em>m(min), h(hour), d(days) etc</em>.
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-output-duration</code></td>
			
 
				+          <td>This parameter defines the final runtime of the trace. 
			
 
				+              Default value if <strong>1 hour</strong>.
			
 
				+          </td>
			
 
				+          <td>'<code>-output-duration 30m</code>' 
			
 
				+              implies that the resulting trace will have a max runtime of 
			
 
				+              30mins. All the jobs in the input trace file will be folded and 
			
 
				+              scaled to fit this window.
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-concentration</code></td>
			
 
				+          <td>Set the concentration of the resulting trace. Default value is 
			
 
				+              <strong>1</strong>.
			
 
				+          </td>
			
 
				+          <td>If the total runtime of the resulting trace is less than the total
			
 
				+              runtime of the input trace, then the resulting trace would contain
			
 
				+              lesser number of jobs as compared to the input trace. This 
			
 
				+              essentially means that the output is diluted. To increase the 
			
 
				+              density of jobs, set the concentration to a higher value.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-debug</code></td>
			
 
				+          <td>Run the Folder in debug mode. By default it is set to 
			
 
				+              <strong>false</strong>.</td>
			
 
				+          <td>In debug mode, the Folder will print additional statements for 
			
 
				+              debugging. Also the intermediate files generated in the scratch 
			
 
				+              directory will not be cleaned up.
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-seed</code></td>
			
 
				+          <td>Initial seed to the Random Number Generator. By default, a Random 
			
 
				+              Number Generator is used to generate a seed and the seed value is
			
 
				+              reported back to the user for future use.
			
 
				+          </td>
			
 
				+          <td>If an initial seed is passed, then the <code>Random Number 
			
 
				+              Generator</code> will generate the random numbers in the same 
			
 
				+              sequence i.e the sequence of random numbers remains same if the 
			
 
				+              same seed is used. Folder uses Random Number Generator to decide 
			
 
				+              whether or not to emit the job. 
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-temp-directory</code></td>
			
 
				+          <td>Temporary directory for the Folder. By default the <strong>output
			
 
				+              folder's parent directory</strong> is used as the scratch space.
			
 
				+          </td>
			
 
				+          <td>This is the scratch space used by Folder.  All the 
			
 
				+              temporary files are cleaned up in the end unless the Folder is run
			
 
				+              in <code>debug</code> mode.</td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-skew-buffer-length</code></td>
			
 
				+          <td>Enables <em>Folder</em> to tolerate skewed jobs.
			
 
				+              The default buffer length is <strong>0</strong>.</td>
			
 
				+          <td>'<code>-skew-buffer-length 100</code>' 
			
 
				+              indicates that if the jobs appear out of order within a window 
			
 
				+              size of 100, then they will be emitted in-order by the folder. 
			
 
				+              If a job appears out-of-order outside this window, then the Folder
			
 
				+              will bail out provided <code>-allow-missorting</code> is not set.
			
 
				+              <em>Folder</em> reports the maximum skew size seen in the 
			
 
				+              input trace for future use.
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+        <tr>
			
 
				+          <td><code>-allow-missorting</code></td>
			
 
				+          <td>Enables <em>Folder</em> to tolerate out-of-order jobs. By default 
			
 
				+              mis-sorting is not allowed.
			
 
				+          </td>
			
 
				+          <td>If mis-sorting is allowed, then the <em>Folder</em> will ignore 
			
 
				+              out-of-order jobs that cannot be deskewed using a skew buffer of
			
 
				+              size specified using <code>-skew-buffer-length</code>. If 
			
 
				+              mis-sorting is not allowed, then the Folder will bail out if the
			
 
				+              skew buffer is incapable of tolerating the skew.
			
 
				+          </td>
			
 
				+        </tr>
			
 
				+      </table>
			
 
				+      
			
 
				+      <section>
			
 
				+      <title>Examples</title>
			
 
				+      <section>
			
 
				+        <title>Folding an input trace with 10 hours of total runtime to 
			
 
				+               generate an output trace with 1 hour of total runtime</title>
			
 
				+        <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h  -input-cycle 20m  file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
			
 
				+        <p></p>
			
 
				+        <p>If the folded jobs are out of order then the command
			
 
				+          will bail out. 
			
 
				+        </p>
			
 
				+        <p>
			
 
				+        
			
 
				+        </p>
			
 
				+      </section>
			
 
				+      
			
 
				+      <section>
			
 
				+        <title>Folding an input trace with 10 hours of total runtime to 
			
 
				+               generate an output trace with 1 hour of total runtime and 
			
 
				+               tolerate some skewness
			
 
				+        </title>
			
 
				+        <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m  -allow-missorting -skew-buffer-length 100 file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
			
 
				+        <p></p>
			
 
				+        <p>If the folded jobs are out of order, then atmost
			
 
				+          100 jobs will be de-skewed. If the 101<sup>st</sup> job is 
			
 
				+          <em>out-of-order</em>, then the command will bail out.
			
 
				+        </p>
			
 
				+      </section>
			
 
				+      <section>
			
 
				+        <title>Folding an input trace with 10 hours of total runtime to 
			
 
				+               generate an output trace with 1 hour of total runtime in debug 
			
 
				+               mode
			
 
				+        </title>
			
 
				+        <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m  -debug -temp-directory file:///tmp/debug file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
			
 
				+        <p></p>
			
 
				+        <p>This will fold the 10hr job-trace file 
			
 
				+           <code>file:///home/user/job-trace.json</code> to finish within 1hr 
			
 
				+           and use <code>file:///tmp/debug</code> as the temporary directory. 
			
 
				+           The intermediate files in the temporary directory will not be cleaned
			
 
				+           up.
			
 
				+        </p>
			
 
				+      </section>
			
 
				+      
			
 
				+      <section>
			
 
				+        <title>Folding an input trace with 10 hours of total runtime to 
			
 
				+               generate an output trace with 1 hour of total runtime with custom
			
 
				+               concentration.
			
 
				+        </title>
			
 
				+        <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m  -concentration 2  file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
			
 
				+        <p></p>
			
 
				+        <p>This will fold the 10hr job-trace file 
			
 
				+           <code>file:///home/user/job-trace.json</code> to finish within 1hr 
			
 
				+           with concentration of 2. <code>Example-2.3.2</code> will retain 10% 
			
 
				+           of the jobs. With <em>concentration</em> as 2, 20% of the total input 
			
 
				+           jobs will be retained.
			
 
				+        </p>
			
 
				+      </section>
			
 
				+    </section>
			
 
				+    </section>
			
 
				+    <p></p><p></p><p></p>
			
 
				+  </section>
			
 
				+  
			
 
				+  <!--
			
 
				+    Appendix [Resources i.e ppts, jiras, definition etc]
			
 
				+  -->
			
 
				+  <section>
			
 
				+    <title>Appendix</title>
			
 
				+    
			
 
				+    <section>
			
 
				+      <title>Resources</title>
			
 
				+      <p><a href="https://issues.apache.org/jira/browse/MAPREDUCE-751">MAPREDUCE-751</a> is the main JIRA that introduced <em>Rumen</em> to <em>MapReduce</em>. 
			
 
				+         Look at the MapReduce <a href="https://issues.apache.org/jira/browse/MAPREDUCE/component/12313617">rumen-component</a> for further details.</p>
			
 
				+    </section>
			
 
				+    
			
 
				+    <section>
			
 
				+     <title>Dependencies</title>
			
 
				+    <p><em>Rumen</em> expects certain library <em>JARs</em> to be present in 
			
 
				+         the <em>CLASSPATH</em>. 
			
 
				+              The required libraries are </p>
			
 
				+      <ul>
			
 
				+        <li><code>Hadoop MapReduce Tools</code> (<code>hadoop-mapred-tools-{hadoop-version}.jar</code>)</li>
			
 
				+        <li><code>Hadoop Common</code> (<code>hadoop-common-{hadoop-version}.jar</code>)</li>
			
 
				+        <li><code>Apache Commons Logging</code> (<code>commons-logging-1.1.1.jar</code>)</li>
			
 
				+        <li><code>Apache Commons CLI</code> (<code>commons-cli-1.2.jar</code>)</li>
			
 
				+        <li><code>Jackson Mapper</code> (<code>jackson-mapper-asl-1.4.2.jar</code>)</li>
			
 
				+        <li><code>Jackson Core</code> (<code>jackson-core-asl-1.4.2.jar</code>)</li>
			
 
				+      </ul>
			
 
				+      
			
 
				+      <note>One simple way to run Rumen is to use '$HADOOP_HOME/bin/hadoop jar' 
			
 
				+              option  to run it.
			
 
				+      </note>
			
 
				+    </section>
			
 
				+  </section>
			
 
				+</body>
			
 
				+</document>
			
--- a/src/docs/src/documentation/content/xdocs/site.xml
+++ b/src/docs/src/documentation/content/xdocs/site.xml
@@ -49,6 +49,7 @@ See http://forrest.apache.org/docs/linking.html for more info.
 
				     <vaidya         label="Vaidya"  href="vaidya.html"/>
			
 
				     <archives     label="Hadoop Archives" href="hadoop_archives.html"/>
			
 
				     <gridmix       label="Gridmix"  href="gridmix.html"/>
			
 
				+    <Rumen          label="Rumen"     href="rumen.html"/>
			
 
				     <cap_scheduler  label="Capacity Scheduler" href="capacity_scheduler.html"/>
			
 
				     <fair_scheduler    label="Fair Scheduler"  href="fair_scheduler.html"/>
			
 
				     <cap_scheduler  label="Hod Scheduler"  href="hod_scheduler.html"/>
			
--- a/src/mapred/org/apache/hadoop/mapred/JobConf.java
+++ b/src/mapred/org/apache/hadoop/mapred/JobConf.java
@@ -156,10 +156,10 @@ public class JobConf extends Configuration {
 
				    */
			
 
				   public static final String DEFAULT_QUEUE_NAME = "default";
			
 
				   
			
 
				-  static final String MAPRED_JOB_MAP_MEMORY_MB_PROPERTY =
			
 
				+  public static final String MAPRED_JOB_MAP_MEMORY_MB_PROPERTY =
			
 
				       "mapred.job.map.memory.mb";
			
 
				 
			
 
				-  static final String MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY =
			
 
				+  public static final String MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY =
			
 
				       "mapred.job.reduce.memory.mb";
			
 
				 
			
 
				   static final String MR_ACLS_ENABLED = "mapred.acls.enabled";
			
--- a/src/mapred/org/apache/hadoop/mapred/JobTracker.java
+++ b/src/mapred/org/apache/hadoop/mapred/JobTracker.java
@@ -4480,9 +4480,9 @@ public class JobTracker implements MRConstants, InterTrackerProtocol,
 
				   public static final String MAPRED_CLUSTER_REDUCE_MEMORY_MB_PROPERTY =
			
 
				       "mapred.cluster.reduce.memory.mb";
			
 
				 
			
 
				-  static final String MAPRED_CLUSTER_MAX_MAP_MEMORY_MB_PROPERTY =
			
 
				+  public static final String MAPRED_CLUSTER_MAX_MAP_MEMORY_MB_PROPERTY =
			
 
				       "mapred.cluster.max.map.memory.mb";
			
 
				-  static final String MAPRED_CLUSTER_MAX_REDUCE_MEMORY_MB_PROPERTY =
			
 
				+  public static final String MAPRED_CLUSTER_MAX_REDUCE_MEMORY_MB_PROPERTY =
			
 
				       "mapred.cluster.max.reduce.memory.mb";
			
 
				 
			
 
				   /* 
			
--- a/src/mapred/org/apache/hadoop/mapred/Reporter.java
+++ b/src/mapred/org/apache/hadoop/mapred/Reporter.java
@@ -61,6 +61,10 @@ public interface Reporter extends Progressable {
 
				       public InputSplit getInputSplit() throws UnsupportedOperationException {
			
 
				         throw new UnsupportedOperationException("NULL reporter has no input");
			
 
				       }
			
 
				+      @Override
			
 
				+      public float getProgress() {
			
 
				+        return 0;
			
 
				+      }
			
 
				     };
			
 
				 
			
 
				   /**
			
@@ -117,4 +121,10 @@ public interface Reporter extends Progressable {
 
				    */
			
 
				   public abstract InputSplit getInputSplit() 
			
 
				     throws UnsupportedOperationException;
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the progress of the task. Progress is represented as a number between
			
 
				+   * 0 and 1 (inclusive).
			
 
				+   */
			
 
				+  public float getProgress();
			
 
				 }
			
--- a/src/mapred/org/apache/hadoop/mapred/Task.java
+++ b/src/mapred/org/apache/hadoop/mapred/Task.java
@@ -586,6 +586,11 @@ abstract public class Task implements Writable, Configurable {
 
				       // indicate that progress update needs to be sent
			
 
				       setProgressFlag();
			
 
				     }
			
 
				+    
			
 
				+    public float getProgress() {
			
 
				+      return taskProgress.getProgress();
			
 
				+    };
			
 
				+    
			
 
				     public void progress() {
			
 
				       // indicate that progress update needs to be sent
			
 
				       setProgressFlag();
			
--- a/src/mapred/org/apache/hadoop/mapreduce/StatusReporter.java
+++ b/src/mapred/org/apache/hadoop/mapreduce/StatusReporter.java
@@ -21,5 +21,11 @@ public abstract class StatusReporter {
 
				   public abstract Counter getCounter(Enum<?> name);
			
 
				   public abstract Counter getCounter(String group, String name);
			
 
				   public abstract void progress();
			
 
				+  /**
			
 
				+   * Get the current progress.
			
 
				+   * @return a number between 0.0 and 1.0 (inclusive) indicating the attempt's 
			
 
				+   * progress.
			
 
				+   */
			
 
				+  public abstract float getProgress();
			
 
				   public abstract void setStatus(String status);
			
 
				 }
			
--- a/src/mapred/org/apache/hadoop/mapreduce/TaskInputOutputContext.java
+++ b/src/mapred/org/apache/hadoop/mapreduce/TaskInputOutputContext.java
@@ -93,6 +93,10 @@ public abstract class TaskInputOutputContext<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
 
				     reporter.progress();
			
 
				   }
			
 
				 
			
 
				+  public float getProgress() {
			
 
				+    return reporter.getProgress();
			
 
				+  }
			
 
				+
			
 
				   @Override
			
 
				   public void setStatus(String status) {
			
 
				     reporter.setStatus(status);
			
--- a/src/mapred/org/apache/hadoop/mapreduce/lib/map/MultithreadedMapper.java
+++ b/src/mapred/org/apache/hadoop/mapreduce/lib/map/MultithreadedMapper.java
@@ -235,6 +235,10 @@ public class MultithreadedMapper<K1, V1, K2, V2>
 
				       outer.setStatus(status);
			
 
				     }
			
 
				     
			
 
				+    @Override
			
 
				+    public float getProgress() {
			
 
				+      return outer.getProgress();
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   private class MapRunner extends Thread {
			
--- a/src/test/org/apache/hadoop/mapred/UtilsForTests.java
+++ b/src/test/org/apache/hadoop/mapred/UtilsForTests.java
@@ -608,6 +608,16 @@ public class UtilsForTests {
 
				   public static RunningJob runJob(JobConf conf, Path inDir, Path outDir, 
			
 
				                                   int numMaps, int numReds) throws IOException {
			
 
				 
			
 
				+    String input = "The quick brown fox\n" + "has many silly\n"
			
 
				+                   + "red fox sox\n";
			
 
				+    
			
 
				+    // submit the job and wait for it to complete
			
 
				+    return runJob(conf, inDir, outDir, numMaps, numReds, input);
			
 
				+  }
			
 
				+  
			
 
				+  // Start a job with the specified input and return its RunningJob object
			
 
				+  static RunningJob runJob(JobConf conf, Path inDir, Path outDir, int numMaps, 
			
 
				+                           int numReds, String input) throws IOException {
			
 
				     FileSystem fs = FileSystem.get(conf);
			
 
				     if (fs.exists(outDir)) {
			
 
				       fs.delete(outDir, true);
			
@@ -615,8 +625,7 @@ public class UtilsForTests {
 
				     if (!fs.exists(inDir)) {
			
 
				       fs.mkdirs(inDir);
			
 
				     }
			
 
				-    String input = "The quick brown fox\n" + "has many silly\n"
			
 
				-        + "red fox sox\n";
			
 
				+    
			
 
				     for (int i = 0; i < numMaps; ++i) {
			
 
				       DataOutputStream file = fs.create(new Path(inDir, "part-" + i));
			
 
				       file.writeBytes(input);
			
--- a/src/test/org/apache/hadoop/mapreduce/MapReduceTestUtil.java
+++ b/src/test/org/apache/hadoop/mapreduce/MapReduceTestUtil.java
@@ -285,6 +285,10 @@ public class MapReduceTestUtil {
 
				       }
			
 
				       public void progress() {
			
 
				       }
			
 
				+      @Override
			
 
				+      public float getProgress() {
			
 
				+        return 0;
			
 
				+      }
			
 
				       public Counter getCounter(Enum<?> name) {
			
 
				         return new Counters().findCounter(name);
			
 
				       }
			
--- a/src/test/org/apache/hadoop/tools/rumen/TestRumenJobTraces.java
+++ b/src/test/org/apache/hadoop/tools/rumen/TestRumenJobTraces.java
@@ -19,6 +19,7 @@
 
				 package org.apache.hadoop.tools.rumen;
			
 
				 
			
 
				 import java.io.BufferedInputStream;
			
 
				+import java.io.DataOutputStream;
			
 
				 import java.io.IOException;
			
 
				 import java.io.InputStream;
			
 
				 import java.io.OutputStream;
			
@@ -46,6 +47,7 @@ import org.apache.hadoop.mapreduce.JobID;
 
				 import org.apache.hadoop.mapreduce.TaskAttemptID;
			
 
				 import org.apache.hadoop.mapreduce.TaskID;
			
 
				 import org.apache.hadoop.mapreduce.TaskType;
			
 
				+import org.apache.hadoop.tools.rumen.TraceBuilder.MyOptions;
			
 
				 import org.apache.hadoop.util.LineReader;
			
 
				 import org.apache.hadoop.util.Tool;
			
 
				 import org.apache.hadoop.util.ToolRunner;
			
@@ -363,6 +365,160 @@ public class TestRumenJobTraces {
 
				   }
			
 
				 
			
 
				   /**
			
 
				+   * Check if processing of input arguments is as expected by passing globbed
			
 
				+   * input path
			
 
				+   * <li> without -recursive option and
			
 
				+   * <li> with -recursive option.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testProcessInputArgument() throws Exception {
			
 
				+    final Configuration conf = new Configuration();
			
 
				+    final FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+
			
 
				+    // define the test's root temporary directory
			
 
				+    final Path rootTempDir =
			
 
				+      new Path(System.getProperty("test.build.data", "/tmp"))
			
 
				+          .makeQualified(lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+    // define the test's root input directory
			
 
				+    Path testRootInputDir = new Path(rootTempDir, "TestProcessInputArgument");
			
 
				+    // define the nested input directory
			
 
				+    Path nestedInputDir = new Path(testRootInputDir, "1/2/3/4");
			
 
				+    // define the globbed version of the nested input directory
			
 
				+    Path globbedInputNestedDir =
			
 
				+      lfs.makeQualified(new Path(testRootInputDir, "*/*/*/*/*"));
			
 
				+    try {
			
 
				+      lfs.delete(nestedInputDir, true);
			
 
				+
			
 
				+      List<String> recursiveInputPaths = new ArrayList<String>();
			
 
				+      List<String> nonRecursiveInputPaths = new ArrayList<String>();
			
 
				+      // Create input files under the given path with multiple levels of
			
 
				+      // sub directories
			
 
				+      createHistoryLogsHierarchy(nestedInputDir, lfs, recursiveInputPaths,
			
 
				+          nonRecursiveInputPaths);
			
 
				+
			
 
				+      // Check the case of globbed input path and without -recursive option
			
 
				+      List<Path> inputs = MyOptions.processInputArgument(
			
 
				+                              globbedInputNestedDir.toString(), conf, false);
			
 
				+      validateHistoryLogPaths(inputs, nonRecursiveInputPaths);
			
 
				+   // Check the case of globbed input path and with -recursive option
			
 
				+      inputs = MyOptions.processInputArgument(
			
 
				+                   globbedInputNestedDir.toString(), conf, true);
			
 
				+      validateHistoryLogPaths(inputs, recursiveInputPaths);
			
 
				+
			
 
				+    } finally {
			
 
				+      lfs.delete(testRootInputDir, true);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Validate if the input history log paths are as expected.
			
 
				+   * @param inputs  the resultant input paths to be validated
			
 
				+   * @param expectedHistoryFileNames  the expected input history logs
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void validateHistoryLogPaths(List<Path> inputs,
			
 
				+      List<String> expectedHistoryFileNames) throws IOException {
			
 
				+
			
 
				+    System.out.println("\nExpected history files are:");
			
 
				+    for (String historyFile : expectedHistoryFileNames) {
			
 
				+      System.out.println(historyFile);
			
 
				+    }
			
 
				+    System.out.println("\nResultant history files are:");
			
 
				+    List<String> historyLogs = new ArrayList<String>();
			
 
				+    for (Path p : inputs) {
			
 
				+      historyLogs.add(p.toUri().getPath());
			
 
				+      System.out.println(p.toUri().getPath());
			
 
				+    }
			
 
				+
			
 
				+    assertEquals("Number of history logs found is different from the expected.",
			
 
				+        expectedHistoryFileNames.size(), inputs.size());
			
 
				+
			
 
				+    // Verify if all the history logs are expected ones and they are in the
			
 
				+    // expected order
			
 
				+    assertTrue("Some of the history log files do not match the expected.",
			
 
				+        historyLogs.equals(expectedHistoryFileNames));
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Create history logs under the given path with multiple levels of
			
 
				+   * sub directories as shown below.
			
 
				+   * <br>
			
 
				+   * Create a file, an empty subdirectory and a nonempty subdirectory
			
 
				+   * &lt;historyDir&gt; under the given input path.
			
 
				+   * <br>
			
 
				+   * The subdirectory &lt;historyDir&gt; contains the following dir structure:
			
 
				+   * <br>
			
 
				+   * <br>&lt;historyDir&gt;/historyFile1.txt
			
 
				+   * <br>&lt;historyDir&gt;/historyFile1.gz
			
 
				+   * <br>&lt;historyDir&gt;/subDir1/historyFile2.txt
			
 
				+   * <br>&lt;historyDir&gt;/subDir1/historyFile2.gz
			
 
				+   * <br>&lt;historyDir&gt;/subDir2/historyFile3.txt
			
 
				+   * <br>&lt;historyDir&gt;/subDir2/historyFile3.gz
			
 
				+   * <br>&lt;historyDir&gt;/subDir1/subDir11/historyFile4.txt
			
 
				+   * <br>&lt;historyDir&gt;/subDir1/subDir11/historyFile4.gz
			
 
				+   * <br>&lt;historyDir&gt;/subDir2/subDir21/
			
 
				+   * <br>
			
 
				+   * Create the lists of input paths that should be processed by TraceBuilder
			
 
				+   * for recursive case and non-recursive case.
			
 
				+   * @param nestedInputDir the input history logs directory where history files
			
 
				+   *                       with nested subdirectories are created
			
 
				+   * @param fs         FileSystem of the input paths
			
 
				+   * @param recursiveInputPaths input paths for recursive case
			
 
				+   * @param nonRecursiveInputPaths input paths for non-recursive case
			
 
				+   * @throws IOException
			
 
				+   */
			
 
				+  private void createHistoryLogsHierarchy(Path nestedInputDir, FileSystem fs,
			
 
				+      List<String> recursiveInputPaths, List<String> nonRecursiveInputPaths)
			
 
				+  throws IOException {
			
 
				+    List<Path> dirs = new ArrayList<Path>();
			
 
				+    // define a file in the nested test input directory
			
 
				+    Path inputPath1 = new Path(nestedInputDir, "historyFile.txt");
			
 
				+    // define an empty sub-folder in the nested test input directory
			
 
				+    Path emptyDir = new Path(nestedInputDir, "emptyDir");
			
 
				+    // define a nonempty sub-folder in the nested test input directory
			
 
				+    Path historyDir = new Path(nestedInputDir, "historyDir");
			
 
				+
			
 
				+    fs.mkdirs(nestedInputDir);
			
 
				+    // Create an empty input file
			
 
				+    fs.createNewFile(inputPath1);
			
 
				+    // Create empty subdir
			
 
				+    fs.mkdirs(emptyDir);// let us not create any files under this dir
			
 
				+
			
 
				+    fs.mkdirs(historyDir);
			
 
				+    dirs.add(historyDir);
			
 
				+
			
 
				+    Path subDir1 = new Path(historyDir, "subDir1");
			
 
				+    fs.mkdirs(subDir1);
			
 
				+    dirs.add(subDir1);
			
 
				+    Path subDir2 = new Path(historyDir, "subDir2");
			
 
				+    fs.mkdirs(subDir2);
			
 
				+    dirs.add(subDir2);
			
 
				+
			
 
				+    Path subDir11 = new Path(subDir1, "subDir11");
			
 
				+    fs.mkdirs(subDir11);
			
 
				+    dirs.add(subDir11);
			
 
				+    Path subDir21 = new Path(subDir2, "subDir21");
			
 
				+    fs.mkdirs(subDir21);// let us not create any files under this dir
			
 
				+
			
 
				+    int i = 0;
			
 
				+    for (Path dir : dirs) {
			
 
				+      i++;
			
 
				+      Path gzPath = new Path(dir, "historyFile" + i + ".gz");
			
 
				+      Path txtPath = new Path(dir, "historyFile" + i + ".txt");
			
 
				+      fs.createNewFile(txtPath);
			
 
				+      fs.createNewFile(gzPath);
			
 
				+      recursiveInputPaths.add(gzPath.toUri().getPath());
			
 
				+      recursiveInputPaths.add(txtPath.toUri().getPath());
			
 
				+      if (i == 1) {
			
 
				+        nonRecursiveInputPaths.add(gzPath.toUri().getPath());
			
 
				+        nonRecursiveInputPaths.add(txtPath.toUri().getPath());
			
 
				+      }
			
 
				+    }
			
 
				+    recursiveInputPaths.add(inputPath1.toUri().getPath());
			
 
				+    nonRecursiveInputPaths.add(inputPath1.toUri().getPath());
			
 
				+  }
			
 
				+
			
 
				+    /**
			
 
				    * Test if {@link CurrentJHParser} can read events from current JH files.
			
 
				    */
			
 
				   @Test
			
@@ -426,7 +582,7 @@ public class TestRumenJobTraces {
 
				 
			
 
				       // Test if the JobHistoryParserFactory can detect the parser correctly
			
 
				       parser = JobHistoryParserFactory.getParser(ris);
			
 
				-        
			
 
				+
			
 
				       HistoryEvent e;
			
 
				       while ((e = parser.nextEvent()) != null) {
			
 
				         String eventString = e.getEventType().toString();
			
@@ -470,71 +626,267 @@ public class TestRumenJobTraces {
 
				     }
			
 
				   }
			
 
				   
			
 
				-  @Test
			
 
				-  public void testJobConfigurationParser() throws Exception {
			
 
				-    String[] list1 =
			
 
				-        { "mapred.job.queue.name", "mapreduce.job.name",
			
 
				-            "mapred.child.java.opts" };
			
 
				+    /**
			
 
				+     * Test if the {@link JobConfigurationParser} can correctly extract out 
			
 
				+     * key-value pairs from the job configuration.
			
 
				+     */
			
 
				+    @Test
			
 
				+    public void testJobConfigurationParsing() throws Exception {
			
 
				+      final FileSystem lfs = FileSystem.getLocal(new Configuration());
			
 
				+  
			
 
				+      final Path rootTempDir =
			
 
				+          new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(
			
 
				+              lfs.getUri(), lfs.getWorkingDirectory());
			
 
				+  
			
 
				+      final Path tempDir = new Path(rootTempDir, "TestJobConfigurationParser");
			
 
				+      lfs.delete(tempDir, true);
			
 
				+  
			
 
				+      // Add some configuration parameters to the conf
			
 
				+      JobConf jConf = new JobConf(false);
			
 
				+      String key = "test.data";
			
 
				+      String value = "hello world";
			
 
				+      jConf.set(key, value);
			
 
				+      
			
 
				+      // create the job conf file
			
 
				+      Path jobConfPath = new Path(tempDir.toString(), "job.xml");
			
 
				+      lfs.delete(jobConfPath, false);
			
 
				+      DataOutputStream jobConfStream = lfs.create(jobConfPath);
			
 
				+      jConf.writeXml(jobConfStream);
			
 
				+      jobConfStream.close();
			
 
				+      
			
 
				+      // now read the job conf file using the job configuration parser
			
 
				+      Properties properties = 
			
 
				+        JobConfigurationParser.parse(lfs.open(jobConfPath));
			
 
				+      
			
 
				+      // check if the required parameter is loaded
			
 
				+      assertEquals("Total number of extracted properties (" + properties.size() 
			
 
				+                   + ") doesn't match the expected size of 1 ["
			
 
				+                   + "JobConfigurationParser]",
			
 
				+                   1, properties.size());
			
 
				+      // check if the key is present in the extracted configuration
			
 
				+      assertTrue("Key " + key + " is missing in the configuration extracted "
			
 
				+                 + "[JobConfigurationParser]",
			
 
				+                 properties.keySet().contains(key));
			
 
				+      // check if the desired property has the correct value
			
 
				+      assertEquals("JobConfigurationParser couldn't recover the parameters"
			
 
				+                   + " correctly",
			
 
				+                  value, properties.get(key));
			
 
				+      
			
 
				+      // Test ZombieJob
			
 
				+      LoggedJob job = new LoggedJob();
			
 
				+      job.setJobProperties(properties);
			
 
				+      
			
 
				+      ZombieJob zjob = new ZombieJob(job, null);
			
 
				+      Configuration zconf = zjob.getJobConf();
			
 
				+      // check if the required parameter is loaded
			
 
				+      assertEquals("ZombieJob couldn't recover the parameters correctly", 
			
 
				+                   value, zconf.get(key));
			
 
				+    }
			
 
				 
			
 
				-    String[] list2 = { "mapred.job.queue.name", "mapred.child.java.opts" };
			
 
				+    @Test
			
 
				+    public void testJobConfigurationParser() throws Exception {
			
 
				 
			
 
				-    List<String> interested1 = new ArrayList<String>();
			
 
				-    for (String interested : list1) {
			
 
				-      interested1.add(interested);
			
 
				+      // Validate parser with old mapred config properties from
			
 
				+      // sample-conf-file.xml
			
 
				+      validateJobConfParser("sample-conf.file.xml");
			
 
				     }
			
 
				 
			
 
				-    List<String> interested2 = new ArrayList<String>();
			
 
				-    for (String interested : list2) {
			
 
				-      interested2.add(interested);
			
 
				-    }
			
 
				+    private void validateJobConfParser(String confFile) throws Exception {
			
 
				 
			
 
				-    JobConfigurationParser jcp1 = new JobConfigurationParser(interested1);
			
 
				-    JobConfigurationParser jcp2 = new JobConfigurationParser(interested2);
			
 
				+      final Configuration conf = new Configuration();
			
 
				+      final FileSystem lfs = FileSystem.getLocal(conf);
			
 
				 
			
 
				-    final Configuration conf = new Configuration();
			
 
				-    final FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+      @SuppressWarnings("deprecation")
			
 
				+      final Path rootInputDir =
			
 
				+          new Path(System.getProperty("test.tools.input.dir", ""))
			
 
				+              .makeQualified(lfs);
			
 
				 
			
 
				-    @SuppressWarnings("deprecation")
			
 
				-    final Path rootInputDir =
			
 
				-        new Path(System.getProperty("test.tools.input.dir", ""))
			
 
				-            .makeQualified(lfs);
			
 
				+      final Path rootInputPath = new Path(rootInputDir, "rumen/small-trace-test");
			
 
				 
			
 
				-    final Path rootInputPath = new Path(rootInputDir, "rumen/small-trace-test");
			
 
				+      final Path inputPath = new Path(rootInputPath, confFile);
			
 
				 
			
 
				-    final Path inputPath = new Path(rootInputPath, "sample-conf.file.xml");
			
 
				+      InputStream inputConfStream =
			
 
				+          new PossiblyDecompressedInputStream(inputPath, conf);
			
 
				 
			
 
				-    InputStream inputConfStream =
			
 
				-        new PossiblyDecompressedInputStream(inputPath, conf);
			
 
				+      try {
			
 
				+        Properties props = JobConfigurationParser.parse(inputConfStream);
			
 
				+        inputConfStream.close();
			
 
				+
			
 
				+        // Make sure that parser puts the interested properties into props1 and
			
 
				+        // props2 as defined by list1 and list2.
			
 
				+        assertEquals("Config property for job queue name is not "
			
 
				+            + " extracted properly.", "TheQueue",
			
 
				+            JobBuilder.extract(props, JobConfPropertyNames.QUEUE_NAMES
			
 
				+            .getCandidates(), null));
			
 
				+        assertEquals("Config property for job name is not "
			
 
				+            + " extracted properly.", "MyMRJob",
			
 
				+            JobBuilder.extract(props, JobConfPropertyNames.JOB_NAMES
			
 
				+            .getCandidates(), null));
			
 
				+
			
 
				+        validateChildJavaOpts(props);
			
 
				 
			
 
				-    try {
			
 
				-      Properties props1 = jcp1.parse(inputConfStream);
			
 
				-      inputConfStream.close();
			
 
				-
			
 
				-      inputConfStream = new PossiblyDecompressedInputStream(inputPath, conf);
			
 
				-      Properties props2 = jcp2.parse(inputConfStream);
			
 
				-
			
 
				-      assertEquals("testJobConfigurationParser: wrong number of properties", 3,
			
 
				-          props1.size());
			
 
				-      assertEquals("testJobConfigurationParser: wrong number of properties", 2,
			
 
				-          props2.size());
			
 
				-
			
 
				-      assertEquals("prop test 1", "TheQueue", props1
			
 
				-          .get("mapred.job.queue.name"));
			
 
				-      assertEquals("prop test 2", "job_0001", props1.get("mapreduce.job.name"));
			
 
				-      assertEquals("prop test 3",
			
 
				-          "-server -Xmx640m -Djava.net.preferIPv4Stack=true", props1
			
 
				-              .get("mapred.child.java.opts"));
			
 
				-      assertEquals("prop test 4", "TheQueue", props2
			
 
				-          .get("mapred.job.queue.name"));
			
 
				-      assertEquals("prop test 5",
			
 
				-          "-server -Xmx640m -Djava.net.preferIPv4Stack=true", props2
			
 
				-              .get("mapred.child.java.opts"));
			
 
				+      } finally {
			
 
				+        inputConfStream.close();
			
 
				+      }
			
 
				+    }
			
 
				+    
			
 
				+    // Validate child java opts in properties.
			
 
				+    private void validateChildJavaOpts(Properties props) {
			
 
				+      // if old property mapred.child.java.opts is set, then extraction of all
			
 
				+      // the following 3 properties should give that value.
			
 
				+      assertEquals("mapred.child.java.opts is not extracted properly.",
			
 
				+          "-server -Xmx640m -Djava.net.preferIPv4Stack=true",
			
 
				+          JobBuilder.extract(props, JobConfPropertyNames.TASK_JAVA_OPTS_S
			
 
				+          .getCandidates(), null));
			
 
				+      assertEquals("New config property " + JobConf.MAPRED_MAP_TASK_JAVA_OPTS
			
 
				+          + " is not extracted properly when the old config property "
			
 
				+          + "mapred.child.java.opts is set.",
			
 
				+          "-server -Xmx640m -Djava.net.preferIPv4Stack=true",
			
 
				+          JobBuilder.extract(props, JobConfPropertyNames.MAP_JAVA_OPTS_S
			
 
				+          .getCandidates(), null));
			
 
				+      assertEquals("New config property " + JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS
			
 
				+          + " is not extracted properly when the old config property "
			
 
				+          + "mapred.child.java.opts is set.",
			
 
				+          "-server -Xmx640m -Djava.net.preferIPv4Stack=true",
			
 
				+          JobBuilder.extract(props, JobConfPropertyNames.REDUCE_JAVA_OPTS_S
			
 
				+          .getCandidates(), null));
			
 
				+    }
			
 
				 
			
 
				-    } finally {
			
 
				-      inputConfStream.close();
			
 
				+  /**
			
 
				+   * Test {@link ResourceUsageMetrics}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  public void testResourceUsageMetrics() throws Exception {
			
 
				+    final long cpuUsage = 100;
			
 
				+    final long pMemUsage = 200;
			
 
				+    final long vMemUsage = 300;
			
 
				+    final long heapUsage = 400;
			
 
				+    
			
 
				+    // test ResourceUsageMetrics's setters
			
 
				+    ResourceUsageMetrics metrics = new ResourceUsageMetrics();
			
 
				+    metrics.setCumulativeCpuUsage(cpuUsage);
			
 
				+    metrics.setPhysicalMemoryUsage(pMemUsage);
			
 
				+    metrics.setVirtualMemoryUsage(vMemUsage);
			
 
				+    metrics.setHeapUsage(heapUsage);
			
 
				+    // test cpu usage value
			
 
				+    assertEquals("Cpu usage values mismatch via set", cpuUsage, 
			
 
				+                 metrics.getCumulativeCpuUsage());
			
 
				+    // test pMem usage value
			
 
				+    assertEquals("Physical memory usage values mismatch via set", pMemUsage, 
			
 
				+                 metrics.getPhysicalMemoryUsage());
			
 
				+    // test vMem usage value
			
 
				+    assertEquals("Virtual memory usage values mismatch via set", vMemUsage, 
			
 
				+                 metrics.getVirtualMemoryUsage());
			
 
				+    // test heap usage value
			
 
				+    assertEquals("Heap usage values mismatch via set", heapUsage, 
			
 
				+                 metrics.getHeapUsage());
			
 
				+    
			
 
				+    // test deepCompare() (pass case)
			
 
				+    testResourceUsageMetricViaDeepCompare(metrics, cpuUsage, vMemUsage, 
			
 
				+                                          pMemUsage, heapUsage, true);
			
 
				+    
			
 
				+    // test deepCompare (fail case)
			
 
				+    // test cpu usage mismatch
			
 
				+    testResourceUsageMetricViaDeepCompare(metrics, 0, vMemUsage, pMemUsage, 
			
 
				+                                          heapUsage, false);
			
 
				+    // test pMem usage mismatch
			
 
				+    testResourceUsageMetricViaDeepCompare(metrics, cpuUsage, vMemUsage, 0, 
			
 
				+                                          heapUsage, false);
			
 
				+    // test vMem usage mismatch
			
 
				+    testResourceUsageMetricViaDeepCompare(metrics, cpuUsage, 0, pMemUsage, 
			
 
				+                                          heapUsage, false);
			
 
				+    // test heap usage mismatch
			
 
				+    testResourceUsageMetricViaDeepCompare(metrics, cpuUsage, vMemUsage, 
			
 
				+                                          pMemUsage, 0, false);
			
 
				+    
			
 
				+    // define a metric with a fixed value of size()
			
 
				+    ResourceUsageMetrics metrics2 = new ResourceUsageMetrics() {
			
 
				+      @Override
			
 
				+      public int size() {
			
 
				+        return -1;
			
 
				+      }
			
 
				+    };
			
 
				+    metrics2.setCumulativeCpuUsage(cpuUsage);
			
 
				+    metrics2.setPhysicalMemoryUsage(pMemUsage);
			
 
				+    metrics2.setVirtualMemoryUsage(vMemUsage);
			
 
				+    metrics2.setHeapUsage(heapUsage);
			
 
				+    
			
 
				+    // test with size mismatch
			
 
				+    testResourceUsageMetricViaDeepCompare(metrics2, cpuUsage, vMemUsage, 
			
 
				+                                          pMemUsage, heapUsage, false);
			
 
				+  }
			
 
				+  
			
 
				+  // test ResourceUsageMetric's deepCompare() method
			
 
				+  private static void testResourceUsageMetricViaDeepCompare(
			
 
				+                        ResourceUsageMetrics metrics, long cpuUsage, 
			
 
				+                        long vMemUsage, long pMemUsage, long heapUsage,
			
 
				+                        boolean shouldPass) {
			
 
				+    ResourceUsageMetrics testMetrics = new ResourceUsageMetrics();
			
 
				+    testMetrics.setCumulativeCpuUsage(cpuUsage);
			
 
				+    testMetrics.setPhysicalMemoryUsage(pMemUsage);
			
 
				+    testMetrics.setVirtualMemoryUsage(vMemUsage);
			
 
				+    testMetrics.setHeapUsage(heapUsage);
			
 
				+    
			
 
				+    Boolean passed = null;
			
 
				+    try {
			
 
				+      metrics.deepCompare(testMetrics, new TreePath(null, "<root>"));
			
 
				+      passed = true;
			
 
				+    } catch (DeepInequalityException die) {
			
 
				+      passed = false;
			
 
				     }
			
 
				+    
			
 
				+    assertEquals("ResourceUsageMetrics deepCompare() failed!", 
			
 
				+                 shouldPass, passed);
			
 
				   }
			
 
				-
			
 
				+  
			
 
				+  /**
			
 
				+   * Testing {@link ResourceUsageMetrics} using {@link HadoopLogsAnalyzer}.
			
 
				+   */
			
 
				+  @Test
			
 
				+  @SuppressWarnings("deprecation")
			
 
				+  public void testResourceUsageMetricsWithHadoopLogsAnalyzer() 
			
 
				+  throws IOException {
			
 
				+    Configuration conf = new Configuration();
			
 
				+    // get the input trace file
			
 
				+    Path rootInputDir =
			
 
				+      new Path(System.getProperty("test.tools.input.dir", ""));
			
 
				+    Path rootInputSubFolder = new Path(rootInputDir, "rumen/small-trace-test");
			
 
				+    Path traceFile = new Path(rootInputSubFolder, "v20-resource-usage-log.gz");
			
 
				+    
			
 
				+    FileSystem lfs = FileSystem.getLocal(conf);
			
 
				+    
			
 
				+    // define the root test directory
			
 
				+    Path rootTempDir =
			
 
				+        new Path(System.getProperty("test.build.data", "/tmp"));
			
 
				+
			
 
				+    // define output directory
			
 
				+    Path outputDir = 
			
 
				+      new Path(rootTempDir, "testResourceUsageMetricsWithHadoopLogsAnalyzer");
			
 
				+    lfs.delete(outputDir, true);
			
 
				+    lfs.deleteOnExit(outputDir);
			
 
				+    
			
 
				+    // run HadoopLogsAnalyzer
			
 
				+    HadoopLogsAnalyzer analyzer = new HadoopLogsAnalyzer();
			
 
				+    analyzer.setConf(conf);
			
 
				+    Path traceOutput = new Path(outputDir, "trace.json");
			
 
				+    analyzer.run(new String[] {"-write-job-trace", traceOutput.toString(), 
			
 
				+                               "-v1", traceFile.toString()});
			
 
				+    
			
 
				+    // test HadoopLogsAnalyzer's output w.r.t ResourceUsageMetrics
			
 
				+    //  get the logged job
			
 
				+    JsonObjectMapperParser<LoggedJob> traceParser =
			
 
				+      new JsonObjectMapperParser<LoggedJob>(traceOutput, LoggedJob.class, 
			
 
				+                                            conf);
			
 
				+    
			
 
				+    //  get the logged job from the output trace file
			
 
				+    LoggedJob job = traceParser.getNext();
			
 
				+    LoggedTaskAttempt attempt = job.getMapTasks().get(0).getAttempts().get(0);
			
 
				+    ResourceUsageMetrics metrics = attempt.getResourceUsageMetrics();
			
 
				+    
			
 
				+    //  test via deepCompare()
			
 
				+    testResourceUsageMetricViaDeepCompare(metrics, 200, 100, 75, 50, true);
			
 
				+  }
			
 
				+  
			
 
				   @Test
			
 
				   public void testTopologyBuilder() throws Exception {
			
 
				     final TopologyBuilder subject = new TopologyBuilder();
			
--- a/src/test/tools/data/rumen/small-trace-test/counters-test-trace.json.gz
+++ b/src/test/tools/data/rumen/small-trace-test/counters-test-trace.json.gz
--- a/src/test/tools/data/rumen/small-trace-test/dispatch-trace-output.json.gz
+++ b/src/test/tools/data/rumen/small-trace-test/dispatch-trace-output.json.gz
--- a/src/test/tools/data/rumen/small-trace-test/job-tracker-logs-trace-output.gz
+++ b/src/test/tools/data/rumen/small-trace-test/job-tracker-logs-trace-output.gz
--- a/src/test/tools/data/rumen/small-trace-test/sample-conf.file.xml
+++ b/src/test/tools/data/rumen/small-trace-test/sample-conf.file.xml
@@ -19,11 +19,14 @@
 
				  */
			
 
				 -->
			
 
				 <configuration>
			
 
				+<!--
			
 
				+Old mapred config properties
			
 
				+-->
			
 
				    <property>
			
 
				       <name>mapred.job.queue.name</name><value>TheQueue</value>
			
 
				    </property>
			
 
				    <property>
			
 
				-      <name>mapreduce.job.name</name><value>job_0001</value>
			
 
				+      <name>mapred.job.name</name><value>MyMRJob</value>
			
 
				    </property>
			
 
				    <property>
			
 
				       <name>maproduce.uninteresting.property</name><value>abcdef</value>
			
--- a/src/test/tools/data/rumen/small-trace-test/truncated-trace-output
+++ b/src/test/tools/data/rumen/small-trace-test/truncated-trace-output
@@ -3,6 +3,9 @@
 
				   "user" : "hadoopqa",
			
 
				   "jobName" : null,
			
 
				   "jobID" : "job_200904211745_0002",
			
 
				+   "jobProperties" : {
			
 
				+    "mapred.child.java.opts" : "-server -Xmx640m -Djava.net.preferIPv4Stack=true"
			
 
				+  },
			
 
				   "mapTasks" : [ {
			
 
				     "startTime" : 1240336753705,
			
 
				     "attempts" : [ {
			
--- a/src/test/tools/data/rumen/small-trace-test/v20-resource-usage-log.gz
+++ b/src/test/tools/data/rumen/small-trace-test/v20-resource-usage-log.gz
--- a/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java
@@ -54,13 +54,13 @@ public interface ClusterStory {
 
				   /**
			
 
				    * Get {@link MachineNode} by its host name.
			
 
				    * 
			
 
				-   * @return The {@line MachineNode} with the same name. Or null if not found.
			
 
				+   * @return The {@link MachineNode} with the same name. Or null if not found.
			
 
				    */
			
 
				   public MachineNode getMachineByName(String name);
			
 
				   
			
 
				   /**
			
 
				    * Get {@link RackNode} by its name.
			
 
				-   * @return The {@line RackNode} with the same name. Or null if not found.
			
 
				+   * @return The {@link RackNode} with the same name. Or null if not found.
			
 
				    */
			
 
				   public RackNode getRackByName(String name);
			
 
				 
			
--- a/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java
@@ -72,7 +72,7 @@ public class DeskewedJobTraceReader implements Closeable {
 
				    * 
			
 
				    * @param reader
			
 
				    *          the {@link JobTraceReader} that's being protected
			
 
				-   * @param skewBufferSize
			
 
				+   * @param skewBufferLength
			
 
				    *          [the number of late jobs that can preced a later out-of-order
			
 
				    *          earlier job
			
 
				    * @throws IOException
			
--- a/src/tools/org/apache/hadoop/tools/rumen/HadoopLogsAnalyzer.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/HadoopLogsAnalyzer.java
@@ -1208,6 +1208,38 @@ public class HadoopLogsAnalyzer extends Configured implements Tool {
 
				         attempt.spilledRecords = val;
			
 
				       }
			
 
				     }, counterString, "SPILLED_RECORDS");
			
 
				+    
			
 
				+    // incorporate CPU usage
			
 
				+    incorporateCounter(new SetField(attempt2) {
			
 
				+      @Override
			
 
				+      void set(long val) {
			
 
				+        attempt.getResourceUsageMetrics().setCumulativeCpuUsage(val);
			
 
				+      }
			
 
				+    }, counterString, "CPU_MILLISECONDS");
			
 
				+    
			
 
				+    // incorporate virtual memory usage
			
 
				+    incorporateCounter(new SetField(attempt2) {
			
 
				+      @Override
			
 
				+      void set(long val) {
			
 
				+        attempt.getResourceUsageMetrics().setVirtualMemoryUsage(val);
			
 
				+      }
			
 
				+    }, counterString, "VIRTUAL_MEMORY_BYTES");
			
 
				+    
			
 
				+    // incorporate physical memory usage
			
 
				+    incorporateCounter(new SetField(attempt2) {
			
 
				+      @Override
			
 
				+      void set(long val) {
			
 
				+        attempt.getResourceUsageMetrics().setPhysicalMemoryUsage(val);
			
 
				+      }
			
 
				+    }, counterString, "PHYSICAL_MEMORY_BYTES");
			
 
				+    
			
 
				+    // incorporate heap usage
			
 
				+    incorporateCounter(new SetField(attempt2) {
			
 
				+      @Override
			
 
				+      void set(long val) {
			
 
				+        attempt.getResourceUsageMetrics().setHeapUsage(val);
			
 
				+      }
			
 
				+    }, counterString, "COMMITTED_HEAP_BYTES");
			
 
				   }
			
 
				 
			
 
				   private ParsedHost getAndRecordParsedHost(String hostName) {
			
@@ -1594,6 +1626,8 @@ public class HadoopLogsAnalyzer extends Configured implements Tool {
 
				       jobBeingTraced.setJobMapMB(jobconf.jobMapMB);
			
 
				       jobBeingTraced.setJobReduceMB(jobconf.jobReduceMB);
			
 
				 
			
 
				+      jobBeingTraced.setJobProperties(jobconf.properties);
			
 
				+      
			
 
				       jobconf = null;
			
 
				 
			
 
				       finalizeJob();
			
--- a/src/tools/org/apache/hadoop/tools/rumen/JobBuilder.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/JobBuilder.java
@@ -74,6 +74,8 @@ public class JobBuilder {
 
				   private static final Pattern heapPattern =
			
 
				       Pattern.compile("-Xmx([0-9]+[kKmMgGtT])");
			
 
				 
			
 
				+  private Properties jobConfigurationParameters = null;
			
 
				+
			
 
				   public JobBuilder(String jobID) {
			
 
				     this.jobID = jobID;
			
 
				   }
			
@@ -142,7 +144,7 @@ public class JobBuilder {
 
				           "JobBuilder.process(HistoryEvent): unknown event type");
			
 
				   }
			
 
				 
			
 
				-  private String extract(Properties conf, String[] names, String defaultValue) {
			
 
				+  static String extract(Properties conf, String[] names, String defaultValue) {
			
 
				     for (String name : names) {
			
 
				       String result = conf.getProperty(name);
			
 
				 
			
@@ -206,6 +208,7 @@ public class JobBuilder {
 
				           "JobBuilder.process(Properties conf) called after LoggedJob built");
			
 
				     }
			
 
				 
			
 
				+    //TODO remove this once the deprecate APIs in LoggedJob are removed
			
 
				     result.setQueue(extract(conf, JobConfPropertyNames.QUEUE_NAMES
			
 
				         .getCandidates(), "default"));
			
 
				     result.setJobName(extract(conf, JobConfPropertyNames.JOB_NAMES
			
@@ -217,6 +220,8 @@ public class JobBuilder {
 
				         JobConfPropertyNames.MAP_JAVA_OPTS_S.getCandidates()));
			
 
				     maybeSetJobReduceMB(extractMegabytes(conf,
			
 
				         JobConfPropertyNames.REDUCE_JAVA_OPTS_S.getCandidates()));
			
 
				+        
			
 
				+    this.jobConfigurationParameters = conf;
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -226,9 +231,12 @@ public class JobBuilder {
 
				    * @return Parsed {@link LoggedJob} object.
			
 
				    */
			
 
				   public LoggedJob build() {
			
 
				-    // The main job here is to build CDFs
			
 
				+    // The main job here is to build CDFs and manage the conf
			
 
				     finalized = true;
			
 
				 
			
 
				+    // set the conf
			
 
				+    result.setJobProperties(jobConfigurationParameters);
			
 
				+    
			
 
				     // initialize all the per-job statistics gathering places
			
 
				     Histogram[] successfulMapAttemptTimes =
			
 
				         new Histogram[ParsedHost.numberOfDistances() + 1];
			
--- a/src/tools/org/apache/hadoop/tools/rumen/JobConfigurationParser.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/JobConfigurationParser.java
@@ -17,13 +17,9 @@
 
				  */
			
 
				 package org.apache.hadoop.tools.rumen;
			
 
				 
			
 
				-import java.io.BufferedInputStream;
			
 
				 import java.io.IOException;
			
 
				 import java.io.InputStream;
			
 
				-import java.util.HashSet;
			
 
				-import java.util.List;
			
 
				 import java.util.Properties;
			
 
				-import java.util.Set;
			
 
				 
			
 
				 import javax.xml.parsers.DocumentBuilder;
			
 
				 import javax.xml.parsers.DocumentBuilderFactory;
			
@@ -38,22 +34,11 @@ import org.xml.sax.SAXException;
 
				 
			
 
				 /**
			
 
				  * {@link JobConfigurationParser} parses the job configuration xml file, and
			
 
				- * extracts various framework specific properties. It parses the file using a
			
 
				+ * extracts configuration properties. It parses the file using a
			
 
				  * stream-parser and thus is more memory efficient. [This optimization may be
			
 
				  * postponed for a future release]
			
 
				  */
			
 
				 public class JobConfigurationParser {
			
 
				-  final private Set<String> interested;
			
 
				-
			
 
				-  /**
			
 
				-   * Constructor
			
 
				-   * 
			
 
				-   * @param interested
			
 
				-   *          properties we should extract from the job configuration xml.
			
 
				-   */
			
 
				-  public JobConfigurationParser(List<String> interested) {
			
 
				-    this.interested = new HashSet<String>(interested);
			
 
				-  }
			
 
				 
			
 
				   /**
			
 
				    * Parse the job configuration file (as an input stream) and return a
			
@@ -66,7 +51,7 @@ public class JobConfigurationParser {
 
				    *         configuration xml.
			
 
				    * @throws IOException
			
 
				    */
			
 
				-  Properties parse(InputStream input) throws IOException {
			
 
				+  static Properties parse(InputStream input) throws IOException {
			
 
				     Properties result = new Properties();
			
 
				 
			
 
				     try {
			
@@ -117,7 +102,7 @@ public class JobConfigurationParser {
 
				           }
			
 
				         }
			
 
				 
			
 
				-        if (interested.contains(attr) && value != null) {
			
 
				+        if (attr != null && value != null) {
			
 
				           result.put(attr, value);
			
 
				         }
			
 
				       }
			
--- a/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java
@@ -38,7 +38,7 @@ public class JobHistoryParserFactory {
 
				     throw new IOException("No suitable parser.");
			
 
				   }
			
 
				 
			
 
				-  enum VersionDetector {
			
 
				+  public enum VersionDetector {
			
 
				     Hadoop20() {
			
 
				 
			
 
				       @Override
			
--- a/src/tools/org/apache/hadoop/tools/rumen/LoggedJob.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/LoggedJob.java
@@ -22,6 +22,8 @@ package org.apache.hadoop.tools.rumen;
 
				 
			
 
				 import java.util.ArrayList;
			
 
				 import java.util.List;
			
 
				+import java.util.Map;
			
 
				+import java.util.Properties;
			
 
				 import java.util.Set;
			
 
				 import java.util.TreeSet;
			
 
				 
			
@@ -92,6 +94,8 @@ public class LoggedJob implements DeepCompare {
 
				   double[] mapperTriesToSucceed;
			
 
				   double failedMapperFraction; // !!!!!
			
 
				 
			
 
				+  private Properties jobProperties = new Properties();
			
 
				+  
			
 
				   LoggedJob() {
			
 
				 
			
 
				   }
			
@@ -102,6 +106,20 @@ public class LoggedJob implements DeepCompare {
 
				     setJobID(jobID);
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Set the configuration properties of the job.
			
 
				+   */
			
 
				+  void setJobProperties(Properties conf) {
			
 
				+    this.jobProperties = conf;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the configuration properties of the job.
			
 
				+   */
			
 
				+  public Properties getJobProperties() {
			
 
				+    return jobProperties;
			
 
				+  }
			
 
				+  
			
 
				   void adjustTimes(long adjustment) {
			
 
				     submitTime += adjustment;
			
 
				     launchTime += adjustment;
			
@@ -537,6 +555,35 @@ public class LoggedJob implements DeepCompare {
 
				     }
			
 
				   }
			
 
				 
			
 
				+  private void compareJobProperties(Properties prop1, Properties prop2,
			
 
				+                                    TreePath loc, String eltname) 
			
 
				+  throws DeepInequalityException {
			
 
				+    if (prop1 == null && prop2 == null) {
			
 
				+      return;
			
 
				+    }
			
 
				+
			
 
				+    if (prop1 == null || prop2 == null) {
			
 
				+      throw new DeepInequalityException(eltname + " miscompared [null]", 
			
 
				+                                        new TreePath(loc, eltname));
			
 
				+    }
			
 
				+
			
 
				+    if (prop1.size() != prop2.size()) {
			
 
				+      throw new DeepInequalityException(eltname + " miscompared [size]", 
			
 
				+                                        new TreePath(loc, eltname));
			
 
				+    }
			
 
				+    
			
 
				+    for (Map.Entry<Object, Object> entry : prop1.entrySet()) {
			
 
				+      Object v1 = entry.getValue();
			
 
				+      Object v2 = prop2.get(entry.getKey());
			
 
				+      if (v1 == null || v2 == null || !v1.equals(v2)) {
			
 
				+        throw new DeepInequalityException(
			
 
				+          eltname + " miscompared for value of key : " 
			
 
				+            + entry.getKey().toString(), 
			
 
				+          new TreePath(loc, eltname));
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				   public void deepCompare(DeepCompare comparand, TreePath loc)
			
 
				       throws DeepInequalityException {
			
 
				     if (!(comparand instanceof LoggedJob)) {
			
@@ -600,5 +647,9 @@ public class LoggedJob implements DeepCompare {
 
				     compare1(clusterReduceMB, other.clusterReduceMB, loc, "clusterReduceMB");
			
 
				     compare1(jobMapMB, other.jobMapMB, loc, "jobMapMB");
			
 
				     compare1(jobReduceMB, other.jobReduceMB, loc, "jobReduceMB");
			
 
				+
			
 
				+    // compare the job configuration parameters
			
 
				+    compareJobProperties(jobProperties, other.getJobProperties(), loc, 
			
 
				+                         "JobProperties");
			
 
				   }
			
 
				 }
			
--- a/src/tools/org/apache/hadoop/tools/rumen/LoggedTaskAttempt.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/LoggedTaskAttempt.java
@@ -64,6 +64,9 @@ public class LoggedTaskAttempt implements DeepCompare {
 
				 
			
 
				   LoggedLocation location;
			
 
				 
			
 
				+  // Initialize to default object for backward compatibility
			
 
				+  ResourceUsageMetrics metrics = new ResourceUsageMetrics();
			
 
				+  
			
 
				   LoggedTaskAttempt() {
			
 
				     super();
			
 
				   }
			
@@ -349,8 +352,50 @@ public class LoggedTaskAttempt implements DeepCompare {
 
				         attempt.spilledRecords = val;
			
 
				       }
			
 
				     }, counters, "SPILLED_RECORDS");
			
 
				+    
			
 
				+    // incorporate CPU usage
			
 
				+    incorporateCounter(new SetField(this) {
			
 
				+      @Override
			
 
				+      void set(long val) {
			
 
				+        metrics.setCumulativeCpuUsage(val);
			
 
				+      }
			
 
				+    }, counters, "CPU_MILLISECONDS");
			
 
				+    
			
 
				+    // incorporate virtual memory usage
			
 
				+    incorporateCounter(new SetField(this) {
			
 
				+      @Override
			
 
				+      void set(long val) {
			
 
				+        metrics.setVirtualMemoryUsage(val);
			
 
				+      }
			
 
				+    }, counters, "VIRTUAL_MEMORY_BYTES");
			
 
				+    
			
 
				+    // incorporate physical memory usage
			
 
				+    incorporateCounter(new SetField(this) {
			
 
				+      @Override
			
 
				+      void set(long val) {
			
 
				+        metrics.setPhysicalMemoryUsage(val);
			
 
				+      }
			
 
				+    }, counters, "PHYSICAL_MEMORY_BYTES");
			
 
				+    
			
 
				+    // incorporate heap usage
			
 
				+    incorporateCounter(new SetField(this) {
			
 
				+      @Override
			
 
				+      void set(long val) {
			
 
				+        metrics.setHeapUsage(val);
			
 
				+      }
			
 
				+    }, counters, "COMMITTED_HEAP_BYTES");
			
 
				   }
			
 
				 
			
 
				+  // Get the resource usage metrics
			
 
				+  public ResourceUsageMetrics getResourceUsageMetrics() {
			
 
				+    return metrics;
			
 
				+  }
			
 
				+  
			
 
				+  // Set the resource usage metrics
			
 
				+  void setResourceUsageMetrics(ResourceUsageMetrics metrics) {
			
 
				+    this.metrics = metrics;
			
 
				+  }
			
 
				+  
			
 
				   private static String canonicalizeCounterName(String nonCanonicalName) {
			
 
				     String result = nonCanonicalName.toLowerCase();
			
 
				 
			
--- a/src/tools/org/apache/hadoop/tools/rumen/Node.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/Node.java
@@ -24,7 +24,7 @@ import java.util.TreeSet;
 
				 
			
 
				 /**
			
 
				  * {@link Node} represents a node in the cluster topology. A node can be a
			
 
				- * {@MachineNode}, or a {@link RackNode}, etc.
			
 
				+ * {@link MachineNode}, or a {@link RackNode}, etc.
			
 
				  */
			
 
				 public class Node implements Comparable<Node> {
			
 
				   private static final SortedSet<Node> EMPTY_SET = 
			
--- a/src/tools/org/apache/hadoop/tools/rumen/ParsedConfigFile.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/ParsedConfigFile.java
@@ -17,6 +17,7 @@
 
				  */
			
 
				 package org.apache.hadoop.tools.rumen;
			
 
				 
			
 
				+import java.util.Properties;
			
 
				 import java.util.regex.Pattern;
			
 
				 import java.util.regex.Matcher;
			
 
				 
			
@@ -55,6 +56,8 @@ class ParsedConfigFile {
 
				   final String jobID;
			
 
				 
			
 
				   final boolean valid;
			
 
				+  
			
 
				+  final Properties properties = new Properties();
			
 
				 
			
 
				   private int maybeGetIntValue(String propName, String attr, String value,
			
 
				       int oldValue) {
			
@@ -143,6 +146,8 @@ class ParsedConfigFile {
 
				                 "true".equals(((Text) field.getFirstChild()).getData());
			
 
				           }
			
 
				         }
			
 
				+        
			
 
				+        properties.setProperty(attr, value);
			
 
				 
			
 
				         if ("mapred.child.java.opts".equals(attr) && value != null) {
			
 
				           Matcher matcher = heapPattern.matcher(value);
			
--- a/src/tools/org/apache/hadoop/tools/rumen/ResourceUsageMetrics.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/ResourceUsageMetrics.java
@@ -0,0 +1,160 @@
 
				+/**
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+package org.apache.hadoop.tools.rumen;
			
 
				+
			
 
				+import java.io.DataInput;
			
 
				+import java.io.DataOutput;
			
 
				+import java.io.IOException;
			
 
				+
			
 
				+import org.apache.hadoop.io.Writable;
			
 
				+import org.apache.hadoop.io.WritableUtils;
			
 
				+
			
 
				+/**
			
 
				+ * Captures the resource usage metrics.
			
 
				+ */
			
 
				+public class ResourceUsageMetrics implements Writable, DeepCompare  {
			
 
				+  private long cumulativeCpuUsage;
			
 
				+  private long virtualMemoryUsage;
			
 
				+  private long physicalMemoryUsage;
			
 
				+  private long heapUsage;
			
 
				+  
			
 
				+  public ResourceUsageMetrics() {
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the cumulative CPU usage.
			
 
				+   */
			
 
				+  public long getCumulativeCpuUsage() {
			
 
				+    return cumulativeCpuUsage;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the cumulative CPU usage.
			
 
				+   */
			
 
				+  public void setCumulativeCpuUsage(long usage) {
			
 
				+    cumulativeCpuUsage = usage;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the virtual memory usage.
			
 
				+   */
			
 
				+  public long getVirtualMemoryUsage() {
			
 
				+    return virtualMemoryUsage;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the virtual memory usage.
			
 
				+   */
			
 
				+  public void setVirtualMemoryUsage(long usage) {
			
 
				+    virtualMemoryUsage = usage;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the physical memory usage.
			
 
				+   */
			
 
				+  public long getPhysicalMemoryUsage() {
			
 
				+    return physicalMemoryUsage;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the physical memory usage.
			
 
				+   */
			
 
				+  public void setPhysicalMemoryUsage(long usage) {
			
 
				+    physicalMemoryUsage = usage;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Get the total heap usage.
			
 
				+   */
			
 
				+  public long getHeapUsage() {
			
 
				+    return heapUsage;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Set the total heap usage.
			
 
				+   */
			
 
				+  public void setHeapUsage(long usage) {
			
 
				+    heapUsage = usage;
			
 
				+  }
			
 
				+  
			
 
				+  /**
			
 
				+   * Returns the size of the serialized data
			
 
				+   */
			
 
				+  public int size() {
			
 
				+    int size = 0;
			
 
				+    size += WritableUtils.getVIntSize(cumulativeCpuUsage);   // long #1
			
 
				+    size += WritableUtils.getVIntSize(virtualMemoryUsage);   // long #2
			
 
				+    size += WritableUtils.getVIntSize(physicalMemoryUsage);  // long #3
			
 
				+    size += WritableUtils.getVIntSize(heapUsage);            // long #4
			
 
				+    return size;
			
 
				+  }
			
 
				+  
			
 
				+  @Override
			
 
				+  public void readFields(DataInput in) throws IOException {
			
 
				+    cumulativeCpuUsage = WritableUtils.readVLong(in);  // long #1
			
 
				+    virtualMemoryUsage = WritableUtils.readVLong(in);  // long #2
			
 
				+    physicalMemoryUsage = WritableUtils.readVLong(in); // long #3
			
 
				+    heapUsage = WritableUtils.readVLong(in);           // long #4
			
 
				+  }
			
 
				+  
			
 
				+  @Override
			
 
				+  public void write(DataOutput out) throws IOException {
			
 
				+    //TODO Write resources version no too
			
 
				+    WritableUtils.writeVLong(out, cumulativeCpuUsage);  // long #1
			
 
				+    WritableUtils.writeVLong(out, virtualMemoryUsage);  // long #2
			
 
				+    WritableUtils.writeVLong(out, physicalMemoryUsage); // long #3
			
 
				+    WritableUtils.writeVLong(out, heapUsage);           // long #4
			
 
				+  }
			
 
				+
			
 
				+  private static void compareMetric(long m1, long m2, TreePath loc) 
			
 
				+  throws DeepInequalityException {
			
 
				+    if (m1 != m2) {
			
 
				+      throw new DeepInequalityException("Value miscompared:" + loc.toString(), 
			
 
				+                                        loc);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  private static void compareSize(ResourceUsageMetrics m1, 
			
 
				+                                  ResourceUsageMetrics m2, TreePath loc) 
			
 
				+  throws DeepInequalityException {
			
 
				+    if (m1.size() != m2.size()) {
			
 
				+      throw new DeepInequalityException("Size miscompared: " + loc.toString(), 
			
 
				+                                        loc);
			
 
				+    }
			
 
				+  }
			
 
				+  
			
 
				+  @Override
			
 
				+  public void deepCompare(DeepCompare other, TreePath loc)
			
 
				+      throws DeepInequalityException {
			
 
				+    if (!(other instanceof ResourceUsageMetrics)) {
			
 
				+      throw new DeepInequalityException("Comparand has wrong type", loc);
			
 
				+    }
			
 
				+
			
 
				+    ResourceUsageMetrics metrics2 = (ResourceUsageMetrics) other;
			
 
				+    compareMetric(getCumulativeCpuUsage(), metrics2.getCumulativeCpuUsage(), 
			
 
				+                  new TreePath(loc, "cumulativeCpu"));
			
 
				+    compareMetric(getVirtualMemoryUsage(), metrics2.getVirtualMemoryUsage(), 
			
 
				+                  new TreePath(loc, "virtualMemory"));
			
 
				+    compareMetric(getPhysicalMemoryUsage(), metrics2.getPhysicalMemoryUsage(), 
			
 
				+                  new TreePath(loc, "physicalMemory"));
			
 
				+    compareMetric(getHeapUsage(), metrics2.getHeapUsage(), 
			
 
				+                  new TreePath(loc, "heapUsage"));
			
 
				+    compareSize(this, metrics2, new TreePath(loc, "size"));
			
 
				+  }
			
 
				+}
			
 
				+
			
--- a/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java
@@ -17,7 +17,6 @@
 
				  */
			
 
				 package org.apache.hadoop.tools.rumen;
			
 
				 
			
 
				-import org.apache.hadoop.mapred.TaskStatus;
			
 
				 import org.apache.hadoop.mapred.TaskStatus.State;
			
 
				 
			
 
				 /**
			
@@ -38,7 +37,7 @@ public abstract class TaskAttemptInfo {
 
				   }
			
 
				 
			
 
				   /**
			
 
				-   * Get the final {@link TaskStatus.State} of the task-attempt.
			
 
				+   * Get the final {@link State} of the task-attempt.
			
 
				    * 
			
 
				    * @return the final <code>State</code> of the task-attempt
			
 
				    */
			
--- a/src/tools/org/apache/hadoop/tools/rumen/TaskInfo.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/TaskInfo.java
@@ -23,14 +23,22 @@ public class TaskInfo {
 
				   private final long bytesOut;
			
 
				   private final int recsOut;
			
 
				   private final long maxMemory;
			
 
				+  private final ResourceUsageMetrics metrics;
			
 
				 
			
 
				   public TaskInfo(long bytesIn, int recsIn, long bytesOut, int recsOut,
			
 
				       long maxMemory) {
			
 
				+    this(bytesIn, recsIn, bytesOut, recsOut, maxMemory, 
			
 
				+         new ResourceUsageMetrics());
			
 
				+  }
			
 
				+  
			
 
				+  public TaskInfo(long bytesIn, int recsIn, long bytesOut, int recsOut,
			
 
				+                  long maxMemory, ResourceUsageMetrics metrics) {
			
 
				     this.bytesIn = bytesIn;
			
 
				     this.recsIn = recsIn;
			
 
				     this.bytesOut = bytesOut;
			
 
				     this.recsOut = recsOut;
			
 
				     this.maxMemory = maxMemory;
			
 
				+    this.metrics = metrics;
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -70,4 +78,10 @@ public class TaskInfo {
 
				     return maxMemory;
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * @return Resource usage metrics
			
 
				+   */
			
 
				+  public ResourceUsageMetrics getResourceUsageMetrics() {
			
 
				+    return metrics;
			
 
				+  }
			
 
				 }
			
--- a/src/tools/org/apache/hadoop/tools/rumen/TraceBuilder.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/TraceBuilder.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 
				 import java.io.InputStream;
			
 
				 import java.util.ArrayList;
			
 
				 import java.util.Arrays;
			
 
				+import java.util.Comparator;
			
 
				 import java.util.LinkedList;
			
 
				 import java.util.List;
			
 
				 import java.util.Properties;
			
@@ -35,6 +36,7 @@ import org.apache.hadoop.conf.Configured;
 
				 import org.apache.hadoop.fs.FileStatus;
			
 
				 import org.apache.hadoop.fs.FileSystem;
			
 
				 import org.apache.hadoop.fs.Path;
			
 
				+import org.apache.hadoop.fs.PathFilter;
			
 
				 import org.apache.hadoop.io.IOUtils;
			
 
				 import org.apache.hadoop.mapred.JobHistory;
			
 
				 import org.apache.hadoop.util.Tool;
			
@@ -49,7 +51,6 @@ public class TraceBuilder extends Configured implements Tool {
 
				   static final int RUN_METHOD_FAILED_EXIT_CODE = 3;
			
 
				 
			
 
				   TopologyBuilder topologyBuilder = new TopologyBuilder();
			
 
				-  JobConfigurationParser jobConfParser;
			
 
				   Outputter<LoggedJob> traceWriter;
			
 
				   Outputter<LoggedNetworkTopology> topologyWriter;
			
 
				 
			
@@ -67,48 +68,136 @@ public class TraceBuilder extends Configured implements Tool {
 
				         IOException, ClassNotFoundException {
			
 
				       int switchTop = 0;
			
 
				 
			
 
				+      // to determine if the input paths should be recursively scanned or not
			
 
				+      boolean doRecursiveTraversal = false;
			
 
				+
			
 
				       while (args[switchTop].startsWith("-")) {
			
 
				         if (args[switchTop].equalsIgnoreCase("-demuxer")) {
			
 
				           inputDemuxerClass =
			
 
				-              Class.forName(args[++switchTop]).asSubclass(InputDemuxer.class);
			
 
				-
			
 
				-          ++switchTop;
			
 
				+            Class.forName(args[++switchTop]).asSubclass(InputDemuxer.class);
			
 
				+        } else if (args[switchTop].equalsIgnoreCase("-recursive")) {
			
 
				+          doRecursiveTraversal = true;
			
 
				         }
			
 
				+        ++switchTop;
			
 
				       }
			
 
				 
			
 
				       traceOutput = new Path(args[0 + switchTop]);
			
 
				       topologyOutput = new Path(args[1 + switchTop]);
			
 
				 
			
 
				       for (int i = 2 + switchTop; i < args.length; ++i) {
			
 
				+        inputs.addAll(processInputArgument(
			
 
				+            args[i], conf, doRecursiveTraversal));
			
 
				+      }
			
 
				+    }
			
 
				 
			
 
				-        Path thisPath = new Path(args[i]);
			
 
				-
			
 
				-        FileSystem fs = thisPath.getFileSystem(conf);
			
 
				-        if (fs.getFileStatus(thisPath).isDir()) {
			
 
				-          FileStatus[] statuses = fs.listStatus(thisPath);
			
 
				-
			
 
				-          List<String> dirNames = new ArrayList<String>();
			
 
				+    /**
			
 
				+     * Compare the history file names, not the full paths.
			
 
				+     * Job history file name format is such that doing lexicographic sort on the
			
 
				+     * history file names should result in the order of jobs' submission times.
			
 
				+     */
			
 
				+    private static class HistoryLogsComparator
			
 
				+    implements Comparator<FileStatus> {
			
 
				+      @Override
			
 
				+      public int compare(FileStatus file1, FileStatus file2) {
			
 
				+        return file1.getPath().getName().compareTo(
			
 
				+            file2.getPath().getName());
			
 
				+      }
			
 
				+    }
			
 
				 
			
 
				-          for (FileStatus s : statuses) {
			
 
				-            if (s.isDir()) continue;
			
 
				-            String name = s.getPath().getName();
			
 
				+    private static class InputFilter implements PathFilter {
			
 
				+      public boolean accept(Path path) {
			
 
				+        return !(path.getName().endsWith(".crc")
			
 
				+                 || path.getName().startsWith("."));
			
 
				+      }
			
 
				+    }
			
 
				 
			
 
				-            if (!(name.endsWith(".crc") || name.startsWith("."))) {
			
 
				-              dirNames.add(name);
			
 
				+    /**
			
 
				+     * List files (possibly recursively) and get their statuses.
			
 
				+     * @param path The path of the file/dir for which ls is to be done
			
 
				+     * @param fs FileSystem of the path
			
 
				+     * @param filter the user-supplied path filter
			
 
				+     * @return the list of file statuses under the given path
			
 
				+     */
			
 
				+    static List<FileStatus> listFiles(Path path, FileSystem fs,
			
 
				+        PathFilter filter, boolean isRecursive) throws IOException {
			
 
				+      List<FileStatus> list = new ArrayList<FileStatus>();
			
 
				+      FileStatus[] statuses = fs.listStatus(path, filter);
			
 
				+      if (statuses != null) {
			
 
				+        for (FileStatus status : statuses) {
			
 
				+          if (status.isDir()) {
			
 
				+            if (isRecursive) {
			
 
				+              list.addAll(listFiles(status.getPath(), fs, filter, isRecursive));
			
 
				             }
			
 
				+          } else {
			
 
				+            list.add(status);
			
 
				           }
			
 
				+        }
			
 
				+      }
			
 
				+      return list;
			
 
				+    }
			
 
				+
			
 
				+    /**
			
 
				+     * Processes the input file/folder argument. If the input is a file,
			
 
				+     * then it is directly considered for further processing by TraceBuilder.
			
 
				+     * If the input is a folder, then all the history logs in the
			
 
				+     * input folder are considered for further processing.
			
 
				+     *
			
 
				+     * If isRecursive is true, then the input path is recursively scanned
			
 
				+     * for job history logs for further processing by TraceBuilder.
			
 
				+     *
			
 
				+     * NOTE: If the input represents a globbed path, then it is first flattened
			
 
				+     *       and then the individual paths represented by the globbed input
			
 
				+     *       path are considered for further processing.
			
 
				+     *
			
 
				+     * @param input        input path, possibly globbed
			
 
				+     * @param conf         configuration
			
 
				+     * @param isRecursive  whether to recursively traverse the input paths to
			
 
				+     *                     find history logs
			
 
				+     * @return the input history log files' paths
			
 
				+     * @throws FileNotFoundException
			
 
				+     * @throws IOException
			
 
				+     */
			
 
				+    static List<Path> processInputArgument(String input, Configuration conf,
			
 
				+        boolean isRecursive) throws FileNotFoundException, IOException {
			
 
				+      Path inPath = new Path(input);
			
 
				+      FileSystem fs = inPath.getFileSystem(conf);
			
 
				+      FileStatus[] inStatuses = fs.globStatus(inPath);
			
 
				+
			
 
				+      List<Path> inputPaths = new LinkedList<Path>();
			
 
				+      if (inStatuses == null || inStatuses.length == 0) {
			
 
				+        return inputPaths;
			
 
				+      }
			
 
				 
			
 
				-          String[] sortableNames = dirNames.toArray(new String[1]);
			
 
				+      for (FileStatus inStatus : inStatuses) {
			
 
				+        Path thisPath = inStatus.getPath();
			
 
				+        if (inStatus.isDir()) {
			
 
				 
			
 
				-          Arrays.sort(sortableNames);
			
 
				+          // Find list of files in this path(recursively if -recursive option
			
 
				+              // is specified).
			
 
				+          List<FileStatus> historyLogs = new ArrayList<FileStatus>();
			
 
				 
			
 
				-          for (String dirName : sortableNames) {
			
 
				-            inputs.add(new Path(thisPath, dirName));
			
 
				+          List<FileStatus> statuses = listFiles(thisPath, fs, new InputFilter(),
			
 
				+              isRecursive);
			
 
				+          for (FileStatus child : statuses) {
			
 
				+            historyLogs.add(child);
			
 
				+          }
			
 
				+          if (historyLogs.size() > 0) {
			
 
				+            // Add the sorted history log file names in this path to the
			
 
				+            // inputPaths list
			
 
				+            FileStatus[] sortableNames =
			
 
				+              historyLogs.toArray(new FileStatus[historyLogs.size()]);
			
 
				+            Arrays.sort(sortableNames, new HistoryLogsComparator());
			
 
				+
			
 
				+            for (FileStatus historyLog : sortableNames) {
			
 
				+              inputPaths.add(historyLog.getPath());
			
 
				+            }
			
 
				           }
			
 
				         } else {
			
 
				-          inputs.add(thisPath);
			
 
				+          inputPaths.add(thisPath);
			
 
				         }
			
 
				       }
			
 
				+
			
 
				+      return inputPaths;
			
 
				     }
			
 
				   }
			
 
				 
			
@@ -169,25 +258,11 @@ public class TraceBuilder extends Configured implements Tool {
 
				     return jobId != null;
			
 
				   }
			
 
				 
			
 
				-  private void addInterestedProperties(List<String> interestedProperties,
			
 
				-      String[] names) {
			
 
				-    for (String name : names) {
			
 
				-      interestedProperties.add(name);
			
 
				-    }
			
 
				-  }
			
 
				 
			
 
				   @SuppressWarnings("unchecked")
			
 
				   @Override
			
 
				   public int run(String[] args) throws Exception {
			
 
				     MyOptions options = new MyOptions(args, getConf());
			
 
				-    List<String> interestedProperties = new ArrayList<String>();
			
 
				-    {
			
 
				-      for (JobConfPropertyNames candidateSet : JobConfPropertyNames.values()) {
			
 
				-        addInterestedProperties(interestedProperties, candidateSet
			
 
				-            .getCandidates());
			
 
				-      }
			
 
				-    }
			
 
				-    jobConfParser = new JobConfigurationParser(interestedProperties);
			
 
				     traceWriter = options.clazzTraceOutputter.newInstance();
			
 
				     traceWriter.init(options.traceOutput, getConf());
			
 
				     topologyWriter = new DefaultOutputter<LoggedNetworkTopology>();
			
@@ -232,7 +307,7 @@ public class TraceBuilder extends Configured implements Tool {
 
				               }
			
 
				 
			
 
				               if (isJobConfXml(filePair.first(), ris)) {
			
 
				-                processJobConf(jobConfParser.parse(ris.rewind()), jobBuilder);
			
 
				+            	processJobConf(JobConfigurationParser.parse(ris.rewind()), jobBuilder);
			
 
				               } else {
			
 
				                 parser = JobHistoryParserFactory.getParser(ris);
			
 
				                 if (parser == null) {
			
--- a/src/tools/org/apache/hadoop/tools/rumen/ZombieJob.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/ZombieJob.java
@@ -120,8 +120,20 @@ public class ZombieJob implements JobStory {
 
				   @Override
			
 
				   public synchronized JobConf getJobConf() {
			
 
				     if (jobConf == null) {
			
 
				-      // TODO : add more to jobConf ?
			
 
				       jobConf = new JobConf();
			
 
				+      
			
 
				+      // Add parameters from the configuration in the job trace
			
 
				+      //
			
 
				+      // The reason why the job configuration parameters, as seen in the jobconf
			
 
				+      // file, are added first because the specialized values obtained from 
			
 
				+      // Rumen should override the job conf values.
			
 
				+      //
			
 
				+      for (Map.Entry<Object, Object> entry : job.getJobProperties().entrySet()) {
			
 
				+        jobConf.set(entry.getKey().toString(), entry.getValue().toString());
			
 
				+      }
			
 
				+      
			
 
				+      //TODO Eliminate parameters that are already copied from the job's 
			
 
				+      // configuration file.
			
 
				       jobConf.setJobName(getName());
			
 
				       jobConf.setUser(getUser());
			
 
				       jobConf.setNumMapTasks(getNumberMaps());
			
@@ -622,6 +634,7 @@ public class ZombieJob implements JobStory {
 
				     long outputBytes = -1;
			
 
				     long outputRecords = -1;
			
 
				     long heapMegabytes = -1;
			
 
				+    ResourceUsageMetrics metrics = new ResourceUsageMetrics();
			
 
				 
			
 
				     Values type = loggedTask.getTaskType();
			
 
				     if ((type != Values.MAP) && (type != Values.REDUCE)) {
			
@@ -656,12 +669,15 @@ public class ZombieJob implements JobStory {
 
				             (job.getJobReduceMB() > 0) ? job.getJobReduceMB() : job
			
 
				                 .getHeapMegabytes();
			
 
				       }
			
 
				+      // set the resource usage metrics
			
 
				+      metrics = attempt.getResourceUsageMetrics();
			
 
				       break;
			
 
				     }
			
 
				 
			
 
				     TaskInfo taskInfo =
			
 
				         new TaskInfo(inputBytes, (int) inputRecords, outputBytes,
			
 
				-            (int) outputRecords, (int) heapMegabytes);
			
 
				+            (int) outputRecords, (int) heapMegabytes,
			
 
				+            metrics);
			
 
				     return taskInfo;
			
 
				   }
			
 
				 
			
--- a/src/tools/org/apache/hadoop/tools/rumen/package-info.java
+++ b/src/tools/org/apache/hadoop/tools/rumen/package-info.java
@@ -0,0 +1,377 @@
 
				+/*
			
 
				+ * Licensed to the Apache Software Foundation (ASF) under one
			
 
				+ * or more contributor license agreements.  See the NOTICE file
			
 
				+ * distributed with this work for additional information
			
 
				+ * regarding copyright ownership.  The ASF licenses this file
			
 
				+ * to you under the Apache License, Version 2.0 (the
			
 
				+ * "License"); you may not use this file except in compliance
			
 
				+ * with the License.  You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/** Rumen is a data extraction and analysis tool built for 
			
 
				+ * <a href="http://hadoop.apache.org/">Apache Hadoop</a>. Rumen mines job history
			
 
				+ * logs to extract meaningful data and stores it into an easily-parsed format.
			
 
				+ * 
			
 
				+ * The default output format of Rumen is <a href="http://www.json.org">JSON</a>.
			
 
				+ * Rumen uses the <a href="http://jackson.codehaus.org/">Jackson</a> library to 
			
 
				+ * create JSON objects.
			
 
				+ * <br><br>
			
 
				+ * 
			
 
				+ * The following classes can be used to programmatically invoke Rumen:
			
 
				+ * <ol>
			
 
				+ *   <li>
			
 
				+ *    {@link org.apache.hadoop.tools.rumen.JobConfigurationParser}<br>
			
 
				+ *      A parser to parse and filter out interesting properties from job 
			
 
				+ *      configuration.
			
 
				+ *      
			
 
				+ *      <br><br>
			
 
				+ *      <i>Sample code</i>:
			
 
				+ *      <pre>
			
 
				+ *      <code>
			
 
				+ *        // An example to parse and filter out job name
			
 
				+ *        
			
 
				+ *        String conf_filename = .. // assume the job configuration filename here
			
 
				+ *        
			
 
				+ *        // construct a list of interesting properties
			
 
				+ *        List<String> interestedProperties = new ArrayList<String>();
			
 
				+ *        interestedProperties.add("mapreduce.job.name");
			
 
				+ *        
			
 
				+ *        JobConfigurationParser jcp = 
			
 
				+ *          new JobConfigurationParser(interestedProperties);
			
 
				+ *
			
 
				+ *        InputStream in = new FileInputStream(conf_filename);
			
 
				+ *        Properties parsedProperties = jcp.parse(in);
			
 
				+ *     </code>
			
 
				+ *     </pre>
			
 
				+ *     Some of the commonly used interesting properties are enumerated in 
			
 
				+ *     {@link org.apache.hadoop.tools.rumen.JobConfPropertyNames}. <br><br>
			
 
				+ *     
			
 
				+ *     <b>Note:</b>
			
 
				+ *        A single instance of {@link org.apache.hadoop.tools.rumen.JobConfigurationParser} 
			
 
				+ *        can be used to parse multiple job configuration files. 
			
 
				+ *     
			
 
				+ *   </li>
			
 
				+ *   <li>
			
 
				+ *    {@link org.apache.hadoop.tools.rumen.JobHistoryParser} <br>
			
 
				+ *      A parser that parses job history files. It is an interface and actual 
			
 
				+ *      implementations are defined as Enum in 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory}. Note that
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.RewindableInputStream}<br>
			
 
				+ *      is a wrapper class around {@link java.io.InputStream} to make the input 
			
 
				+ *      stream rewindable.
			
 
				+ *      
			
 
				+ *      <br>
			
 
				+ *      <i>Sample code</i>:
			
 
				+ *      <pre>
			
 
				+ *      <code>
			
 
				+ *        // An example to parse a current job history file i.e a job history 
			
 
				+ *        // file for which the version is known
			
 
				+ *        
			
 
				+ *        String filename = .. // assume the job history filename here
			
 
				+ *        
			
 
				+ *        InputStream in = new FileInputStream(filename);
			
 
				+ *        
			
 
				+ *        HistoryEvent event = null;
			
 
				+ *        
			
 
				+ *        JobHistoryParser parser = new CurrentJHParser(in);
			
 
				+ *        
			
 
				+ *        event = parser.nextEvent();
			
 
				+ *        // process all the events
			
 
				+ *        while (event != null) {
			
 
				+ *          // ... process all event
			
 
				+ *          event = parser.nextEvent();
			
 
				+ *        }
			
 
				+ *        
			
 
				+ *        // close the parser and the underlying stream
			
 
				+ *        parser.close();
			
 
				+ *      </code>
			
 
				+ *      </pre>
			
 
				+ *      
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory} provides a 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory#getParser(org.apache.hadoop.tools.rumen.RewindableInputStream)}
			
 
				+ *      API to get a parser for parsing the job history file. Note that this
			
 
				+ *      API can be used if the job history version is unknown.<br><br>
			
 
				+ *      <i>Sample code</i>:
			
 
				+ *      <pre>
			
 
				+ *      <code>
			
 
				+ *        // An example to parse a job history for which the version is not 
			
 
				+ *        // known i.e using JobHistoryParserFactory.getParser()
			
 
				+ *        
			
 
				+ *        String filename = .. // assume the job history filename here
			
 
				+ *        
			
 
				+ *        InputStream in = new FileInputStream(filename);
			
 
				+ *        RewindableInputStream ris = new RewindableInputStream(in);
			
 
				+ *        
			
 
				+ *        // JobHistoryParserFactory will check and return a parser that can
			
 
				+ *        // parse the file
			
 
				+ *        JobHistoryParser parser = JobHistoryParserFactory.getParser(ris);
			
 
				+ *        
			
 
				+ *        // now use the parser to parse the events
			
 
				+ *        HistoryEvent event = parser.nextEvent();
			
 
				+ *        while (event != null) {
			
 
				+ *          // ... process the event
			
 
				+ *          event = parser.nextEvent();
			
 
				+ *        }
			
 
				+ *        
			
 
				+ *        parser.close();
			
 
				+ *      </code>
			
 
				+ *      </pre>
			
 
				+ *      <b>Note:</b>
			
 
				+ *        Create one instance to parse a job history log and close it after use.
			
 
				+ *  </li>
			
 
				+ *  <li>
			
 
				+ *    {@link org.apache.hadoop.tools.rumen.TopologyBuilder}<br>
			
 
				+ *      Builds the cluster topology based on the job history events. Every 
			
 
				+ *      job history file consists of events. Each event can be represented using
			
 
				+ *      {@link org.apache.hadoop.mapreduce.jobhistory.HistoryEvent}. 
			
 
				+ *      These events can be passed to {@link org.apache.hadoop.tools.rumen.TopologyBuilder} using 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.TopologyBuilder#process(org.apache.hadoop.mapreduce.jobhistory.HistoryEvent)}.
			
 
				+ *      A cluster topology can be represented using {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology}.
			
 
				+ *      Once all the job history events are processed, the cluster 
			
 
				+ *      topology can be obtained using {@link org.apache.hadoop.tools.rumen.TopologyBuilder#build()}.
			
 
				+ *      
			
 
				+ *      <br><br>
			
 
				+ *      <i>Sample code</i>:
			
 
				+ *      <pre>
			
 
				+ *      <code>
			
 
				+ *        // Building topology for a job history file represented using 
			
 
				+ *        // 'filename' and the corresponding configuration file represented 
			
 
				+ *        // using 'conf_filename'
			
 
				+ *        String filename = .. // assume the job history filename here
			
 
				+ *        String conf_filename = .. // assume the job configuration filename here
			
 
				+ *        
			
 
				+ *        InputStream jobConfInputStream = new FileInputStream(filename);
			
 
				+ *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
			
 
				+ *        
			
 
				+ *        TopologyBuilder tb = new TopologyBuilder();
			
 
				+ *        
			
 
				+ *        // construct a list of interesting properties
			
 
				+ *        List<String> interestingProperties = new ArrayList<Strng>();
			
 
				+ *        // add the interesting properties here
			
 
				+ *        interestingProperties.add("mapreduce.job.name");
			
 
				+ *        
			
 
				+ *        JobConfigurationParser jcp = 
			
 
				+ *          new JobConfigurationParser(interestingProperties);
			
 
				+ *        
			
 
				+ *        // parse the configuration file
			
 
				+ *        tb.process(jcp.parse(jobConfInputStream));
			
 
				+ *        
			
 
				+ *        // read the job history file and pass it to the 
			
 
				+ *        // TopologyBuilder.
			
 
				+ *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
			
 
				+ *        HistoryEvent e;
			
 
				+ *        
			
 
				+ *        // read and process all the job history events
			
 
				+ *        while ((e = parser.nextEvent()) != null) {
			
 
				+ *          tb.process(e);
			
 
				+ *        }
			
 
				+ *        
			
 
				+ *        LoggedNetworkTopology topology = tb.build();
			
 
				+ *      </code>
			
 
				+ *      </pre>
			
 
				+ *  </li>
			
 
				+ *  <li>
			
 
				+ *    {@link org.apache.hadoop.tools.rumen.JobBuilder}<br>
			
 
				+ *      Summarizes a job history file.
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.TraceBuilder} provides  
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.TraceBuilder#extractJobID(String)} 
			
 
				+ *      API for extracting job id from job history or job configuration files
			
 
				+ *      which can be used for instantiating {@link org.apache.hadoop.tools.rumen.JobBuilder}. 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.JobBuilder} generates a 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.LoggedJob} object via 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.JobBuilder#build()}. 
			
 
				+ *      See {@link org.apache.hadoop.tools.rumen.LoggedJob} for more details.
			
 
				+ *      
			
 
				+ *      <br><br>
			
 
				+ *      <i>Sample code</i>:
			
 
				+ *      <pre>
			
 
				+ *      <code>
			
 
				+ *        // An example to summarize a current job history file 'filename'
			
 
				+ *        // and the corresponding configuration file 'conf_filename'
			
 
				+ *        
			
 
				+ *        String filename = .. // assume the job history filename here
			
 
				+ *        String conf_filename = .. // assume the job configuration filename here
			
 
				+ *        
			
 
				+ *        InputStream jobConfInputStream = new FileInputStream(job_filename);
			
 
				+ *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
			
 
				+ *        
			
 
				+ *        String jobID = TraceBuilder.extractJobID(job_filename);
			
 
				+ *        JobBuilder jb = new JobBuilder(jobID);
			
 
				+ *        
			
 
				+ *        // construct a list of interesting properties
			
 
				+ *        List<String> interestingProperties = new ArrayList<Strng>();
			
 
				+ *        // add the interesting properties here
			
 
				+ *        interestingProperties.add("mapreduce.job.name");
			
 
				+ *        
			
 
				+ *        JobConfigurationParser jcp = 
			
 
				+ *          new JobConfigurationParser(interestingProperties);
			
 
				+ *        
			
 
				+ *        // parse the configuration file
			
 
				+ *        jb.process(jcp.parse(jobConfInputStream));
			
 
				+ *        
			
 
				+ *        // parse the job history file
			
 
				+ *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
			
 
				+ *        try {
			
 
				+ *          HistoryEvent e;
			
 
				+ *          // read and process all the job history events
			
 
				+ *          while ((e = parser.nextEvent()) != null) {
			
 
				+ *            jobBuilder.process(e);
			
 
				+ *          }
			
 
				+ *        } finally {
			
 
				+ *          parser.close();
			
 
				+ *        }
			
 
				+ *        
			
 
				+ *        LoggedJob job = jb.build();
			
 
				+ *      </code>
			
 
				+ *      </pre>
			
 
				+ *     <b>Note:</b>
			
 
				+ *       The order of parsing the job configuration file or job history file is 
			
 
				+ *       not important. Create one instance to parse the history file and job 
			
 
				+ *       configuration.
			
 
				+ *   </li>
			
 
				+ *   <li>
			
 
				+ *    {@link org.apache.hadoop.tools.rumen.DefaultOutputter}<br>
			
 
				+ *      Implements {@link org.apache.hadoop.tools.rumen.Outputter} and writes 
			
 
				+ *      JSON object in text format to the output file. 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter} can be 
			
 
				+ *      initialized with the output filename.
			
 
				+ *      
			
 
				+ *      <br><br>
			
 
				+ *      <i>Sample code</i>:  
			
 
				+ *      <pre>
			
 
				+ *      <code>
			
 
				+ *        // An example to summarize a current job history file represented by
			
 
				+ *        // 'filename' and the configuration filename represented using 
			
 
				+ *        // 'conf_filename'. Also output the job summary to 'out.json' along 
			
 
				+ *        // with the cluster topology to 'topology.json'.
			
 
				+ *        
			
 
				+ *        String filename = .. // assume the job history filename here
			
 
				+ *        String conf_filename = .. // assume the job configuration filename here
			
 
				+ *        
			
 
				+ *        Configuration conf = new Configuration();
			
 
				+ *        DefaultOutputter do = new DefaultOutputter();
			
 
				+ *        do.init("out.json", conf);
			
 
				+ *        
			
 
				+ *        InputStream jobConfInputStream = new FileInputStream(filename);
			
 
				+ *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
			
 
				+ *        
			
 
				+ *        // extract the job-id from the filename
			
 
				+ *        String jobID = TraceBuilder.extractJobID(filename);
			
 
				+ *        JobBuilder jb = new JobBuilder(jobID);
			
 
				+ *        TopologyBuilder tb = new TopologyBuilder();
			
 
				+ *        
			
 
				+ *        // construct a list of interesting properties
			
 
				+ *        List<String> interestingProperties = new ArrayList<Strng>();
			
 
				+ *        // add the interesting properties here
			
 
				+ *        interestingProperties.add("mapreduce.job.name");
			
 
				+ *        
			
 
				+ *        JobConfigurationParser jcp =
			
 
				+ *          new JobConfigurationParser(interestingProperties);
			
 
				+ *          
			
 
				+ *        // parse the configuration file
			
 
				+ *        tb.process(jcp.parse(jobConfInputStream));
			
 
				+ *        
			
 
				+ *        // read the job history file and pass it to the
			
 
				+ *        // TopologyBuilder.
			
 
				+ *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
			
 
				+ *        HistoryEvent e;
			
 
				+ *        while ((e = parser.nextEvent()) != null) {
			
 
				+ *          jb.process(e);
			
 
				+ *          tb.process(e);
			
 
				+ *        }
			
 
				+ *        
			
 
				+ *        LoggedJob j = jb.build();
			
 
				+ *        
			
 
				+ *        // serialize the job summary in json (text) format
			
 
				+ *        do.output(j);
			
 
				+ *        
			
 
				+ *        // close
			
 
				+ *        do.close();
			
 
				+ *        
			
 
				+ *        do.init("topology.json", conf);
			
 
				+ *        
			
 
				+ *        // get the job summary using TopologyBuilder
			
 
				+ *        LoggedNetworkTopology topology = topologyBuilder.build();
			
 
				+ *        
			
 
				+ *        // serialize the cluster topology in json (text) format
			
 
				+ *        do.output(topology);
			
 
				+ *        
			
 
				+ *        // close
			
 
				+ *        do.close();
			
 
				+ *      </code>
			
 
				+ *      </pre>
			
 
				+ *   </li>
			
 
				+ *   <li>
			
 
				+ *    {@link org.apache.hadoop.tools.rumen.JobTraceReader}<br>
			
 
				+ *      A reader for reading {@link org.apache.hadoop.tools.rumen.LoggedJob} serialized using 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter}. {@link org.apache.hadoop.tools.rumen.LoggedJob} 
			
 
				+ *      provides various APIs for extracting job details. Following are the most
			
 
				+ *      commonly used ones
			
 
				+ *        <ul>
			
 
				+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getMapTasks()} : Get the map tasks</li>
			
 
				+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getReduceTasks()} : Get the reduce tasks</li>
			
 
				+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getOtherTasks()} : Get the setup/cleanup tasks</li>
			
 
				+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getOutcome()} : Get the job's outcome</li>
			
 
				+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getSubmitTime()} : Get the job's submit time</li>
			
 
				+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getFinishTime()} : Get the job's finish time</li>
			
 
				+ *        </ul>
			
 
				+ *        
			
 
				+ *      <br><br>
			
 
				+ *      <i>Sample code</i>:
			
 
				+ *      <pre>
			
 
				+ *      <code>
			
 
				+ *        // An example to read job summary from a trace file 'out.json'.
			
 
				+ *        JobTraceReader reader = new JobTracerReader("out.json");
			
 
				+ *        LoggedJob job = reader.getNext();
			
 
				+ *        while (job != null) {
			
 
				+ *          // .... process job level information
			
 
				+ *          for (LoggedTask task : job.getMapTasks()) {
			
 
				+ *            // process all the map tasks in the job
			
 
				+ *            for (LoggedTaskAttempt attempt : task.getAttempts()) {
			
 
				+ *              // process all the map task attempts in the job
			
 
				+ *            }
			
 
				+ *          }
			
 
				+ *          
			
 
				+ *          // get the next job
			
 
				+ *          job = reader.getNext();
			
 
				+ *        }
			
 
				+ *        reader.close();
			
 
				+ *      </code>
			
 
				+ *      </pre>         
			
 
				+ *   </li>
			
 
				+ *   <li>
			
 
				+ *    {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader}<br>
			
 
				+ *      A reader to read {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology} serialized using 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter}. {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader} can be 
			
 
				+ *      initialized using the serialized topology filename. 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader#get()} can
			
 
				+ *      be used to get the 
			
 
				+ *      {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology}. 
			
 
				+ *      
			
 
				+ *      <br><br>
			
 
				+ *      <i>Sample code</i>:
			
 
				+ *      <pre>
			
 
				+ *      <code>
			
 
				+ *        // An example to read the cluster topology from a topology output file
			
 
				+ *        // 'topology.json'
			
 
				+ *        ClusterTopologyReader reader = new ClusterTopologyReader("topology.json");
			
 
				+ *        LoggedNetworkTopology topology  = reader.get();
			
 
				+ *        for (LoggedNetworkTopology t : topology.getChildren()) {
			
 
				+ *          // process the cluster topology
			
 
				+ *        }
			
 
				+ *        reader.close();
			
 
				+ *      </code>
			
 
				+ *      </pre>
			
 
				+ *   </li>
			
 
				+ * </ol>     
			
 
				+ */
			
 
				+
			
 
				+package org.apache.hadoop.tools.rumen;