|
@@ -23,6 +23,7 @@ import static org.apache.hadoop.mapred.Task.Counter.COMBINE_OUTPUT_RECORDS;
|
|
|
import static org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_BYTES;
|
|
|
import static org.apache.hadoop.mapred.Task.Counter.MAP_INPUT_RECORDS;
|
|
|
import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_BYTES;
|
|
|
+import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_MATERIALIZED_BYTES;
|
|
|
import static org.apache.hadoop.mapred.Task.Counter.MAP_OUTPUT_RECORDS;
|
|
|
|
|
|
import java.io.DataInput;
|
|
@@ -43,6 +44,7 @@ import org.apache.hadoop.conf.Configuration;
|
|
|
import org.apache.hadoop.fs.FSDataInputStream;
|
|
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
+import org.apache.hadoop.fs.FileSystem.Statistics;
|
|
|
import org.apache.hadoop.fs.LocalDirAllocator;
|
|
|
import org.apache.hadoop.fs.LocalFileSystem;
|
|
|
import org.apache.hadoop.fs.Path;
|
|
@@ -60,6 +62,7 @@ import org.apache.hadoop.io.serializer.Serializer;
|
|
|
import org.apache.hadoop.mapred.IFile.Writer;
|
|
|
import org.apache.hadoop.mapred.Merger.Segment;
|
|
|
import org.apache.hadoop.mapred.SortedRanges.SkipRangeIterator;
|
|
|
+import org.apache.hadoop.mapred.FileInputFormat;
|
|
|
import org.apache.hadoop.mapreduce.split.JobSplit;
|
|
|
import org.apache.hadoop.mapreduce.split.JobSplit.SplitMetaInfo;
|
|
|
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitIndex;
|
|
@@ -168,17 +171,33 @@ class MapTask extends Task {
|
|
|
private RecordReader<K,V> rawIn;
|
|
|
private Counters.Counter inputByteCounter;
|
|
|
private Counters.Counter inputRecordCounter;
|
|
|
+ private Counters.Counter fileInputByteCounter;
|
|
|
private InputSplit split;
|
|
|
private TaskReporter reporter;
|
|
|
private long beforePos = -1;
|
|
|
private long afterPos = -1;
|
|
|
+ private long bytesInPrev = -1;
|
|
|
+ private long bytesInCurr = -1;
|
|
|
+ private final Statistics fsStats;
|
|
|
|
|
|
- TrackedRecordReader(InputSplit split, JobConf job, RecordReader<K,V> raw,
|
|
|
- TaskReporter reporter)
|
|
|
- throws IOException{
|
|
|
- rawIn = raw;
|
|
|
+ TrackedRecordReader(InputSplit split, JobConf job, TaskReporter reporter)
|
|
|
+ throws IOException {
|
|
|
inputRecordCounter = reporter.getCounter(MAP_INPUT_RECORDS);
|
|
|
inputByteCounter = reporter.getCounter(MAP_INPUT_BYTES);
|
|
|
+ fileInputByteCounter = reporter
|
|
|
+ .getCounter(FileInputFormat.Counter.BYTES_READ);
|
|
|
+
|
|
|
+ Statistics matchedStats = null;
|
|
|
+ if (split instanceof FileSplit) {
|
|
|
+ matchedStats = getFsStatistics(((FileSplit) split).getPath());
|
|
|
+ }
|
|
|
+ fsStats = matchedStats;
|
|
|
+
|
|
|
+ bytesInPrev = getInputBytes(fsStats);
|
|
|
+ rawIn = job.getInputFormat().getRecordReader(split, job, reporter);
|
|
|
+ bytesInCurr = getInputBytes(fsStats);
|
|
|
+ fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
|
|
|
+
|
|
|
this.reporter = reporter;
|
|
|
this.split = split;
|
|
|
conf = job;
|
|
@@ -194,7 +213,7 @@ class MapTask extends Task {
|
|
|
|
|
|
public synchronized boolean next(K key, V value)
|
|
|
throws IOException {
|
|
|
- boolean ret = moveToNext(key, value);
|
|
|
+ boolean ret = moveToNext(key, value);
|
|
|
if (ret) {
|
|
|
incrCounters();
|
|
|
}
|
|
@@ -204,6 +223,7 @@ class MapTask extends Task {
|
|
|
protected void incrCounters() {
|
|
|
inputRecordCounter.increment(1);
|
|
|
inputByteCounter.increment(afterPos - beforePos);
|
|
|
+ fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
|
|
|
}
|
|
|
|
|
|
protected synchronized boolean moveToNext(K key, V value)
|
|
@@ -212,8 +232,10 @@ class MapTask extends Task {
|
|
|
try {
|
|
|
reporter.setProgress(getProgress());
|
|
|
beforePos = getPos();
|
|
|
+ bytesInPrev = getInputBytes(fsStats);
|
|
|
ret = rawIn.next(key, value);
|
|
|
afterPos = getPos();
|
|
|
+ bytesInCurr = getInputBytes(fsStats);
|
|
|
} catch (IOException ioe) {
|
|
|
if (split instanceof FileSplit) {
|
|
|
LOG.error("IO error in map input file " + conf.get("map.input.file"));
|
|
@@ -225,13 +247,24 @@ class MapTask extends Task {
|
|
|
}
|
|
|
|
|
|
public long getPos() throws IOException { return rawIn.getPos(); }
|
|
|
- public void close() throws IOException { rawIn.close(); }
|
|
|
+
|
|
|
+ public void close() throws IOException {
|
|
|
+ bytesInPrev = getInputBytes(fsStats);
|
|
|
+ rawIn.close();
|
|
|
+ bytesInCurr = getInputBytes(fsStats);
|
|
|
+ fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
|
|
|
+ }
|
|
|
+
|
|
|
public float getProgress() throws IOException {
|
|
|
return rawIn.getProgress();
|
|
|
}
|
|
|
TaskReporter getTaskReporter() {
|
|
|
return reporter;
|
|
|
}
|
|
|
+
|
|
|
+ private long getInputBytes(Statistics stats) {
|
|
|
+ return stats == null ? 0 : stats.getBytesRead();
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -246,9 +279,9 @@ class MapTask extends Task {
|
|
|
private Counters.Counter skipRecCounter;
|
|
|
private long recIndex = -1;
|
|
|
|
|
|
- SkippingRecordReader(RecordReader<K,V> raw, TaskUmbilicalProtocol umbilical,
|
|
|
- TaskReporter reporter) throws IOException{
|
|
|
- super(null, conf, raw, reporter);
|
|
|
+ SkippingRecordReader(TaskUmbilicalProtocol umbilical, TaskReporter reporter)
|
|
|
+ throws IOException {
|
|
|
+ super(null, conf, reporter);
|
|
|
this.umbilical = umbilical;
|
|
|
this.skipRecCounter = reporter.getCounter(Counter.MAP_SKIPPED_RECORDS);
|
|
|
this.toWriteSkipRecs = toWriteSkipRecs() &&
|
|
@@ -380,11 +413,9 @@ class MapTask extends Task {
|
|
|
updateJobWithSplit(job, inputSplit);
|
|
|
reporter.setInputSplit(inputSplit);
|
|
|
|
|
|
- RecordReader<INKEY,INVALUE> rawIn = // open input
|
|
|
- job.getInputFormat().getRecordReader(inputSplit, job, reporter);
|
|
|
RecordReader<INKEY,INVALUE> in = isSkipping() ?
|
|
|
- new SkippingRecordReader<INKEY,INVALUE>(rawIn, umbilical, reporter) :
|
|
|
- new TrackedRecordReader<INKEY,INVALUE>(inputSplit, job, rawIn, reporter);
|
|
|
+ new SkippingRecordReader<INKEY,INVALUE>(umbilical, reporter) :
|
|
|
+ new TrackedRecordReader<INKEY,INVALUE>(inputSplit, job, reporter);
|
|
|
job.setBoolean("mapred.skip.on", isSkipping());
|
|
|
|
|
|
|
|
@@ -427,16 +458,31 @@ class MapTask extends Task {
|
|
|
extends org.apache.hadoop.mapreduce.RecordReader<K,V> {
|
|
|
private final org.apache.hadoop.mapreduce.RecordReader<K,V> real;
|
|
|
private final org.apache.hadoop.mapreduce.Counter inputRecordCounter;
|
|
|
+ private final org.apache.hadoop.mapreduce.Counter fileInputByteCounter;
|
|
|
private final TaskReporter reporter;
|
|
|
private org.apache.hadoop.mapreduce.InputSplit inputSplit;
|
|
|
+ private final JobConf job;
|
|
|
+ private final Statistics fsStats;
|
|
|
|
|
|
NewTrackingRecordReader(org.apache.hadoop.mapreduce.InputSplit split,
|
|
|
- org.apache.hadoop.mapreduce.RecordReader<K,V> real,
|
|
|
- TaskReporter reporter) {
|
|
|
- this.real = real;
|
|
|
+ org.apache.hadoop.mapreduce.InputFormat inputFormat,
|
|
|
+ TaskReporter reporter, JobConf job,
|
|
|
+ org.apache.hadoop.mapreduce.TaskAttemptContext taskContext)
|
|
|
+ throws IOException, InterruptedException {
|
|
|
+ this.real = inputFormat.createRecordReader(split, taskContext);
|
|
|
this.reporter = reporter;
|
|
|
this.inputSplit = split;
|
|
|
+ this.job = job;
|
|
|
this.inputRecordCounter = reporter.getCounter(MAP_INPUT_RECORDS);
|
|
|
+ this.fileInputByteCounter = reporter
|
|
|
+ .getCounter(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.Counter.BYTES_READ);
|
|
|
+
|
|
|
+ Statistics matchedStats = null;
|
|
|
+ if (split instanceof org.apache.hadoop.mapreduce.lib.input.FileSplit) {
|
|
|
+ matchedStats = getFsStatistics(((org.apache.hadoop.mapreduce.lib.input.FileSplit) split)
|
|
|
+ .getPath());
|
|
|
+ }
|
|
|
+ fsStats = matchedStats;
|
|
|
}
|
|
|
|
|
|
@Override
|
|
@@ -463,16 +509,23 @@ class MapTask extends Task {
|
|
|
public void initialize(org.apache.hadoop.mapreduce.InputSplit split,
|
|
|
org.apache.hadoop.mapreduce.TaskAttemptContext context
|
|
|
) throws IOException, InterruptedException {
|
|
|
+ long bytesInPrev = getInputBytes(fsStats);
|
|
|
real.initialize(split, context);
|
|
|
+ long bytesInCurr = getInputBytes(fsStats);
|
|
|
+ fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
|
|
|
}
|
|
|
|
|
|
@Override
|
|
|
public boolean nextKeyValue() throws IOException, InterruptedException {
|
|
|
boolean result = false;
|
|
|
try {
|
|
|
+ long bytesInPrev = getInputBytes(fsStats);
|
|
|
result = real.nextKeyValue();
|
|
|
+ long bytesInCurr = getInputBytes(fsStats);
|
|
|
+
|
|
|
if (result) {
|
|
|
inputRecordCounter.increment(1);
|
|
|
+ fileInputByteCounter.increment(bytesInCurr - bytesInPrev);
|
|
|
}
|
|
|
reporter.setProgress(getProgress());
|
|
|
} catch (IOException ioe) {
|
|
@@ -486,6 +539,10 @@ class MapTask extends Task {
|
|
|
}
|
|
|
return result;
|
|
|
}
|
|
|
+
|
|
|
+ private long getInputBytes(Statistics stats) {
|
|
|
+ return stats == null ? 0 : stats.getBytesRead();
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -538,15 +595,29 @@ class MapTask extends Task {
|
|
|
private final TaskReporter reporter;
|
|
|
|
|
|
private final Counters.Counter mapOutputRecordCounter;
|
|
|
+ private final Counters.Counter fileOutputByteCounter;
|
|
|
+ private final Statistics fsStats;
|
|
|
|
|
|
@SuppressWarnings("unchecked")
|
|
|
NewDirectOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
|
|
|
JobConf job, TaskUmbilicalProtocol umbilical, TaskReporter reporter)
|
|
|
throws IOException, ClassNotFoundException, InterruptedException {
|
|
|
this.reporter = reporter;
|
|
|
- out = outputFormat.getRecordWriter(taskContext);
|
|
|
+ Statistics matchedStats = null;
|
|
|
+ if (outputFormat instanceof org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) {
|
|
|
+ matchedStats = getFsStatistics(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
|
|
|
+ .getOutputPath(jobContext));
|
|
|
+ }
|
|
|
+ fsStats = matchedStats;
|
|
|
mapOutputRecordCounter =
|
|
|
reporter.getCounter(MAP_OUTPUT_RECORDS);
|
|
|
+ fileOutputByteCounter = reporter
|
|
|
+ .getCounter(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.Counter.BYTES_WRITTEN);
|
|
|
+
|
|
|
+ long bytesOutPrev = getOutputBytes(fsStats);
|
|
|
+ out = outputFormat.getRecordWriter(taskContext);
|
|
|
+ long bytesOutCurr = getOutputBytes(fsStats);
|
|
|
+ fileOutputByteCounter.increment(bytesOutCurr - bytesOutPrev);
|
|
|
}
|
|
|
|
|
|
@Override
|
|
@@ -554,7 +625,10 @@ class MapTask extends Task {
|
|
|
public void write(K key, V value)
|
|
|
throws IOException, InterruptedException {
|
|
|
reporter.progress();
|
|
|
+ long bytesOutPrev = getOutputBytes(fsStats);
|
|
|
out.write(key, value);
|
|
|
+ long bytesOutCurr = getOutputBytes(fsStats);
|
|
|
+ fileOutputByteCounter.increment(bytesOutCurr - bytesOutPrev);
|
|
|
mapOutputRecordCounter.increment(1);
|
|
|
}
|
|
|
|
|
@@ -563,9 +637,16 @@ class MapTask extends Task {
|
|
|
throws IOException,InterruptedException {
|
|
|
reporter.progress();
|
|
|
if (out != null) {
|
|
|
+ long bytesOutPrev = getOutputBytes(fsStats);
|
|
|
out.close(context);
|
|
|
+ long bytesOutCurr = getOutputBytes(fsStats);
|
|
|
+ fileOutputByteCounter.increment(bytesOutCurr - bytesOutPrev);
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ private long getOutputBytes(Statistics stats) {
|
|
|
+ return stats == null ? 0 : stats.getBytesWritten();
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
private class NewOutputCollector<K,V>
|
|
@@ -639,9 +720,8 @@ class MapTask extends Task {
|
|
|
|
|
|
org.apache.hadoop.mapreduce.RecordReader<INKEY,INVALUE> input =
|
|
|
new NewTrackingRecordReader<INKEY,INVALUE>
|
|
|
- (split, inputFormat.createRecordReader(split, taskContext),
|
|
|
- reporter);
|
|
|
-
|
|
|
+ (split, inputFormat, reporter, job, taskContext);
|
|
|
+
|
|
|
job.setBoolean("mapred.skip.on", isSkipping());
|
|
|
org.apache.hadoop.mapreduce.RecordWriter output = null;
|
|
|
org.apache.hadoop.mapreduce.Mapper<INKEY,INVALUE,OUTKEY,OUTVALUE>.Context
|
|
@@ -704,6 +784,8 @@ class MapTask extends Task {
|
|
|
private TaskReporter reporter = null;
|
|
|
|
|
|
private final Counters.Counter mapOutputRecordCounter;
|
|
|
+ private final Counters.Counter fileOutputByteCounter;
|
|
|
+ private final Statistics fsStats;
|
|
|
|
|
|
@SuppressWarnings("unchecked")
|
|
|
public DirectMapOutputCollector(TaskUmbilicalProtocol umbilical,
|
|
@@ -712,14 +794,30 @@ class MapTask extends Task {
|
|
|
String finalName = getOutputName(getPartition());
|
|
|
FileSystem fs = FileSystem.get(job);
|
|
|
|
|
|
- out = job.getOutputFormat().getRecordWriter(fs, job, finalName, reporter);
|
|
|
-
|
|
|
+
|
|
|
+ OutputFormat<K, V> outputFormat = job.getOutputFormat();
|
|
|
+
|
|
|
+ Statistics matchedStats = null;
|
|
|
+ if (outputFormat instanceof FileOutputFormat) {
|
|
|
+ matchedStats = getFsStatistics(FileOutputFormat.getOutputPath(job));
|
|
|
+ }
|
|
|
+ fsStats = matchedStats;
|
|
|
mapOutputRecordCounter = reporter.getCounter(MAP_OUTPUT_RECORDS);
|
|
|
+ fileOutputByteCounter = reporter
|
|
|
+ .getCounter(FileOutputFormat.Counter.BYTES_WRITTEN);
|
|
|
+
|
|
|
+ long bytesOutPrev = getOutputBytes(fsStats);
|
|
|
+ out = job.getOutputFormat().getRecordWriter(fs, job, finalName, reporter);
|
|
|
+ long bytesOutCurr = getOutputBytes(fsStats);
|
|
|
+ fileOutputByteCounter.increment(bytesOutCurr - bytesOutPrev);
|
|
|
}
|
|
|
|
|
|
public void close() throws IOException {
|
|
|
if (this.out != null) {
|
|
|
+ long bytesOutPrev = getOutputBytes(fsStats);
|
|
|
out.close(this.reporter);
|
|
|
+ long bytesOutCurr = getOutputBytes(fsStats);
|
|
|
+ fileOutputByteCounter.increment(bytesOutCurr - bytesOutPrev);
|
|
|
}
|
|
|
|
|
|
}
|
|
@@ -730,10 +828,16 @@ class MapTask extends Task {
|
|
|
|
|
|
public void collect(K key, V value, int partition) throws IOException {
|
|
|
reporter.progress();
|
|
|
+ long bytesOutPrev = getOutputBytes(fsStats);
|
|
|
out.write(key, value);
|
|
|
+ long bytesOutCurr = getOutputBytes(fsStats);
|
|
|
+ fileOutputByteCounter.increment(bytesOutCurr - bytesOutPrev);
|
|
|
mapOutputRecordCounter.increment(1);
|
|
|
}
|
|
|
|
|
|
+ private long getOutputBytes(Statistics stats) {
|
|
|
+ return stats == null ? 0 : stats.getBytesWritten();
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
class MapOutputBuffer<K extends Object, V extends Object>
|
|
@@ -793,6 +897,7 @@ class MapTask extends Task {
|
|
|
private final Counters.Counter mapOutputByteCounter;
|
|
|
private final Counters.Counter mapOutputRecordCounter;
|
|
|
private final Counters.Counter combineOutputCounter;
|
|
|
+ private final Counters.Counter fileOutputByteCounter;
|
|
|
|
|
|
private ArrayList<SpillRecord> indexCacheList;
|
|
|
private int totalIndexCacheMemory;
|
|
@@ -855,6 +960,7 @@ class MapTask extends Task {
|
|
|
Counters.Counter combineInputCounter =
|
|
|
reporter.getCounter(COMBINE_INPUT_RECORDS);
|
|
|
combineOutputCounter = reporter.getCounter(COMBINE_OUTPUT_RECORDS);
|
|
|
+ fileOutputByteCounter = reporter.getCounter(MAP_OUTPUT_MATERIALIZED_BYTES);
|
|
|
// compression
|
|
|
if (job.getCompressMapOutput()) {
|
|
|
Class<? extends CompressionCodec> codecClass =
|
|
@@ -1205,6 +1311,8 @@ class MapTask extends Task {
|
|
|
// release sort buffer before the merge
|
|
|
kvbuffer = null;
|
|
|
mergeParts();
|
|
|
+ Path outputPath = mapOutputFile.getOutputFile();
|
|
|
+ fileOutputByteCounter.increment(rfs.getFileStatus(outputPath).getLen());
|
|
|
}
|
|
|
|
|
|
public void close() { }
|