Browse Source

HADOOP-5103. FileInputFormat now reuses the clusterMap network topology object and that brings down the log messages in the JobClient to do with NetworkTopology.add significantly. Contributed by Jothi Padmanabhan.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/core/trunk@746925 13f79535-47bb-0310-9956-ffa450edef68
Devaraj Das 16 years ago
parent
commit
86813f49a0

+ 4 - 0
CHANGES.txt

@@ -208,6 +208,10 @@ Trunk (unreleased changes)
 
 
     HADOOP-5292. Fix NPE in KFS::getBlockLocations. (Sriram Rao via lohit)
     HADOOP-5292. Fix NPE in KFS::getBlockLocations. (Sriram Rao via lohit)
 
 
+    HADOOP-5103. FileInputFormat now reuses the clusterMap network topology object
+    and that brings down the log messages in the JobClient to do with 
+    NetworkTopology.add significantly. (Jothi Padmanabhan via ddas)
+
 Release 0.20.0 - Unreleased
 Release 0.20.0 - Unreleased
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 1 - 1
src/core/org/apache/hadoop/net/NetworkTopology.java

@@ -316,7 +316,6 @@ public class NetworkTopology {
       throw new IllegalArgumentException(
       throw new IllegalArgumentException(
         "Not allow to add an inner node: "+NodeBase.getPath(node));
         "Not allow to add an inner node: "+NodeBase.getPath(node));
     }
     }
-    LOG.info("Adding a new node: "+NodeBase.getPath(node));
     netlock.writeLock().lock();
     netlock.writeLock().lock();
     try {
     try {
       Node rack = getNode(node.getNetworkLocation());
       Node rack = getNode(node.getNetworkLocation());
@@ -326,6 +325,7 @@ public class NetworkTopology {
                                            + " at an illegal network location");
                                            + " at an illegal network location");
       }
       }
       if (clusterMap.add(node)) {
       if (clusterMap.add(node)) {
+        LOG.info("Adding a new node: "+NodeBase.getPath(node));
         if (rack == null) {
         if (rack == null) {
           numOfRacks++;
           numOfRacks++;
         }
         }

+ 10 - 4
src/mapred/org/apache/hadoop/mapred/FileInputFormat.java

@@ -214,6 +214,7 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
 
 
     // generate splits
     // generate splits
     ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
     ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
+    NetworkTopology clusterMap = new NetworkTopology();
     for (FileStatus file: files) {
     for (FileStatus file: files) {
       Path path = file.getPath();
       Path path = file.getPath();
       FileSystem fs = path.getFileSystem(job);
       FileSystem fs = path.getFileSystem(job);
@@ -226,7 +227,7 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
         long bytesRemaining = length;
         long bytesRemaining = length;
         while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
         while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
           String[] splitHosts = getSplitHosts(blkLocations, 
           String[] splitHosts = getSplitHosts(blkLocations, 
-              length-bytesRemaining, splitSize);
+              length-bytesRemaining, splitSize, clusterMap);
           splits.add(new FileSplit(path, length-bytesRemaining, splitSize, 
           splits.add(new FileSplit(path, length-bytesRemaining, splitSize, 
               splitHosts));
               splitHosts));
           bytesRemaining -= splitSize;
           bytesRemaining -= splitSize;
@@ -237,7 +238,7 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
                      blkLocations[blkLocations.length-1].getHosts()));
                      blkLocations[blkLocations.length-1].getHosts()));
         }
         }
       } else if (length != 0) {
       } else if (length != 0) {
-        String[] splitHosts = getSplitHosts(blkLocations,0,length);
+        String[] splitHosts = getSplitHosts(blkLocations,0,length,clusterMap);
         splits.add(new FileSplit(path, 0, length, splitHosts));
         splits.add(new FileSplit(path, 0, length, splitHosts));
       } else { 
       } else { 
         //Create empty hosts array for zero length files
         //Create empty hosts array for zero length files
@@ -417,7 +418,8 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
    * @throws IOException
    * @throws IOException
    */
    */
   protected String[] getSplitHosts(BlockLocation[] blkLocations, 
   protected String[] getSplitHosts(BlockLocation[] blkLocations, 
-      long offset, long splitSize)   throws IOException {
+      long offset, long splitSize, NetworkTopology clusterMap)
+  throws IOException {
 
 
     int startIndex = getBlockIndex(blkLocations, offset);
     int startIndex = getBlockIndex(blkLocations, offset);
 
 
@@ -442,7 +444,6 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
     long bytesInLastBlock = bytesInThisBlock;
     long bytesInLastBlock = bytesInThisBlock;
     int endIndex = index - 1;
     int endIndex = index - 1;
     
     
-    NetworkTopology clusterMap = new NetworkTopology();
     Map <Node,NodeInfo> hostsMap = new IdentityHashMap<Node,NodeInfo>();
     Map <Node,NodeInfo> hostsMap = new IdentityHashMap<Node,NodeInfo>();
     Map <Node,NodeInfo> racksMap = new IdentityHashMap<Node,NodeInfo>();
     Map <Node,NodeInfo> racksMap = new IdentityHashMap<Node,NodeInfo>();
     String [] allTopos = new String[0];
     String [] allTopos = new String[0];
@@ -486,6 +487,11 @@ public abstract class FileInputFormat<K, V> implements InputFormat<K, V> {
         if (node == null) {
         if (node == null) {
           node = new NodeBase(topo);
           node = new NodeBase(topo);
           clusterMap.add(node);
           clusterMap.add(node);
+        }
+        
+        nodeInfo = hostsMap.get(node);
+        
+        if (nodeInfo == null) {
           nodeInfo = new NodeInfo(node);
           nodeInfo = new NodeInfo(node);
           hostsMap.put(node,nodeInfo);
           hostsMap.put(node,nodeInfo);
           parentNode = node.getParent();
           parentNode = node.getParent();

+ 5 - 2
src/test/org/apache/hadoop/mapred/TestGetSplitHosts.java

@@ -18,6 +18,8 @@
 package org.apache.hadoop.mapred;
 package org.apache.hadoop.mapred;
 
 
 import org.apache.hadoop.fs.BlockLocation;
 import org.apache.hadoop.fs.BlockLocation;
+import org.apache.hadoop.net.NetworkTopology;
+
 import junit.framework.TestCase;
 import junit.framework.TestCase;
 
 
 public class TestGetSplitHosts extends TestCase {
 public class TestGetSplitHosts extends TestCase {
@@ -28,6 +30,7 @@ public class TestGetSplitHosts extends TestCase {
     int block1Size = 100, block2Size = 150, block3Size = 75;
     int block1Size = 100, block2Size = 150, block3Size = 75;
     int fileSize = block1Size + block2Size + block3Size;
     int fileSize = block1Size + block2Size + block3Size;
     int replicationFactor = 3;
     int replicationFactor = 3;
+    NetworkTopology clusterMap = new NetworkTopology();
     
     
     BlockLocation[] bs = new BlockLocation[numBlocks];
     BlockLocation[] bs = new BlockLocation[numBlocks];
     
     
@@ -72,7 +75,7 @@ public class TestGetSplitHosts extends TestCase {
     
     
     SequenceFileInputFormat< String, String> sif = 
     SequenceFileInputFormat< String, String> sif = 
       new SequenceFileInputFormat<String,String>();
       new SequenceFileInputFormat<String,String>();
-    String [] hosts = sif.getSplitHosts(bs, 0, fileSize);
+    String [] hosts = sif.getSplitHosts(bs, 0, fileSize, clusterMap);
 
 
     // Contributions By Racks are
     // Contributions By Racks are
     // Rack1   175       
     // Rack1   175       
@@ -93,7 +96,7 @@ public class TestGetSplitHosts extends TestCase {
     bs[2] = new BlockLocation(block3Names,block3Hosts,block1Size+block2Size,
     bs[2] = new BlockLocation(block3Names,block3Hosts,block1Size+block2Size,
                                block3Size);
                                block3Size);
 
 
-    hosts = sif.getSplitHosts(bs, 0, fileSize);
+    hosts = sif.getSplitHosts(bs, 0, fileSize, clusterMap);
     
     
     // host1 makes the highest contribution among all hosts
     // host1 makes the highest contribution among all hosts
     // So, that should be returned before others
     // So, that should be returned before others