瀏覽代碼

HADOOP-8470. Add NetworkTopologyWithNodeGroup, a 4-layer implementation of NetworkTopology. Contributed by Junping Du

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1351445 13f79535-47bb-0310-9956-ffa450edef68
Tsz-wo Sze 13 年之前
父節點
當前提交
ed7040f07b

+ 3 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -12,6 +12,9 @@ Trunk (unreleased changes)
     HADOOP-8469. Make NetworkTopology class pluggable.  (Junping Du via
     szetszwo)
 
+    HADOOP-8470. Add NetworkTopologyWithNodeGroup, a 4-layer implementation
+    of NetworkTopology.  (Junping Du via szetszwo)
+
   IMPROVEMENTS
 
     HADOOP-8017. Configure hadoop-main pom to get rid of M2E plugin execution

+ 398 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/net/NetworkTopologyWithNodeGroup.java

@@ -0,0 +1,398 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.net;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * The class extends NetworkTopology to represents a cluster of computer with
+ *  a 4-layers hierarchical network topology.
+ * In this network topology, leaves represent data nodes (computers) and inner
+ * nodes represent switches/routers that manage traffic in/out of data centers,
+ * racks or physical host (with virtual switch).
+ * 
+ * @see NetworkTopology
+ */
+@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
+@InterfaceStability.Unstable
+public class NetworkTopologyWithNodeGroup extends NetworkTopology {
+
+  public final static String DEFAULT_NODEGROUP = "/default-nodegroup";
+
+  public NetworkTopologyWithNodeGroup() {
+    clusterMap = new InnerNodeWithNodeGroup(InnerNode.ROOT);
+  }
+
+  @Override
+  protected Node getNodeForNetworkLocation(Node node) {
+    // if node only with default rack info, here we need to add default
+    // nodegroup info
+    if (NetworkTopology.DEFAULT_RACK.equals(node.getNetworkLocation())) {
+      node.setNetworkLocation(node.getNetworkLocation()
+          + DEFAULT_NODEGROUP);
+    }
+    Node nodeGroup = getNode(node.getNetworkLocation());
+    if (nodeGroup == null) {
+      nodeGroup = new InnerNode(node.getNetworkLocation());
+    }
+    return getNode(nodeGroup.getNetworkLocation());
+  }
+
+  @Override
+  public String getRack(String loc) {
+    netlock.readLock().lock();
+    try {
+      loc = InnerNode.normalize(loc);
+      Node locNode = getNode(loc);
+      if (locNode instanceof InnerNodeWithNodeGroup) {
+        InnerNodeWithNodeGroup node = (InnerNodeWithNodeGroup) locNode;
+        if (node.isRack()) {
+          return loc;
+        } else if (node.isNodeGroup()) {
+          return node.getNetworkLocation();
+        } else {
+          // may be a data center
+          return null;
+        }
+      } else {
+        // not in cluster map, don't handle it
+        return loc;
+      }
+    } finally {
+      netlock.readLock().unlock();
+    }
+  }
+
+  /**
+   * Given a string representation of a node group for a specific network
+   * location
+   * 
+   * @param loc
+   *            a path-like string representation of a network location
+   * @return a node group string
+   */
+  public String getNodeGroup(String loc) {
+    netlock.readLock().lock();
+    try {
+      loc = InnerNode.normalize(loc);
+      Node locNode = getNode(loc);
+      if (locNode instanceof InnerNodeWithNodeGroup) {
+        InnerNodeWithNodeGroup node = (InnerNodeWithNodeGroup) locNode;
+        if (node.isNodeGroup()) {
+          return loc;
+        } else if (node.isRack()) {
+          // not sure the node group for a rack
+          return null;
+        } else {
+          // may be a leaf node
+          return getNodeGroup(node.getNetworkLocation());
+        }
+      } else {
+        // not in cluster map, don't handle it
+        return loc;
+      }
+    } finally {
+      netlock.readLock().unlock();
+    }
+  }
+
+  @Override
+  public boolean isOnSameRack( Node node1,  Node node2) {
+    if (node1 == null || node2 == null ||
+        node1.getParent() == null || node2.getParent() == null) {
+      return false;
+    }
+      
+    netlock.readLock().lock();
+    try {
+      return isSameParents(node1.getParent(), node2.getParent());
+    } finally {
+      netlock.readLock().unlock();
+    }
+  }
+
+  /**
+   * Check if two nodes are on the same node group (hypervisor) The
+   * assumption here is: each nodes are leaf nodes.
+   * 
+   * @param node1
+   *            one node (can be null)
+   * @param node2
+   *            another node (can be null)
+   * @return true if node1 and node2 are on the same node group; false
+   *         otherwise
+   * @exception IllegalArgumentException
+   *                when either node1 or node2 is null, or node1 or node2 do
+   *                not belong to the cluster
+   */
+  @Override
+  public boolean isOnSameNodeGroup(Node node1, Node node2) {
+    if (node1 == null || node2 == null) {
+      return false;
+    }
+    netlock.readLock().lock();
+    try {
+      return isSameParents(node1, node2);
+    } finally {
+      netlock.readLock().unlock();
+    }
+  }
+
+  /**
+   * Check if network topology is aware of NodeGroup
+   */
+  @Override
+  public boolean isNodeGroupAware() {
+    return true;
+  }
+
+  /** Add a leaf node
+   * Update node counter & rack counter if necessary
+   * @param node node to be added; can be null
+   * @exception IllegalArgumentException if add a node to a leave 
+   *                                     or node to be added is not a leaf
+   */
+  @Override
+  public void add(Node node) {
+    if (node==null) return;
+    if( node instanceof InnerNode ) {
+      throw new IllegalArgumentException(
+        "Not allow to add an inner node: "+NodeBase.getPath(node));
+    }
+    netlock.writeLock().lock();
+    try {
+      Node rack = null;
+
+      // if node only with default rack info, here we need to add default 
+      // nodegroup info
+      if (NetworkTopology.DEFAULT_RACK.equals(node.getNetworkLocation())) {
+        node.setNetworkLocation(node.getNetworkLocation() + 
+            NetworkTopologyWithNodeGroup.DEFAULT_NODEGROUP);
+      }
+      Node nodeGroup = getNode(node.getNetworkLocation());
+      if (nodeGroup == null) {
+        nodeGroup = new InnerNodeWithNodeGroup(node.getNetworkLocation());
+      }
+      rack = getNode(nodeGroup.getNetworkLocation());
+
+      if (rack != null && !(rack instanceof InnerNode)) {
+        throw new IllegalArgumentException("Unexpected data node " 
+            + node.toString() 
+            + " at an illegal network location");
+      }
+      if (clusterMap.add(node)) {
+        LOG.info("Adding a new node: " + NodeBase.getPath(node));
+        if (rack == null) {
+          // We only track rack number here
+          numOfRacks++;
+        }
+      }
+      if(LOG.isDebugEnabled()) {
+        LOG.debug("NetworkTopology became:\n" + this.toString());
+      }
+    } finally {
+      netlock.writeLock().unlock();
+    }
+  }
+
+  /** Remove a node
+   * Update node counter and rack counter if necessary
+   * @param node node to be removed; can be null
+   */
+  @Override
+  public void remove(Node node) {
+    if (node==null) return;
+    if( node instanceof InnerNode ) {
+      throw new IllegalArgumentException(
+          "Not allow to remove an inner node: "+NodeBase.getPath(node));
+    }
+    LOG.info("Removing a node: "+NodeBase.getPath(node));
+    netlock.writeLock().lock();
+    try {
+      if (clusterMap.remove(node)) {
+        Node nodeGroup = getNode(node.getNetworkLocation());
+        if (nodeGroup == null) {
+          nodeGroup = new InnerNode(node.getNetworkLocation());
+        }
+        InnerNode rack = (InnerNode)getNode(nodeGroup.getNetworkLocation());
+        if (rack == null) {
+          numOfRacks--;
+        }
+      }
+      if(LOG.isDebugEnabled()) {
+        LOG.debug("NetworkTopology became:\n" + this.toString());
+      }
+    } finally {
+      netlock.writeLock().unlock();
+    }
+  }
+
+  /** Sort nodes array by their distances to <i>reader</i>
+   * It linearly scans the array, if a local node is found, swap it with
+   * the first element of the array.
+   * If a local node group node is found, swap it with the first element 
+   * following the local node.
+   * If a local rack node is found, swap it with the first element following
+   * the local node group node.
+   * If neither local node, node group node or local rack node is found, put a 
+   * random replica location at position 0.
+   * It leaves the rest nodes untouched.
+   * @param reader the node that wishes to read a block from one of the nodes
+   * @param nodes the list of nodes containing data for the reader
+   */
+  @Override
+  public void pseudoSortByDistance( Node reader, Node[] nodes ) {
+
+    if (reader != null && !this.contains(reader)) {
+      // if reader is not a datanode (not in NetworkTopology tree), we will 
+      // replace this reader with a sibling leaf node in tree.
+      Node nodeGroup = getNode(reader.getNetworkLocation());
+      if (nodeGroup != null && nodeGroup instanceof InnerNode) {
+        InnerNode parentNode = (InnerNode) nodeGroup;
+        // replace reader with the first children of its parent in tree
+        reader = parentNode.getLeaf(0, null);
+      } else {
+        return;
+      }
+    }
+    int tempIndex = 0;
+    int localRackNode = -1;
+    int localNodeGroupNode = -1;
+    if (reader != null) {  
+      //scan the array to find the local node & local rack node
+      for (int i = 0; i < nodes.length; i++) {
+        if (tempIndex == 0 && reader == nodes[i]) { //local node
+          //swap the local node and the node at position 0
+          if (i != 0) {
+            swap(nodes, tempIndex, i);
+          }
+          tempIndex=1;
+
+          if (localRackNode != -1 && (localNodeGroupNode !=-1)) {
+            if (localRackNode == 0) {
+              localRackNode = i;
+            }
+            if (localNodeGroupNode == 0) {
+              localNodeGroupNode = i;
+            }
+            break;
+          }
+        } else if (localNodeGroupNode == -1 && isOnSameNodeGroup(reader, 
+            nodes[i])) {
+          //local node group
+          localNodeGroupNode = i;
+          // node local and rack local are already found
+          if(tempIndex != 0 && localRackNode != -1) break;
+        } else if (localRackNode == -1 && isOnSameRack(reader, nodes[i])) {
+          localRackNode = i;
+          if (tempIndex != 0 && localNodeGroupNode != -1) break;
+        }
+      }
+
+      // swap the local nodegroup node and the node at position tempIndex
+      if(localNodeGroupNode != -1 && localNodeGroupNode != tempIndex) {
+        swap(nodes, tempIndex, localNodeGroupNode);
+        if (localRackNode == tempIndex) {
+          localRackNode = localNodeGroupNode;
+        }
+        tempIndex++;
+      }
+
+      // swap the local rack node and the node at position tempIndex
+      if(localRackNode != -1 && localRackNode != tempIndex) {
+        swap(nodes, tempIndex, localRackNode);
+        tempIndex++;
+      }
+    }
+
+    // put a random node at position 0 if there is not a local/local-nodegroup/
+    // local-rack node
+    if (tempIndex == 0 && localNodeGroupNode == -1 && localRackNode == -1
+        && nodes.length != 0) {
+      swap(nodes, 0, r.nextInt(nodes.length));
+    }
+  }
+
+  /** InnerNodeWithNodeGroup represents a switch/router of a data center, rack
+   * or physical host. Different from a leaf node, it has non-null children.
+   */
+  static class InnerNodeWithNodeGroup extends InnerNode {
+    public InnerNodeWithNodeGroup(String name, String location, 
+        InnerNode parent, int level) {
+      super(name, location, parent, level);
+    }
+
+    public InnerNodeWithNodeGroup(String name, String location) {
+      super(name, location);
+    }
+
+    public InnerNodeWithNodeGroup(String path) {
+      super(path);
+    }
+
+    @Override
+    boolean isRack() {
+      // it is node group
+      if (getChildren().isEmpty()) {
+        return false;
+      }
+
+      Node firstChild = children.get(0);
+
+      if (firstChild instanceof InnerNode) {
+        Node firstGrandChild = (((InnerNode) firstChild).children).get(0);
+        if (firstGrandChild instanceof InnerNode) {
+          // it is datacenter
+          return false;
+        } else {
+          return true;
+        }
+      }
+      return false;
+    }
+
+    /**
+     * Judge if this node represents a node group
+     * 
+     * @return true if it has no child or its children are not InnerNodes
+     */
+    boolean isNodeGroup() {
+      if (children.isEmpty()) {
+        return true;
+      }
+      Node firstChild = children.get(0);
+      if (firstChild instanceof InnerNode) {
+        // it is rack or datacenter
+        return false;
+      }
+      return true;
+    }
+
+    @Override
+    protected InnerNode createParentNode(String parentName) {
+      return new InnerNodeWithNodeGroup(parentName, getPath(this), this,
+          this.getLevel() + 1);
+    }
+
+    @Override
+    protected boolean areChildrenLeaves() {
+      return isNodeGroup();
+    }
+  }
+}

+ 9 - 3
hadoop-common-project/hadoop-common/src/main/resources/core-default.xml

@@ -599,10 +599,9 @@
   </description>
 </property>
 
-<!-- Rack Configuration -->
-
+<!-- Topology Configuration -->
 <property>
-	<name>net.topology.node.switch.mapping.impl</name>
+  <name>net.topology.node.switch.mapping.impl</name>
   <value>org.apache.hadoop.net.ScriptBasedMapping</value>
   <description> The default implementation of the DNSToSwitchMapping. It
     invokes a script specified in net.topology.script.file.name to resolve
@@ -611,6 +610,13 @@
   </description>
 </property>
 
+<property>
+  <name>net.topology.impl</name>
+  <value>org.apache.hadoop.net.NetworkTopology</value>
+  <description> The default implementation of NetworkTopology which is classic three layer one.
+  </description>
+</property>
+
 <property>
   <name>net.topology.script.file.name</name>
   <value></value>

+ 165 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/net/TestNetworkTopologyWithNodeGroup.java

@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.net;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
+
+public class TestNetworkTopologyWithNodeGroup extends TestCase {
+  private final static NetworkTopologyWithNodeGroup cluster = new 
+      NetworkTopologyWithNodeGroup();
+
+  private final static DatanodeDescriptor dataNodes[] = new DatanodeDescriptor[] {
+      DFSTestUtil.getDatanodeDescriptor("1.1.1.1", "/d1/r1/s1"),
+      DFSTestUtil.getDatanodeDescriptor("2.2.2.2", "/d1/r1/s1"),
+      DFSTestUtil.getDatanodeDescriptor("3.3.3.3", "/d1/r1/s2"),
+      DFSTestUtil.getDatanodeDescriptor("4.4.4.4", "/d1/r2/s3"),
+      DFSTestUtil.getDatanodeDescriptor("5.5.5.5", "/d1/r2/s3"),
+      DFSTestUtil.getDatanodeDescriptor("6.6.6.6", "/d1/r2/s4"),
+      DFSTestUtil.getDatanodeDescriptor("7.7.7.7", "/d2/r3/s5"),
+      DFSTestUtil.getDatanodeDescriptor("8.8.8.8", "/d2/r3/s6")
+  };
+
+  private final static NodeBase computeNode = new NodeBase("/d1/r1/s1/h9");
+
+  static {
+    for(int i=0; i<dataNodes.length; i++) {
+      cluster.add(dataNodes[i]);
+    }
+  }
+
+  public void testNumOfChildren() throws Exception {
+    assertEquals(cluster.getNumOfLeaves(), dataNodes.length);
+  }
+
+  public void testNumOfRacks() throws Exception {
+    assertEquals(cluster.getNumOfRacks(), 3);
+  }
+
+  public void testRacks() throws Exception {
+    assertEquals(cluster.getNumOfRacks(), 3);
+    assertTrue(cluster.isOnSameRack(dataNodes[0], dataNodes[1]));
+    assertTrue(cluster.isOnSameRack(dataNodes[1], dataNodes[2]));
+    assertFalse(cluster.isOnSameRack(dataNodes[2], dataNodes[3]));
+    assertTrue(cluster.isOnSameRack(dataNodes[3], dataNodes[4]));
+    assertTrue(cluster.isOnSameRack(dataNodes[4], dataNodes[5]));
+    assertFalse(cluster.isOnSameRack(dataNodes[5], dataNodes[6]));
+    assertTrue(cluster.isOnSameRack(dataNodes[6], dataNodes[7]));
+  }
+
+  public void testNodeGroups() throws Exception {
+    assertEquals(cluster.getNumOfRacks(), 3);
+    assertTrue(cluster.isOnSameNodeGroup(dataNodes[0], dataNodes[1]));
+    assertFalse(cluster.isOnSameNodeGroup(dataNodes[1], dataNodes[2]));
+    assertFalse(cluster.isOnSameNodeGroup(dataNodes[2], dataNodes[3]));
+    assertTrue(cluster.isOnSameNodeGroup(dataNodes[3], dataNodes[4]));
+    assertFalse(cluster.isOnSameNodeGroup(dataNodes[4], dataNodes[5]));
+    assertFalse(cluster.isOnSameNodeGroup(dataNodes[5], dataNodes[6]));
+    assertFalse(cluster.isOnSameNodeGroup(dataNodes[6], dataNodes[7]));
+  }
+
+  public void testGetDistance() throws Exception {
+    assertEquals(cluster.getDistance(dataNodes[0], dataNodes[0]), 0);
+    assertEquals(cluster.getDistance(dataNodes[0], dataNodes[1]), 2);
+    assertEquals(cluster.getDistance(dataNodes[0], dataNodes[2]), 4);
+    assertEquals(cluster.getDistance(dataNodes[0], dataNodes[3]), 6);
+    assertEquals(cluster.getDistance(dataNodes[0], dataNodes[6]), 8);
+  }
+
+  public void testPseudoSortByDistance() throws Exception {
+    DatanodeDescriptor[] testNodes = new DatanodeDescriptor[4];
+
+    // array contains both local node, local node group & local rack node
+    testNodes[0] = dataNodes[1];
+    testNodes[1] = dataNodes[2];
+    testNodes[2] = dataNodes[3];
+    testNodes[3] = dataNodes[0];
+    cluster.pseudoSortByDistance(dataNodes[0], testNodes );
+    assertTrue(testNodes[0] == dataNodes[0]);
+    assertTrue(testNodes[1] == dataNodes[1]);
+    assertTrue(testNodes[2] == dataNodes[2]);
+    assertTrue(testNodes[3] == dataNodes[3]);
+
+    // array contains local node & local node group
+    testNodes[0] = dataNodes[3];
+    testNodes[1] = dataNodes[4];
+    testNodes[2] = dataNodes[1];
+    testNodes[3] = dataNodes[0];
+    cluster.pseudoSortByDistance(dataNodes[0], testNodes );
+    assertTrue(testNodes[0] == dataNodes[0]);
+    assertTrue(testNodes[1] == dataNodes[1]);
+
+    // array contains local node & rack node
+    testNodes[0] = dataNodes[5];
+    testNodes[1] = dataNodes[3];
+    testNodes[2] = dataNodes[2];
+    testNodes[3] = dataNodes[0];
+    cluster.pseudoSortByDistance(dataNodes[0], testNodes );
+    assertTrue(testNodes[0] == dataNodes[0]);
+    assertTrue(testNodes[1] == dataNodes[2]);
+
+    // array contains local-nodegroup node (not a data node also) & rack node
+    testNodes[0] = dataNodes[6];
+    testNodes[1] = dataNodes[7];
+    testNodes[2] = dataNodes[2];
+    testNodes[3] = dataNodes[0];
+    cluster.pseudoSortByDistance(computeNode, testNodes );
+    assertTrue(testNodes[0] == dataNodes[0]);
+    assertTrue(testNodes[1] == dataNodes[2]);
+  }
+
+  /**
+   * This picks a large number of nodes at random in order to ensure coverage
+   * 
+   * @param numNodes the number of nodes
+   * @param excludedScope the excluded scope
+   * @return the frequency that nodes were chosen
+   */
+  private Map<Node, Integer> pickNodesAtRandom(int numNodes,
+      String excludedScope) {
+    Map<Node, Integer> frequency = new HashMap<Node, Integer>();
+    for (DatanodeDescriptor dnd : dataNodes) {
+      frequency.put(dnd, 0);
+    }
+
+    for (int j = 0; j < numNodes; j++) {
+      Node random = cluster.chooseRandom(excludedScope);
+      frequency.put(random, frequency.get(random) + 1);
+    }
+    return frequency;
+  }
+
+  /**
+   * This test checks that chooseRandom works for an excluded node.
+   */
+  public void testChooseRandomExcludedNode() {
+    String scope = "~" + NodeBase.getPath(dataNodes[0]);
+    Map<Node, Integer> frequency = pickNodesAtRandom(100, scope);
+
+    for (Node key : dataNodes) {
+      // all nodes except the first should be more than zero
+      assertTrue(frequency.get(key) > 0 || key == dataNodes[0]);
+    }
+  }
+
+}