Przeglądaj źródła

HADOOP-4116. Balancer should provide better resource management. The same change with a different patch has also made to branch 18 & trunk.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/core/branches/branch-0.18@699059 13f79535-47bb-0310-9956-ffa450edef68
Hairong Kuang 16 lat temu
rodzic
commit
a09f41465f

+ 6 - 0
CHANGES.txt

@@ -1,5 +1,11 @@
 Hadoop Change Log
 
+Release 0.18.2 - Unreleased
+
+  BUG FIXES
+
+    HADOOP-4116. Balancer should provide better resource management. (hairong)
+
 Release 0.18.1 - 2008-09-17
 
   IMPROVEMENTS

+ 32 - 34
src/hdfs/org/apache/hadoop/dfs/Balancer.java

@@ -28,7 +28,6 @@ import java.io.OutputStream;
 import java.net.InetAddress;
 import java.net.InetSocketAddress;
 import java.net.Socket;
-import java.net.SocketTimeoutException;
 import java.text.DateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -174,6 +173,11 @@ public class Balancer implements Tool {
     LogFactory.getLog("org.apache.hadoop.dfs.Balancer");
   final private static long MAX_BLOCKS_SIZE_TO_FETCH = 2*1024*1024*1024L; //2GB
 
+  /** The maximum number of concurrent blocks moves for 
+   * balancing purpose at a datanode
+   */
+  public static final int MAX_NUM_CONCURRENT_MOVES = 5;
+  
   private Configuration conf;
 
   private double threshold = 10D;
@@ -208,10 +212,10 @@ public class Balancer implements Tool {
   
   private double avgUtilization = 0.0D;
   
-  final private int MOVER_THREAD_POOL_SIZE = 1000;
+  final static private int MOVER_THREAD_POOL_SIZE = 1000;
   final private ExecutorService moverExecutor = 
     Executors.newFixedThreadPool(MOVER_THREAD_POOL_SIZE);
-  final private int DISPATCHER_THREAD_POOL_SIZE = 200;
+  final static private int DISPATCHER_THREAD_POOL_SIZE = 200;
   final private ExecutorService dispatcherExecutor =
     Executors.newFixedThreadPool(DISPATCHER_THREAD_POOL_SIZE);
   
@@ -256,11 +260,13 @@ public class Balancer implements Tool {
             this.block = block;
             if ( chooseProxySource() ) {
               addToMoved(block);
-              LOG.info("Decided to move block "+ block.getBlockId()
-                  +" with a length of "+FsShell.byteDesc(block.getNumBytes())
-                  + " bytes from " + source.getName() 
-                  + " to " + target.getName()
-                  + " using proxy source " + proxySource.getName() );
+              if (LOG.isDebugEnabled()) {
+                LOG.debug("Decided to move block "+ block.getBlockId()
+                    +" with a length of "+FsShell.byteDesc(block.getNumBytes())
+                    + " bytes from " + source.getName() 
+                    + " to " + target.getName()
+                    + " using proxy source " + proxySource.getName() );
+              }
               return true;
             }
           }
@@ -301,10 +307,8 @@ public class Balancer implements Tool {
       DataInputStream in = null;
       try {
         sock.connect(DataNode.createSocketAddr(
-            proxySource.datanode.getName()), FSConstants.READ_TIMEOUT);
-        long bandwidth = conf.getLong("dfs.balance.bandwidthPerSec", 1024L*1024);
-        sock.setSoTimeout(2*FSConstants.READ_TIMEOUT+
-            (int)(block.getNumBytes()*1500/bandwidth));
+            target.datanode.getName()), FSConstants.READ_TIMEOUT);
+        sock.setKeepAlive(true);
         out = new DataOutputStream( new BufferedOutputStream(
             sock.getOutputStream(), FSConstants.BUFFER_SIZE));
         sendRequest(out);
@@ -312,25 +316,17 @@ public class Balancer implements Tool {
             sock.getInputStream(), FSConstants.BUFFER_SIZE));
         receiveResponse(in);
         bytesMoved.inc(block.getNumBytes());
-        if (LOG.isDebugEnabled()) {
-          LOG.debug( "Moving block " + block.getBlock().getBlockId() +
+        LOG.info( "Moving block " + block.getBlock().getBlockId() +
               " from "+ source.getName() + " to " +
               target.getName() + " through " +
               proxySource.getName() +
-              " succeeded." );
-        }
-      } catch (SocketTimeoutException te) { 
-        LOG.warn("Timeout moving block "+block.getBlockId()+
-            " from " + source.getName() + " to " +
-            target.getName() + " through " +
-            proxySource.getName());
+              " is succeeded." );
       } catch (IOException e) {
         LOG.warn("Error moving block "+block.getBlockId()+
             " from " + source.getName() + " to " +
             target.getName() + " through " +
             proxySource.getName() +
-            ": "+e.getMessage()+ "\n" +
-            StringUtils.stringifyException(e) );
+            ": "+e.getMessage());
       } finally {
         IOUtils.closeStream(out);
         IOUtils.closeStream(in);
@@ -353,11 +349,11 @@ public class Balancer implements Tool {
     /* Send a block copy request to the outputstream*/
     private void sendRequest(DataOutputStream out) throws IOException {
       out.writeShort(FSConstants.DATA_TRANSFER_VERSION);
-      out.writeByte(FSConstants.OP_COPY_BLOCK);
+      out.writeByte(FSConstants.OP_REPLACE_BLOCK);
       out.writeLong(block.getBlock().getBlockId());
       out.writeLong(block.getBlock().getGenerationStamp());
       Text.writeString(out, source.getStorageID());
-      target.write(out);
+      proxySource.write(out);
       out.flush();
     }
     
@@ -365,11 +361,7 @@ public class Balancer implements Tool {
     private void receiveResponse(DataInputStream in) throws IOException {
       short status = in.readShort();
       if (status != FSConstants.OP_STATUS_SUCCESS) {
-        throw new IOException("Moving block "+block.getBlockId()+
-            " from "+source.getName() + " to " +
-            target.getName() + " through " +
-            proxySource.getName() +
-        "failed");
+        throw new IOException("block move is failed.");
       }
     }
 
@@ -385,8 +377,10 @@ public class Balancer implements Tool {
     private void scheduleBlockMove() {
       moverExecutor.execute(new Runnable() {
         public void run() {
-          LOG.info("Starting moving "+ block.getBlockId() +
-              " from " + proxySource.getName() + " to " + target.getName());
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Starting moving "+ block.getBlockId() +
+                " from " + proxySource.getName() + " to " + target.getName());
+          }
           dispatch();
         }
       });
@@ -476,8 +470,6 @@ public class Balancer implements Tool {
   /* A class that keeps track of a datanode in Balancer */
   private static class BalancerDatanode implements Writable {
     final private static long MAX_SIZE_TO_MOVE = 10*1024*1024*1024L; //10GB
-    final protected static short MAX_NUM_CONCURRENT_MOVES =
-      DataNode.MAX_BALANCING_THREADS;
     protected DatanodeInfo datanode;
     private double utilization;
     protected long maxSizeToMove;
@@ -914,6 +906,9 @@ public class Balancer implements Tool {
     // compute average utilization
     long totalCapacity=0L, totalUsedSpace=0L;
     for (DatanodeInfo datanode : datanodes) {
+      if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) {
+        continue; // ignore decommissioning or decommissioned nodes
+      }
       totalCapacity += datanode.getCapacity();
       totalUsedSpace += datanode.getDfsUsed();
     }
@@ -927,6 +922,9 @@ public class Balancer implements Tool {
     long overLoadedBytes = 0L, underLoadedBytes = 0L;
     shuffleArray(datanodes);
     for (DatanodeInfo datanode : datanodes) {
+      if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) {
+        continue; // ignore decommissioning or decommissioned nodes
+      }
       cluster.add(datanode);
       BalancerDatanode datanodeS;
       if (getUtilization(datanode) > avgUtilization) {

+ 133 - 85
src/hdfs/org/apache/hadoop/dfs/DataNode.java

@@ -32,6 +32,7 @@ import org.apache.hadoop.util.DiskChecker.DiskErrorException;
 import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
 import org.apache.hadoop.dfs.IncorrectVersionException;
 import org.apache.hadoop.mapred.StatusHttpServer;
+import org.apache.hadoop.dfs.Balancer;
 import org.apache.hadoop.dfs.BlockCommand;
 import org.apache.hadoop.dfs.DatanodeProtocol;
 import org.apache.hadoop.dfs.FSDatasetInterface.MetaDataInputStream;
@@ -45,7 +46,6 @@ import java.nio.channels.FileChannel;
 import java.nio.channels.ServerSocketChannel;
 import java.nio.channels.SocketChannel;
 import java.util.*;
-import java.util.concurrent.Semaphore;
 import java.security.NoSuchAlgorithmException;
 import java.security.SecureRandom;
 
@@ -146,6 +146,45 @@ public class DataNode extends Configured
   private static final int MAX_XCEIVER_COUNT = 256;
   private int maxXceiverCount = MAX_XCEIVER_COUNT;
   
+  /** A manager to make sure that cluster balancing does not
+   * take too much resources.
+   * 
+   * It limits the number of block moves for balancing and
+   * the total amount of bandwidth they can use.
+   */
+  private static class BlockBalanceThrottler extends Throttler {
+   private int numThreads;
+   
+   /**Constructor
+    * 
+    * @param bandwidth Total amount of bandwidth can be used for balancing 
+    */
+   private BlockBalanceThrottler(long bandwidth) {
+     super(bandwidth);
+     LOG.info("Balancing bandwith is "+ bandwidth + " bytes/s");
+   }
+   
+   /** Check if the block move can start. 
+    * 
+    * Return true if the thread quota is not exceeded and 
+    * the counter is incremented; False otherwise.
+    */
+   private synchronized boolean acquire() {
+     if (numThreads >= Balancer.MAX_NUM_CONCURRENT_MOVES) {
+       return false;
+     }
+     numThreads++;
+     return true;
+   }
+   
+   /** Mark that the move is completed. The thread counter is decremented. */
+   private synchronized void release() {
+     numThreads--;
+   }
+  }
+
+  private BlockBalanceThrottler balanceThrottler;
+  
   /**
    * We need an estimate for block size to check if the disk partition has
    * enough space. For now we set it to be the default block size set
@@ -156,12 +195,6 @@ public class DataNode extends Configured
    */
   private long estimateBlockSize;
   
-  // The following three fields are to support balancing
-  final static short MAX_BALANCING_THREADS = 5;
-  private Semaphore balancingSem = new Semaphore(MAX_BALANCING_THREADS);
-  long balanceBandwidth;
-  private Throttler balancingThrottler;
-
   // For InterDataNodeProtocol
   Server ipcServer;
   
@@ -308,9 +341,8 @@ public class DataNode extends Configured
     DataNode.nameNodeAddr = nameNodeAddr;
 
     //set up parameter for cluster balancing
-    this.balanceBandwidth = conf.getLong("dfs.balance.bandwidthPerSec", 1024L*1024);
-    LOG.info("Balancing bandwith is "+balanceBandwidth + " bytes/s");
-    this.balancingThrottler = new Throttler(balanceBandwidth);
+    this.balanceThrottler = new BlockBalanceThrottler(
+      conf.getLong("dfs.balance.bandwidthPerSec", 1024L*1024));
 
     //initialize periodic block scanner
     String reason = null;
@@ -884,24 +916,6 @@ public class DataNode extends Configured
     }
   }
 
-  /* utility function for receiving a response */
-  private static void receiveResponse(Socket s, int numTargets) throws IOException {
-    // check the response
-    DataInputStream reply = new DataInputStream(new BufferedInputStream(
-                                NetUtils.getInputStream(s), BUFFER_SIZE));
-    try {
-      for (int i = 0; i < numTargets; i++) {
-        short opStatus = reply.readShort();
-        if(opStatus != OP_STATUS_SUCCESS) {
-          throw new IOException("operation failed at "+
-              s.getInetAddress());
-        } 
-      }
-    } finally {
-      IOUtils.closeStream(reply);
-    }
-  }
-
   /* utility function for sending a respose */
   private static void sendResponse(Socket s, short opStatus, long timeout) 
                                                        throws IOException {
@@ -945,6 +959,7 @@ public class DataNode extends Configured
       this.ss = ss;
     }
 
+
     /**
      */
     public void run() {
@@ -1360,67 +1375,50 @@ public class DataNode extends Configured
       // Read in the header
       long blockId = in.readLong(); // read block id
       Block block = new Block(blockId, 0, in.readLong());
-      String source = Text.readString(in); // read del hint
-      DatanodeInfo target = new DatanodeInfo(); // read target
-      target.readFields(in);
 
-      Socket targetSock = null;
-      short opStatus = OP_STATUS_SUCCESS;
+      if (!balanceThrottler.acquire()) { // not able to start
+        LOG.info("Not able to copy block " + blockId + " to "
+            + s.getRemoteSocketAddress() + " because threads quota is exceeded.");
+        return;
+      }
+
       BlockSender blockSender = null;
-      DataOutputStream targetOut = null;
+      DataOutputStream reply = null;
+      boolean isOpSuccess = true;
+
       try {
-        balancingSem.acquireUninterruptibly();
-        
         // check if the block exists or not
         blockSender = new BlockSender(block, 0, -1, false, false, false);
 
-        // get the output stream to the target
-        InetSocketAddress targetAddr = NetUtils.createSocketAddr(target.getName());
-        targetSock = newSocket();
-        targetSock.connect(targetAddr, socketTimeout);
-        targetSock.setSoTimeout(socketTimeout);
+        // set up response stream
+        OutputStream baseStream = NetUtils.getOutputStream(
+            s, socketWriteTimeout);
+        reply = new DataOutputStream(new BufferedOutputStream(
+            baseStream, SMALL_BUFFER_SIZE));
 
-        OutputStream baseStream = NetUtils.getOutputStream(targetSock, 
-                                                            socketWriteTimeout);
-        targetOut = new DataOutputStream(
-                       new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE));
-
-        /* send request to the target */
-        // fist write header info
-        targetOut.writeShort(DATA_TRANSFER_VERSION); // transfer version
-        targetOut.writeByte(OP_REPLACE_BLOCK); // op code
-        targetOut.writeLong(block.getBlockId()); // block id
-        targetOut.writeLong(block.getGenerationStamp()); // block id
-        Text.writeString( targetOut, source); // del hint
 
-        // then send data
-        long read = blockSender.sendBlock(targetOut, baseStream, 
-                                          balancingThrottler);
+        // send block content to the target 
+        long read = blockSender.sendBlock(reply, baseStream, 
+                                          balanceThrottler);
 
         myMetrics.bytesRead.inc((int) read);
         myMetrics.blocksRead.inc();
         
-        // check the response from target
-        receiveResponse(targetSock, 1);
-
-        LOG.info("Copied block " + block + " to " + targetAddr);
+        LOG.info("Copied block " + block + " to " + s.getRemoteSocketAddress());
       } catch (IOException ioe) {
-        opStatus = OP_STATUS_ERROR;
-        LOG.warn("Got exception while serving " + block + " to "
-            + target.getName() + ": " + StringUtils.stringifyException(ioe));
+        isOpSuccess = false;
         throw ioe;
       } finally {
-        /* send response to the requester */
-        try {
-          sendResponse(s, opStatus, socketWriteTimeout);
-        } catch (IOException replyE) {
-          LOG.warn("Error writing the response back to "+
-              s.getRemoteSocketAddress() + "\n" +
-              StringUtils.stringifyException(replyE) );
+        balanceThrottler.release();
+        if (isOpSuccess) {
+          try {
+            // send one last byte to indicate that the resource is cleaned.
+            reply.writeChar('d');
+          } catch (IOException ignored) {
+          }
         }
-        IOUtils.closeStream(targetOut);
+        IOUtils.closeStream(reply);
         IOUtils.closeStream(blockSender);
-        balancingSem.release();
       }
     }
 
@@ -1433,21 +1431,59 @@ public class DataNode extends Configured
      * @throws IOException
      */
     private void replaceBlock(DataInputStream in) throws IOException {
-      balancingSem.acquireUninterruptibly();
-
-      /* read header */
-      Block block = new Block(in.readLong(), estimateBlockSize, in.readLong()); // block id & len
-      String sourceID = Text.readString(in);
-
-      short opStatus = OP_STATUS_SUCCESS;
-      BlockReceiver blockReceiver = null;
-      try {
+        /* read header */
+        long blockId = in.readLong();
+        Block block = new Block(blockId, estimateBlockSize,
+            in.readLong()); // block id & generation stamp
+        String sourceID = Text.readString(in); // read del hint
+        DatanodeInfo proxySource = new DatanodeInfo(); // read proxy source
+        proxySource.readFields(in);
+  
+        if (!balanceThrottler.acquire()) { // not able to start
+          LOG.warn("Not able to receive block " + blockId + " from " 
+              + s.getRemoteSocketAddress() + " because threads quota is exceeded.");
+          sendResponse(s, (short)OP_STATUS_ERROR, 
+              socketWriteTimeout);
+          return;
+        }
+  
+        Socket proxySock = null;
+        DataOutputStream proxyOut = null;
+  
+        short opStatus = OP_STATUS_SUCCESS;
+        BlockReceiver blockReceiver = null;
+        DataInputStream proxyReply = null;
+  
+        try {
+        // get the output stream to the proxy
+        InetSocketAddress proxyAddr = NetUtils.createSocketAddr(
+            proxySource.getName());
+        proxySock = newSocket();
+        proxySock.connect(proxyAddr, socketTimeout);
+        proxySock.setSoTimeout(socketTimeout);
+  
+        OutputStream baseStream = NetUtils.getOutputStream(proxySock, 
+            socketWriteTimeout);
+        proxyOut = new DataOutputStream(
+                       new BufferedOutputStream(baseStream, SMALL_BUFFER_SIZE));
+  
+        /* send request to the proxy */
+        proxyOut.writeShort(DATA_TRANSFER_VERSION); // transfer version
+        proxyOut.writeByte(OP_COPY_BLOCK); // op code
+        proxyOut.writeLong(block.getBlockId()); // block id
+        proxyOut.writeLong(block.getGenerationStamp()); // block id
+        proxyOut.flush();
+  
+        // receive the response from the proxy
+        proxyReply = new DataInputStream(new BufferedInputStream(
+            NetUtils.getInputStream(proxySock), BUFFER_SIZE));
         // open a block receiver and check if the block does not exist
-         blockReceiver = new BlockReceiver(
-            block, in, s.getRemoteSocketAddress().toString(), false, "", null);
+        blockReceiver = new BlockReceiver(
+            block, proxyReply, proxySock.getRemoteSocketAddress().toString(),
+            false, "", null);
 
         // receive a block
-        blockReceiver.receiveBlock(null, null, null, null, balancingThrottler, -1);
+        blockReceiver.receiveBlock(null, null, null, null, balanceThrottler, -1);
                       
         // notify name node
         notifyNamenodeReceivedBlock(block, sourceID);
@@ -1458,14 +1494,26 @@ public class DataNode extends Configured
         opStatus = OP_STATUS_ERROR;
         throw ioe;
       } finally {
+        // receive the last byte that indicates the proxy released its thread resource
+        if (opStatus == OP_STATUS_SUCCESS) {
+          try {
+            proxyReply.readChar();
+          } catch (IOException ignored) {
+          }
+        }
+      
+        // now release the thread resource
+        balanceThrottler.release();
+      
         // send response back
         try {
           sendResponse(s, opStatus, socketWriteTimeout);
         } catch (IOException ioe) {
           LOG.warn("Error writing reply back to " + s.getRemoteSocketAddress());
         }
+        IOUtils.closeStream(proxyOut);
         IOUtils.closeStream(blockReceiver);
-        balancingSem.release();
+        IOUtils.closeStream(proxyReply);
       }
     }
   }

+ 8 - 5
src/hdfs/org/apache/hadoop/dfs/FSConstants.java

@@ -100,12 +100,15 @@ public interface FSConstants {
    * This should change when serialization of DatanodeInfo, not just
    * when protocol changes. It is not very obvious. 
    */
-  /*
-   * Version 11:
-   *    OP_WRITE_BLOCK sends a boolean. If its value is true, an additonal 
-   *    DatanodeInfo of client requesting transfer is also sent. 
+  /* Version 14:
+   *    OP_REPLACE_BLOCK is sent from the Balancer server to the destination,
+   *    including the block id, source, and proxy.
+   *    OP_COPY_BLOCK is sent from the destination to the proxy, which contains
+   *    only the block id.
+   *    A reply to OP_COPY_BLOCK sends the block content.
+   *    A reply to OP_REPLACE_BLOCK includes an operation status.
    */
-  public static final int DATA_TRANSFER_VERSION = 11;
+  public static final int DATA_TRANSFER_VERSION = 14;
 
   // Return codes for file create
   public static final int OPERATION_FAILED = 0;

+ 5 - 5
src/test/org/apache/hadoop/dfs/TestBlockReplacement.java

@@ -203,7 +203,7 @@ public class TestBlockReplacement extends TestCase {
   }
 
   /* Copy a block from sourceProxy to destination. If the block becomes
-   * overreplicated, preferrably remove it from source.
+   * over-replicated, preferably remove it from source.
    * 
    * Return true if a block is successfully copied; otherwise false.
    */
@@ -211,16 +211,16 @@ public class TestBlockReplacement extends TestCase {
       DatanodeInfo sourceProxy, DatanodeInfo destination) throws IOException {
     Socket sock = new Socket();
     sock.connect(NetUtils.createSocketAddr(
-        sourceProxy.getName()), FSConstants.READ_TIMEOUT);
-    sock.setSoTimeout(FSConstants.READ_TIMEOUT);
+        destination.getName()), FSConstants.READ_TIMEOUT);
+    sock.setKeepAlive(true);
     // sendRequest
     DataOutputStream out = new DataOutputStream(sock.getOutputStream());
     out.writeShort(FSConstants.DATA_TRANSFER_VERSION);
-    out.writeByte(FSConstants.OP_COPY_BLOCK);
+    out.writeByte(FSConstants.OP_REPLACE_BLOCK);
     out.writeLong(block.getBlockId());
     out.writeLong(block.getGenerationStamp());
     Text.writeString(out, source.getStorageID());
-    destination.write(out);
+    sourceProxy.write(out);
     out.flush();
     // receiveResponse
     DataInputStream reply = new DataInputStream(sock.getInputStream());