浏览代码

HADOOP-5144. Add a new DFSAdmin command for changing the setting of restore failed storage replicas in namenode. (Boris Shkolnik via szetszwo)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/core/trunk@748728 13f79535-47bb-0310-9956-ffa450edef68
Tsz-wo Sze 16 年之前
父节点
当前提交
636adcda56

+ 3 - 0
CHANGES.txt

@@ -49,6 +49,9 @@ Trunk (unreleased changes)
     HADOOP-4927. Adds a generic wrapper around outputformat to allow creation of
     output on demand (Jothi Padmanabhan via ddas)
 
+    HADOOP-5144. Add a new DFSAdmin command for changing the setting of restore
+    failed storage replicas in namenode. (Boris Shkolnik via szetszwo)
+
   IMPROVEMENTS
 
     HADOOP-4565. Added CombineFileInputFormat to use data locality information

+ 7 - 0
src/docs/src/documentation/content/xdocs/commands_manual.xml

@@ -480,6 +480,7 @@
 					<code>Usage: hadoop dfsadmin  [</code><a href="commands_manual.html#Generic+Options">GENERIC_OPTIONS</a><code>] [-report] [-safemode enter | leave | get | wait] [-refreshNodes]
 					 [-finalizeUpgrade] [-upgradeProgress status | details | force] [-metasave filename] 
 					 [-setQuota &lt;quota&gt; &lt;dirname&gt;...&lt;dirname&gt;] [-clrQuota &lt;dirname&gt;...&lt;dirname&gt;] 
+					 [-restoreFailedStorage true|false|check] 
 					 [-help [cmd]]</code>
 				</p>
 				<table>
@@ -547,6 +548,12 @@
                 2. user is not an administrator.<br/>
                 It does not fault if the directory has no quota.</td>
 			           </tr>
+			           <tr>
+			          	<td><code>-restoreFailedStorage true | false | check</code></td>
+			            <td>This option will turn on/off automatic attempt to restore failed storage replicas. 
+			            If a failed storage becomes available again the system will attempt to restore 
+			            edits and/or fsimage during checkpoint. 'check' option will return current setting.</td>
+			           </tr>
 			           <tr>
 			          	<td><code>-help [cmd]</code></td>
 			            <td> Displays help for the given command or all commands if none

+ 11 - 0
src/hdfs/org/apache/hadoop/hdfs/DFSClient.java

@@ -730,6 +730,17 @@ public class DFSClient implements FSConstants, java.io.Closeable {
       throw re.unwrapRemoteException(AccessControlException.class);
     }
   }
+  
+  /**
+   * enable/disable restore failed storage.
+   * See {@link ClientProtocol#restoreFailedStorage()} 
+   * for more details.
+   * 
+   * @see ClientProtocol#restoreFailedStorage()
+   */
+  boolean restoreFailedStorage(String arg) throws AccessControlException {
+    return namenode.restoreFailedStorage(arg);
+  }
 
   /**
    * Refresh the hosts and exclude files.  (Rereads them.)

+ 10 - 0
src/hdfs/org/apache/hadoop/hdfs/DistributedFileSystem.java

@@ -362,6 +362,16 @@ public class DistributedFileSystem extends FileSystem {
     dfs.saveNamespace();
   }
 
+  /**
+   * enable/disable/check restoreFaileStorage
+   * 
+   * @see org.apache.hadoop.hdfs.protocol.ClientProtocol#restoreFailedStorage()
+   */
+  public boolean restoreFailedStorage(String arg) throws AccessControlException {
+    return dfs.restoreFailedStorage(arg);
+  }
+  
+
   /**
    * Refreshes the list of hosts and excluded hosts from the configured 
    * files.  

+ 11 - 2
src/hdfs/org/apache/hadoop/hdfs/protocol/ClientProtocol.java

@@ -41,9 +41,9 @@ public interface ClientProtocol extends VersionedProtocol {
    * Compared to the previous version the following changes have been introduced:
    * (Only the latest change is reflected.
    * The log of historical changes can be retrieved from the svn).
-   * 42: updated to use sticky bit
+   * 43: added restoreFailedStorage command
    */
-  public static final long versionID = 42L;
+  public static final long versionID = 43L;
   
   ///////////////////////////////////////
   // File contents
@@ -374,6 +374,15 @@ public interface ClientProtocol extends VersionedProtocol {
    */
   public void saveNamespace() throws IOException;
 
+  /**
+   * Enable/Disable restore failed storage.
+   * <p>
+   * sets flag to enable restore of failed storage replicas
+   * 
+   * @throws AccessControlException if the superuser privilege is violated.
+   */
+  public boolean restoreFailedStorage(String arg) throws AccessControlException;
+
   /**
    * Tells the namenode to reread the hosts and exclude files. 
    * @throws IOException

+ 1 - 1
src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSImage.java

@@ -119,7 +119,7 @@ public class FSImage extends Storage {
    */
   private boolean restoreFailedStorage = false;
   public void setRestoreFailedStorage(boolean val) {
-    LOG.info("enabled failed storage replicas restore");
+    LOG.info("set restore failed storage to " + val);
     restoreFailedStorage=val;
   }
   

+ 19 - 0
src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -3441,6 +3441,25 @@ public class FSNamesystem implements FSConstants, FSNamesystemMBean {
     getFSImage().saveFSImage();
     LOG.info("New namespace image has been created.");
   }
+  
+  /**
+   * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
+   * Requires superuser privilege.
+   * 
+   * @throws AccessControlException if superuser privilege is violated.
+   */
+  synchronized boolean restoreFailedStorage(String arg) throws AccessControlException {
+    checkSuperuserPrivilege();
+    
+    // if it is disabled - enable it and vice versa.
+    if(arg.equals("check"))
+      return getFSImage().getRestoreFailedStorage();
+    
+    boolean val = arg.equals("true");  // false if not
+    getFSImage().setRestoreFailedStorage(val);
+    
+    return val;
+  }
 
   /**
    */

+ 9 - 0
src/hdfs/org/apache/hadoop/hdfs/server/namenode/NameNode.java

@@ -47,6 +47,7 @@ import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.net.NetUtils;
 import org.apache.hadoop.net.NetworkTopology;
+import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.SecurityUtil;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.authorize.AuthorizationException;
@@ -608,6 +609,14 @@ public class NameNode implements ClientProtocol, DatanodeProtocol,
     return namesystem.isInSafeMode();
   }
 
+  /**
+   * @throws AccessControlException 
+   * @inheritDoc
+   */
+  public boolean restoreFailedStorage(String arg) throws AccessControlException {
+    return namesystem.restoreFailedStorage(arg);
+  }
+
   /**
    * @inheritDoc
    */

+ 46 - 0
src/hdfs/org/apache/hadoop/hdfs/tools/DFSAdmin.java

@@ -390,6 +390,33 @@ public class DFSAdmin extends FsShell {
     return exitCode;
   }
 
+  /**
+   * Command to enable/disable/check restoring of failed storage replicas in the namenode.
+   * Usage: java DFSAdmin -restoreFailedStorage true|false|check
+   * @exception IOException 
+   * @see org.apache.hadoop.hdfs.protocol.ClientProtocol#restoreFailedStorage()
+   */
+  public int restoreFaileStorage(String arg) throws IOException {
+    int exitCode = -1;
+
+    if (!(fs instanceof DistributedFileSystem)) {
+      System.err.println("FileSystem is " + fs.getUri());
+      return exitCode;
+    }
+
+    if(!arg.equals("check") && !arg.equals("true") && !arg.equals("false")) {
+      System.err.println("restoreFailedStorage valid args are true|false|check");
+      return exitCode;
+    }
+    
+    DistributedFileSystem dfs = (DistributedFileSystem) fs;
+    Boolean res = dfs.restoreFailedStorage(arg);
+    System.out.println("restoreFailedStorage is set to " + res);
+    exitCode = 0;
+
+    return exitCode;
+  }
+
   /**
    * Command to ask the namenode to reread the hosts and excluded hosts 
    * file.
@@ -416,6 +443,7 @@ public class DFSAdmin extends FsShell {
       "The full syntax is: \n\n" +
       "hadoop dfsadmin [-report] [-safemode <enter | leave | get | wait>]\n" +
       "\t[-saveNamespace]\n" +
+      "\t[-restoreFailedStorage true|false|check]\n" +
       "\t[-refreshNodes]\n" +
       "\t[" + SetQuotaCommand.USAGE + "]\n" +
       "\t[" + ClearQuotaCommand.USAGE +"]\n" +
@@ -440,6 +468,10 @@ public class DFSAdmin extends FsShell {
     "Save current namespace into storage directories and reset edits log.\n" +
     "\t\tRequires superuser permissions and safe mode.\n";
 
+    String restoreFailedStorage = "-restoreFailedStorage:\t" +
+    "Set/Unset/Check flag to attempt restore of failed storage replicas if they become available.\n" +
+    "\t\tRequires superuser permissions.\n";
+    
     String refreshNodes = "-refreshNodes: \tUpdates the set of hosts allowed " +
                           "to connect to namenode.\n\n" +
       "\t\tRe-reads the config file to update values defined by \n" +
@@ -480,6 +512,8 @@ public class DFSAdmin extends FsShell {
       System.out.println(safemode);
     } else if ("saveNamespace".equals(cmd)) {
       System.out.println(saveNamespace);
+    } else if ("restoreFailedStorage".equals(cmd)) {
+      System.out.println(restoreFailedStorage);
     } else if ("refreshNodes".equals(cmd)) {
       System.out.println(refreshNodes);
     } else if ("finalizeUpgrade".equals(cmd)) {
@@ -505,6 +539,7 @@ public class DFSAdmin extends FsShell {
       System.out.println(report);
       System.out.println(safemode);
       System.out.println(saveNamespace);
+      System.out.println(restoreFailedStorage);
       System.out.println(refreshNodes);
       System.out.println(finalizeUpgrade);
       System.out.println(upgradeProgress);
@@ -647,6 +682,9 @@ public class DFSAdmin extends FsShell {
     } else if ("-saveNamespace".equals(cmd)) {
       System.err.println("Usage: java DFSAdmin"
                          + " [-saveNamespace]");
+    } else if ("-restoreFailedStorage".equals(cmd)) {
+      System.err.println("Usage: java DFSAdmin"
+          + " [-restoreFailedStorage true|false|check ]");
     } else if ("-refreshNodes".equals(cmd)) {
       System.err.println("Usage: java DFSAdmin"
                          + " [-refreshNodes]");
@@ -679,6 +717,7 @@ public class DFSAdmin extends FsShell {
       System.err.println("           [-report]");
       System.err.println("           [-safemode enter | leave | get | wait]");
       System.err.println("           [-saveNamespace]");
+      System.err.println("           [-restoreFailedStorage true|false|check]");
       System.err.println("           [-refreshNodes]");
       System.err.println("           [-finalizeUpgrade]");
       System.err.println("           [-upgradeProgress status | details | force]");
@@ -729,6 +768,11 @@ public class DFSAdmin extends FsShell {
         printUsage(cmd);
         return exitCode;
       }
+    } else if ("-restoreFailedStorage".equals(cmd)) {
+      if (argv.length != 2) {
+        printUsage(cmd);
+        return exitCode;
+      }
     } else if ("-refreshNodes".equals(cmd)) {
       if (argv.length != 1) {
         printUsage(cmd);
@@ -776,6 +820,8 @@ public class DFSAdmin extends FsShell {
         setSafeMode(argv, i);
       } else if ("-saveNamespace".equals(cmd)) {
         exitCode = saveNamespace();
+      } else if ("-restoreFailedStorage".equals(cmd)) {
+        exitCode = restoreFaileStorage(argv[i]);
       } else if ("-refreshNodes".equals(cmd)) {
         exitCode = refreshNodes();
       } else if ("-finalizeUpgrade".equals(cmd)) {

+ 53 - 2
src/test/org/apache/hadoop/hdfs/server/namenode/TestStorageRestore.java

@@ -20,7 +20,6 @@ package org.apache.hadoop.hdfs.server.namenode;
 
 import java.io.File;
 import java.io.IOException;
-import java.util.Collection;
 import java.util.Iterator;
 import java.util.Random;
 
@@ -28,6 +27,7 @@ import junit.framework.TestCase;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.cli.util.CommandExecutor;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
@@ -38,6 +38,7 @@ import org.apache.hadoop.hdfs.server.common.Storage;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
 import org.apache.hadoop.hdfs.server.namenode.FSImage.NameNodeDirType;
 import org.apache.hadoop.hdfs.server.namenode.FSImage.NameNodeFile;
+import org.apache.hadoop.util.StringUtils;
 
 
 /**
@@ -191,7 +192,6 @@ public class TestStorageRestore extends TestCase {
    */
   public void testStorageRestore() throws Exception {
     int numDatanodes = 2;
-    //Collection<String> dirs = config.getStringCollection("dfs.name.dir");
     cluster = new MiniDFSCluster(0, config, numDatanodes, true, false, true,  null, null, null, null);
     cluster.waitActive();
     
@@ -225,4 +225,55 @@ public class TestStorageRestore extends TestCase {
     secondary.shutdown();
     cluster.shutdown();
   }
+  
+  /**
+   * Test dfsadmin -restoreFailedStorage command
+   * @throws Exception
+   */
+  public void testDfsAdminCmd() throws IOException {
+    int numDatanodes = 2;
+    
+    
+    cluster = new MiniDFSCluster(0, config, numDatanodes, true, false, true,  null, null, null, null);
+    cluster.waitActive();
+    try {
+
+      FSImage fsi = cluster.getNameNode().getFSImage();
+
+      // it is started with dfs.name.dir.restore set to true (in SetUp())
+      boolean restore = fsi.getRestoreFailedStorage();
+      LOG.info("Restore is " + restore);
+      assertEquals(restore, true);
+
+      // now run DFSAdmnin command
+
+      String cmd = "-fs NAMENODE -restoreFailedStorage false";
+      String namenode = config.get("fs.default.name", "file:///");
+      CommandExecutor.executeDFSAdminCommand(cmd, namenode);
+      restore = fsi.getRestoreFailedStorage();
+      LOG.info("After set true call restore is " + restore);
+      assertEquals(restore, false);
+
+      // run one more time - to set it to true again
+      cmd = "-fs NAMENODE -restoreFailedStorage true";
+      CommandExecutor.executeDFSAdminCommand(cmd, namenode);
+      restore = fsi.getRestoreFailedStorage();
+      LOG.info("After set false call restore is " + restore);
+      assertEquals(restore, true);
+      
+   // run one more time - no change in value
+      cmd = "-fs NAMENODE -restoreFailedStorage check";
+      CommandExecutor.executeDFSAdminCommand(cmd, namenode);
+      restore = fsi.getRestoreFailedStorage();
+      LOG.info("After check call restore is " + restore);
+      assertEquals(restore, true);
+      String commandOutput = CommandExecutor.getLastCommandOutput();
+      commandOutput.trim();
+      assertTrue(commandOutput.contains("restoreFailedStorage is set to true"));
+      
+
+    } finally {
+      cluster.shutdown();
+    }
+  }
 }