فهرست منبع

HDFS-4174. Add abilit to list the corrupted files in WebUI (backport of HDFS-1031). Contributed by Jing Zhao.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1@1409292 13f79535-47bb-0310-9956-ffa450edef68
Suresh Srinivas 12 سال پیش
والد
کامیت
a20aa0bd5c

+ 3 - 0
CHANGES.txt

@@ -329,6 +329,9 @@ Release 1.1.1 - Unreleased
     HDFS-1539. A config option for the datanode to fsycn a block file
     when block is completely written. (dhruba via szetszwo)
 
+    HDFS-4174. Add abilit to list the corrupted files in WebUI (backport of
+    HDFS-1031). (Jing Zhao via suresh)
+
   BUG FIXES
 
     HADOOP-8878. Uppercase namenode hostname causes hadoop dfs calls with

+ 3 - 0
src/hdfs/org/apache/hadoop/hdfs/DFSConfigKeys.java

@@ -246,6 +246,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
   public static final int     DFS_BLOCKREPORT_INITIAL_DELAY_DEFAULT = 0;
   public static final String  DFS_BLOCK_INVALIDATE_LIMIT_KEY = "dfs.block.invalidate.limit";
   public static final int     DFS_BLOCK_INVALIDATE_LIMIT_DEFAULT = 100;
+  public static final String  DFS_MAX_CORRUPT_FILES_RETURNED_KEY = "dfs.corruptfilesreturned.max";
+  public static final int     DFS_MAX_CORRUPT_FILES_RETURNED_DEFAULT = 500;
+  
   public static final String  DFS_CLIENT_READ_SHORTCIRCUIT_KEY = "dfs.client.read.shortcircuit";
   public static final boolean DFS_CLIENT_READ_SHORTCIRCUIT_DEFAULT = false;
   public static final String  DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY = "dfs.client.read.shortcircuit.skip.checksum";

+ 65 - 6
src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -45,12 +45,12 @@ import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.NavigableMap;
 import java.util.Random;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
-import java.util.Map.Entry;
 import java.util.concurrent.TimeUnit;
 
 import javax.management.NotCompliantMBeanException;
@@ -61,6 +61,7 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.ContentSummary;
+import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsAction;
 import org.apache.hadoop.fs.permission.FsPermission;
@@ -86,27 +87,26 @@ import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretMan
 import org.apache.hadoop.hdfs.server.common.GenerationStamp;
 import org.apache.hadoop.hdfs.server.common.HdfsConstants;
 import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
+import org.apache.hadoop.hdfs.server.common.Storage;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirType;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
-import org.apache.hadoop.hdfs.server.common.Storage;
 import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport;
 import org.apache.hadoop.hdfs.server.namenode.BlocksMap.BlockInfo;
 import org.apache.hadoop.hdfs.server.namenode.LeaseManager.Lease;
 import org.apache.hadoop.hdfs.server.namenode.UnderReplicatedBlocks.BlockIterator;
 import org.apache.hadoop.hdfs.server.namenode.metrics.FSNamesystemMBean;
+import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand;
 import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
+import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
 import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
 import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
 import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand;
 import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
 import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand;
-import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
-import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.ipc.Server;
-import org.apache.hadoop.fs.FileAlreadyExistsException;
 import org.apache.hadoop.metrics2.MetricsBuilder;
 import org.apache.hadoop.metrics2.MetricsSource;
 import org.apache.hadoop.metrics2.MetricsSystem;
@@ -121,8 +121,8 @@ import org.apache.hadoop.net.ScriptBasedMapping;
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
-import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.security.token.SecretManager.InvalidToken;
+import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.security.token.delegation.DelegationKey;
 import org.apache.hadoop.util.Daemon;
 import org.apache.hadoop.util.HostsFileReader;
@@ -327,6 +327,8 @@ public class FSNamesystem implements FSConstants, FSNamesystemMBean, FSClusterSt
   private boolean allowBrokenAppend = false;
   // enable durable sync
   private boolean durableSync = true;
+  // How many entries are returned by getCorruptInodes()
+  int maxCorruptFilesReturned;
 
   /**
    * Last block index used for replication work.
@@ -527,6 +529,9 @@ public class FSNamesystem implements FSConstants, FSNamesystemMBean, FSClusterSt
         conf.getClass("net.topology.impl", NetworkTopology.class,
             NetworkTopology.class), conf);
 
+    this.maxCorruptFilesReturned = conf.getInt(
+        DFSConfigKeys.DFS_MAX_CORRUPT_FILES_RETURNED_KEY,
+        DFSConfigKeys.DFS_MAX_CORRUPT_FILES_RETURNED_DEFAULT);
     this.replicator = BlockPlacementPolicy.getInstance(conf, this, clusterMap);
     this.defaultReplication = conf.getInt("dfs.replication", 3);
     this.maxReplication = conf.getInt("dfs.replication.max", 512);
@@ -6271,4 +6276,58 @@ public class FSNamesystem implements FSConstants, FSNamesystemMBean, FSClusterSt
     this.avoidStaleDataNodesForWrite = avoidStaleDataNodesForWrite;
   }
   
+  /**
+   * Used by {@link FSNamesystem#getCorruptFileBlocks()} and
+   * {@link FSNamesystem#listCorruptFileBlocks()} to represent information about
+   * corrupt file and its corresponding block
+   */
+  static class CorruptFileBlockInfo {
+    String path;
+    Block block;
+    
+    public CorruptFileBlockInfo(String p, Block b) {
+      path = p;
+      block = b;
+    }
+    
+    @Override
+    public String toString() {
+      return block.getBlockName() + "\t" + path;
+    }
+  }
+  
+  /**
+   * @return a collection of corrupt files with their blocks information, with a
+   *         maximum of {@link FSNamesystem#maxCorruptFilesReturned} files
+   *         listed in total
+   */
+  private Collection<CorruptFileBlockInfo> getCorruptFileBlocks() {
+    ArrayList<CorruptFileBlockInfo> corruptFiles = 
+        new ArrayList<CorruptFileBlockInfo>();
+    for (Block blk : neededReplications.getCorruptQueue()){
+      INode inode = blocksMap.getINode(blk);
+      if (inode != null && countNodes(blk).liveReplicas() == 0) {
+        String filePath = inode.getFullPathName();
+        CorruptFileBlockInfo info = new CorruptFileBlockInfo(filePath, blk);
+        corruptFiles.add(info);
+        if (corruptFiles.size() >= this.maxCorruptFilesReturned) {
+          break;
+        }
+      }
+    }
+    return corruptFiles;
+  }
+  
+  /**
+   * @return Collection of CorruptFileBlockInfo objects representing files with
+   * corrupted blocks.
+   * @throws AccessControlException
+   * @throws IOException
+   */
+  synchronized Collection<CorruptFileBlockInfo> listCorruptFileBlocks()
+      throws AccessControlException, IOException {
+    checkSuperuserPrivilege();
+    return getCorruptFileBlocks();
+  }
+  
 }

+ 12 - 0
src/hdfs/org/apache/hadoop/hdfs/server/namenode/JspHelper.java

@@ -65,6 +65,7 @@ import org.apache.hadoop.security.authorize.AuthorizationException;
 import org.apache.hadoop.security.authorize.ProxyUsers;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.VersionInfo;
 
 public class JspHelper {
   public static final String CURRENT_CONF = "current.conf";
@@ -625,4 +626,15 @@ public class JspHelper {
   public static int getDefaultChunkSize(Configuration conf) {
     return conf.getInt("dfs.default.chunk.view.size", 32 * 1024);
   }
+  
+  /** Return a table containing version information. */
+  static String getVersionTable(FSNamesystem fsn) {
+    return "<div class='dfstable'><table>"
+        + "\n  <tr><td class='col1'>Started:</td><td>" + fsn.getStartTime()
+        + "</td></tr>\n" + "\n  <tr><td class='col1'>Version:</td><td>"
+        + VersionInfo.getVersion() + ", " + VersionInfo.getRevision()
+        + "</td></tr>\n" + "\n  <tr><td class='col1'>Compiled:</td><td>"
+        + VersionInfo.getDate() + " by " + VersionInfo.getUser()
+        + "</td></tr>\n</table></div>";
+  }
 }

+ 48 - 31
src/hdfs/org/apache/hadoop/hdfs/server/namenode/UnderReplicatedBlocks.java

@@ -27,6 +27,7 @@ import org.apache.hadoop.hdfs.protocol.Block;
  */
 class UnderReplicatedBlocks implements Iterable<Block> {
   static final int LEVEL = 3;
+  static public final int QUEUE_WITH_CORRUPT_BLOCKS = 2;
   private List<TreeSet<Block>> priorityQueues = new ArrayList<TreeSet<Block>>();
       
   /* constructor */
@@ -187,39 +188,55 @@ class UnderReplicatedBlocks implements Iterable<Block> {
     return new BlockIterator();
   }
   
-    class BlockIterator implements Iterator<Block> {
-      private int level;
-      private List<Iterator<Block>> iterators = new ArrayList<Iterator<Block>>();
-      BlockIterator()  
-      {
-        level=0;
-        for(int i=0; i<LEVEL; i++) {
-          iterators.add(priorityQueues.get(i).iterator());
-        }
-      }
-              
-      private void update() {
-        while(level< LEVEL-1 && !iterators.get(level).hasNext()) {
-          level++;
-        }
-      }
-              
-      public Block next() {
-        update();
-        return iterators.get(level).next();
-      }
-              
-      public boolean hasNext() {
-        update();
-        return iterators.get(level).hasNext();
+  /* returns an iterator of all blocks in a given priority queue */
+  private synchronized Iterable<Block> getQueue(int priority) {
+    if (priority < 0 || priority >= LEVEL) {
+      return null;
+    }
+    return priorityQueues.get(priority);
+  }
+  
+  /**
+   * @return an iterator of all the blocks in the QUEUE_WITH_CORRUPT_BLOCKS
+   *         priority queue
+   */
+  Iterable<Block> getCorruptQueue() {
+    return getQueue(QUEUE_WITH_CORRUPT_BLOCKS);
+  }
+  
+  class BlockIterator implements Iterator<Block> {
+    private int level;
+    private List<Iterator<Block>> iterators = new ArrayList<Iterator<Block>>();
+
+    BlockIterator() {
+      level = 0;
+      for (int i = 0; i < LEVEL; i++) {
+        iterators.add(priorityQueues.get(i).iterator());
       }
-              
-      public void remove() {
-        iterators.get(level).remove();
+    }
+
+    private void update() {
+      while (level < LEVEL - 1 && !iterators.get(level).hasNext()) {
+        level++;
       }
-      
-      public int getPriority() {
-        return level;
+    }
+
+    public Block next() {
+      update();
+      return iterators.get(level).next();
+    }
+
+    public boolean hasNext() {
+      update();
+      return iterators.get(level).hasNext();
+    }
+
+    public void remove() {
+      iterators.get(level).remove();
+    }
+
+    public int getPriority() {
+      return level;
     };
   }
 }

+ 118 - 0
src/test/org/apache/hadoop/hdfs/server/namenode/TestCorruptFilesJsp.java

@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import static org.junit.Assert.assertTrue;
+
+import java.net.URL;
+import java.util.Collection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.ChecksumException;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.DFSTestUtil;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.TestDatanodeBlockScanner;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem.CorruptFileBlockInfo;
+import org.junit.Test;
+
+/** A JUnit test for corrupt_files.jsp */
+public class TestCorruptFilesJsp  {
+
+  @Test
+  public void testCorruptFilesJsp() throws Exception {
+    MiniDFSCluster cluster = null;
+    try {
+      final int FILE_SIZE = 512;
+      Path[] filepaths = { new Path("/audiobook"), new Path("/audio/audio1"),
+          new Path("/audio/audio2"), new Path("/audio/audio") };
+
+      Configuration conf = new Configuration();
+      // DataNode scans directories
+      conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_INTERVAL_KEY, 1);
+      // DataNode sends block reports
+      conf.setInt(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 3 * 1000);
+      cluster = new MiniDFSCluster(conf, 1, true, null);
+      cluster.waitActive();
+
+      FileSystem fs = cluster.getFileSystem();
+
+      // create files
+      for (Path filepath : filepaths) {
+        DFSTestUtil.createFile(fs, filepath, FILE_SIZE, (short) 1, 0L);
+        DFSTestUtil.waitReplication(fs, filepath, (short) 1);
+      }
+
+      // verify there are not corrupt files
+      Collection<CorruptFileBlockInfo> badFiles = cluster.getNameNode()
+          .getNamesystem().listCorruptFileBlocks();
+      assertTrue("There are " + badFiles.size()
+          + " corrupt files, but expecting none", badFiles.size() == 0);
+
+      String nnUrl = cluster.getNameNode().getHttpAddress().getHostName() + ":"
+          + cluster.getNameNode().getHttpAddress().getPort(); 
+      URL url = new URL("http://" + nnUrl + "/corrupt_files.jsp");
+      String corruptFilesPage = DFSTestUtil.urlGet(url);
+      assertTrue("Corrupt files page is not showing a healthy filesystem",
+          corruptFilesPage.contains("No missing blocks found at the moment."));
+
+      // Now corrupt all the files except for the last one
+      for (int idx = 0; idx < filepaths.length - 1; idx++) {
+        String blockName = DFSTestUtil.getFirstBlock(fs, filepaths[idx])
+            .getBlockName();
+        TestDatanodeBlockScanner.corruptReplica(blockName, 0);
+
+        // read the file so that the corrupt block is reported to NN
+        FSDataInputStream in = fs.open(filepaths[idx]);
+        try {
+          in.readFully(new byte[FILE_SIZE]);
+        } catch (ChecksumException ignored) { // checksum error is expected.
+        }
+        in.close();
+      }
+
+      // verify if all corrupt files were reported to NN
+      badFiles = cluster.getNameNode().getNamesystem().listCorruptFileBlocks();
+      assertTrue("Expecting 3 corrupt files, but got " + badFiles.size(),
+          badFiles.size() == 3);
+
+      corruptFilesPage = DFSTestUtil.urlGet(url);
+      assertTrue("'/audiobook' should be corrupt", corruptFilesPage
+          .contains("/audiobook"));
+      assertTrue("'/audio/audio1' should be corrupt", corruptFilesPage
+          .contains("/audio/audio1"));
+      assertTrue("'/audio/audio2' should be corrupt", corruptFilesPage
+          .contains("/audio/audio2"));
+      assertTrue("Summary message shall report 3 corrupt files",
+          corruptFilesPage.contains("At least 3 corrupt file(s)"));
+
+      // clean up
+      for (Path filepath : filepaths) {
+        fs.delete(filepath, false);
+      }
+    } finally {
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+    }
+  }
+
+}

+ 79 - 0
src/webapps/hdfs/corrupt_files.jsp

@@ -0,0 +1,79 @@
+
+<%
+  /*
+   * Licensed to the Apache Software Foundation (ASF) under one
+   * or more contributor license agreements.  See the NOTICE file
+   * distributed with this work for additional information
+   * regarding copyright ownership.  The ASF licenses this file
+   * to you under the Apache License, Version 2.0 (the
+   * "License"); you may not use this file except in compliance
+   * with the License.  You may obtain a copy of the License at
+   *
+   *     http://www.apache.org/licenses/LICENSE-2.0
+   *
+   * Unless required by applicable law or agreed to in writing, software
+   * distributed under the License is distributed on an "AS IS" BASIS,
+   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   * See the License for the specific language governing permissions and
+   * limitations under the License.
+   */
+%>
+<%@ page contentType="text/html; charset=UTF-8"
+	import="org.apache.hadoop.util.ServletUtil"
+	import="org.apache.hadoop.fs.FileStatus"
+	import="org.apache.hadoop.fs.FileUtil"
+	import="org.apache.hadoop.fs.Path"
+	import="java.util.Collection"
+	import="java.util.Arrays" %>
+<%!//for java.io.Serializable
+  private static final long serialVersionUID = 1L;%>
+<%
+  NameNode nn = (NameNode) application.getAttribute("name.node");
+  FSNamesystem fsn = nn.getNamesystem();
+  // String namenodeRole = nn.getRole().toString();
+  String namenodeLabel = nn.getNameNodeAddress().getHostName() + ":"
+      + nn.getNameNodeAddress().getPort();
+  Collection<FSNamesystem.CorruptFileBlockInfo> corruptFileBlocks = 
+	fsn.listCorruptFileBlocks();
+  int corruptFileCount = corruptFileBlocks.size();
+
+%>
+
+<html>
+<link rel="stylesheet" type="text/css" href="/static/hadoop.css">
+<title>Hadoop <%=namenodeLabel%></title>
+<body>
+<h1>'<%=namenodeLabel%>'</h1>
+<%=JspHelper.getVersionTable(fsn)%>
+<br>
+<b><a href="/nn_browsedfscontent.jsp">Browse the filesystem</a></b>
+<br>
+<b><a href="/logs/"><%=namenodeLabel%> Logs</a></b>
+<br>
+<b><a href=/dfshealth.jsp> Go back to DFS home</a></b>
+<hr>
+<h3>Reported Corrupt Files</h3>
+<%
+  if (corruptFileCount == 0) {
+%>
+    <i>No missing blocks found at the moment.</i> <br>
+    Please run fsck for a thorough health analysis.
+<%
+  } else {
+    for (FSNamesystem.CorruptFileBlockInfo c : corruptFileBlocks) {
+      String currentFileBlock = c.toString();
+%>
+      <%=currentFileBlock%><br>
+<%
+    }
+%>
+    <p>
+      <b>Total:</b> At least <%=corruptFileCount%> corrupt file(s)
+    </p>
+<%
+  }
+%>
+
+<%
+  out.println(ServletUtil.htmlFooter());
+%>

+ 4 - 1
src/webapps/hdfs/dfshealth.jsp

@@ -262,7 +262,10 @@
 <h3>Cluster Summary</h3>
 <b> <%= jspHelper.getSafeModeText()%> </b>
 <b> <%= jspHelper.getInodeLimitText()%> </b>
-<a class="warning"> <%= JspHelper.getWarningText(fsn)%></a>
+<a class="warning" href="/corrupt_files.jsp" title="List corrupt files">
+  <%= JspHelper.getWarningText(fsn)%>
+</a>
+
 
 <%
     generateDFSHealthReport(out, nn, request);