Procházet zdrojové kódy

HADOOP-227. Add support for backup namenodes, which periodically get snapshots of the namenode state. Contributed by Dhruba.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@500415 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting před 18 roky
rodič
revize
2d1c66a80f

+ 5 - 2
CHANGES.txt

@@ -87,8 +87,11 @@ Trunk (unreleased changes)
     with different versions of Lucene without worrying about CLASSPATH
     order.  (Milind Bhandarkar via cutting)
 
-27. HADOOP-248.  Optimize location of map outputs to not use random
-    probes.  (Devaraj Das via cutting)
+27. HADOOP-248.  Optimize location of map outputs to no longer use
+    random probes.  (Devaraj Das via cutting)
+
+28. HADOOP-227.  Add support for backup namenodes, which periodically
+    get snapshots of the namenode state.  (Dhruba Borthakur via cutting) 
 
 
 Release 0.10.1 - 2007-01-10

+ 15 - 12
bin/hadoop

@@ -25,20 +25,21 @@ bin=`cd "$bin"; pwd`
 if [ $# = 0 ]; then
   echo "Usage: hadoop [--config confdir] COMMAND"
   echo "where COMMAND is one of:"
-  echo "  namenode -format  format the DFS filesystem"
-  echo "  namenode          run the DFS namenode"
-  echo "  datanode          run a DFS datanode"
-  echo "  dfsadmin          run a DFS admin client"
-  echo "  fsck              run a DFS filesystem checking utility"
-  echo "  fs                run a generic filesystem user client"
-  echo "  jobtracker        run the MapReduce job Tracker node" 
-  echo "  tasktracker       run a MapReduce task Tracker node" 
-  echo "  job               manipulate MapReduce jobs" 
-  echo "  version           print the version"
-  echo "  jar <jar>         run a jar file"
+  echo "  namenode -format     format the DFS filesystem"
+  echo "  secondarynamenode    run the DFS secondary namenode"
+  echo "  namenode             run the DFS namenode"
+  echo "  datanode             run a DFS datanode"
+  echo "  dfsadmin             run a DFS admin client"
+  echo "  fsck                 run a DFS filesystem checking utility"
+  echo "  fs                   run a generic filesystem user client"
+  echo "  jobtracker           run the MapReduce job Tracker node" 
+  echo "  tasktracker          run a MapReduce task Tracker node" 
+  echo "  job                  manipulate MapReduce jobs" 
+  echo "  version              print the version"
+  echo "  jar <jar>            run a jar file"
   echo "  distcp <srcurl> <desturl> copy file or directories recursively"
   echo " or"
-  echo "  CLASSNAME         run the class named CLASSNAME"
+  echo "  CLASSNAME            run the class named CLASSNAME"
   echo "Most commands print help when invoked w/o parameters."
   exit 1
 fi
@@ -139,6 +140,8 @@ fi
 # figure out which class to run
 if [ "$COMMAND" = "namenode" ] ; then
   CLASS='org.apache.hadoop.dfs.NameNode'
+elif [ "$COMMAND" = "secondarynamenode" ] ; then
+  CLASS='org.apache.hadoop.dfs.SecondaryNameNode'
 elif [ "$COMMAND" = "datanode" ] ; then
   CLASS='org.apache.hadoop.dfs.DataNode'
 elif [ "$COMMAND" = "fs" ] ; then

+ 13 - 0
bin/hadoop-config.sh

@@ -38,3 +38,16 @@ fi
  
 # Allow alternate conf dir location.
 HADOOP_CONF_DIR="${HADOOP_CONF_DIR:-$HADOOP_HOME/conf}"
+
+#check to see it is specified whether to use the slaves or the
+# masters file
+if [ $# -gt 1 ]
+then
+    if [ "--hosts" = "$1" ]
+    then
+        shift
+        slavesfile=$1
+        shift
+        export HADOOP_SLAVES="${HADOOP_CONF_DIR}/$slavesfile"
+    fi
+fi

+ 1 - 1
bin/hadoop-daemon.sh

@@ -12,7 +12,7 @@
 #   HADOOP_NICENESS The scheduling priority for daemons. Defaults to 0.
 ##
 
-usage="Usage: hadoop-daemon.sh [--config <conf-dir>] (start|stop) <hadoop-command> <args...>"
+usage="Usage: hadoop-daemon.sh [--config <conf-dir>] [--hosts hostlistfile] (start|stop) <hadoop-command> <args...>"
 
 # if no args specified, show usage
 if [ $# -le 1 ]; then

+ 1 - 1
bin/hadoop-daemons.sh

@@ -2,7 +2,7 @@
 # 
 # Run a Hadoop command on all slave hosts.
 
-usage="Usage: hadoop-daemons.sh [--config confdir] [start|stop] command args..."
+usage="Usage: hadoop-daemons.sh [--config confdir] [--hosts hostlistfile] [start|stop] command args..."
 
 # if no args specified, show usage
 if [ $# -le 1 ]; then

+ 12 - 3
bin/slaves.sh

@@ -24,15 +24,24 @@ bin=`cd "$bin"; pwd`
 
 . "$bin"/hadoop-config.sh
 
+# If the slaves file is specified in the command line,
+# then it takes precedence over the definition in 
+# hadoop-env.sh. Save it here.
+HOSTLIST=$HADOOP_SLAVES
+
 if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
   . "${HADOOP_CONF_DIR}/hadoop-env.sh"
 fi
 
-if [ "$HADOOP_SLAVES" = "" ]; then
-  export HADOOP_SLAVES="${HADOOP_CONF_DIR}/slaves"
+if [ "$HOSTLIST" = "" ]; then
+  if [ "$HADOOP_SLAVES" = "" ]; then
+    export HOSTLIST="${HADOOP_CONF_DIR}/slaves"
+  else
+    export HOSTLIST="${HADOOP_SLAVES}"
+  fi
 fi
 
-for slave in `cat "$HADOOP_SLAVES"`; do
+for slave in `cat "$HOSTLIST"`; do
  ssh $HADOOP_SSH_OPTS $slave $"${@// /\\ }" \
    2>&1 | sed "s/^/$slave: /" &
  if [ "$HADOOP_SLAVE_SLEEP" != "" ]; then

+ 1 - 0
bin/start-dfs.sh

@@ -12,3 +12,4 @@ bin=`cd "$bin"; pwd`
 # note: datanodes will log connection errors until namenode starts
 "$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start namenode
 "$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR start datanode
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR --hosts masters start secondarynamenode

+ 2 - 1
bin/stop-dfs.sh

@@ -8,5 +8,6 @@ bin=`cd "$bin"; pwd`
 . "$bin"/hadoop-config.sh
 
 "$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR stop namenode
-"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR stop datanode
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR --hosts slaves stop datanode
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR --hosts masters stop secondarynamenode
 

+ 38 - 0
conf/hadoop-default.xml

@@ -136,6 +136,44 @@ creations/deletions), or "all".</description>
   <description>The size of the in-memory filsystem instance in MB</description>
 </property>
 
+<property>
+  <name>fs.checkpoint.dir</name>
+  <value>${hadoop.tmp.dir}/dfs/namesecondary</value>
+  <description>Determines where on the local filesystem the DFS secondary
+      name node should store the temporary images and edits to merge.  
+  </description>
+</property>
+
+<property>
+  <name>fs.checkpoint.period</name>
+  <value>3600</value>
+  <description>The number of seconds between two periodic checkpoints.
+  </description>
+</property>
+
+<property>
+  <name>fs.checkpoint.size</name>
+  <value>67108864</value>
+  <description>The size of the current edit log (in bytes) that triggers
+       a periodic checkpoint even if the fs.checkpoint.period hasn't expired.
+  </description>
+</property>
+
+<property>
+  <name>dfs.secondary.info.port</name>
+  <value>50090</value>
+  <description>The base number for the Secondary namenode info port.
+  </description>
+</property>
+
+<property>
+  <name>dfs.secondary.info.bindAddress</name>
+  <value>0.0.0.0</value>
+  <description>
+    The address where the secondary namenode web UI will listen to.
+  </description>
+</property>
+
 <property>
   <name>dfs.datanode.bindAddress</name>
   <value>0.0.0.0</value>

+ 1 - 0
conf/masters.template

@@ -0,0 +1 @@
+localhost

+ 27 - 1
src/java/org/apache/hadoop/dfs/ClientProtocol.java

@@ -29,7 +29,7 @@ import org.apache.hadoop.ipc.VersionedProtocol;
  **********************************************************************/
 interface ClientProtocol extends VersionedProtocol {
 
-    public static final long versionID = 6L; // reportBadBlocks added
+  public static final long versionID = 7L;  // periodic checkpoint added
   
     ///////////////////////////////////////
     // File contents
@@ -314,4 +314,30 @@ interface ClientProtocol extends VersionedProtocol {
     public boolean setSafeMode( FSConstants.SafeModeAction action ) throws IOException;
 
     public boolean decommission( FSConstants.DecommissionAction action, String[] nodenames) throws IOException;
+
+    /**
+     * Get the size of the current edit log (in bytes).
+     * @return The number of bytes in the current edit log.
+     * @throws IOException
+     */
+    public long getEditLogSize() throws IOException;
+
+    /**
+     * Closes the current edit log and opens a new one. The 
+     * call fails if there are already two or more edits log files or
+     * if the file system is in SafeMode.
+     * @return True if the call was successful, false otherwise.
+     * @throws IOException
+     */
+    public void rollEditLog() throws IOException;
+
+    /**
+     * Rolls the fsImage log. It removes the old fsImage, copies the
+     * new image to fsImage, removes the old edits and renames edits.new 
+     * to edits. The call fails if any of the four files are missing.
+     * @return True if the call was successful, false otherwise.
+     * @throws IOException
+     */
+    public void rollFsImage() throws IOException;
+
 }

+ 4 - 0
src/java/org/apache/hadoop/dfs/FSDirectory.java

@@ -315,6 +315,10 @@ class FSDirectory implements FSConstants {
     public FSDirectory(File[] dirs) throws IOException {
       this.fsImage = new FSImage( dirs );
     }
+
+    public FSDirectory(FSImage fsImage) throws IOException {
+      this.fsImage = fsImage;
+    }
     
     void loadFSImage( Configuration conf ) throws IOException {
       fsImage.loadFSImage( conf );

+ 269 - 47
src/java/org/apache/hadoop/dfs/FSEditLog.java

@@ -47,18 +47,32 @@ class FSEditLog {
   private static final byte OP_SET_REPLICATION = 4;
   private static final byte OP_DATANODE_ADD = 5;
   private static final byte OP_DATANODE_REMOVE = 6;
+
+  private static final String FS_EDIT = "edits";
+  private static final String FS_EDIT_NEW = "edits.new";
   
-  private File[] editFiles;
+  private File[] editFiles = null;
+  private File[] editFilesNew = null;
+
   DataOutputStream[] editStreams = null;
   FileDescriptor[] editDescriptors = null;
-  
-  FSEditLog( File[] edits ) {
-    this.editFiles = edits;
-  }
-  
-  File[] getEditFiles() {
-    return this.editFiles;
-  }
+  private FSImage fsimage = null;
+
+  FSEditLog(File[] fsDirs, FSImage image)  throws IOException {
+    fsimage = image;
+    editFiles = new File[fsDirs.length];
+    editFilesNew = new File[fsDirs.length];
+    for (int idx = 0; idx < fsDirs.length; idx++) {
+       editFiles[idx] = new File(fsDirs[idx], FS_EDIT);
+       editFilesNew[idx] = new File(fsDirs[idx], FS_EDIT_NEW);
+     }
+   }
+
+  FSEditLog(File imageDir, FSImage image, String edits)  throws IOException {
+    fsimage = image;
+    editFiles = new File[1];
+    editFiles[0] = new File(imageDir, edits);
+   }
 
   /**
    * Initialize the output stream for logging.
@@ -75,40 +89,108 @@ class FSEditLog {
       editStreams[idx].writeInt( FSConstants.DFS_CURRENT_VERSION );
     }
   }
+
+  /**
+   * Create edits.new if non existant.
+   */
+  void createNewIfMissing() throws IOException {
+    for (int idx = 0; idx < editFilesNew.length; idx++) {
+      if (!editFilesNew[idx].exists()) {
+        FileOutputStream stream = new FileOutputStream(editFilesNew[idx]);
+        DataOutputStream editStr = new DataOutputStream(stream);
+        editStr.writeInt( FSConstants.DFS_CURRENT_VERSION );
+        editStr.flush();
+        editStr.close();
+      } 
+    }
+  }
   
   /**
    * Shutdown the filestore
    */
   void close() throws IOException {
+    if (editStreams == null) {
+      return;
+    }
     for (int idx = 0; idx < editStreams.length; idx++) {
-      editStreams[idx].flush();
-      editDescriptors[idx].sync();
-      editStreams[idx].close();
+      try {
+        editStreams[idx].flush();
+        editDescriptors[idx].sync();
+        editStreams[idx].close();
+      } catch (IOException e) {
+        processIOError(idx);
+        idx--;
+      }
     }
   }
 
+  /**
+   * If there is an IO Error on any log operations, remove that
+   * directory from the list of directories. If no more directories
+   * remain, then raise an exception that will possibly cause the
+   * server to exit
+   */
+   void processIOError(int index) throws IOException {
+     if (editStreams.length == 1) {
+       throw new IOException("Checkpoint directories inaccessible.");
+     }
+     assert(index < editFiles.length);
+     assert(editFiles.length == editFilesNew.length);
+     assert(editFiles.length == editStreams.length);
+     int newsize = editStreams.length - 1;
+     int oldsize = editStreams.length;
+
+     //
+     // save existing values and allocate space for new ones
+     //
+     File[] editFiles1 = editFiles;
+     File[] editFilesNew1 = editFilesNew;
+     DataOutputStream[] editStreams1 = editStreams;
+     FileDescriptor[] editDescriptors1 = editDescriptors;
+     editFiles = new File[newsize];
+     editFilesNew = new File[newsize];
+     editStreams = new DataOutputStream[newsize];
+     editDescriptors = new FileDescriptor[newsize];
+
+     //
+     // copy values from old into new, skip the one with error.
+     //
+     for (int idx = 0; idx < index; idx++) {
+       editFiles[idx] = editFiles1[idx];
+       editFilesNew[idx] = editFilesNew1[idx];
+       editStreams[idx] = editStreams1[idx];
+       editDescriptors[idx] = editDescriptors1[idx];
+     }
+     for (int idx = index; idx < oldsize - 1; idx++) {
+       editFiles[idx] = editFiles1[idx+1];
+       editFilesNew[idx] = editFilesNew1[idx+1];
+       editStreams[idx] = editStreams1[idx+1];
+       editDescriptors[idx] = editDescriptors1[idx+1];
+     }
+     //
+     // Invoke the ioerror routine of the fsimage
+     //
+     fsimage.processIOError(index);
+   }
+
   /**
    * Delete specified editLog
    */
   void delete(int idx) throws IOException {
     if (editStreams != null) {
-      editStreams[idx].close();
+      try {
+        editStreams[idx].close();
+      } catch (IOException e) {
+        processIOError(idx);
+      }
     }
-    editFiles[idx].delete();
-  }
-  
-  /**
-   * Delete all editLogs
-   */
-  void deleteAll() throws IOException {
-    for (int idx = 0; idx < editFiles.length; idx++ ) {
+    if (!editFiles[idx].delete() || !editFilesNew[idx].delete()) {
       if (editStreams != null) {
-        editStreams[idx].close();
+        processIOError(idx);
       }
-      editFiles[idx].delete();
     }
   }
-  
+
   /**
    * check if ANY edits log exists
    */
@@ -120,38 +202,51 @@ class FSEditLog {
     }
     return false;
   }
+
+  /**
+   * check if ANY edits.new log exists
+   */
+  boolean existsNew() throws IOException {
+    for (int idx = 0; idx < editFilesNew.length; idx++) {
+      if (editFilesNew[idx].exists()) { 
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * check if a particular edits.new log exists
+   */
+  boolean existsNew(int idx) throws IOException {
+    if (editFilesNew[idx].exists()) { 
+      return true;
+    }
+    return false;
+  }
+
   
   /**
    * Load an edit log, and apply the changes to the in-memory structure
-   *
    * This is where we apply edits that we've been writing to disk all
    * along.
    */
-  int loadFSEdits( Configuration conf ) throws IOException {
+  int loadFSEdits(Configuration conf, int index) throws IOException {
+    int numEdits = 0;
+    numEdits = loadFSEdits(conf, editFiles[index]);
+    if (editFilesNew[index].exists()) { 
+      numEdits += loadFSEdits(conf, editFilesNew[index]);
+    }
+    return numEdits;
+  }
+
+  int loadFSEdits( Configuration conf, File edits)
+                                                 throws IOException {
     FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
     FSDirectory fsDir = fsNamesys.dir;
     int numEdits = 0;
     int logVersion = 0;
     
-    // first check how many editFiles exist
-    // and choose the largest editFile, because it is the most recent
-    Vector<File> files = new Vector<File>();
-    for (int idx = 0; idx < editFiles.length; idx++) {
-      if (editFiles[idx].exists()) {
-        files.add(editFiles[idx]);
-      }
-    }
-    long maxLength = Long.MIN_VALUE;
-    File edits = null;
-    for (Iterator<File> it = files.iterator(); it.hasNext();) {
-      File f = it.next();
-      long length = f.length();
-      if (length > maxLength) {
-        maxLength = length;
-        edits = f;
-      }
-    }
-    
     if (edits != null) {
       DataInputStream in = new DataInputStream(
           new BufferedInputStream(
@@ -322,11 +417,16 @@ class FSEditLog {
           editStreams[idx].flush();
           editDescriptors[idx].sync();
         } catch (IOException ie) {
-          // TODO: Must report an error here
+          try {
+            processIOError(idx);         
+          } catch (IOException e) {
+            FSNamesystem.LOG.error("Unable to append to edit log. " +
+                                   "Fatal Error.");
+            System.exit(-1);
+          }
         }
       }
     }
-    // TODO: initialize checkpointing if the log is large enough
   }
 
   /** 
@@ -395,4 +495,126 @@ class FSEditLog {
   static short fromLogReplication( UTF8 replication ) {
     return Short.parseShort(replication.toString());
   }
+
+  /**
+   * Return the size of the current EditLog
+   */
+  long getEditLogSize() throws IOException {
+    assert(editFiles.length == editStreams.length);
+    long size = 0;
+    for (int idx = 0; idx < editFiles.length; idx++) {
+      synchronized (editStreams[idx]) {
+        assert(size == 0 || size == editFiles[idx].length());
+        size = editFiles[idx].length();
+      }
+    }
+    return size;
+  }
+ 
+  /**
+   * Closes the current edit log and opens edits.new. 
+   */
+  void rollEditLog() throws IOException {
+    //
+    // If edits.new already exists, then return error.
+    //
+    if (existsNew()) {
+      throw new IOException("Attempt to roll edit log but edits.new exists");
+    }
+
+    close();                     // close existing edit log
+
+    //
+    // Open edits.new
+    //
+    for (int idx = 0; idx < editFiles.length; idx++ ) {
+      try {
+        FileOutputStream stream = new FileOutputStream(editFilesNew[idx]);
+        editStreams[idx] = new DataOutputStream(stream);
+        editDescriptors[idx] = stream.getFD();
+        editStreams[idx].writeInt( FSConstants.DFS_CURRENT_VERSION );
+      } catch (IOException e) {
+        processIOError(idx);
+        idx--;
+      }
+    }
+  }
+
+  /**
+   * Closes the current edit log and opens edits.new. 
+   * If edits.new already exists, then ignore it.
+   */
+  void rollEditLogIfNeeded() throws IOException {
+
+    //
+    // Open edits.new
+    //
+    for (int idx = 0; idx < editFiles.length; idx++ ) {
+      if (existsNew(idx)) {
+        continue;
+      }
+      try {
+        FileOutputStream stream = new FileOutputStream(editFilesNew[idx]);
+        editStreams[idx] = new DataOutputStream(stream);
+        editDescriptors[idx] = stream.getFD();
+        editStreams[idx].writeInt( FSConstants.DFS_CURRENT_VERSION );
+      } catch (IOException e) {
+        processIOError(idx);
+        idx--;
+      }
+    }
+  }
+  /**
+   * Removes the old edit log and renamed edits.new as edits.
+   * Reopens the edits file.
+   */
+  void purgeEditLog() throws IOException {
+    purgeEditLog(true);
+  }
+
+  /**
+   * Removes the old edit log and renamed edits.new as edits.
+   */
+  void purgeEditLog(boolean reopenEdits) throws IOException {
+    //
+    // If edits.new does not exists, then return error.
+    //
+    if (!existsNew()) {
+      throw new IOException("Attempt to purge edit log " +
+                            "but edits.new does not exist.");
+    }
+    close();
+
+    //
+    // Delete edits and rename edits.new to edits.
+    //
+    for (int idx = 0; idx < editFiles.length; idx++ ) {
+      if (!editFilesNew[idx].renameTo(editFiles[idx])) {
+        processIOError(idx); 
+        idx--; 
+      }
+    }
+    //
+    // Reopen all the edits logs.
+    //
+    boolean append = true;
+    for (int idx = 0; reopenEdits && idx < editStreams.length; idx++) {
+      try {
+        FileOutputStream stream = new FileOutputStream(editFiles[idx],
+                                                       append);
+        editStreams[idx] = new DataOutputStream(stream);
+        editDescriptors[idx] = stream.getFD();
+      } catch (IOException e) {
+        processIOError(idx); 
+        idx--; 
+      }
+    }
+  }
+
+  /**
+   * Return the name of the edit file
+   */
+  File getFsEditName() throws IOException {
+      return editFiles[0];
+  }
 }

+ 245 - 67
src/java/org/apache/hadoop/dfs/FSImage.java

@@ -48,14 +48,26 @@ import org.apache.hadoop.io.UTF8;
  * @author Konstantin Shvachko
  */
 class FSImage {
-  private static final String FS_IMAGE = "fsimage";
-  private static final String NEW_FS_IMAGE = "fsimage.new";
-  private static final String OLD_FS_IMAGE = "fsimage.old";
-  private static final String FS_TIME = "fstime";
+
+  //
+  // The filenames used for storing the images
+  //
+  private enum NameNodeFile {
+    IMAGE ("fsimage"),
+    CKPT ("fsimage.ckpt"),
+    TIME ("fstime");
+
+    private String fileName;
+    private NameNodeFile(String name) {
+      this.fileName = name;
+    }
+    String getName() {
+      return fileName;
+    }
+  }
 
   private File[] imageDirs;  /// directories that contains the image file 
   private FSEditLog editLog;
-  // private int namespaceID = 0;    /// a persistent attribute of the namespace
 
   /**
    * 
@@ -68,13 +80,61 @@ class FSImage {
         throw new IOException("NameNode not formatted: " + imageDirs[idx]);
       }
     }
-    File[] edits = new File[fsDirs.length];
-    for (int idx = 0; idx < edits.length; idx++) {
-      edits[idx] = new File(fsDirs[idx], "edits");
+    this.editLog = new FSEditLog( fsDirs , this);
+  }
+
+  /**
+   * Represents an Image (image and edit file).
+   */
+  FSImage(File imageDir, String edits) throws IOException {
+    this.imageDirs = new File[1];
+    imageDirs[0] = imageDir;
+    if (!imageDirs[0].exists()) {
+      throw new IOException("File " + imageDirs[0] + " not found.");
     }
-    this.editLog = new FSEditLog( edits );
+    this.editLog = new FSEditLog(imageDir, this, edits);
+  }
+
+  /*
+   * Create an fsimage and edits log from scratch.
+   */
+  void create() throws IOException {
+    saveFSImage(NameNodeFile.IMAGE.getName());
+    editLog.create();
   }
 
+  /**
+   * If there is an IO Error on any log operations, remove that
+   * directory from the list of directories. If no more directories
+   * remain, then raise an exception that will possibly cause the
+   * server to exit
+   */
+  void processIOError(int index) throws IOException {
+    if (imageDirs.length == 1) {
+      throw new IOException("Checkpoint directories inaccessible.");
+    }
+    assert(index < imageDirs.length);
+    int newsize = imageDirs.length - 1;
+    int oldsize = imageDirs.length;
+
+    //
+    // save existing values and allocate space for new ones
+    //
+    File[] imageDirs1 = imageDirs;
+    imageDirs = new File[newsize];
+
+    //
+    // copy in saved values, skipping the one on which we had
+    // an error
+    //
+    for (int idx = 0; idx < index; idx++) {
+      imageDirs[idx] = imageDirs1[idx];
+    }
+    for (int idx = index; idx < oldsize - 1; idx++) {
+      imageDirs[idx] = imageDirs1[idx+1];
+    }
+  }
+        
   FSEditLog getEditLog() {
     return editLog;
   }
@@ -85,40 +145,57 @@ class FSImage {
    * "re-save" and consolidate the edit-logs
    */
   void loadFSImage( Configuration conf ) throws IOException {
-    FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
-    FSDirectory fsDir = fsNamesys.dir;
     for (int idx = 0; idx < imageDirs.length; idx++) {
       //
       // Atomic move sequence, to recover from interrupted save
       //
-      File curFile = new File(imageDirs[idx], FS_IMAGE);
-      File newFile = new File(imageDirs[idx], NEW_FS_IMAGE);
-      File oldFile = new File(imageDirs[idx], OLD_FS_IMAGE);
-
-      // Maybe we were interrupted between 2 and 4
-      if (oldFile.exists() && curFile.exists()) {
-        oldFile.delete();
-        if (editLog.exists()) {
-          editLog.deleteAll();
+      File curFile = new File(imageDirs[idx], 
+                              NameNodeFile.IMAGE.getName());
+      File ckptFile = new File(imageDirs[idx], 
+                              NameNodeFile.CKPT.getName());
+
+      //
+      // If we were in the midst of a checkpoint
+      //
+      if (ckptFile.exists()) {
+        if (editLog.existsNew(idx)) {
+          //
+          // checkpointing migth have uploaded a new
+          // merged image, but we discard it here because we are
+          // not sure whether the entire merged image was uploaded
+          // before the namenode crashed.
+          //
+          if (!ckptFile.delete()) {
+            throw new IOException("Unable to delete " + ckptFile);
+          }
+        } else {
+          //
+          // checkpointing was in progress when the namenode
+          // shutdown. The fsimage.ckpt was created and the edits.new
+          // file was moved to edits. We complete that checkpoint by
+          // moving fsimage.new to fsimage. There is no need to 
+          // update the fstime file here.
+          //
+          if (!ckptFile.renameTo(curFile)) {
+            throw new IOException("Unable to rename " + ckptFile +
+                                  " to " + curFile);
+          }
         }
-      } else if (oldFile.exists() && newFile.exists()) {
-        // Or maybe between 1 and 2
-        newFile.renameTo(curFile);
-        oldFile.delete();
-      } else if (curFile.exists() && newFile.exists()) {
-        // Or else before stage 1, in which case we lose the edits
-        newFile.delete();
       }
     }
-    
+
     // Now check all curFiles and see which is the newest
     File curFile = null;
     long maxTimeStamp = Long.MIN_VALUE;
+    int saveidx = 0;
+    boolean needToSave = false;
     for (int idx = 0; idx < imageDirs.length; idx++) {
-      File file = new File(imageDirs[idx], FS_IMAGE);
+      File file = new File(imageDirs[idx], 
+                           NameNodeFile.IMAGE.getName());
       if (file.exists()) {
         long timeStamp = 0;
-        File timeFile = new File(imageDirs[idx], FS_TIME);
+        File timeFile = new File(imageDirs[idx], 
+                                 NameNodeFile.TIME.getName());
         if (timeFile.exists() && timeFile.canRead()) {
           DataInputStream in = new DataInputStream(
               new FileInputStream(timeFile));
@@ -127,14 +204,38 @@ class FSImage {
           } finally {
             in.close();
           }
+        } else {
+          needToSave |= true;
         }
         if (maxTimeStamp < timeStamp) {
           maxTimeStamp = timeStamp;
           curFile = file;
+          saveidx = idx;
         }
+      } else {
+        needToSave |= true;
       }
     }
 
+    //
+    // Load in bits
+    //
+    needToSave |= loadFSImage(conf, curFile);
+
+    //
+    // read in the editlog from the same directory from
+    // which we read in the image
+    //
+    needToSave |= (editLog.loadFSEdits(conf, saveidx) > 0);
+    if (needToSave) {
+      saveFSImage();
+    }
+  }
+
+  boolean loadFSImage(Configuration conf, File curFile)
+                      throws IOException {
+    FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
+    FSDirectory fsDir = fsNamesys.dir;
     //
     // Load in bits
     //
@@ -154,16 +255,17 @@ class FSImage {
         int numFiles = 0;
         // version 0 does not store version #
         // starts directly with the number of files
-        if( imgVersion >= 0 ) {  
+        if( imgVersion >= 0 ) {
           numFiles = imgVersion;
           imgVersion = 0;
-        } else 
+        } else {
           numFiles = in.readInt();
-        
+        }
+
         needToSave = ( imgVersion != FSConstants.DFS_CURRENT_VERSION );
         if( imgVersion < FSConstants.DFS_CURRENT_VERSION ) // future version
           throw new IncorrectVersionException(imgVersion, "file system image");
-        
+
         // read file info
         short replication = (short)conf.getInt("dfs.replication", 3);
         for (int i = 0; i < numFiles; i++) {
@@ -185,30 +287,27 @@ class FSImage {
           }
           fsDir.unprotectedAddFile(name, blocks, replication );
         }
-        
+
         // load datanode info
         this.loadDatanodes( imgVersion, in );
       } finally {
         in.close();
       }
     }
-    
     if( fsDir.namespaceID == 0 )
       fsDir.namespaceID = newNamespaceID();
-    
-    needToSave |= ( editLog.exists() && editLog.loadFSEdits(conf) > 0 );
-    if( needToSave )
-      saveFSImage();
+
+    return needToSave;
   }
 
   /**
    * Save the contents of the FS image
    */
-  void saveFSImage() throws IOException {
+  void saveFSImage(String filename) throws IOException {
     FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
     FSDirectory fsDir = fsNamesys.dir;
     for (int idx = 0; idx < imageDirs.length; idx++) {
-      File newFile = new File(imageDirs[idx], NEW_FS_IMAGE);
+      File newFile = new File(imageDirs[idx], filename);
       
       //
       // Write out data
@@ -226,31 +325,25 @@ class FSImage {
         out.close();
       }
     }
-    
-    //
-    // Atomic move sequence
-    //
-    for (int idx = 0; idx < imageDirs.length; idx++) {
-      File curFile = new File(imageDirs[idx], FS_IMAGE);
-      File newFile = new File(imageDirs[idx], NEW_FS_IMAGE);
-      File oldFile = new File(imageDirs[idx], OLD_FS_IMAGE);
-      File timeFile = new File(imageDirs[idx], FS_TIME);
-      // 1.  Move cur to old and delete timeStamp
-      curFile.renameTo(oldFile);
-      if (timeFile.exists()) { timeFile.delete(); }
-      // 2.  Move new to cur and write timestamp
-      newFile.renameTo(curFile);
-      DataOutputStream out = new DataOutputStream(
-            new FileOutputStream(timeFile));
-      try {
-        out.writeLong(System.currentTimeMillis());
-      } finally {
-        out.close();
-      }
-      // 3.  Remove pending-edits file (it's been integrated with newFile)
-      editLog.delete(idx);
-      // 4.  Delete old
-      oldFile.delete();
+  }
+
+  /**
+   * Save the contents of the FS image
+   */
+  void saveFSImage() throws IOException {
+    editLog.createNewIfMissing();
+    saveFSImage(NameNodeFile.CKPT.getName());
+    rollFSImage(false);
+  }
+
+  void updateTimeFile(File timeFile, long timestamp) throws IOException {
+    if (timeFile.exists()) { timeFile.delete(); }
+    DataOutputStream out = new DataOutputStream(
+          new FileOutputStream(timeFile));
+    try {
+      out.writeLong(timestamp);
+    } finally {
+      out.close();
     }
   }
 
@@ -343,6 +436,91 @@ class FSImage {
       fsNamesys.unprotectedAddDatanode(nodeImage.getDatanodeDescriptor());
     }
   }
+  /**
+   * Moves fsimage.ckpt to fsImage and edits.new to edits
+   * Reopens the new edits file.
+   */
+  void rollFSImage() throws IOException {
+    rollFSImage(true);
+  }
+
+  /**
+   * Moves fsimage.ckpt to fsImage and edits.new to edits
+   */
+  void rollFSImage(boolean reopenEdits) throws IOException {
+    //
+    // First, verify that edits.new and fsimage.ckpt exists in all
+    // checkpoint directories.
+    //
+    if (!editLog.existsNew()) {
+      throw new IOException("New Edits file does not exist");
+    }
+    for (int idx = 0; idx < imageDirs.length; idx++) {
+      File ckpt = new File(imageDirs[idx], 
+                           NameNodeFile.CKPT.getName());
+      File curFile = new File(imageDirs[idx], 
+                              NameNodeFile.IMAGE.getName());
+
+      if (!curFile.exists()) {
+        throw new IOException("Image file " + curFile +
+                              " does not exist");
+      }
+      if (!ckpt.exists()) {
+        throw new IOException("Checkpoint file " + ckpt +
+                              " does not exist");
+      }
+    }
+    editLog.purgeEditLog(reopenEdits); // renamed edits.new to edits
+
+    //
+    // Renames new image
+    //
+    for (int idx = 0; idx < imageDirs.length; idx++) {
+      File ckpt = new File(imageDirs[idx], 
+                           NameNodeFile.CKPT.getName());
+      File curFile = new File(imageDirs[idx], 
+                              NameNodeFile.IMAGE.getName());
+      if (!ckpt.renameTo(curFile)) {
+        editLog.processIOError(idx);
+        idx--;
+      }
+    }
+
+    //
+    // Updates the fstime file
+    //
+    long now = System.currentTimeMillis();
+    for (int idx = 0; idx < imageDirs.length; idx++) {
+	  File timeFile = new File(imageDirs[idx], 
+                               NameNodeFile.TIME.getName());
+      try {
+        updateTimeFile(timeFile, now);
+      } catch (IOException e) {
+        editLog.processIOError(idx);
+        idx--;
+      }
+    }
+  }
+
+  /**
+   * Return the name of the image file.
+   */
+  File getFsImageName() {
+      return new File(imageDirs[0], NameNodeFile.IMAGE.getName());
+  }
+
+  /**
+   * Return the name of the image file that is uploaded by periodic
+   * checkpointing.
+   */
+  File[] getFsImageNameCheckpoint() {
+      File[] list = new File[imageDirs.length];
+      for (int i = 0; i < imageDirs.length; i++) {
+        list[i] = new File(imageDirs[i], 
+                           NameNodeFile.CKPT.getName());
+      }
+      return list;
+  }
 
   static class DatanodeImage implements WritableComparable {
 

+ 84 - 0
src/java/org/apache/hadoop/dfs/FSNamesystem.java

@@ -242,8 +242,19 @@ class FSNamesystem implements FSConstants {
         this.infoServer.setAttribute("name.node", nn);
         this.infoServer.setAttribute("name.conf", conf);
         this.infoServer.addServlet("fsck", "/fsck", FsckServlet.class);
+        this.infoServer.addServlet("getimage", "/getimage", GetImageServlet.class);
         this.infoServer.start();
     }
+
+    /**
+     * dirs is a list of directories where the filesystem directory state 
+     * is stored
+     */
+    FSNamesystem(FSImage fsImage) throws IOException {
+        fsNamesystemObject = this;
+        this.dir = new FSDirectory(fsImage);
+    }
+
     /** Return the FSNamesystem object
      * 
      */
@@ -3017,6 +3028,40 @@ class FSNamesystem implements FSConstants {
         return "";
       return safeMode.getTurnOffTip();
     }
+
+    long getEditLogSize() throws IOException {
+      return getEditLog().getEditLogSize();
+    }
+
+    synchronized void rollEditLog() throws IOException {
+      if (isInSafeMode()) {
+        throw new SafeModeException("Checkpoint not created",
+                                     safeMode);
+      }
+      LOG.info("Roll Edit Log");
+      getEditLog().rollEditLog();
+    }
+
+    synchronized void rollFSImage() throws IOException {
+      LOG.info("Roll FSImage");
+      if (isInSafeMode()) {
+        throw new SafeModeException("Checkpoint not created",
+                                    safeMode);
+      }
+      dir.fsImage.rollFSImage();
+    }
+
+    File getFsImageName() throws IOException {
+      return dir.fsImage.getFsImageName();
+    }
+
+    File[] getFsImageNameCheckpoint() throws IOException {
+      return dir.fsImage.getFsImageNameCheckpoint();
+    }
+
+    File getFsEditName() throws IOException {
+      return getEditLog().getFsEditName();
+    }
     
     /**
      * This class is used in Namesystem's jetty to do fsck on namenode
@@ -3042,4 +3087,43 @@ class FSNamesystem implements FSConstants {
         }
       }
     }
+
+    /**
+     * This class is used in Namesystem's jetty to retrieve a file.
+     * Typically used by the Secondary NameNode to retrieve image and
+     * edit file for periodic checkpointing.
+     * @author Dhruba Borthakur
+     */
+    public static class GetImageServlet extends HttpServlet {
+      public void doGet(HttpServletRequest request,
+          HttpServletResponse response
+          ) throws ServletException, IOException {
+        Map<String,String[]> pmap = request.getParameterMap();
+        try {
+          ServletContext context = getServletContext();
+          NameNode nn = (NameNode) context.getAttribute("name.node");
+          Configuration conf = (Configuration) context.getAttribute("name.conf");
+          TransferFsImage ff = new TransferFsImage(pmap, request, response);
+          if (ff.getImage()) {
+            // send fsImage to Secondary
+            TransferFsImage.getFileServer(response.getOutputStream(),
+                                          nn.getFsImageName()); 
+          } else if (ff.getEdit()) {
+            // send old edits to Secondary
+            TransferFsImage.getFileServer(response.getOutputStream(),
+                                          nn.getFsEditName());
+          } else if (ff.putImage()) {
+            // issue a HTTP get request to download the new fsimage 
+            TransferFsImage.getFileClient(ff.getInfoServer(), "getimage=1", 
+                                          nn.getFsImageNameCheckpoint());
+          }
+        } catch (IOException ie) {
+          StringUtils.stringifyException(ie);
+          LOG.warn(ie);
+          String errMsg = "GetImage failed.";
+          response.sendError(HttpServletResponse.SC_GONE, errMsg);
+          throw ie;
+        }
+      }
+    }
 }

+ 53 - 3
src/java/org/apache/hadoop/dfs/NameNode.java

@@ -92,12 +92,20 @@ public class NameNode implements ClientProtocol, DatanodeProtocol, FSConstants {
       for (int idx = 0; idx < dirs.length; idx++) {
         FSImage.format(dirs[idx]);
       }
+      FSImage fsimage = new FSImage(dirs);
+      FSNamesystem namesystem = new FSNamesystem(fsimage);
+      fsimage.create();
     }
 
     /** Format a new filesystem.  Destroys any filesystem that may already
      * exist at this location.  **/
     public static void format(File dir) throws IOException {
+      File dirs[] = new File[1];
+      dirs[0] = dir;
       FSImage.format(dir);
+      FSImage fsimage = new FSImage(dirs);
+      FSNamesystem namesystem = new FSNamesystem(fsimage);
+      fsimage.create();
     }
 
     private class NameNodeMetrics {
@@ -165,7 +173,7 @@ public class NameNode implements ClientProtocol, DatanodeProtocol, FSConstants {
     }
 
     /** Return the configured directories where name data is stored. */
-    private static File[] getDirs(Configuration conf) {
+    static File[] getDirs(Configuration conf) {
       String[] dirNames = conf.getStrings("dfs.name.dir");
       if (dirNames == null) { dirNames = new String[] {"/tmp/hadoop/dfs/name"}; }
       File[] dirs = new File[dirNames.length];
@@ -503,7 +511,27 @@ public class NameNode implements ClientProtocol, DatanodeProtocol, FSConstants {
         return ret;
     }
 
-    
+    /**
+     * Returns the size of the current edit log.
+     */
+    public long getEditLogSize() throws IOException {
+      return namesystem.getEditLogSize();
+    }
+
+    /**
+     * Roll the edit log.
+     */
+    public void rollEditLog() throws IOException {
+      namesystem.rollEditLog();
+    }
+
+    /**
+     * Roll the image 
+     */
+    public void rollFsImage() throws IOException {
+      namesystem.rollFSImage();
+    }
+
     ////////////////////////////////////////////////////////////////
     // DatanodeProtocol
     ////////////////////////////////////////////////////////////////
@@ -618,7 +646,29 @@ public class NameNode implements ClientProtocol, DatanodeProtocol, FSConstants {
       if( version != DFS_CURRENT_VERSION )
         throw new IncorrectVersionException( version, "data node" );
     }
-    
+
+    /**
+     * Returns the name of the fsImage file
+     */
+    public File getFsImageName() throws IOException {
+      return namesystem.getFsImageName();
+    }
+
+    /**
+     * Returns the name of the fsImage file uploaded by periodic
+     * checkpointing
+     */
+    public File[] getFsImageNameCheckpoint() throws IOException {
+      return namesystem.getFsImageNameCheckpoint();
+    }
+
+    /**
+     * Returns the name of the edits file
+     */
+    public File getFsEditName() throws IOException {
+      return namesystem.getFsEditName();
+    }
+
     /**
      */
     public static void main(String argv[]) throws Exception {

+ 477 - 0
src/java/org/apache/hadoop/dfs/SecondaryNameNode.java

@@ -0,0 +1,477 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.dfs;
+
+import org.apache.commons.logging.*;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.ipc.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Daemon;
+import org.apache.hadoop.mapred.StatusHttpServer;
+
+import java.io.*;
+import java.net.*;
+import java.util.Map;
+import javax.servlet.ServletContext;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+/**********************************************************
+ * The Secondary NameNode is a helper to the primary NameNode.
+ * The Secondary is responsible for supporting periodic checkpoints 
+ * of the HDFS metadata. The current design allows only one Secondary
+ * NameNode per HDFs cluster.
+ *
+ * The Secondary NameNode is a daemon that periodically wakes
+ * up (determined by the schedule specified in the configuration),
+ * triggers a periodic checkpoint and then goes back to sleep.
+ * The Secondary NameNode uses the ClientProtocol to talk to the
+ * primary NameNode.
+ *
+ * @author  Dhruba Borthakur
+ **********************************************************/
+public class SecondaryNameNode implements FSConstants, Runnable {
+    
+    public static final Log LOG = LogFactory.getLog(
+                                  "org.apache.hadoop.dfs.NameNode.Secondary");
+    private static final String SRC_FS_IMAGE = "srcimage.tmp";
+    private static final String FS_EDITS = "edits.tmp";
+    private static final String DEST_FS_IMAGE = "destimage.tmp";
+
+    private ClientProtocol namenode;
+    private Configuration conf;
+    private String localName;
+    private InetSocketAddress nameNodeAddr;
+    private boolean shouldRun;
+    private StatusHttpServer infoServer;
+    private int infoPort;
+    private String infoBindAddress;
+
+    private File checkpointDir;
+    private long checkpointPeriod;	// in seconds
+    private long checkpointSize;    // size (in MB) of current Edit Log
+    private File srcImage;
+    private File destImage;
+    private File editFile;
+
+    private boolean[] simulation = null; // error simulation events
+
+    /**
+     * Create a connection to the primary namenode.
+     */
+    public SecondaryNameNode(Configuration conf)  throws IOException {
+
+      //
+      // initialize error simulation code for junit test
+      //
+      initializeErrorSimulationEvent(2);
+
+      //
+      // Create connection to the namenode.
+      //
+      shouldRun = true;
+      nameNodeAddr = DataNode.createSocketAddr(
+                                 conf.get("fs.default.name", "local"));
+      this.conf = conf;
+      this.namenode = (ClientProtocol) RPC.getProxy(ClientProtocol.class,
+                       ClientProtocol.versionID, nameNodeAddr, conf);
+      try {
+        this.localName = InetAddress.getLocalHost().getHostName();
+      } catch (UnknownHostException uhe) {
+        this.localName = "";
+      }
+
+      //
+      // initialize the webserver for uploading files.
+      //
+      infoPort = conf.getInt("dfs.secondary.info.port", 50090);
+      infoBindAddress = conf.get("dfs.secondary.info.bindAddress", "0.0.0.0");
+      infoServer = new StatusHttpServer("dfs", infoBindAddress, infoPort, false);
+      infoServer.setAttribute("name.secondary", this);
+      infoServer.addServlet("getimage", "/getimage", GetImageServlet.class);
+      infoServer.start();
+
+      //
+      // Initialize other scheduling parameters from the configuration
+      //
+      String[] dirName = conf.getStrings("fs.checkpoint.dir");
+      checkpointDir = new File(dirName[0]);
+      checkpointPeriod = conf.getLong("fs.checkpoint.period", 3600);
+      checkpointSize = conf.getLong("fs.checkpoint.size", 4194304);
+      doSetup();
+
+      LOG.warn("Checkpoint Directory:" + checkpointDir);
+      LOG.warn("Checkpoint Period   :" + checkpointPeriod + " secs " +
+               "(" + checkpointPeriod/60 + " min)");
+      LOG.warn("Log Size Trigger    :" + checkpointSize + " bytes " +
+               "(" + checkpointSize/1024 + " KB)");
+    }
+
+   /**
+     * Shut down this instance of the datanode.
+     * Returns only after shutdown is complete.
+     */
+    public void shutdown() {
+      shouldRun = false;
+      try {
+          infoServer.stop();
+      } catch (Exception e) {
+      }
+    }
+
+    private void doSetup() throws IOException {
+      //
+      // Create the checkpoint directory if needed. 
+      //
+      checkpointDir.mkdirs();
+      srcImage = new File(checkpointDir, SRC_FS_IMAGE);
+      destImage = new File(checkpointDir, DEST_FS_IMAGE);
+      editFile = new File(checkpointDir, FS_EDITS);
+      srcImage.delete();
+      destImage.delete();
+      editFile.delete();
+    }
+
+    File getNewImage() {
+      return destImage;
+    }
+
+    //
+    // The main work loop
+    //
+    public void run() {
+
+      //
+      // Poll the Namenode (once every 5 minutes) to find the size of the
+      // pending edit log.
+      //
+      long period = 5 * 60;              // 5 minutes
+      long lastCheckpointTime = 0;
+      if (checkpointPeriod < period) {
+        period = checkpointPeriod;
+      }
+
+      while (shouldRun) {
+        try {
+            Thread.sleep(1000 * period);
+        } catch (InterruptedException ie) {
+          // do nothing
+        }
+        if (!shouldRun) {
+          break;
+        }
+        try {
+          long now = System.currentTimeMillis();
+
+          long size = namenode.getEditLogSize();
+          if (size >= checkpointSize || 
+              now >= lastCheckpointTime + 1000 * checkpointPeriod) {
+            doCheckpoint();
+            lastCheckpointTime = now;
+          }
+        } catch (IOException e) {
+          LOG.error("Exception in doCheckpoint:");
+          LOG.error(StringUtils.stringifyException(e));
+          e.printStackTrace();
+        }
+      }
+    }
+
+    /**
+     * get the current fsimage from Namenode.
+     */
+     private void getFSImage() throws IOException {
+       String fsName = getInfoServer();
+       String fileid = "getimage=1";
+       TransferFsImage.getFileClient(fsName, fileid, srcImage);
+       LOG.info("Downloaded file " + srcImage + " size " +
+                srcImage.length() + " bytes.");
+    }
+
+    /**
+     * get the old edits file from the NameNode
+     */
+     private void getFSEdits() throws IOException {
+       String fsName = getInfoServer();
+       String fileid = "getedit=1";
+       TransferFsImage.getFileClient(fsName, fileid, editFile);
+       LOG.info("Downloaded file " + editFile + " size " +
+                editFile.length() + " bytes.");
+    }
+
+    /**
+     * Copy the new fsimage into the NameNode
+     */
+     private void putFSImage() throws IOException {
+       String fsName = getInfoServer();
+       String fileid = "putimage=1&port=" + infoPort +
+                       "&machine=" +
+                       InetAddress.getLocalHost().getHostAddress();
+       LOG.info("Posted URL " + fsName + fileid);
+       TransferFsImage.getFileClient(fsName, fileid, (File[])null);
+     }
+
+    /*
+     * Returns the Jetty server that the Namenode is listening on.
+     */
+    private String getInfoServer() throws IOException {
+      String fsName = conf.get("fs.default.name", "local");
+      if (fsName.equals("local")) {
+        throw new IOException("This is not a DFS");
+      }
+      String[] splits = fsName.split(":", 2);
+      int infoPort = conf.getInt("dfs.info.port", 50070);
+      return splits[0]+":"+infoPort;
+    }
+
+    /*
+     * Create a new checkpoint
+     */
+    void doCheckpoint() throws IOException {
+
+      //
+      // Do the rquired initialization of the merge work area.
+      //
+      doSetup();
+
+      //
+      // Tell the namenode to start logging transactions in a new edit file
+      //
+      namenode.rollEditLog();
+
+      //
+      // error simulation code for junit test
+      //
+      if (simulation != null && simulation[0]) {
+        throw new IOException("Simulating error0 " +
+                              "after creating edits.new");
+      }
+
+      getFSImage();                // Fetch fsimage
+      getFSEdits();                // Fetch edist
+      doMerge();                   // Do the merge
+  
+      //
+      // Upload the new image into the NameNode. Then tell the Namenode
+      // to make this new uploaded image as the most current image.
+      //
+      putFSImage();
+
+      //
+      // error simulation code for junit test
+      //
+      if (simulation != null && simulation[1]) {
+        throw new IOException("Simulating error1 " +
+                              "after uploading new image to NameNode");
+      }
+
+      namenode.rollFsImage();
+
+      LOG.warn("Checkpoint done. Image Size:" + srcImage.length() +
+               " Edit Size:" + editFile.length() +
+               " New Image Size:" + destImage.length());
+    }
+
+    /**
+     * merges SRC_FS_IMAGE with FS_EDITS and writes the output into
+     * DEST_FS_IMAGE
+     */
+    private void doMerge() throws IOException {
+      FSImage fsImage = new FSImage(checkpointDir, FS_EDITS);
+      FSNamesystem namesystem = new FSNamesystem(fsImage);
+      fsImage.loadFSImage(conf, srcImage);
+      fsImage.getEditLog().loadFSEdits(conf, editFile);
+      fsImage.saveFSImage(DEST_FS_IMAGE);
+    }
+
+    /**
+     * @param argv The parameters passed to this program.
+     * @exception Exception if the filesystem does not exist.
+     * @return 0 on success, non zero on error.
+     */
+    private int processArgs(String[] argv) throws Exception {
+
+      if (argv.length < 1) {
+          printUsage("");
+          return -1;
+      }
+
+      int exitCode = -1;
+      int i = 0;
+      String cmd = argv[i++];
+
+      //
+      // verify that we have enough command line parameters
+      //
+      if ("-geteditsize".equals(cmd)) {
+        if (argv.length != 1) {
+          printUsage(cmd);
+          return exitCode;
+        }
+      } else if ("-checkpoint".equals(cmd)) {
+        if (argv.length != 1 && argv.length != 2) {
+          printUsage(cmd);
+          return exitCode;
+        }
+        if (argv.length == 2 && !"force".equals(argv[i])) {
+          printUsage(cmd);
+          return exitCode;
+        }
+      }
+
+      exitCode = 0;
+      try {
+        if ("-checkpoint".equals(cmd)) {
+          long size = namenode.getEditLogSize();
+          if (size >= checkpointSize || 
+              argv.length == 2 && "force".equals(argv[i])) {
+            doCheckpoint();
+          } else {
+            System.err.println("EditLog size " + size + " bytes is " +
+                               "smaller than configured checkpoint " +
+                               "size " + checkpointSize + " bytes.");
+            System.err.println("Skipping checkpoint.");
+          }
+        } else if ("-geteditsize".equals(cmd)) {
+          long size = namenode.getEditLogSize();
+          System.out.println("EditLog size is " + size + " bytes");
+        } else {
+          exitCode = -1;
+          LOG.error(cmd.substring(1) + ": Unknown command");
+          printUsage("");
+        }
+      } catch (RemoteException e) {
+          //
+          // This is a error returned by hadoop server. Print
+          // out the first line of the error mesage, ignore the stack trace.
+          exitCode = -1;
+          try {
+            String[] content;
+            content = e.getLocalizedMessage().split("\n");
+            LOG.error(cmd.substring(1) + ": "
+                               + content[0]);
+          } catch (Exception ex) {
+            LOG.error(cmd.substring(1) + ": "
+                               + ex.getLocalizedMessage());
+          }
+        } catch (IOException e) {
+          //
+          // IO exception encountered locally.
+          //
+          exitCode = -1;
+          LOG.error(cmd.substring(1) + ": "
+                             + e.getLocalizedMessage());
+        } finally {
+            // Does the RPC connection need to be closed?
+        }
+        return exitCode;
+    }
+
+    /**
+     * Displays format of commands.
+     * @param cmd The command that is being executed.
+     */
+    private void printUsage(String cmd) {
+      if ("-geteditsize".equals(cmd)) {
+        System.err.println("Usage: java SecondaryNameNode"
+                           + " [-geteditsize]");
+      } else if ("-checkpoint".equals(cmd)) {
+        System.err.println("Usage: java SecondaryNameNode"
+                           + " [-checkpoint [force]]");
+      } else {
+        System.err.println("Usage: java SecondaryNameNode " +
+                           "[-checkpoint [force]] " +
+                           "[-geteditsize] ");
+      }
+    }
+
+    //
+    // utility method to facilitate junit test error simulation
+    //
+    void initializeErrorSimulationEvent(int numberOfEvents) {
+      simulation = new boolean[numberOfEvents]; 
+      for (int i = 0; i < numberOfEvents; i++) {
+        simulation[i] = false;
+      }
+    }
+
+    void setErrorSimulation(int index) {
+      assert(index < simulation.length);
+      simulation[index] = true;
+    }
+
+    void clearErrorSimulation(int index) {
+      assert(index < simulation.length);
+      simulation[index] = false;
+    }
+
+   /**
+     * This class is used in Namesystem's jetty to retrieve a file.
+     * Typically used by the Secondary NameNode to retrieve image and
+     * edit file for periodic checkpointing.
+     * @author Dhruba Borthakur
+     */
+    public static class GetImageServlet extends HttpServlet {
+      public void doGet(HttpServletRequest request,
+          HttpServletResponse response
+          ) throws ServletException, IOException {
+        Map<String,String[]> pmap = request.getParameterMap();
+        try {
+          ServletContext context = getServletContext();
+          SecondaryNameNode nn = (SecondaryNameNode) 
+                                  context.getAttribute("name.secondary");
+          TransferFsImage ff = new TransferFsImage(pmap, request, response);
+          if (ff.getImage()) {
+            TransferFsImage.getFileServer(response.getOutputStream(),
+                                   nn.getNewImage());
+          }
+          LOG.info("New Image " + nn.getNewImage() + " retrieved by Namenode.");
+        } catch (IOException ie) {
+          StringUtils.stringifyException(ie);
+          LOG.error(ie);
+          String errMsg = "GetImage failed.";
+          response.sendError(HttpServletResponse.SC_GONE, errMsg);
+          throw ie;
+
+        }
+      }
+    }
+
+    /**
+     * main() has some simple utility methods.
+     * @param argv Command line parameters.
+     * @exception Exception if the filesystem does not exist.
+     */
+    public static void main(String[] argv) throws Exception {
+        Configuration tconf = new Configuration();
+        if (argv.length >= 1) {
+          SecondaryNameNode secondary = new SecondaryNameNode(tconf);
+          int ret = secondary.processArgs(argv);
+          System.exit(ret);
+        }
+
+        // Create a never ending deamon
+        Daemon checkpointThread = new Daemon(new SecondaryNameNode(tconf)); 
+        checkpointThread.start();
+    }
+}

+ 186 - 0
src/java/org/apache/hadoop/dfs/TransferFsImage.java

@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.dfs;
+
+import java.io.*;
+import java.net.*;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Random;
+import java.util.TreeSet;
+import javax.servlet.http.HttpServletResponse;
+import javax.servlet.http.HttpServletRequest;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSOutputStream;
+import org.apache.hadoop.io.UTF8;
+
+
+/**
+ * This class provides fetching a specified file from the NameNode.
+ * @author Dhruba Borthakur
+ */
+class TransferFsImage implements FSConstants {
+  
+  private HttpServletResponse response;
+  private boolean isGetImage;
+  private boolean isGetEdit;
+  private boolean isPutImage;
+  private int remoteport;
+  private String machineName;
+  
+  /**
+   * File downloader.
+   * @param pmap key=value[] map that is passed to the http servlet as 
+   *        url parameters
+   * @param request the object from which this servelet reads the url contents
+   * @param response the object into which this servelet writes the url contents
+   * @throws IOException
+   */
+  public TransferFsImage(Map<String,String[]> pmap,
+                         HttpServletRequest request,
+                         HttpServletResponse response
+                         ) throws IOException {
+    isGetImage = isGetEdit = isPutImage = false;
+    remoteport = 0;
+    machineName = null;
+
+    for (Iterator<String> it = pmap.keySet().iterator(); it.hasNext();) {
+      String key = it.next();
+      if (key.equals("getimage")) { 
+        isGetImage = true;
+      } else if (key.equals("getedit")) { 
+        isGetEdit = true;
+      } else if (key.equals("putimage")) { 
+        isPutImage = true;
+      } else if (key.equals("port")) { 
+        remoteport = new Integer(pmap.get("port")[0]).intValue();
+      } else if (key.equals("machine")) { 
+        machineName = pmap.get("machine")[0];
+      }
+    }
+    if ((isGetImage && isGetEdit) ||
+        (!isGetImage && !isGetEdit && !isPutImage)) {
+      throw new IOException("No good parameters to TransferFsImage");
+    }
+  }
+
+  boolean getEdit() {
+    return isGetEdit;
+  }
+
+  boolean getImage() {
+    return isGetImage;
+  }
+
+  boolean putImage() {
+    return isPutImage;
+  }
+
+  String getInfoServer() throws IOException{
+    if (machineName == null || remoteport == 0) {
+      throw new IOException ("MachineName and port undefined");
+    }
+    return machineName + ":" + remoteport;
+  }
+
+  /**
+   * A server-side method to respond to a getfile http request
+   * Copies the contents of the local file into the output stream.
+   */
+  static void getFileServer(OutputStream outstream, File localfile) 
+                                                 throws IOException {
+    byte buf[] = new byte[BUFFER_SIZE];
+    FileInputStream infile = null;
+    try {
+      infile = new FileInputStream(localfile);
+      int num = 1;
+      while (num > 0) {
+        num = infile.read(buf);
+        if (num <= 0) {
+          break;
+        }
+        outstream.write(buf, 0, num);
+      }
+    } finally {
+      outstream.close();
+      if (infile != null) {
+        infile.close();
+      }
+    }
+  }
+
+  /**
+   * Client-side Method to fetch file from a server
+   * Copies the response from the URL to a list of local files.
+   */
+   static void getFileClient(String fsName, String id, File[] localPath)
+                             throws IOException {
+     byte[] buf = new byte[BUFFER_SIZE];
+     StringBuffer str = new StringBuffer("http://"+fsName+"/getimage?");
+     str.append(id);
+
+     //
+     // open connection to remote server
+     //
+     URL url = new URL(str.toString());
+     URLConnection connection = url.openConnection();
+     InputStream stream = connection.getInputStream();
+     FileOutputStream[] output = null;
+     if (localPath != null) {
+       output = new FileOutputStream[localPath.length];
+       for (int i = 0; i < output.length; i++) {
+         output[i] = new FileOutputStream(localPath[i]);
+       }
+     }
+
+     try {
+       int num = 1;
+       while (num > 0) {
+         num = stream.read(buf);
+         if (num > 0 && localPath != null) {
+           for (int i = 0; i < output.length; i++) {
+             output[i].write(buf, 0, num);
+           }
+         }
+       }
+     } finally {
+       stream.close();
+       if (localPath != null) {
+         for (int i = 0; i < output.length; i++) {
+           output[i].close();
+         }
+       }
+     }
+   }
+
+  /**
+   * Client-side Method to fetch file from a server
+   * Copies the response from the URL to the local file.
+   */
+   static void getFileClient(String fsName, String id, File localPath)
+                             throws IOException {
+     File[] filelist = new File[1];
+     filelist[0] = localPath;
+     getFileClient(fsName, id, filelist);
+   }
+}

+ 7 - 0
src/test/org/apache/hadoop/dfs/MiniDFSCluster.java

@@ -278,4 +278,11 @@ public class MiniDFSCluster {
   public FileSystem getFileSystem() throws IOException {
     return FileSystem.get(conf);
   }
+
+  /**
+   * Get the directories where the namenode stores image
+   */
+  public File[] getNameDirs() {
+    return NameNode.getDirs(conf);
+  }
 }

+ 360 - 0
src/test/org/apache/hadoop/dfs/TestCheckpoint.java

@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.dfs;
+
+import junit.framework.TestCase;
+import java.io.*;
+import java.util.Random;
+import java.net.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSInputStream;
+import org.apache.hadoop.fs.FSOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * This class tests the creation and validation of a checkpoint.
+ * @author Dhruba Borthakur
+ */
+public class TestCheckpoint extends TestCase {
+  static final long seed = 0xDEADBEEFL;
+  static final int blockSize = 8192;
+  static final int fileSize = 16384;
+  static final int numDatanodes = 4;
+
+  private void writeFile(FileSystem fileSys, Path name, int repl)
+  throws IOException {
+    // create and write a file that contains three blocks of data
+    FSOutputStream stm = fileSys.createRaw(name, true, (short)repl,
+        (long)blockSize);
+    byte[] buffer = new byte[fileSize];
+    Random rand = new Random(seed);
+    rand.nextBytes(buffer);
+    stm.write(buffer);
+    stm.close();
+  }
+  
+  
+  private void checkFile(FileSystem fileSys, Path name, int repl)
+  throws IOException {
+    assertTrue(fileSys.exists(name));
+    String[][] locations = fileSys.getFileCacheHints(name, 0, fileSize);
+    for (int idx = 0; idx < locations.length; idx++) {
+      assertEquals("Number of replicas for block" + idx,
+          Math.min(numDatanodes, repl), locations[idx].length);  
+    }
+  }
+  
+  private void cleanupFile(FileSystem fileSys, Path name)
+  throws IOException {
+    assertTrue(fileSys.exists(name));
+    fileSys.delete(name);
+    assertTrue(!fileSys.exists(name));
+  }
+
+  /**
+   * put back the old namedir
+   */
+  private void resurrectNameDir(File[] namedirs) 
+  throws IOException {
+    String parentdir = namedirs[0].getParent();
+    String name = namedirs[0].getName();
+    File oldname =  new File(parentdir, name + ".old");
+    if (!oldname.renameTo(namedirs[0])) {
+      assertTrue(false);
+    }
+  }
+
+  /**
+   * remove one namedir
+   */
+  private void removeOneNameDir(File[] namedirs) 
+  throws IOException {
+    String parentdir = namedirs[0].getParent();
+    String name = namedirs[0].getName();
+    File newname =  new File(parentdir, name + ".old");
+    if (!namedirs[0].renameTo(newname)) {
+      assertTrue(false);
+    }
+  }
+
+  /*
+   * Verify that namenode does not startup if one namedir is bad.
+   */
+  private void testNamedirError(Configuration conf, File[] namedirs) 
+  throws IOException {
+    System.out.println("Starting testNamedirError");
+    Path file1 = new Path("checkpoint.dat");
+    MiniDFSCluster cluster = null;
+
+    if (namedirs.length <= 1) {
+      return;
+    }
+    
+    //
+    // Remove one namedir & Restart cluster. This should fail.
+    //
+    removeOneNameDir(namedirs);
+    try {
+      cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                                false, false);
+      assertTrue(false);
+    } catch (IOException e) {
+      // no nothing
+    }
+    resurrectNameDir(namedirs); // put back namedir
+  }
+
+  /*
+   * Simulate namenode crashing after rolling edit log.
+   */
+  private void testSecondaryNamenodeError1(Configuration conf)
+  throws IOException {
+    System.out.println("Starting testSecondaryNamenodeError 1");
+    Path file1 = new Path("checkpoint.dat");
+    MiniDFSCluster cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                                false, false);
+    FileSystem fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      assertTrue(!fileSys.exists(file1));
+      //
+      // Make the checkpoint fail after rolling the
+      // edit log.
+      //
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.initializeErrorSimulationEvent(2);
+      secondary.setErrorSimulation(0);
+
+      try {
+        secondary.doCheckpoint();  // this should fail
+        assertTrue(false);      
+      } catch (IOException e) {
+      }
+      secondary.shutdown();
+
+      //
+      // Create a new file
+      //
+      writeFile(fileSys, file1, 3);
+      checkFile(fileSys, file1, 3);
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    //
+    // Restart cluster and verify that file exists.
+    // Then take another checkpoint to verify that the 
+    // namenode restart accounted for the rolled edit logs.
+    //
+    System.out.println("Starting testSecondaryNamenodeError 2");
+    cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                 false, false);
+    fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      checkFile(fileSys, file1, 3);
+      cleanupFile(fileSys, file1);
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.doCheckpoint();
+      secondary.shutdown();
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+  }
+
+  /*
+   * Simulate a namenode crash after uploading new image
+   */
+  private void testSecondaryNamenodeError2(Configuration conf)
+  throws IOException {
+    System.out.println("Starting testSecondaryNamenodeError 21");
+    Path file1 = new Path("checkpoint.dat");
+    MiniDFSCluster cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                                false, false);
+    FileSystem fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      assertTrue(!fileSys.exists(file1));
+      //
+      // Make the checkpoint fail after rolling the
+      // edit log.
+      //
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.initializeErrorSimulationEvent(2);
+      secondary.setErrorSimulation(1);
+
+      try {
+        secondary.doCheckpoint();  // this should fail
+        assertTrue(false);      
+      } catch (IOException e) {
+      }
+      secondary.shutdown();
+
+      //
+      // Create a new file
+      //
+      writeFile(fileSys, file1, 3);
+      checkFile(fileSys, file1, 3);
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    //
+    // Restart cluster and verify that file exists.
+    // Then take another checkpoint to verify that the 
+    // namenode restart accounted for the rolled edit logs.
+    //
+    System.out.println("Starting testSecondaryNamenodeError 22");
+    cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                 false, false);
+    fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      checkFile(fileSys, file1, 3);
+      cleanupFile(fileSys, file1);
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.doCheckpoint();
+      secondary.shutdown();
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+  }
+
+  /**
+   * Tests checkpoint in DFS.
+   */
+  public void testCheckpoint() throws IOException {
+    Path file1 = new Path("checkpoint.dat");
+    Path file2 = new Path("checkpoint2.dat");
+    File[] namedirs = null;
+
+    Configuration conf = new Configuration();
+    MiniDFSCluster cluster = new MiniDFSCluster(65312, conf, numDatanodes, false);
+    FileSystem fileSys = cluster.getFileSystem();
+
+    // Now wait for 15 seconds to give datanodes chance to register
+    // themselves and to report heartbeat
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      //
+      // verify that 'format' really blew away all pre-existing files
+      //
+      assertTrue(!fileSys.exists(file1));
+      assertTrue(!fileSys.exists(file2));
+      namedirs = cluster.getNameDirs();
+
+      //
+      // Create file1
+      //
+      writeFile(fileSys, file1, 3);
+      checkFile(fileSys, file1, 3);
+
+      //
+      // Take a checkpoint
+      //
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.doCheckpoint();
+      secondary.shutdown();
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    //
+    // Restart cluster and verify that file1 still exist.
+    //
+    cluster = new MiniDFSCluster(65312, conf, numDatanodes, false, false);
+    fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      // check that file1 still exists
+      checkFile(fileSys, file1, 3);
+      cleanupFile(fileSys, file1);
+
+      // create new file file2
+      writeFile(fileSys, file2, 3);
+      checkFile(fileSys, file2, 3);
+
+      //
+      // Take a checkpoint
+      //
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.doCheckpoint();
+      secondary.shutdown();
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    //
+    // Restart cluster and verify that file2 exists and
+    // file1 does not exist.
+    //
+    cluster = new MiniDFSCluster(65312, conf, numDatanodes, false, false);
+    fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+
+    assertTrue(!fileSys.exists(file1));
+
+    try {
+      // verify that file2 exists
+      checkFile(fileSys, file2, 3);
+      cleanupFile(fileSys, file2);
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    testSecondaryNamenodeError1(conf);
+    testSecondaryNamenodeError2(conf);
+    testNamedirError(conf, namedirs);
+  }
+}