Browse Source

HADOOP-227. Add support for backup namenodes, which periodically get snapshots of the namenode state. Contributed by Dhruba.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@500415 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 18 years ago
parent
commit
2d1c66a80f

+ 5 - 2
CHANGES.txt

@@ -87,8 +87,11 @@ Trunk (unreleased changes)
     with different versions of Lucene without worrying about CLASSPATH
     with different versions of Lucene without worrying about CLASSPATH
     order.  (Milind Bhandarkar via cutting)
     order.  (Milind Bhandarkar via cutting)
 
 
-27. HADOOP-248.  Optimize location of map outputs to not use random
-    probes.  (Devaraj Das via cutting)
+27. HADOOP-248.  Optimize location of map outputs to no longer use
+    random probes.  (Devaraj Das via cutting)
+
+28. HADOOP-227.  Add support for backup namenodes, which periodically
+    get snapshots of the namenode state.  (Dhruba Borthakur via cutting) 
 
 
 
 
 Release 0.10.1 - 2007-01-10
 Release 0.10.1 - 2007-01-10

+ 15 - 12
bin/hadoop

@@ -25,20 +25,21 @@ bin=`cd "$bin"; pwd`
 if [ $# = 0 ]; then
 if [ $# = 0 ]; then
   echo "Usage: hadoop [--config confdir] COMMAND"
   echo "Usage: hadoop [--config confdir] COMMAND"
   echo "where COMMAND is one of:"
   echo "where COMMAND is one of:"
-  echo "  namenode -format  format the DFS filesystem"
-  echo "  namenode          run the DFS namenode"
-  echo "  datanode          run a DFS datanode"
-  echo "  dfsadmin          run a DFS admin client"
-  echo "  fsck              run a DFS filesystem checking utility"
-  echo "  fs                run a generic filesystem user client"
-  echo "  jobtracker        run the MapReduce job Tracker node" 
-  echo "  tasktracker       run a MapReduce task Tracker node" 
-  echo "  job               manipulate MapReduce jobs" 
-  echo "  version           print the version"
-  echo "  jar <jar>         run a jar file"
+  echo "  namenode -format     format the DFS filesystem"
+  echo "  secondarynamenode    run the DFS secondary namenode"
+  echo "  namenode             run the DFS namenode"
+  echo "  datanode             run a DFS datanode"
+  echo "  dfsadmin             run a DFS admin client"
+  echo "  fsck                 run a DFS filesystem checking utility"
+  echo "  fs                   run a generic filesystem user client"
+  echo "  jobtracker           run the MapReduce job Tracker node" 
+  echo "  tasktracker          run a MapReduce task Tracker node" 
+  echo "  job                  manipulate MapReduce jobs" 
+  echo "  version              print the version"
+  echo "  jar <jar>            run a jar file"
   echo "  distcp <srcurl> <desturl> copy file or directories recursively"
   echo "  distcp <srcurl> <desturl> copy file or directories recursively"
   echo " or"
   echo " or"
-  echo "  CLASSNAME         run the class named CLASSNAME"
+  echo "  CLASSNAME            run the class named CLASSNAME"
   echo "Most commands print help when invoked w/o parameters."
   echo "Most commands print help when invoked w/o parameters."
   exit 1
   exit 1
 fi
 fi
@@ -139,6 +140,8 @@ fi
 # figure out which class to run
 # figure out which class to run
 if [ "$COMMAND" = "namenode" ] ; then
 if [ "$COMMAND" = "namenode" ] ; then
   CLASS='org.apache.hadoop.dfs.NameNode'
   CLASS='org.apache.hadoop.dfs.NameNode'
+elif [ "$COMMAND" = "secondarynamenode" ] ; then
+  CLASS='org.apache.hadoop.dfs.SecondaryNameNode'
 elif [ "$COMMAND" = "datanode" ] ; then
 elif [ "$COMMAND" = "datanode" ] ; then
   CLASS='org.apache.hadoop.dfs.DataNode'
   CLASS='org.apache.hadoop.dfs.DataNode'
 elif [ "$COMMAND" = "fs" ] ; then
 elif [ "$COMMAND" = "fs" ] ; then

+ 13 - 0
bin/hadoop-config.sh

@@ -38,3 +38,16 @@ fi
  
  
 # Allow alternate conf dir location.
 # Allow alternate conf dir location.
 HADOOP_CONF_DIR="${HADOOP_CONF_DIR:-$HADOOP_HOME/conf}"
 HADOOP_CONF_DIR="${HADOOP_CONF_DIR:-$HADOOP_HOME/conf}"
+
+#check to see it is specified whether to use the slaves or the
+# masters file
+if [ $# -gt 1 ]
+then
+    if [ "--hosts" = "$1" ]
+    then
+        shift
+        slavesfile=$1
+        shift
+        export HADOOP_SLAVES="${HADOOP_CONF_DIR}/$slavesfile"
+    fi
+fi

+ 1 - 1
bin/hadoop-daemon.sh

@@ -12,7 +12,7 @@
 #   HADOOP_NICENESS The scheduling priority for daemons. Defaults to 0.
 #   HADOOP_NICENESS The scheduling priority for daemons. Defaults to 0.
 ##
 ##
 
 
-usage="Usage: hadoop-daemon.sh [--config <conf-dir>] (start|stop) <hadoop-command> <args...>"
+usage="Usage: hadoop-daemon.sh [--config <conf-dir>] [--hosts hostlistfile] (start|stop) <hadoop-command> <args...>"
 
 
 # if no args specified, show usage
 # if no args specified, show usage
 if [ $# -le 1 ]; then
 if [ $# -le 1 ]; then

+ 1 - 1
bin/hadoop-daemons.sh

@@ -2,7 +2,7 @@
 # 
 # 
 # Run a Hadoop command on all slave hosts.
 # Run a Hadoop command on all slave hosts.
 
 
-usage="Usage: hadoop-daemons.sh [--config confdir] [start|stop] command args..."
+usage="Usage: hadoop-daemons.sh [--config confdir] [--hosts hostlistfile] [start|stop] command args..."
 
 
 # if no args specified, show usage
 # if no args specified, show usage
 if [ $# -le 1 ]; then
 if [ $# -le 1 ]; then

+ 12 - 3
bin/slaves.sh

@@ -24,15 +24,24 @@ bin=`cd "$bin"; pwd`
 
 
 . "$bin"/hadoop-config.sh
 . "$bin"/hadoop-config.sh
 
 
+# If the slaves file is specified in the command line,
+# then it takes precedence over the definition in 
+# hadoop-env.sh. Save it here.
+HOSTLIST=$HADOOP_SLAVES
+
 if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
 if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then
   . "${HADOOP_CONF_DIR}/hadoop-env.sh"
   . "${HADOOP_CONF_DIR}/hadoop-env.sh"
 fi
 fi
 
 
-if [ "$HADOOP_SLAVES" = "" ]; then
-  export HADOOP_SLAVES="${HADOOP_CONF_DIR}/slaves"
+if [ "$HOSTLIST" = "" ]; then
+  if [ "$HADOOP_SLAVES" = "" ]; then
+    export HOSTLIST="${HADOOP_CONF_DIR}/slaves"
+  else
+    export HOSTLIST="${HADOOP_SLAVES}"
+  fi
 fi
 fi
 
 
-for slave in `cat "$HADOOP_SLAVES"`; do
+for slave in `cat "$HOSTLIST"`; do
  ssh $HADOOP_SSH_OPTS $slave $"${@// /\\ }" \
  ssh $HADOOP_SSH_OPTS $slave $"${@// /\\ }" \
    2>&1 | sed "s/^/$slave: /" &
    2>&1 | sed "s/^/$slave: /" &
  if [ "$HADOOP_SLAVE_SLEEP" != "" ]; then
  if [ "$HADOOP_SLAVE_SLEEP" != "" ]; then

+ 1 - 0
bin/start-dfs.sh

@@ -12,3 +12,4 @@ bin=`cd "$bin"; pwd`
 # note: datanodes will log connection errors until namenode starts
 # note: datanodes will log connection errors until namenode starts
 "$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start namenode
 "$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start namenode
 "$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR start datanode
 "$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR start datanode
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR --hosts masters start secondarynamenode

+ 2 - 1
bin/stop-dfs.sh

@@ -8,5 +8,6 @@ bin=`cd "$bin"; pwd`
 . "$bin"/hadoop-config.sh
 . "$bin"/hadoop-config.sh
 
 
 "$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR stop namenode
 "$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR stop namenode
-"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR stop datanode
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR --hosts slaves stop datanode
+"$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR --hosts masters stop secondarynamenode
 
 

+ 38 - 0
conf/hadoop-default.xml

@@ -136,6 +136,44 @@ creations/deletions), or "all".</description>
   <description>The size of the in-memory filsystem instance in MB</description>
   <description>The size of the in-memory filsystem instance in MB</description>
 </property>
 </property>
 
 
+<property>
+  <name>fs.checkpoint.dir</name>
+  <value>${hadoop.tmp.dir}/dfs/namesecondary</value>
+  <description>Determines where on the local filesystem the DFS secondary
+      name node should store the temporary images and edits to merge.  
+  </description>
+</property>
+
+<property>
+  <name>fs.checkpoint.period</name>
+  <value>3600</value>
+  <description>The number of seconds between two periodic checkpoints.
+  </description>
+</property>
+
+<property>
+  <name>fs.checkpoint.size</name>
+  <value>67108864</value>
+  <description>The size of the current edit log (in bytes) that triggers
+       a periodic checkpoint even if the fs.checkpoint.period hasn't expired.
+  </description>
+</property>
+
+<property>
+  <name>dfs.secondary.info.port</name>
+  <value>50090</value>
+  <description>The base number for the Secondary namenode info port.
+  </description>
+</property>
+
+<property>
+  <name>dfs.secondary.info.bindAddress</name>
+  <value>0.0.0.0</value>
+  <description>
+    The address where the secondary namenode web UI will listen to.
+  </description>
+</property>
+
 <property>
 <property>
   <name>dfs.datanode.bindAddress</name>
   <name>dfs.datanode.bindAddress</name>
   <value>0.0.0.0</value>
   <value>0.0.0.0</value>

+ 1 - 0
conf/masters.template

@@ -0,0 +1 @@
+localhost

+ 27 - 1
src/java/org/apache/hadoop/dfs/ClientProtocol.java

@@ -29,7 +29,7 @@ import org.apache.hadoop.ipc.VersionedProtocol;
  **********************************************************************/
  **********************************************************************/
 interface ClientProtocol extends VersionedProtocol {
 interface ClientProtocol extends VersionedProtocol {
 
 
-    public static final long versionID = 6L; // reportBadBlocks added
+  public static final long versionID = 7L;  // periodic checkpoint added
   
   
     ///////////////////////////////////////
     ///////////////////////////////////////
     // File contents
     // File contents
@@ -314,4 +314,30 @@ interface ClientProtocol extends VersionedProtocol {
     public boolean setSafeMode( FSConstants.SafeModeAction action ) throws IOException;
     public boolean setSafeMode( FSConstants.SafeModeAction action ) throws IOException;
 
 
     public boolean decommission( FSConstants.DecommissionAction action, String[] nodenames) throws IOException;
     public boolean decommission( FSConstants.DecommissionAction action, String[] nodenames) throws IOException;
+
+    /**
+     * Get the size of the current edit log (in bytes).
+     * @return The number of bytes in the current edit log.
+     * @throws IOException
+     */
+    public long getEditLogSize() throws IOException;
+
+    /**
+     * Closes the current edit log and opens a new one. The 
+     * call fails if there are already two or more edits log files or
+     * if the file system is in SafeMode.
+     * @return True if the call was successful, false otherwise.
+     * @throws IOException
+     */
+    public void rollEditLog() throws IOException;
+
+    /**
+     * Rolls the fsImage log. It removes the old fsImage, copies the
+     * new image to fsImage, removes the old edits and renames edits.new 
+     * to edits. The call fails if any of the four files are missing.
+     * @return True if the call was successful, false otherwise.
+     * @throws IOException
+     */
+    public void rollFsImage() throws IOException;
+
 }
 }

+ 4 - 0
src/java/org/apache/hadoop/dfs/FSDirectory.java

@@ -315,6 +315,10 @@ class FSDirectory implements FSConstants {
     public FSDirectory(File[] dirs) throws IOException {
     public FSDirectory(File[] dirs) throws IOException {
       this.fsImage = new FSImage( dirs );
       this.fsImage = new FSImage( dirs );
     }
     }
+
+    public FSDirectory(FSImage fsImage) throws IOException {
+      this.fsImage = fsImage;
+    }
     
     
     void loadFSImage( Configuration conf ) throws IOException {
     void loadFSImage( Configuration conf ) throws IOException {
       fsImage.loadFSImage( conf );
       fsImage.loadFSImage( conf );

+ 269 - 47
src/java/org/apache/hadoop/dfs/FSEditLog.java

@@ -47,18 +47,32 @@ class FSEditLog {
   private static final byte OP_SET_REPLICATION = 4;
   private static final byte OP_SET_REPLICATION = 4;
   private static final byte OP_DATANODE_ADD = 5;
   private static final byte OP_DATANODE_ADD = 5;
   private static final byte OP_DATANODE_REMOVE = 6;
   private static final byte OP_DATANODE_REMOVE = 6;
+
+  private static final String FS_EDIT = "edits";
+  private static final String FS_EDIT_NEW = "edits.new";
   
   
-  private File[] editFiles;
+  private File[] editFiles = null;
+  private File[] editFilesNew = null;
+
   DataOutputStream[] editStreams = null;
   DataOutputStream[] editStreams = null;
   FileDescriptor[] editDescriptors = null;
   FileDescriptor[] editDescriptors = null;
-  
-  FSEditLog( File[] edits ) {
-    this.editFiles = edits;
-  }
-  
-  File[] getEditFiles() {
-    return this.editFiles;
-  }
+  private FSImage fsimage = null;
+
+  FSEditLog(File[] fsDirs, FSImage image)  throws IOException {
+    fsimage = image;
+    editFiles = new File[fsDirs.length];
+    editFilesNew = new File[fsDirs.length];
+    for (int idx = 0; idx < fsDirs.length; idx++) {
+       editFiles[idx] = new File(fsDirs[idx], FS_EDIT);
+       editFilesNew[idx] = new File(fsDirs[idx], FS_EDIT_NEW);
+     }
+   }
+
+  FSEditLog(File imageDir, FSImage image, String edits)  throws IOException {
+    fsimage = image;
+    editFiles = new File[1];
+    editFiles[0] = new File(imageDir, edits);
+   }
 
 
   /**
   /**
    * Initialize the output stream for logging.
    * Initialize the output stream for logging.
@@ -75,40 +89,108 @@ class FSEditLog {
       editStreams[idx].writeInt( FSConstants.DFS_CURRENT_VERSION );
       editStreams[idx].writeInt( FSConstants.DFS_CURRENT_VERSION );
     }
     }
   }
   }
+
+  /**
+   * Create edits.new if non existant.
+   */
+  void createNewIfMissing() throws IOException {
+    for (int idx = 0; idx < editFilesNew.length; idx++) {
+      if (!editFilesNew[idx].exists()) {
+        FileOutputStream stream = new FileOutputStream(editFilesNew[idx]);
+        DataOutputStream editStr = new DataOutputStream(stream);
+        editStr.writeInt( FSConstants.DFS_CURRENT_VERSION );
+        editStr.flush();
+        editStr.close();
+      } 
+    }
+  }
   
   
   /**
   /**
    * Shutdown the filestore
    * Shutdown the filestore
    */
    */
   void close() throws IOException {
   void close() throws IOException {
+    if (editStreams == null) {
+      return;
+    }
     for (int idx = 0; idx < editStreams.length; idx++) {
     for (int idx = 0; idx < editStreams.length; idx++) {
-      editStreams[idx].flush();
-      editDescriptors[idx].sync();
-      editStreams[idx].close();
+      try {
+        editStreams[idx].flush();
+        editDescriptors[idx].sync();
+        editStreams[idx].close();
+      } catch (IOException e) {
+        processIOError(idx);
+        idx--;
+      }
     }
     }
   }
   }
 
 
+  /**
+   * If there is an IO Error on any log operations, remove that
+   * directory from the list of directories. If no more directories
+   * remain, then raise an exception that will possibly cause the
+   * server to exit
+   */
+   void processIOError(int index) throws IOException {
+     if (editStreams.length == 1) {
+       throw new IOException("Checkpoint directories inaccessible.");
+     }
+     assert(index < editFiles.length);
+     assert(editFiles.length == editFilesNew.length);
+     assert(editFiles.length == editStreams.length);
+     int newsize = editStreams.length - 1;
+     int oldsize = editStreams.length;
+
+     //
+     // save existing values and allocate space for new ones
+     //
+     File[] editFiles1 = editFiles;
+     File[] editFilesNew1 = editFilesNew;
+     DataOutputStream[] editStreams1 = editStreams;
+     FileDescriptor[] editDescriptors1 = editDescriptors;
+     editFiles = new File[newsize];
+     editFilesNew = new File[newsize];
+     editStreams = new DataOutputStream[newsize];
+     editDescriptors = new FileDescriptor[newsize];
+
+     //
+     // copy values from old into new, skip the one with error.
+     //
+     for (int idx = 0; idx < index; idx++) {
+       editFiles[idx] = editFiles1[idx];
+       editFilesNew[idx] = editFilesNew1[idx];
+       editStreams[idx] = editStreams1[idx];
+       editDescriptors[idx] = editDescriptors1[idx];
+     }
+     for (int idx = index; idx < oldsize - 1; idx++) {
+       editFiles[idx] = editFiles1[idx+1];
+       editFilesNew[idx] = editFilesNew1[idx+1];
+       editStreams[idx] = editStreams1[idx+1];
+       editDescriptors[idx] = editDescriptors1[idx+1];
+     }
+     //
+     // Invoke the ioerror routine of the fsimage
+     //
+     fsimage.processIOError(index);
+   }
+
   /**
   /**
    * Delete specified editLog
    * Delete specified editLog
    */
    */
   void delete(int idx) throws IOException {
   void delete(int idx) throws IOException {
     if (editStreams != null) {
     if (editStreams != null) {
-      editStreams[idx].close();
+      try {
+        editStreams[idx].close();
+      } catch (IOException e) {
+        processIOError(idx);
+      }
     }
     }
-    editFiles[idx].delete();
-  }
-  
-  /**
-   * Delete all editLogs
-   */
-  void deleteAll() throws IOException {
-    for (int idx = 0; idx < editFiles.length; idx++ ) {
+    if (!editFiles[idx].delete() || !editFilesNew[idx].delete()) {
       if (editStreams != null) {
       if (editStreams != null) {
-        editStreams[idx].close();
+        processIOError(idx);
       }
       }
-      editFiles[idx].delete();
     }
     }
   }
   }
-  
+
   /**
   /**
    * check if ANY edits log exists
    * check if ANY edits log exists
    */
    */
@@ -120,38 +202,51 @@ class FSEditLog {
     }
     }
     return false;
     return false;
   }
   }
+
+  /**
+   * check if ANY edits.new log exists
+   */
+  boolean existsNew() throws IOException {
+    for (int idx = 0; idx < editFilesNew.length; idx++) {
+      if (editFilesNew[idx].exists()) { 
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * check if a particular edits.new log exists
+   */
+  boolean existsNew(int idx) throws IOException {
+    if (editFilesNew[idx].exists()) { 
+      return true;
+    }
+    return false;
+  }
+
   
   
   /**
   /**
    * Load an edit log, and apply the changes to the in-memory structure
    * Load an edit log, and apply the changes to the in-memory structure
-   *
    * This is where we apply edits that we've been writing to disk all
    * This is where we apply edits that we've been writing to disk all
    * along.
    * along.
    */
    */
-  int loadFSEdits( Configuration conf ) throws IOException {
+  int loadFSEdits(Configuration conf, int index) throws IOException {
+    int numEdits = 0;
+    numEdits = loadFSEdits(conf, editFiles[index]);
+    if (editFilesNew[index].exists()) { 
+      numEdits += loadFSEdits(conf, editFilesNew[index]);
+    }
+    return numEdits;
+  }
+
+  int loadFSEdits( Configuration conf, File edits)
+                                                 throws IOException {
     FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
     FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
     FSDirectory fsDir = fsNamesys.dir;
     FSDirectory fsDir = fsNamesys.dir;
     int numEdits = 0;
     int numEdits = 0;
     int logVersion = 0;
     int logVersion = 0;
     
     
-    // first check how many editFiles exist
-    // and choose the largest editFile, because it is the most recent
-    Vector<File> files = new Vector<File>();
-    for (int idx = 0; idx < editFiles.length; idx++) {
-      if (editFiles[idx].exists()) {
-        files.add(editFiles[idx]);
-      }
-    }
-    long maxLength = Long.MIN_VALUE;
-    File edits = null;
-    for (Iterator<File> it = files.iterator(); it.hasNext();) {
-      File f = it.next();
-      long length = f.length();
-      if (length > maxLength) {
-        maxLength = length;
-        edits = f;
-      }
-    }
-    
     if (edits != null) {
     if (edits != null) {
       DataInputStream in = new DataInputStream(
       DataInputStream in = new DataInputStream(
           new BufferedInputStream(
           new BufferedInputStream(
@@ -322,11 +417,16 @@ class FSEditLog {
           editStreams[idx].flush();
           editStreams[idx].flush();
           editDescriptors[idx].sync();
           editDescriptors[idx].sync();
         } catch (IOException ie) {
         } catch (IOException ie) {
-          // TODO: Must report an error here
+          try {
+            processIOError(idx);         
+          } catch (IOException e) {
+            FSNamesystem.LOG.error("Unable to append to edit log. " +
+                                   "Fatal Error.");
+            System.exit(-1);
+          }
         }
         }
       }
       }
     }
     }
-    // TODO: initialize checkpointing if the log is large enough
   }
   }
 
 
   /** 
   /** 
@@ -395,4 +495,126 @@ class FSEditLog {
   static short fromLogReplication( UTF8 replication ) {
   static short fromLogReplication( UTF8 replication ) {
     return Short.parseShort(replication.toString());
     return Short.parseShort(replication.toString());
   }
   }
+
+  /**
+   * Return the size of the current EditLog
+   */
+  long getEditLogSize() throws IOException {
+    assert(editFiles.length == editStreams.length);
+    long size = 0;
+    for (int idx = 0; idx < editFiles.length; idx++) {
+      synchronized (editStreams[idx]) {
+        assert(size == 0 || size == editFiles[idx].length());
+        size = editFiles[idx].length();
+      }
+    }
+    return size;
+  }
+ 
+  /**
+   * Closes the current edit log and opens edits.new. 
+   */
+  void rollEditLog() throws IOException {
+    //
+    // If edits.new already exists, then return error.
+    //
+    if (existsNew()) {
+      throw new IOException("Attempt to roll edit log but edits.new exists");
+    }
+
+    close();                     // close existing edit log
+
+    //
+    // Open edits.new
+    //
+    for (int idx = 0; idx < editFiles.length; idx++ ) {
+      try {
+        FileOutputStream stream = new FileOutputStream(editFilesNew[idx]);
+        editStreams[idx] = new DataOutputStream(stream);
+        editDescriptors[idx] = stream.getFD();
+        editStreams[idx].writeInt( FSConstants.DFS_CURRENT_VERSION );
+      } catch (IOException e) {
+        processIOError(idx);
+        idx--;
+      }
+    }
+  }
+
+  /**
+   * Closes the current edit log and opens edits.new. 
+   * If edits.new already exists, then ignore it.
+   */
+  void rollEditLogIfNeeded() throws IOException {
+
+    //
+    // Open edits.new
+    //
+    for (int idx = 0; idx < editFiles.length; idx++ ) {
+      if (existsNew(idx)) {
+        continue;
+      }
+      try {
+        FileOutputStream stream = new FileOutputStream(editFilesNew[idx]);
+        editStreams[idx] = new DataOutputStream(stream);
+        editDescriptors[idx] = stream.getFD();
+        editStreams[idx].writeInt( FSConstants.DFS_CURRENT_VERSION );
+      } catch (IOException e) {
+        processIOError(idx);
+        idx--;
+      }
+    }
+  }
+  /**
+   * Removes the old edit log and renamed edits.new as edits.
+   * Reopens the edits file.
+   */
+  void purgeEditLog() throws IOException {
+    purgeEditLog(true);
+  }
+
+  /**
+   * Removes the old edit log and renamed edits.new as edits.
+   */
+  void purgeEditLog(boolean reopenEdits) throws IOException {
+    //
+    // If edits.new does not exists, then return error.
+    //
+    if (!existsNew()) {
+      throw new IOException("Attempt to purge edit log " +
+                            "but edits.new does not exist.");
+    }
+    close();
+
+    //
+    // Delete edits and rename edits.new to edits.
+    //
+    for (int idx = 0; idx < editFiles.length; idx++ ) {
+      if (!editFilesNew[idx].renameTo(editFiles[idx])) {
+        processIOError(idx); 
+        idx--; 
+      }
+    }
+    //
+    // Reopen all the edits logs.
+    //
+    boolean append = true;
+    for (int idx = 0; reopenEdits && idx < editStreams.length; idx++) {
+      try {
+        FileOutputStream stream = new FileOutputStream(editFiles[idx],
+                                                       append);
+        editStreams[idx] = new DataOutputStream(stream);
+        editDescriptors[idx] = stream.getFD();
+      } catch (IOException e) {
+        processIOError(idx); 
+        idx--; 
+      }
+    }
+  }
+
+  /**
+   * Return the name of the edit file
+   */
+  File getFsEditName() throws IOException {
+      return editFiles[0];
+  }
 }
 }

+ 245 - 67
src/java/org/apache/hadoop/dfs/FSImage.java

@@ -48,14 +48,26 @@ import org.apache.hadoop.io.UTF8;
  * @author Konstantin Shvachko
  * @author Konstantin Shvachko
  */
  */
 class FSImage {
 class FSImage {
-  private static final String FS_IMAGE = "fsimage";
-  private static final String NEW_FS_IMAGE = "fsimage.new";
-  private static final String OLD_FS_IMAGE = "fsimage.old";
-  private static final String FS_TIME = "fstime";
+
+  //
+  // The filenames used for storing the images
+  //
+  private enum NameNodeFile {
+    IMAGE ("fsimage"),
+    CKPT ("fsimage.ckpt"),
+    TIME ("fstime");
+
+    private String fileName;
+    private NameNodeFile(String name) {
+      this.fileName = name;
+    }
+    String getName() {
+      return fileName;
+    }
+  }
 
 
   private File[] imageDirs;  /// directories that contains the image file 
   private File[] imageDirs;  /// directories that contains the image file 
   private FSEditLog editLog;
   private FSEditLog editLog;
-  // private int namespaceID = 0;    /// a persistent attribute of the namespace
 
 
   /**
   /**
    * 
    * 
@@ -68,13 +80,61 @@ class FSImage {
         throw new IOException("NameNode not formatted: " + imageDirs[idx]);
         throw new IOException("NameNode not formatted: " + imageDirs[idx]);
       }
       }
     }
     }
-    File[] edits = new File[fsDirs.length];
-    for (int idx = 0; idx < edits.length; idx++) {
-      edits[idx] = new File(fsDirs[idx], "edits");
+    this.editLog = new FSEditLog( fsDirs , this);
+  }
+
+  /**
+   * Represents an Image (image and edit file).
+   */
+  FSImage(File imageDir, String edits) throws IOException {
+    this.imageDirs = new File[1];
+    imageDirs[0] = imageDir;
+    if (!imageDirs[0].exists()) {
+      throw new IOException("File " + imageDirs[0] + " not found.");
     }
     }
-    this.editLog = new FSEditLog( edits );
+    this.editLog = new FSEditLog(imageDir, this, edits);
+  }
+
+  /*
+   * Create an fsimage and edits log from scratch.
+   */
+  void create() throws IOException {
+    saveFSImage(NameNodeFile.IMAGE.getName());
+    editLog.create();
   }
   }
 
 
+  /**
+   * If there is an IO Error on any log operations, remove that
+   * directory from the list of directories. If no more directories
+   * remain, then raise an exception that will possibly cause the
+   * server to exit
+   */
+  void processIOError(int index) throws IOException {
+    if (imageDirs.length == 1) {
+      throw new IOException("Checkpoint directories inaccessible.");
+    }
+    assert(index < imageDirs.length);
+    int newsize = imageDirs.length - 1;
+    int oldsize = imageDirs.length;
+
+    //
+    // save existing values and allocate space for new ones
+    //
+    File[] imageDirs1 = imageDirs;
+    imageDirs = new File[newsize];
+
+    //
+    // copy in saved values, skipping the one on which we had
+    // an error
+    //
+    for (int idx = 0; idx < index; idx++) {
+      imageDirs[idx] = imageDirs1[idx];
+    }
+    for (int idx = index; idx < oldsize - 1; idx++) {
+      imageDirs[idx] = imageDirs1[idx+1];
+    }
+  }
+        
   FSEditLog getEditLog() {
   FSEditLog getEditLog() {
     return editLog;
     return editLog;
   }
   }
@@ -85,40 +145,57 @@ class FSImage {
    * "re-save" and consolidate the edit-logs
    * "re-save" and consolidate the edit-logs
    */
    */
   void loadFSImage( Configuration conf ) throws IOException {
   void loadFSImage( Configuration conf ) throws IOException {
-    FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
-    FSDirectory fsDir = fsNamesys.dir;
     for (int idx = 0; idx < imageDirs.length; idx++) {
     for (int idx = 0; idx < imageDirs.length; idx++) {
       //
       //
       // Atomic move sequence, to recover from interrupted save
       // Atomic move sequence, to recover from interrupted save
       //
       //
-      File curFile = new File(imageDirs[idx], FS_IMAGE);
-      File newFile = new File(imageDirs[idx], NEW_FS_IMAGE);
-      File oldFile = new File(imageDirs[idx], OLD_FS_IMAGE);
-
-      // Maybe we were interrupted between 2 and 4
-      if (oldFile.exists() && curFile.exists()) {
-        oldFile.delete();
-        if (editLog.exists()) {
-          editLog.deleteAll();
+      File curFile = new File(imageDirs[idx], 
+                              NameNodeFile.IMAGE.getName());
+      File ckptFile = new File(imageDirs[idx], 
+                              NameNodeFile.CKPT.getName());
+
+      //
+      // If we were in the midst of a checkpoint
+      //
+      if (ckptFile.exists()) {
+        if (editLog.existsNew(idx)) {
+          //
+          // checkpointing migth have uploaded a new
+          // merged image, but we discard it here because we are
+          // not sure whether the entire merged image was uploaded
+          // before the namenode crashed.
+          //
+          if (!ckptFile.delete()) {
+            throw new IOException("Unable to delete " + ckptFile);
+          }
+        } else {
+          //
+          // checkpointing was in progress when the namenode
+          // shutdown. The fsimage.ckpt was created and the edits.new
+          // file was moved to edits. We complete that checkpoint by
+          // moving fsimage.new to fsimage. There is no need to 
+          // update the fstime file here.
+          //
+          if (!ckptFile.renameTo(curFile)) {
+            throw new IOException("Unable to rename " + ckptFile +
+                                  " to " + curFile);
+          }
         }
         }
-      } else if (oldFile.exists() && newFile.exists()) {
-        // Or maybe between 1 and 2
-        newFile.renameTo(curFile);
-        oldFile.delete();
-      } else if (curFile.exists() && newFile.exists()) {
-        // Or else before stage 1, in which case we lose the edits
-        newFile.delete();
       }
       }
     }
     }
-    
+
     // Now check all curFiles and see which is the newest
     // Now check all curFiles and see which is the newest
     File curFile = null;
     File curFile = null;
     long maxTimeStamp = Long.MIN_VALUE;
     long maxTimeStamp = Long.MIN_VALUE;
+    int saveidx = 0;
+    boolean needToSave = false;
     for (int idx = 0; idx < imageDirs.length; idx++) {
     for (int idx = 0; idx < imageDirs.length; idx++) {
-      File file = new File(imageDirs[idx], FS_IMAGE);
+      File file = new File(imageDirs[idx], 
+                           NameNodeFile.IMAGE.getName());
       if (file.exists()) {
       if (file.exists()) {
         long timeStamp = 0;
         long timeStamp = 0;
-        File timeFile = new File(imageDirs[idx], FS_TIME);
+        File timeFile = new File(imageDirs[idx], 
+                                 NameNodeFile.TIME.getName());
         if (timeFile.exists() && timeFile.canRead()) {
         if (timeFile.exists() && timeFile.canRead()) {
           DataInputStream in = new DataInputStream(
           DataInputStream in = new DataInputStream(
               new FileInputStream(timeFile));
               new FileInputStream(timeFile));
@@ -127,14 +204,38 @@ class FSImage {
           } finally {
           } finally {
             in.close();
             in.close();
           }
           }
+        } else {
+          needToSave |= true;
         }
         }
         if (maxTimeStamp < timeStamp) {
         if (maxTimeStamp < timeStamp) {
           maxTimeStamp = timeStamp;
           maxTimeStamp = timeStamp;
           curFile = file;
           curFile = file;
+          saveidx = idx;
         }
         }
+      } else {
+        needToSave |= true;
       }
       }
     }
     }
 
 
+    //
+    // Load in bits
+    //
+    needToSave |= loadFSImage(conf, curFile);
+
+    //
+    // read in the editlog from the same directory from
+    // which we read in the image
+    //
+    needToSave |= (editLog.loadFSEdits(conf, saveidx) > 0);
+    if (needToSave) {
+      saveFSImage();
+    }
+  }
+
+  boolean loadFSImage(Configuration conf, File curFile)
+                      throws IOException {
+    FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
+    FSDirectory fsDir = fsNamesys.dir;
     //
     //
     // Load in bits
     // Load in bits
     //
     //
@@ -154,16 +255,17 @@ class FSImage {
         int numFiles = 0;
         int numFiles = 0;
         // version 0 does not store version #
         // version 0 does not store version #
         // starts directly with the number of files
         // starts directly with the number of files
-        if( imgVersion >= 0 ) {  
+        if( imgVersion >= 0 ) {
           numFiles = imgVersion;
           numFiles = imgVersion;
           imgVersion = 0;
           imgVersion = 0;
-        } else 
+        } else {
           numFiles = in.readInt();
           numFiles = in.readInt();
-        
+        }
+
         needToSave = ( imgVersion != FSConstants.DFS_CURRENT_VERSION );
         needToSave = ( imgVersion != FSConstants.DFS_CURRENT_VERSION );
         if( imgVersion < FSConstants.DFS_CURRENT_VERSION ) // future version
         if( imgVersion < FSConstants.DFS_CURRENT_VERSION ) // future version
           throw new IncorrectVersionException(imgVersion, "file system image");
           throw new IncorrectVersionException(imgVersion, "file system image");
-        
+
         // read file info
         // read file info
         short replication = (short)conf.getInt("dfs.replication", 3);
         short replication = (short)conf.getInt("dfs.replication", 3);
         for (int i = 0; i < numFiles; i++) {
         for (int i = 0; i < numFiles; i++) {
@@ -185,30 +287,27 @@ class FSImage {
           }
           }
           fsDir.unprotectedAddFile(name, blocks, replication );
           fsDir.unprotectedAddFile(name, blocks, replication );
         }
         }
-        
+
         // load datanode info
         // load datanode info
         this.loadDatanodes( imgVersion, in );
         this.loadDatanodes( imgVersion, in );
       } finally {
       } finally {
         in.close();
         in.close();
       }
       }
     }
     }
-    
     if( fsDir.namespaceID == 0 )
     if( fsDir.namespaceID == 0 )
       fsDir.namespaceID = newNamespaceID();
       fsDir.namespaceID = newNamespaceID();
-    
-    needToSave |= ( editLog.exists() && editLog.loadFSEdits(conf) > 0 );
-    if( needToSave )
-      saveFSImage();
+
+    return needToSave;
   }
   }
 
 
   /**
   /**
    * Save the contents of the FS image
    * Save the contents of the FS image
    */
    */
-  void saveFSImage() throws IOException {
+  void saveFSImage(String filename) throws IOException {
     FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
     FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
     FSDirectory fsDir = fsNamesys.dir;
     FSDirectory fsDir = fsNamesys.dir;
     for (int idx = 0; idx < imageDirs.length; idx++) {
     for (int idx = 0; idx < imageDirs.length; idx++) {
-      File newFile = new File(imageDirs[idx], NEW_FS_IMAGE);
+      File newFile = new File(imageDirs[idx], filename);
       
       
       //
       //
       // Write out data
       // Write out data
@@ -226,31 +325,25 @@ class FSImage {
         out.close();
         out.close();
       }
       }
     }
     }
-    
-    //
-    // Atomic move sequence
-    //
-    for (int idx = 0; idx < imageDirs.length; idx++) {
-      File curFile = new File(imageDirs[idx], FS_IMAGE);
-      File newFile = new File(imageDirs[idx], NEW_FS_IMAGE);
-      File oldFile = new File(imageDirs[idx], OLD_FS_IMAGE);
-      File timeFile = new File(imageDirs[idx], FS_TIME);
-      // 1.  Move cur to old and delete timeStamp
-      curFile.renameTo(oldFile);
-      if (timeFile.exists()) { timeFile.delete(); }
-      // 2.  Move new to cur and write timestamp
-      newFile.renameTo(curFile);
-      DataOutputStream out = new DataOutputStream(
-            new FileOutputStream(timeFile));
-      try {
-        out.writeLong(System.currentTimeMillis());
-      } finally {
-        out.close();
-      }
-      // 3.  Remove pending-edits file (it's been integrated with newFile)
-      editLog.delete(idx);
-      // 4.  Delete old
-      oldFile.delete();
+  }
+
+  /**
+   * Save the contents of the FS image
+   */
+  void saveFSImage() throws IOException {
+    editLog.createNewIfMissing();
+    saveFSImage(NameNodeFile.CKPT.getName());
+    rollFSImage(false);
+  }
+
+  void updateTimeFile(File timeFile, long timestamp) throws IOException {
+    if (timeFile.exists()) { timeFile.delete(); }
+    DataOutputStream out = new DataOutputStream(
+          new FileOutputStream(timeFile));
+    try {
+      out.writeLong(timestamp);
+    } finally {
+      out.close();
     }
     }
   }
   }
 
 
@@ -343,6 +436,91 @@ class FSImage {
       fsNamesys.unprotectedAddDatanode(nodeImage.getDatanodeDescriptor());
       fsNamesys.unprotectedAddDatanode(nodeImage.getDatanodeDescriptor());
     }
     }
   }
   }
+  /**
+   * Moves fsimage.ckpt to fsImage and edits.new to edits
+   * Reopens the new edits file.
+   */
+  void rollFSImage() throws IOException {
+    rollFSImage(true);
+  }
+
+  /**
+   * Moves fsimage.ckpt to fsImage and edits.new to edits
+   */
+  void rollFSImage(boolean reopenEdits) throws IOException {
+    //
+    // First, verify that edits.new and fsimage.ckpt exists in all
+    // checkpoint directories.
+    //
+    if (!editLog.existsNew()) {
+      throw new IOException("New Edits file does not exist");
+    }
+    for (int idx = 0; idx < imageDirs.length; idx++) {
+      File ckpt = new File(imageDirs[idx], 
+                           NameNodeFile.CKPT.getName());
+      File curFile = new File(imageDirs[idx], 
+                              NameNodeFile.IMAGE.getName());
+
+      if (!curFile.exists()) {
+        throw new IOException("Image file " + curFile +
+                              " does not exist");
+      }
+      if (!ckpt.exists()) {
+        throw new IOException("Checkpoint file " + ckpt +
+                              " does not exist");
+      }
+    }
+    editLog.purgeEditLog(reopenEdits); // renamed edits.new to edits
+
+    //
+    // Renames new image
+    //
+    for (int idx = 0; idx < imageDirs.length; idx++) {
+      File ckpt = new File(imageDirs[idx], 
+                           NameNodeFile.CKPT.getName());
+      File curFile = new File(imageDirs[idx], 
+                              NameNodeFile.IMAGE.getName());
+      if (!ckpt.renameTo(curFile)) {
+        editLog.processIOError(idx);
+        idx--;
+      }
+    }
+
+    //
+    // Updates the fstime file
+    //
+    long now = System.currentTimeMillis();
+    for (int idx = 0; idx < imageDirs.length; idx++) {
+	  File timeFile = new File(imageDirs[idx], 
+                               NameNodeFile.TIME.getName());
+      try {
+        updateTimeFile(timeFile, now);
+      } catch (IOException e) {
+        editLog.processIOError(idx);
+        idx--;
+      }
+    }
+  }
+
+  /**
+   * Return the name of the image file.
+   */
+  File getFsImageName() {
+      return new File(imageDirs[0], NameNodeFile.IMAGE.getName());
+  }
+
+  /**
+   * Return the name of the image file that is uploaded by periodic
+   * checkpointing.
+   */
+  File[] getFsImageNameCheckpoint() {
+      File[] list = new File[imageDirs.length];
+      for (int i = 0; i < imageDirs.length; i++) {
+        list[i] = new File(imageDirs[i], 
+                           NameNodeFile.CKPT.getName());
+      }
+      return list;
+  }
 
 
   static class DatanodeImage implements WritableComparable {
   static class DatanodeImage implements WritableComparable {
 
 

+ 84 - 0
src/java/org/apache/hadoop/dfs/FSNamesystem.java

@@ -242,8 +242,19 @@ class FSNamesystem implements FSConstants {
         this.infoServer.setAttribute("name.node", nn);
         this.infoServer.setAttribute("name.node", nn);
         this.infoServer.setAttribute("name.conf", conf);
         this.infoServer.setAttribute("name.conf", conf);
         this.infoServer.addServlet("fsck", "/fsck", FsckServlet.class);
         this.infoServer.addServlet("fsck", "/fsck", FsckServlet.class);
+        this.infoServer.addServlet("getimage", "/getimage", GetImageServlet.class);
         this.infoServer.start();
         this.infoServer.start();
     }
     }
+
+    /**
+     * dirs is a list of directories where the filesystem directory state 
+     * is stored
+     */
+    FSNamesystem(FSImage fsImage) throws IOException {
+        fsNamesystemObject = this;
+        this.dir = new FSDirectory(fsImage);
+    }
+
     /** Return the FSNamesystem object
     /** Return the FSNamesystem object
      * 
      * 
      */
      */
@@ -3017,6 +3028,40 @@ class FSNamesystem implements FSConstants {
         return "";
         return "";
       return safeMode.getTurnOffTip();
       return safeMode.getTurnOffTip();
     }
     }
+
+    long getEditLogSize() throws IOException {
+      return getEditLog().getEditLogSize();
+    }
+
+    synchronized void rollEditLog() throws IOException {
+      if (isInSafeMode()) {
+        throw new SafeModeException("Checkpoint not created",
+                                     safeMode);
+      }
+      LOG.info("Roll Edit Log");
+      getEditLog().rollEditLog();
+    }
+
+    synchronized void rollFSImage() throws IOException {
+      LOG.info("Roll FSImage");
+      if (isInSafeMode()) {
+        throw new SafeModeException("Checkpoint not created",
+                                    safeMode);
+      }
+      dir.fsImage.rollFSImage();
+    }
+
+    File getFsImageName() throws IOException {
+      return dir.fsImage.getFsImageName();
+    }
+
+    File[] getFsImageNameCheckpoint() throws IOException {
+      return dir.fsImage.getFsImageNameCheckpoint();
+    }
+
+    File getFsEditName() throws IOException {
+      return getEditLog().getFsEditName();
+    }
     
     
     /**
     /**
      * This class is used in Namesystem's jetty to do fsck on namenode
      * This class is used in Namesystem's jetty to do fsck on namenode
@@ -3042,4 +3087,43 @@ class FSNamesystem implements FSConstants {
         }
         }
       }
       }
     }
     }
+
+    /**
+     * This class is used in Namesystem's jetty to retrieve a file.
+     * Typically used by the Secondary NameNode to retrieve image and
+     * edit file for periodic checkpointing.
+     * @author Dhruba Borthakur
+     */
+    public static class GetImageServlet extends HttpServlet {
+      public void doGet(HttpServletRequest request,
+          HttpServletResponse response
+          ) throws ServletException, IOException {
+        Map<String,String[]> pmap = request.getParameterMap();
+        try {
+          ServletContext context = getServletContext();
+          NameNode nn = (NameNode) context.getAttribute("name.node");
+          Configuration conf = (Configuration) context.getAttribute("name.conf");
+          TransferFsImage ff = new TransferFsImage(pmap, request, response);
+          if (ff.getImage()) {
+            // send fsImage to Secondary
+            TransferFsImage.getFileServer(response.getOutputStream(),
+                                          nn.getFsImageName()); 
+          } else if (ff.getEdit()) {
+            // send old edits to Secondary
+            TransferFsImage.getFileServer(response.getOutputStream(),
+                                          nn.getFsEditName());
+          } else if (ff.putImage()) {
+            // issue a HTTP get request to download the new fsimage 
+            TransferFsImage.getFileClient(ff.getInfoServer(), "getimage=1", 
+                                          nn.getFsImageNameCheckpoint());
+          }
+        } catch (IOException ie) {
+          StringUtils.stringifyException(ie);
+          LOG.warn(ie);
+          String errMsg = "GetImage failed.";
+          response.sendError(HttpServletResponse.SC_GONE, errMsg);
+          throw ie;
+        }
+      }
+    }
 }
 }

+ 53 - 3
src/java/org/apache/hadoop/dfs/NameNode.java

@@ -92,12 +92,20 @@ public class NameNode implements ClientProtocol, DatanodeProtocol, FSConstants {
       for (int idx = 0; idx < dirs.length; idx++) {
       for (int idx = 0; idx < dirs.length; idx++) {
         FSImage.format(dirs[idx]);
         FSImage.format(dirs[idx]);
       }
       }
+      FSImage fsimage = new FSImage(dirs);
+      FSNamesystem namesystem = new FSNamesystem(fsimage);
+      fsimage.create();
     }
     }
 
 
     /** Format a new filesystem.  Destroys any filesystem that may already
     /** Format a new filesystem.  Destroys any filesystem that may already
      * exist at this location.  **/
      * exist at this location.  **/
     public static void format(File dir) throws IOException {
     public static void format(File dir) throws IOException {
+      File dirs[] = new File[1];
+      dirs[0] = dir;
       FSImage.format(dir);
       FSImage.format(dir);
+      FSImage fsimage = new FSImage(dirs);
+      FSNamesystem namesystem = new FSNamesystem(fsimage);
+      fsimage.create();
     }
     }
 
 
     private class NameNodeMetrics {
     private class NameNodeMetrics {
@@ -165,7 +173,7 @@ public class NameNode implements ClientProtocol, DatanodeProtocol, FSConstants {
     }
     }
 
 
     /** Return the configured directories where name data is stored. */
     /** Return the configured directories where name data is stored. */
-    private static File[] getDirs(Configuration conf) {
+    static File[] getDirs(Configuration conf) {
       String[] dirNames = conf.getStrings("dfs.name.dir");
       String[] dirNames = conf.getStrings("dfs.name.dir");
       if (dirNames == null) { dirNames = new String[] {"/tmp/hadoop/dfs/name"}; }
       if (dirNames == null) { dirNames = new String[] {"/tmp/hadoop/dfs/name"}; }
       File[] dirs = new File[dirNames.length];
       File[] dirs = new File[dirNames.length];
@@ -503,7 +511,27 @@ public class NameNode implements ClientProtocol, DatanodeProtocol, FSConstants {
         return ret;
         return ret;
     }
     }
 
 
-    
+    /**
+     * Returns the size of the current edit log.
+     */
+    public long getEditLogSize() throws IOException {
+      return namesystem.getEditLogSize();
+    }
+
+    /**
+     * Roll the edit log.
+     */
+    public void rollEditLog() throws IOException {
+      namesystem.rollEditLog();
+    }
+
+    /**
+     * Roll the image 
+     */
+    public void rollFsImage() throws IOException {
+      namesystem.rollFSImage();
+    }
+
     ////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////
     // DatanodeProtocol
     // DatanodeProtocol
     ////////////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////////////
@@ -618,7 +646,29 @@ public class NameNode implements ClientProtocol, DatanodeProtocol, FSConstants {
       if( version != DFS_CURRENT_VERSION )
       if( version != DFS_CURRENT_VERSION )
         throw new IncorrectVersionException( version, "data node" );
         throw new IncorrectVersionException( version, "data node" );
     }
     }
-    
+
+    /**
+     * Returns the name of the fsImage file
+     */
+    public File getFsImageName() throws IOException {
+      return namesystem.getFsImageName();
+    }
+
+    /**
+     * Returns the name of the fsImage file uploaded by periodic
+     * checkpointing
+     */
+    public File[] getFsImageNameCheckpoint() throws IOException {
+      return namesystem.getFsImageNameCheckpoint();
+    }
+
+    /**
+     * Returns the name of the edits file
+     */
+    public File getFsEditName() throws IOException {
+      return namesystem.getFsEditName();
+    }
+
     /**
     /**
      */
      */
     public static void main(String argv[]) throws Exception {
     public static void main(String argv[]) throws Exception {

+ 477 - 0
src/java/org/apache/hadoop/dfs/SecondaryNameNode.java

@@ -0,0 +1,477 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.dfs;
+
+import org.apache.commons.logging.*;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.ipc.*;
+import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Daemon;
+import org.apache.hadoop.mapred.StatusHttpServer;
+
+import java.io.*;
+import java.net.*;
+import java.util.Map;
+import javax.servlet.ServletContext;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+/**********************************************************
+ * The Secondary NameNode is a helper to the primary NameNode.
+ * The Secondary is responsible for supporting periodic checkpoints 
+ * of the HDFS metadata. The current design allows only one Secondary
+ * NameNode per HDFs cluster.
+ *
+ * The Secondary NameNode is a daemon that periodically wakes
+ * up (determined by the schedule specified in the configuration),
+ * triggers a periodic checkpoint and then goes back to sleep.
+ * The Secondary NameNode uses the ClientProtocol to talk to the
+ * primary NameNode.
+ *
+ * @author  Dhruba Borthakur
+ **********************************************************/
+public class SecondaryNameNode implements FSConstants, Runnable {
+    
+    public static final Log LOG = LogFactory.getLog(
+                                  "org.apache.hadoop.dfs.NameNode.Secondary");
+    private static final String SRC_FS_IMAGE = "srcimage.tmp";
+    private static final String FS_EDITS = "edits.tmp";
+    private static final String DEST_FS_IMAGE = "destimage.tmp";
+
+    private ClientProtocol namenode;
+    private Configuration conf;
+    private String localName;
+    private InetSocketAddress nameNodeAddr;
+    private boolean shouldRun;
+    private StatusHttpServer infoServer;
+    private int infoPort;
+    private String infoBindAddress;
+
+    private File checkpointDir;
+    private long checkpointPeriod;	// in seconds
+    private long checkpointSize;    // size (in MB) of current Edit Log
+    private File srcImage;
+    private File destImage;
+    private File editFile;
+
+    private boolean[] simulation = null; // error simulation events
+
+    /**
+     * Create a connection to the primary namenode.
+     */
+    public SecondaryNameNode(Configuration conf)  throws IOException {
+
+      //
+      // initialize error simulation code for junit test
+      //
+      initializeErrorSimulationEvent(2);
+
+      //
+      // Create connection to the namenode.
+      //
+      shouldRun = true;
+      nameNodeAddr = DataNode.createSocketAddr(
+                                 conf.get("fs.default.name", "local"));
+      this.conf = conf;
+      this.namenode = (ClientProtocol) RPC.getProxy(ClientProtocol.class,
+                       ClientProtocol.versionID, nameNodeAddr, conf);
+      try {
+        this.localName = InetAddress.getLocalHost().getHostName();
+      } catch (UnknownHostException uhe) {
+        this.localName = "";
+      }
+
+      //
+      // initialize the webserver for uploading files.
+      //
+      infoPort = conf.getInt("dfs.secondary.info.port", 50090);
+      infoBindAddress = conf.get("dfs.secondary.info.bindAddress", "0.0.0.0");
+      infoServer = new StatusHttpServer("dfs", infoBindAddress, infoPort, false);
+      infoServer.setAttribute("name.secondary", this);
+      infoServer.addServlet("getimage", "/getimage", GetImageServlet.class);
+      infoServer.start();
+
+      //
+      // Initialize other scheduling parameters from the configuration
+      //
+      String[] dirName = conf.getStrings("fs.checkpoint.dir");
+      checkpointDir = new File(dirName[0]);
+      checkpointPeriod = conf.getLong("fs.checkpoint.period", 3600);
+      checkpointSize = conf.getLong("fs.checkpoint.size", 4194304);
+      doSetup();
+
+      LOG.warn("Checkpoint Directory:" + checkpointDir);
+      LOG.warn("Checkpoint Period   :" + checkpointPeriod + " secs " +
+               "(" + checkpointPeriod/60 + " min)");
+      LOG.warn("Log Size Trigger    :" + checkpointSize + " bytes " +
+               "(" + checkpointSize/1024 + " KB)");
+    }
+
+   /**
+     * Shut down this instance of the datanode.
+     * Returns only after shutdown is complete.
+     */
+    public void shutdown() {
+      shouldRun = false;
+      try {
+          infoServer.stop();
+      } catch (Exception e) {
+      }
+    }
+
+    private void doSetup() throws IOException {
+      //
+      // Create the checkpoint directory if needed. 
+      //
+      checkpointDir.mkdirs();
+      srcImage = new File(checkpointDir, SRC_FS_IMAGE);
+      destImage = new File(checkpointDir, DEST_FS_IMAGE);
+      editFile = new File(checkpointDir, FS_EDITS);
+      srcImage.delete();
+      destImage.delete();
+      editFile.delete();
+    }
+
+    File getNewImage() {
+      return destImage;
+    }
+
+    //
+    // The main work loop
+    //
+    public void run() {
+
+      //
+      // Poll the Namenode (once every 5 minutes) to find the size of the
+      // pending edit log.
+      //
+      long period = 5 * 60;              // 5 minutes
+      long lastCheckpointTime = 0;
+      if (checkpointPeriod < period) {
+        period = checkpointPeriod;
+      }
+
+      while (shouldRun) {
+        try {
+            Thread.sleep(1000 * period);
+        } catch (InterruptedException ie) {
+          // do nothing
+        }
+        if (!shouldRun) {
+          break;
+        }
+        try {
+          long now = System.currentTimeMillis();
+
+          long size = namenode.getEditLogSize();
+          if (size >= checkpointSize || 
+              now >= lastCheckpointTime + 1000 * checkpointPeriod) {
+            doCheckpoint();
+            lastCheckpointTime = now;
+          }
+        } catch (IOException e) {
+          LOG.error("Exception in doCheckpoint:");
+          LOG.error(StringUtils.stringifyException(e));
+          e.printStackTrace();
+        }
+      }
+    }
+
+    /**
+     * get the current fsimage from Namenode.
+     */
+     private void getFSImage() throws IOException {
+       String fsName = getInfoServer();
+       String fileid = "getimage=1";
+       TransferFsImage.getFileClient(fsName, fileid, srcImage);
+       LOG.info("Downloaded file " + srcImage + " size " +
+                srcImage.length() + " bytes.");
+    }
+
+    /**
+     * get the old edits file from the NameNode
+     */
+     private void getFSEdits() throws IOException {
+       String fsName = getInfoServer();
+       String fileid = "getedit=1";
+       TransferFsImage.getFileClient(fsName, fileid, editFile);
+       LOG.info("Downloaded file " + editFile + " size " +
+                editFile.length() + " bytes.");
+    }
+
+    /**
+     * Copy the new fsimage into the NameNode
+     */
+     private void putFSImage() throws IOException {
+       String fsName = getInfoServer();
+       String fileid = "putimage=1&port=" + infoPort +
+                       "&machine=" +
+                       InetAddress.getLocalHost().getHostAddress();
+       LOG.info("Posted URL " + fsName + fileid);
+       TransferFsImage.getFileClient(fsName, fileid, (File[])null);
+     }
+
+    /*
+     * Returns the Jetty server that the Namenode is listening on.
+     */
+    private String getInfoServer() throws IOException {
+      String fsName = conf.get("fs.default.name", "local");
+      if (fsName.equals("local")) {
+        throw new IOException("This is not a DFS");
+      }
+      String[] splits = fsName.split(":", 2);
+      int infoPort = conf.getInt("dfs.info.port", 50070);
+      return splits[0]+":"+infoPort;
+    }
+
+    /*
+     * Create a new checkpoint
+     */
+    void doCheckpoint() throws IOException {
+
+      //
+      // Do the rquired initialization of the merge work area.
+      //
+      doSetup();
+
+      //
+      // Tell the namenode to start logging transactions in a new edit file
+      //
+      namenode.rollEditLog();
+
+      //
+      // error simulation code for junit test
+      //
+      if (simulation != null && simulation[0]) {
+        throw new IOException("Simulating error0 " +
+                              "after creating edits.new");
+      }
+
+      getFSImage();                // Fetch fsimage
+      getFSEdits();                // Fetch edist
+      doMerge();                   // Do the merge
+  
+      //
+      // Upload the new image into the NameNode. Then tell the Namenode
+      // to make this new uploaded image as the most current image.
+      //
+      putFSImage();
+
+      //
+      // error simulation code for junit test
+      //
+      if (simulation != null && simulation[1]) {
+        throw new IOException("Simulating error1 " +
+                              "after uploading new image to NameNode");
+      }
+
+      namenode.rollFsImage();
+
+      LOG.warn("Checkpoint done. Image Size:" + srcImage.length() +
+               " Edit Size:" + editFile.length() +
+               " New Image Size:" + destImage.length());
+    }
+
+    /**
+     * merges SRC_FS_IMAGE with FS_EDITS and writes the output into
+     * DEST_FS_IMAGE
+     */
+    private void doMerge() throws IOException {
+      FSImage fsImage = new FSImage(checkpointDir, FS_EDITS);
+      FSNamesystem namesystem = new FSNamesystem(fsImage);
+      fsImage.loadFSImage(conf, srcImage);
+      fsImage.getEditLog().loadFSEdits(conf, editFile);
+      fsImage.saveFSImage(DEST_FS_IMAGE);
+    }
+
+    /**
+     * @param argv The parameters passed to this program.
+     * @exception Exception if the filesystem does not exist.
+     * @return 0 on success, non zero on error.
+     */
+    private int processArgs(String[] argv) throws Exception {
+
+      if (argv.length < 1) {
+          printUsage("");
+          return -1;
+      }
+
+      int exitCode = -1;
+      int i = 0;
+      String cmd = argv[i++];
+
+      //
+      // verify that we have enough command line parameters
+      //
+      if ("-geteditsize".equals(cmd)) {
+        if (argv.length != 1) {
+          printUsage(cmd);
+          return exitCode;
+        }
+      } else if ("-checkpoint".equals(cmd)) {
+        if (argv.length != 1 && argv.length != 2) {
+          printUsage(cmd);
+          return exitCode;
+        }
+        if (argv.length == 2 && !"force".equals(argv[i])) {
+          printUsage(cmd);
+          return exitCode;
+        }
+      }
+
+      exitCode = 0;
+      try {
+        if ("-checkpoint".equals(cmd)) {
+          long size = namenode.getEditLogSize();
+          if (size >= checkpointSize || 
+              argv.length == 2 && "force".equals(argv[i])) {
+            doCheckpoint();
+          } else {
+            System.err.println("EditLog size " + size + " bytes is " +
+                               "smaller than configured checkpoint " +
+                               "size " + checkpointSize + " bytes.");
+            System.err.println("Skipping checkpoint.");
+          }
+        } else if ("-geteditsize".equals(cmd)) {
+          long size = namenode.getEditLogSize();
+          System.out.println("EditLog size is " + size + " bytes");
+        } else {
+          exitCode = -1;
+          LOG.error(cmd.substring(1) + ": Unknown command");
+          printUsage("");
+        }
+      } catch (RemoteException e) {
+          //
+          // This is a error returned by hadoop server. Print
+          // out the first line of the error mesage, ignore the stack trace.
+          exitCode = -1;
+          try {
+            String[] content;
+            content = e.getLocalizedMessage().split("\n");
+            LOG.error(cmd.substring(1) + ": "
+                               + content[0]);
+          } catch (Exception ex) {
+            LOG.error(cmd.substring(1) + ": "
+                               + ex.getLocalizedMessage());
+          }
+        } catch (IOException e) {
+          //
+          // IO exception encountered locally.
+          //
+          exitCode = -1;
+          LOG.error(cmd.substring(1) + ": "
+                             + e.getLocalizedMessage());
+        } finally {
+            // Does the RPC connection need to be closed?
+        }
+        return exitCode;
+    }
+
+    /**
+     * Displays format of commands.
+     * @param cmd The command that is being executed.
+     */
+    private void printUsage(String cmd) {
+      if ("-geteditsize".equals(cmd)) {
+        System.err.println("Usage: java SecondaryNameNode"
+                           + " [-geteditsize]");
+      } else if ("-checkpoint".equals(cmd)) {
+        System.err.println("Usage: java SecondaryNameNode"
+                           + " [-checkpoint [force]]");
+      } else {
+        System.err.println("Usage: java SecondaryNameNode " +
+                           "[-checkpoint [force]] " +
+                           "[-geteditsize] ");
+      }
+    }
+
+    //
+    // utility method to facilitate junit test error simulation
+    //
+    void initializeErrorSimulationEvent(int numberOfEvents) {
+      simulation = new boolean[numberOfEvents]; 
+      for (int i = 0; i < numberOfEvents; i++) {
+        simulation[i] = false;
+      }
+    }
+
+    void setErrorSimulation(int index) {
+      assert(index < simulation.length);
+      simulation[index] = true;
+    }
+
+    void clearErrorSimulation(int index) {
+      assert(index < simulation.length);
+      simulation[index] = false;
+    }
+
+   /**
+     * This class is used in Namesystem's jetty to retrieve a file.
+     * Typically used by the Secondary NameNode to retrieve image and
+     * edit file for periodic checkpointing.
+     * @author Dhruba Borthakur
+     */
+    public static class GetImageServlet extends HttpServlet {
+      public void doGet(HttpServletRequest request,
+          HttpServletResponse response
+          ) throws ServletException, IOException {
+        Map<String,String[]> pmap = request.getParameterMap();
+        try {
+          ServletContext context = getServletContext();
+          SecondaryNameNode nn = (SecondaryNameNode) 
+                                  context.getAttribute("name.secondary");
+          TransferFsImage ff = new TransferFsImage(pmap, request, response);
+          if (ff.getImage()) {
+            TransferFsImage.getFileServer(response.getOutputStream(),
+                                   nn.getNewImage());
+          }
+          LOG.info("New Image " + nn.getNewImage() + " retrieved by Namenode.");
+        } catch (IOException ie) {
+          StringUtils.stringifyException(ie);
+          LOG.error(ie);
+          String errMsg = "GetImage failed.";
+          response.sendError(HttpServletResponse.SC_GONE, errMsg);
+          throw ie;
+
+        }
+      }
+    }
+
+    /**
+     * main() has some simple utility methods.
+     * @param argv Command line parameters.
+     * @exception Exception if the filesystem does not exist.
+     */
+    public static void main(String[] argv) throws Exception {
+        Configuration tconf = new Configuration();
+        if (argv.length >= 1) {
+          SecondaryNameNode secondary = new SecondaryNameNode(tconf);
+          int ret = secondary.processArgs(argv);
+          System.exit(ret);
+        }
+
+        // Create a never ending deamon
+        Daemon checkpointThread = new Daemon(new SecondaryNameNode(tconf)); 
+        checkpointThread.start();
+    }
+}

+ 186 - 0
src/java/org/apache/hadoop/dfs/TransferFsImage.java

@@ -0,0 +1,186 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.dfs;
+
+import java.io.*;
+import java.net.*;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Random;
+import java.util.TreeSet;
+import javax.servlet.http.HttpServletResponse;
+import javax.servlet.http.HttpServletRequest;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSOutputStream;
+import org.apache.hadoop.io.UTF8;
+
+
+/**
+ * This class provides fetching a specified file from the NameNode.
+ * @author Dhruba Borthakur
+ */
+class TransferFsImage implements FSConstants {
+  
+  private HttpServletResponse response;
+  private boolean isGetImage;
+  private boolean isGetEdit;
+  private boolean isPutImage;
+  private int remoteport;
+  private String machineName;
+  
+  /**
+   * File downloader.
+   * @param pmap key=value[] map that is passed to the http servlet as 
+   *        url parameters
+   * @param request the object from which this servelet reads the url contents
+   * @param response the object into which this servelet writes the url contents
+   * @throws IOException
+   */
+  public TransferFsImage(Map<String,String[]> pmap,
+                         HttpServletRequest request,
+                         HttpServletResponse response
+                         ) throws IOException {
+    isGetImage = isGetEdit = isPutImage = false;
+    remoteport = 0;
+    machineName = null;
+
+    for (Iterator<String> it = pmap.keySet().iterator(); it.hasNext();) {
+      String key = it.next();
+      if (key.equals("getimage")) { 
+        isGetImage = true;
+      } else if (key.equals("getedit")) { 
+        isGetEdit = true;
+      } else if (key.equals("putimage")) { 
+        isPutImage = true;
+      } else if (key.equals("port")) { 
+        remoteport = new Integer(pmap.get("port")[0]).intValue();
+      } else if (key.equals("machine")) { 
+        machineName = pmap.get("machine")[0];
+      }
+    }
+    if ((isGetImage && isGetEdit) ||
+        (!isGetImage && !isGetEdit && !isPutImage)) {
+      throw new IOException("No good parameters to TransferFsImage");
+    }
+  }
+
+  boolean getEdit() {
+    return isGetEdit;
+  }
+
+  boolean getImage() {
+    return isGetImage;
+  }
+
+  boolean putImage() {
+    return isPutImage;
+  }
+
+  String getInfoServer() throws IOException{
+    if (machineName == null || remoteport == 0) {
+      throw new IOException ("MachineName and port undefined");
+    }
+    return machineName + ":" + remoteport;
+  }
+
+  /**
+   * A server-side method to respond to a getfile http request
+   * Copies the contents of the local file into the output stream.
+   */
+  static void getFileServer(OutputStream outstream, File localfile) 
+                                                 throws IOException {
+    byte buf[] = new byte[BUFFER_SIZE];
+    FileInputStream infile = null;
+    try {
+      infile = new FileInputStream(localfile);
+      int num = 1;
+      while (num > 0) {
+        num = infile.read(buf);
+        if (num <= 0) {
+          break;
+        }
+        outstream.write(buf, 0, num);
+      }
+    } finally {
+      outstream.close();
+      if (infile != null) {
+        infile.close();
+      }
+    }
+  }
+
+  /**
+   * Client-side Method to fetch file from a server
+   * Copies the response from the URL to a list of local files.
+   */
+   static void getFileClient(String fsName, String id, File[] localPath)
+                             throws IOException {
+     byte[] buf = new byte[BUFFER_SIZE];
+     StringBuffer str = new StringBuffer("http://"+fsName+"/getimage?");
+     str.append(id);
+
+     //
+     // open connection to remote server
+     //
+     URL url = new URL(str.toString());
+     URLConnection connection = url.openConnection();
+     InputStream stream = connection.getInputStream();
+     FileOutputStream[] output = null;
+     if (localPath != null) {
+       output = new FileOutputStream[localPath.length];
+       for (int i = 0; i < output.length; i++) {
+         output[i] = new FileOutputStream(localPath[i]);
+       }
+     }
+
+     try {
+       int num = 1;
+       while (num > 0) {
+         num = stream.read(buf);
+         if (num > 0 && localPath != null) {
+           for (int i = 0; i < output.length; i++) {
+             output[i].write(buf, 0, num);
+           }
+         }
+       }
+     } finally {
+       stream.close();
+       if (localPath != null) {
+         for (int i = 0; i < output.length; i++) {
+           output[i].close();
+         }
+       }
+     }
+   }
+
+  /**
+   * Client-side Method to fetch file from a server
+   * Copies the response from the URL to the local file.
+   */
+   static void getFileClient(String fsName, String id, File localPath)
+                             throws IOException {
+     File[] filelist = new File[1];
+     filelist[0] = localPath;
+     getFileClient(fsName, id, filelist);
+   }
+}

+ 7 - 0
src/test/org/apache/hadoop/dfs/MiniDFSCluster.java

@@ -278,4 +278,11 @@ public class MiniDFSCluster {
   public FileSystem getFileSystem() throws IOException {
   public FileSystem getFileSystem() throws IOException {
     return FileSystem.get(conf);
     return FileSystem.get(conf);
   }
   }
+
+  /**
+   * Get the directories where the namenode stores image
+   */
+  public File[] getNameDirs() {
+    return NameNode.getDirs(conf);
+  }
 }
 }

+ 360 - 0
src/test/org/apache/hadoop/dfs/TestCheckpoint.java

@@ -0,0 +1,360 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.dfs;
+
+import junit.framework.TestCase;
+import java.io.*;
+import java.util.Random;
+import java.net.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSInputStream;
+import org.apache.hadoop.fs.FSOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * This class tests the creation and validation of a checkpoint.
+ * @author Dhruba Borthakur
+ */
+public class TestCheckpoint extends TestCase {
+  static final long seed = 0xDEADBEEFL;
+  static final int blockSize = 8192;
+  static final int fileSize = 16384;
+  static final int numDatanodes = 4;
+
+  private void writeFile(FileSystem fileSys, Path name, int repl)
+  throws IOException {
+    // create and write a file that contains three blocks of data
+    FSOutputStream stm = fileSys.createRaw(name, true, (short)repl,
+        (long)blockSize);
+    byte[] buffer = new byte[fileSize];
+    Random rand = new Random(seed);
+    rand.nextBytes(buffer);
+    stm.write(buffer);
+    stm.close();
+  }
+  
+  
+  private void checkFile(FileSystem fileSys, Path name, int repl)
+  throws IOException {
+    assertTrue(fileSys.exists(name));
+    String[][] locations = fileSys.getFileCacheHints(name, 0, fileSize);
+    for (int idx = 0; idx < locations.length; idx++) {
+      assertEquals("Number of replicas for block" + idx,
+          Math.min(numDatanodes, repl), locations[idx].length);  
+    }
+  }
+  
+  private void cleanupFile(FileSystem fileSys, Path name)
+  throws IOException {
+    assertTrue(fileSys.exists(name));
+    fileSys.delete(name);
+    assertTrue(!fileSys.exists(name));
+  }
+
+  /**
+   * put back the old namedir
+   */
+  private void resurrectNameDir(File[] namedirs) 
+  throws IOException {
+    String parentdir = namedirs[0].getParent();
+    String name = namedirs[0].getName();
+    File oldname =  new File(parentdir, name + ".old");
+    if (!oldname.renameTo(namedirs[0])) {
+      assertTrue(false);
+    }
+  }
+
+  /**
+   * remove one namedir
+   */
+  private void removeOneNameDir(File[] namedirs) 
+  throws IOException {
+    String parentdir = namedirs[0].getParent();
+    String name = namedirs[0].getName();
+    File newname =  new File(parentdir, name + ".old");
+    if (!namedirs[0].renameTo(newname)) {
+      assertTrue(false);
+    }
+  }
+
+  /*
+   * Verify that namenode does not startup if one namedir is bad.
+   */
+  private void testNamedirError(Configuration conf, File[] namedirs) 
+  throws IOException {
+    System.out.println("Starting testNamedirError");
+    Path file1 = new Path("checkpoint.dat");
+    MiniDFSCluster cluster = null;
+
+    if (namedirs.length <= 1) {
+      return;
+    }
+    
+    //
+    // Remove one namedir & Restart cluster. This should fail.
+    //
+    removeOneNameDir(namedirs);
+    try {
+      cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                                false, false);
+      assertTrue(false);
+    } catch (IOException e) {
+      // no nothing
+    }
+    resurrectNameDir(namedirs); // put back namedir
+  }
+
+  /*
+   * Simulate namenode crashing after rolling edit log.
+   */
+  private void testSecondaryNamenodeError1(Configuration conf)
+  throws IOException {
+    System.out.println("Starting testSecondaryNamenodeError 1");
+    Path file1 = new Path("checkpoint.dat");
+    MiniDFSCluster cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                                false, false);
+    FileSystem fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      assertTrue(!fileSys.exists(file1));
+      //
+      // Make the checkpoint fail after rolling the
+      // edit log.
+      //
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.initializeErrorSimulationEvent(2);
+      secondary.setErrorSimulation(0);
+
+      try {
+        secondary.doCheckpoint();  // this should fail
+        assertTrue(false);      
+      } catch (IOException e) {
+      }
+      secondary.shutdown();
+
+      //
+      // Create a new file
+      //
+      writeFile(fileSys, file1, 3);
+      checkFile(fileSys, file1, 3);
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    //
+    // Restart cluster and verify that file exists.
+    // Then take another checkpoint to verify that the 
+    // namenode restart accounted for the rolled edit logs.
+    //
+    System.out.println("Starting testSecondaryNamenodeError 2");
+    cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                 false, false);
+    fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      checkFile(fileSys, file1, 3);
+      cleanupFile(fileSys, file1);
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.doCheckpoint();
+      secondary.shutdown();
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+  }
+
+  /*
+   * Simulate a namenode crash after uploading new image
+   */
+  private void testSecondaryNamenodeError2(Configuration conf)
+  throws IOException {
+    System.out.println("Starting testSecondaryNamenodeError 21");
+    Path file1 = new Path("checkpoint.dat");
+    MiniDFSCluster cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                                false, false);
+    FileSystem fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      assertTrue(!fileSys.exists(file1));
+      //
+      // Make the checkpoint fail after rolling the
+      // edit log.
+      //
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.initializeErrorSimulationEvent(2);
+      secondary.setErrorSimulation(1);
+
+      try {
+        secondary.doCheckpoint();  // this should fail
+        assertTrue(false);      
+      } catch (IOException e) {
+      }
+      secondary.shutdown();
+
+      //
+      // Create a new file
+      //
+      writeFile(fileSys, file1, 3);
+      checkFile(fileSys, file1, 3);
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    //
+    // Restart cluster and verify that file exists.
+    // Then take another checkpoint to verify that the 
+    // namenode restart accounted for the rolled edit logs.
+    //
+    System.out.println("Starting testSecondaryNamenodeError 22");
+    cluster = new MiniDFSCluster(65312, conf, numDatanodes, 
+                                 false, false);
+    fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      checkFile(fileSys, file1, 3);
+      cleanupFile(fileSys, file1);
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.doCheckpoint();
+      secondary.shutdown();
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+  }
+
+  /**
+   * Tests checkpoint in DFS.
+   */
+  public void testCheckpoint() throws IOException {
+    Path file1 = new Path("checkpoint.dat");
+    Path file2 = new Path("checkpoint2.dat");
+    File[] namedirs = null;
+
+    Configuration conf = new Configuration();
+    MiniDFSCluster cluster = new MiniDFSCluster(65312, conf, numDatanodes, false);
+    FileSystem fileSys = cluster.getFileSystem();
+
+    // Now wait for 15 seconds to give datanodes chance to register
+    // themselves and to report heartbeat
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      //
+      // verify that 'format' really blew away all pre-existing files
+      //
+      assertTrue(!fileSys.exists(file1));
+      assertTrue(!fileSys.exists(file2));
+      namedirs = cluster.getNameDirs();
+
+      //
+      // Create file1
+      //
+      writeFile(fileSys, file1, 3);
+      checkFile(fileSys, file1, 3);
+
+      //
+      // Take a checkpoint
+      //
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.doCheckpoint();
+      secondary.shutdown();
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    //
+    // Restart cluster and verify that file1 still exist.
+    //
+    cluster = new MiniDFSCluster(65312, conf, numDatanodes, false, false);
+    fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+    try {
+      // check that file1 still exists
+      checkFile(fileSys, file1, 3);
+      cleanupFile(fileSys, file1);
+
+      // create new file file2
+      writeFile(fileSys, file2, 3);
+      checkFile(fileSys, file2, 3);
+
+      //
+      // Take a checkpoint
+      //
+      SecondaryNameNode secondary = new SecondaryNameNode(conf);
+      secondary.doCheckpoint();
+      secondary.shutdown();
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    //
+    // Restart cluster and verify that file2 exists and
+    // file1 does not exist.
+    //
+    cluster = new MiniDFSCluster(65312, conf, numDatanodes, false, false);
+    fileSys = cluster.getFileSystem();
+    try {
+      Thread.sleep(15000L);
+    } catch (InterruptedException e) {
+      // nothing
+    }
+
+    assertTrue(!fileSys.exists(file1));
+
+    try {
+      // verify that file2 exists
+      checkFile(fileSys, file2, 3);
+      cleanupFile(fileSys, file2);
+    } finally {
+      fileSys.close();
+      cluster.shutdown();
+    }
+
+    testSecondaryNamenodeError1(conf);
+    testSecondaryNamenodeError2(conf);
+    testNamedirError(conf, namedirs);
+  }
+}