فهرست منبع

HADOOP-2585. Name-node imports namespace data from a recent checkpoint accessible via a NFS mount. Contributed by Konstantin Shvachko.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/core/trunk@647313 13f79535-47bb-0310-9956-ffa450edef68
Konstantin Shvachko 17 سال پیش
والد
کامیت
ab1c4d0939

+ 5 - 2
CHANGES.txt

@@ -7,14 +7,17 @@ Trunk (unreleased changes)
 
   NEW FEATURES
 
+    HADOOP-2585. Name-node imports namespace data from a recent checkpoint
+    accessible via a NFS mount. (shv)
+
   IMPROVEMENTS
 
   OPTIMIZATIONS
 
   BUG FIXES
 
-   HADOOP-2905. 'fsck -move' triggers NPE in NameNode. 
-   (Lohit Vjayarenu via rangadi)
+    HADOOP-2905. 'fsck -move' triggers NPE in NameNode. 
+    (Lohit Vjayarenu via rangadi)
 
 Release 0.17.0 - Unreleased
 

+ 1 - 0
build.xml

@@ -187,6 +187,7 @@
     <mkdir dir="${build.webapps}/job/WEB-INF"/>
     <mkdir dir="${build.webapps}/dfs/WEB-INF"/>
     <mkdir dir="${build.webapps}/datanode/WEB-INF"/>
+    <mkdir dir="${build.webapps}/secondary/WEB-INF"/>
     <mkdir dir="${build.examples}"/>
     <mkdir dir="${build.anttasks}"/>
     <mkdir dir="${build.dir}/c++"/>

+ 3 - 1
conf/hadoop-default.xml

@@ -213,7 +213,9 @@ creations/deletions), or "all".</description>
   <name>fs.checkpoint.dir</name>
   <value>${hadoop.tmp.dir}/dfs/namesecondary</value>
   <description>Determines where on the local filesystem the DFS secondary
-      name node should store the temporary images and edits to merge.  
+      name node should store the temporary images and edits to merge.
+      If this is a comma-delimited list of directories then the image is
+      replicated in all of the directories for redundancy.
   </description>
 </property>
 

+ 16 - 0
src/java/org/apache/hadoop/conf/Configuration.java

@@ -40,6 +40,7 @@ import java.util.Set;
 import java.util.StringTokenizer;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.Collection;
 
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
@@ -526,6 +527,21 @@ public class Configuration implements Iterable<Map.Entry<String,String>> {
     return new IntegerRanges(get(name, defaultValue));
   }
 
+  /** 
+   * Get the comma delimited values of the <code>name</code> property as 
+   * a collection of <code>String</code>s.  
+   * If no such property is specified then empty collection is returned.
+   * <p>
+   * This is an optimized version of {@link #getStrings(String)}
+   * 
+   * @param name property name.
+   * @return property value as a collection of <code>String</code>s. 
+   */
+  public Collection<String> getStringCollection(String name) {
+    String valueString = get(name);
+    return StringUtils.getStringCollection(valueString);
+  }
+
   /** 
    * Get the comma delimited values of the <code>name</code> property as 
    * an array of <code>String</code>s.  

+ 116 - 0
src/java/org/apache/hadoop/dfs/CheckpointSignature.java

@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.dfs;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * A unique signature intended to identify checkpoint transactions.
+ */
+class CheckpointSignature extends StorageInfo 
+                      implements WritableComparable<CheckpointSignature> {
+  private static final String FIELD_SEPARATOR = ":";
+  long editsTime = -1L;
+  long checkpointTime = -1L;
+
+  CheckpointSignature() {}
+
+  CheckpointSignature(FSImage fsImage) {
+    super(fsImage);
+    editsTime = fsImage.getEditLog().getFsEditTime();
+    checkpointTime = fsImage.checkpointTime;
+  }
+
+  CheckpointSignature(String str) {
+    String[] fields = str.split(FIELD_SEPARATOR);
+    assert fields.length == 5 : "Must be 5 fields in CheckpointSignature";
+    layoutVersion = Integer.valueOf(fields[0]);
+    namespaceID = Integer.valueOf(fields[1]);
+    cTime = Long.valueOf(fields[2]);
+    editsTime = Long.valueOf(fields[3]);
+    checkpointTime = Long.valueOf(fields[4]);
+  }
+
+  public String toString() {
+    return String.valueOf(layoutVersion) + FIELD_SEPARATOR
+         + String.valueOf(namespaceID) + FIELD_SEPARATOR
+         + String.valueOf(cTime) + FIELD_SEPARATOR
+         + String.valueOf(editsTime) + FIELD_SEPARATOR
+         + String.valueOf(checkpointTime);
+  }
+
+  void validateStorageInfo(StorageInfo si) throws IOException {
+    if(layoutVersion != si.layoutVersion
+        || namespaceID != si.namespaceID || cTime != si.cTime) {
+      // checkpointTime can change when the image is saved - do not compare
+      throw new IOException("Inconsistent checkpoint fileds. "
+          + "LV = " + layoutVersion + " namespaceID = " + namespaceID
+          + " cTime = " + cTime + ". Expecting respectively: "
+          + si.layoutVersion + "; " + si.namespaceID + "; " + si.cTime);
+    }
+  }
+
+  //
+  // Comparable interface
+  //
+  public int compareTo(CheckpointSignature o) {
+    return 
+      (layoutVersion < o.layoutVersion) ? -1 : 
+                  (layoutVersion > o.layoutVersion) ? 1 :
+      (namespaceID < o.namespaceID) ? -1 : (namespaceID > o.namespaceID) ? 1 :
+      (cTime < o.cTime) ? -1 : (cTime > o.cTime) ? 1 :
+      (editsTime < o.editsTime) ? -1 : (editsTime > o.editsTime) ? 1 :
+      (checkpointTime < o.checkpointTime) ? -1 : 
+                  (checkpointTime > o.checkpointTime) ? 1 : 0;
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof CheckpointSignature)) {
+      return false;
+    }
+    return compareTo((CheckpointSignature)o) == 0;
+  }
+
+  public int hashCode() {
+    return layoutVersion ^ namespaceID ^
+            (int)(cTime ^ editsTime ^ checkpointTime);
+  }
+
+  /////////////////////////////////////////////////
+  // Writable
+  /////////////////////////////////////////////////
+  public void write(DataOutput out) throws IOException {
+    out.writeInt(getLayoutVersion());
+    out.writeInt(getNamespaceID());
+    out.writeLong(getCTime());
+    out.writeLong(editsTime);
+    out.writeLong(checkpointTime);
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    layoutVersion = in.readInt();
+    namespaceID = in.readInt();
+    cTime = in.readLong();
+    editsTime = in.readLong();
+    checkpointTime = in.readLong();
+  }
+}

+ 5 - 4
src/java/org/apache/hadoop/dfs/ClientProtocol.java

@@ -18,6 +18,7 @@
 package org.apache.hadoop.dfs;
 
 import java.io.*;
+
 import org.apache.hadoop.ipc.VersionedProtocol;
 import org.apache.hadoop.dfs.FSConstants.UpgradeAction;
 import org.apache.hadoop.fs.permission.*;
@@ -36,7 +37,7 @@ interface ClientProtocol extends VersionedProtocol {
    * Compared to the previous version the following changes have been introduced:
    * (Only the latest change is reflected.
    * The log of historical changes can be retrieved from the svn).
-   * 27 : removed getContentLength(String), open(String, long, long) and isDir(String)
+   * 28 : rollEditLog() returns CheckpointSignature instead of long.
    */
   public static final long versionID = 27L;
   
@@ -376,10 +377,10 @@ interface ClientProtocol extends VersionedProtocol {
   /**
    * Closes the current edit log and opens a new one. The 
    * call fails if the file system is in SafeMode.
-   * Returns a unique token to identify this transaction.
    * @throws IOException
+   * @return a unique token to identify this transaction.
    */
-  public long rollEditLog() throws IOException;
+  public CheckpointSignature rollEditLog() throws IOException;
 
   /**
    * Rolls the fsImage log. It removes the old fsImage, copies the
@@ -433,7 +434,7 @@ interface ClientProtocol extends VersionedProtocol {
    * Write all metadata for this file into persistent storage.
    * The file must be currently open for writing.
    * @param src The string representation of the path
-   * @param clientName The string representation of the client
+   * @param client The string representation of the client
    */
   public void fsync(String src, String client) throws IOException;
 }

+ 12 - 1
src/java/org/apache/hadoop/dfs/FSConstants.java

@@ -146,7 +146,18 @@ public interface FSConstants {
   public enum SafeModeAction{ SAFEMODE_LEAVE, SAFEMODE_ENTER, SAFEMODE_GET; }
 
   // Startup options
-  public enum StartupOption{ FORMAT, REGULAR, UPGRADE, ROLLBACK, FINALIZE; }
+  public enum StartupOption{
+    FORMAT  ("-format"),
+    REGULAR ("-regular"),
+    UPGRADE ("-upgrade"),
+    ROLLBACK("-rollback"),
+    FINALIZE("-finalize"),
+    IMPORT  ("-importCheckpoint");
+    
+    private String name = null;
+    private StartupOption(String arg) {this.name = arg;}
+    String getName() {return name;}
+  }
 
   // type of the datanode report
   public static enum DatanodeReportType {ALL, LIVE, DEAD }

+ 9 - 1
src/java/org/apache/hadoop/dfs/FSDirectory.java

@@ -52,6 +52,7 @@ class FSDirectory implements FSConstants {
   /** Access an existing dfs name directory. */
   public FSDirectory(FSNamesystem ns, Configuration conf) throws IOException {
     this(new FSImage(), ns, conf);
+    fsImage.setCheckpointDirectories(FSImage.getCheckpointDirs(conf, null));
   }
 
   public FSDirectory(FSImage fsImage, FSNamesystem ns, Configuration conf) throws IOException {
@@ -77,7 +78,14 @@ class FSDirectory implements FSConstants {
       startOpt = StartupOption.REGULAR;
     }
     try {
-      fsImage.recoverTransitionRead(dataDirs, startOpt);
+      if (fsImage.recoverTransitionRead(dataDirs, startOpt)) {
+        fsImage.saveFSImage();
+      }
+      FSEditLog editLog = fsImage.getEditLog();
+      assert editLog != null : "editLog must be initialized";
+      if (!editLog.isOpen())
+        editLog.open();
+      fsImage.setCheckpointDirectories(null);
     } catch(IOException e) {
       fsImage.close();
       throw e;

+ 1 - 1
src/java/org/apache/hadoop/dfs/FSEditLog.java

@@ -1055,7 +1055,7 @@ class FSEditLog {
   /**
    * Returns the timestamp of the edit log
    */
-  synchronized long getFsEditTime() throws IOException {
+  synchronized long getFsEditTime() {
     return getEditFile(0).lastModified();
   }
 

+ 151 - 13
src/java/org/apache/hadoop/dfs/FSImage.java

@@ -28,15 +28,19 @@ import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.RandomAccessFile;
+import java.text.SimpleDateFormat;
 import java.util.AbstractList;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Date;
 import java.util.Iterator;
 import java.util.Properties;
 import java.util.Random;
 import java.lang.Math;
 
 import org.apache.hadoop.fs.permission.PermissionStatus;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.dfs.FSConstants.CheckpointStates;
 import org.apache.hadoop.dfs.FSConstants.StartupOption;
 import org.apache.hadoop.dfs.FSConstants.NodeType;
 import org.apache.hadoop.io.UTF8;
@@ -49,6 +53,9 @@ import org.apache.hadoop.dfs.BlocksMap.BlockInfo;
  */
 class FSImage extends Storage {
 
+  private static final SimpleDateFormat DATE_FORM =
+    new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+
   //
   // The filenames used for storing the images
   //
@@ -64,9 +71,18 @@ class FSImage extends Storage {
     String getName() {return fileName;}
   }
   
-  private long checkpointTime = -1L;
+  protected long checkpointTime = -1L;
   private FSEditLog editLog = null;
   private boolean isUpgradeFinalized = false;
+  /**
+   * Directories for importing an image from a checkpoint.
+   */
+  private Collection<File> checkpointDirs;
+
+  /**
+   * Can fs-image be rolled?
+   */
+  volatile private CheckpointStates ckptState = CheckpointStates.START; 
 
   /**
    */
@@ -102,6 +118,10 @@ class FSImage extends Storage {
       this.addStorageDir(new StorageDirectory(it.next()));
   }
 
+  void setCheckpointDirectories(Collection<File> dirs) {
+    checkpointDirs = dirs;
+  }
+
   /**
    */
   File getImageFile(int imageDirIdx, NameNodeFile type) {
@@ -119,7 +139,28 @@ class FSImage extends Storage {
   File getEditNewFile(int idx) {
     return getImageFile(idx, NameNodeFile.EDITS_NEW);
   }
-  
+
+  File[] getFileNames(NameNodeFile type) {
+    File[] list = new File[getNumStorageDirs()];
+    int i=0;
+    for(StorageDirectory sd : storageDirs) {
+      list[i++] = getImageFile(sd, type);
+    }
+    return list;
+  }
+
+  File[] getImageFiles() {
+    return getFileNames(NameNodeFile.IMAGE);
+  }
+
+  File[] getEditsFiles() {
+    return getFileNames(NameNodeFile.EDITS);
+  }
+
+  File[] getTimeFiles() {
+    return getFileNames(NameNodeFile.TIME);
+  }
+
   /**
    * Analyze storage directories.
    * Recover from previous transitions if required. 
@@ -129,12 +170,19 @@ class FSImage extends Storage {
    * @param dataDirs
    * @param startOpt startup option
    * @throws IOException
+   * @return true if the image needs to be saved or false otherwise
    */
-  void recoverTransitionRead(Collection<File> dataDirs,
+  boolean recoverTransitionRead(Collection<File> dataDirs,
                              StartupOption startOpt
                              ) throws IOException {
     assert startOpt != StartupOption.FORMAT : 
       "NameNode formatting should be performed before reading the image";
+
+    if(startOpt == StartupOption.IMPORT 
+        && (checkpointDirs == null || checkpointDirs.isEmpty()))
+      throw new IOException("Cannot import image from a checkpoint. "
+                          + "\"fs.checkpoint.dir\" is not set." );
+
     // 1. For each data directory calculate its state and 
     // check whether all is consistent before transitioning.
     this.storageDirs = new ArrayList<StorageDirectory>(dataDirs.size());
@@ -169,6 +217,10 @@ class FSImage extends Storage {
           sd.read(); // read and verify consistency with other directories
           isFormatted = true;
         }
+        if (startOpt == StartupOption.IMPORT && isFormatted)
+          // import of a checkpoint is allowed only into empty image directories
+          throw new IOException("Cannot import image from a checkpoint. " 
+              + " NameNode already contains an image in " + sd.root);
       } catch (IOException ioe) {
         sd.unlock();
         throw ioe;
@@ -180,8 +232,9 @@ class FSImage extends Storage {
 
     if (dataDirs.size() == 0)  // none of the data dirs exist
       throw new IOException(
-                            "All specified directories are not accessible or do not exist.");
-    if (!isFormatted && startOpt != StartupOption.ROLLBACK)
+        "All specified directories are not accessible or do not exist.");
+    if (!isFormatted && startOpt != StartupOption.ROLLBACK 
+                     && startOpt != StartupOption.IMPORT)
       throw new IOException("NameNode is not formatted.");
     if (startOpt != StartupOption.UPGRADE
           && layoutVersion < LAST_PRE_UPGRADE_LAYOUT_VERSION
@@ -215,17 +268,17 @@ class FSImage extends Storage {
     switch(startOpt) {
     case UPGRADE:
       doUpgrade();
-      break;
+      return false; // upgrade saved image already
+    case IMPORT:
+      doImportCheckpoint();
+      return true;
     case ROLLBACK:
       doRollback();
-      // and now load that image
+      break;
     case REGULAR:
-      if (loadFSImage())
-        saveFSImage();
+      // just load the image
     }
-    assert editLog != null : "editLog must be initialized";
-    if(!editLog.isOpen())
-      editLog.open();
+    return loadFSImage();
   }
 
   private void doUpgrade() throws IOException {
@@ -360,6 +413,30 @@ class FSImage extends Storage {
     LOG.info("Finalize upgrade for " + sd.root + " is complete.");
   }
 
+  /**
+   * Load image from a checkpoint directory and save it into the current one.
+   * @throws IOException
+   */
+  void doImportCheckpoint() throws IOException {
+    FSImage ckptImage = new FSImage();
+    FSNamesystem fsNamesys = FSNamesystem.getFSNamesystem();
+    // replace real image with the checkpoint image
+    FSImage realImage = fsNamesys.getFSImage();
+    assert realImage == this;
+    fsNamesys.dir.fsImage = ckptImage;
+    // load from the checkpoint dirs
+    try {
+      ckptImage.recoverTransitionRead(checkpointDirs, StartupOption.REGULAR);
+    } finally {
+      ckptImage.close();
+    }
+    // return back the real image
+    realImage.setStorageInfo(ckptImage);
+    fsNamesys.dir.fsImage = realImage;
+    // and save it
+    saveFSImage();
+  }
+
   void finalizeUpgrade() throws IOException {
     for(int idx = 0; idx < getNumStorageDirs(); idx++)
       doFinalize(getStorageDir(idx));
@@ -808,6 +885,7 @@ class FSImage extends Storage {
       if (editsNew.exists()) 
         editLog.createEditLogFile(editsNew);
     }
+    ckptState = CheckpointStates.UPLOAD_DONE;
     rollFSImage();
   }
 
@@ -1009,6 +1087,9 @@ class FSImage extends Storage {
    * Reopens the new edits file.
    */
   void rollFSImage() throws IOException {
+    if (ckptState != CheckpointStates.UPLOAD_DONE) {
+      throw new IOException("Cannot roll fsImage before rolling edits log.");
+    }
     //
     // First, verify that edits.new and fsimage.ckpt exists in all
     // checkpoint directories.
@@ -1059,6 +1140,43 @@ class FSImage extends Storage {
         idx--;
       }
     }
+    ckptState = CheckpointStates.START;
+  }
+
+  CheckpointSignature rollEditLog() throws IOException {
+    getEditLog().rollEditLog();
+    ckptState = CheckpointStates.ROLLED_EDITS;
+    return new CheckpointSignature(this);
+  }
+
+  /**
+   * This is called just before a new checkpoint is uploaded to the
+   * namenode.
+   */
+  void validateCheckpointUpload(CheckpointSignature sig) throws IOException {
+    if (ckptState != CheckpointStates.ROLLED_EDITS) {
+      throw new IOException("Namenode is not expecting an new image " +
+                             ckptState);
+    } 
+    // verify token
+    long modtime = getEditLog().getFsEditTime();
+    if (sig.editsTime != modtime) {
+      throw new IOException("Namenode has an edit log with timestamp of " +
+                            DATE_FORM.format(new Date(modtime)) +
+                            " but new checkpoint was created using editlog " +
+                            " with timestamp " + 
+                            DATE_FORM.format(new Date(sig.editsTime)) + 
+                            ". Checkpoint Aborted.");
+    }
+    sig.validateStorageInfo(this);
+    ckptState = CheckpointStates.UPLOAD_START;
+  }
+
+  /**
+   * This is called when a checkpoint upload finishes successfully.
+   */
+  synchronized void checkpointUploadDone() {
+    ckptState = CheckpointStates.UPLOAD_DONE;
   }
 
   void close() throws IOException {
@@ -1073,6 +1191,14 @@ class FSImage extends Storage {
     return getImageFile(0, NameNodeFile.IMAGE);
   }
 
+  File getFsEditName() throws IOException {
+    return getEditLog().getFsEditName();
+  }
+
+  File getFsTimeName() {
+    return getImageFile(0, NameNodeFile.TIME);
+  }
+
   /**
    * Return the name of the image file that is uploaded by periodic
    * checkpointing.
@@ -1191,7 +1317,7 @@ class FSImage extends Storage {
 
   private void verifyDistributedUpgradeProgress(StartupOption startOpt
                                                 ) throws IOException {
-    if(startOpt == StartupOption.ROLLBACK)
+    if(startOpt == StartupOption.ROLLBACK || startOpt == StartupOption.IMPORT)
       return;
     UpgradeManager um = FSNamesystem.getFSNamesystem().upgradeManager;
     assert um != null : "FSNameSystem.upgradeManager is null.";
@@ -1218,4 +1344,16 @@ class FSImage extends Storage {
         + FSConstants.LAYOUT_VERSION + " is initialized.");
   }
 
+  static Collection<File> getCheckpointDirs(Configuration conf,
+                                            String defaultName) {
+    Collection<String> dirNames = conf.getStringCollection("fs.checkpoint.dir");
+    if (dirNames.size() == 0 && defaultName != null) {
+      dirNames.add(defaultName);
+    }
+    Collection<File> dirs = new ArrayList<File>(dirNames.size());
+    for(String name : dirNames) {
+      dirs.add(new File(name));
+    }
+    return dirs;
+  }
 }

+ 14 - 60
src/java/org/apache/hadoop/dfs/FSNamesystem.java

@@ -46,7 +46,6 @@ import java.net.InetSocketAddress;
 import java.util.*;
 import java.util.Map.Entry;
 import java.util.concurrent.LinkedBlockingQueue;
-import java.text.SimpleDateFormat;
 
 import javax.management.NotCompliantMBeanException;
 import javax.management.ObjectName;
@@ -227,12 +226,6 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
   private HostsFileReader hostsReader; 
   private Daemon dnthread = null;
 
-  // can fs-image be rolled?
-  volatile private CheckpointStates ckptState = CheckpointStates.START; 
-
-  private static final SimpleDateFormat DATE_FORM =
-    new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-
   private long maxFsObjects = 0;          // maximum number of fs objects
 
   /**
@@ -325,8 +318,8 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
         conf.get("dfs.datanode.https.address", infoHost + ":" + 50475));
     this.infoServer.setAttribute("datanode.https.port",
         datanodeSslPort.getPort());
-    this.infoServer.setAttribute("name.system", this);
     this.infoServer.setAttribute("name.node", nn);
+    this.infoServer.setAttribute("name.system.image", getFSImage());
     this.infoServer.setAttribute("name.conf", conf);
     this.infoServer.addServlet("fsck", "/fsck", FsckServlet.class);
     this.infoServer.addServlet("getimage", "/getimage", GetImageServlet.class);
@@ -341,12 +334,12 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
   }
 
   static Collection<File> getNamespaceDirs(Configuration conf) {
-    String[] dirNames = conf.getStrings("dfs.name.dir");
-    if (dirNames == null)
-      dirNames = new String[] {"/tmp/hadoop/dfs/name"};
-    Collection<File> dirs = new ArrayList<File>(dirNames.length);
-    for(int idx = 0; idx < dirNames.length; idx++) {
-      dirs.add(new File(dirNames[idx]));
+    Collection<String> dirNames = conf.getStringCollection("dfs.name.dir");
+    if (dirNames.isEmpty())
+      dirNames.add("/tmp/hadoop/dfs/name");
+    Collection<File> dirs = new ArrayList<File>(dirNames.size());
+    for(String name : dirNames) {
+      dirs.add(new File(name));
     }
     return dirs;
   }
@@ -2105,7 +2098,7 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
   /**
    * Get registrationID for datanodes based on the namespaceID.
    * 
-   * @see #registerDatanode(DatanodeRegistration,String)
+   * @see #registerDatanode(DatanodeRegistration)
    * @see FSImage#newNamespaceID()
    * @return registration ID
    */
@@ -4055,61 +4048,22 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
     return getEditLog().getEditLogSize();
   }
 
-  synchronized long rollEditLog() throws IOException {
+  synchronized CheckpointSignature rollEditLog() throws IOException {
     if (isInSafeMode()) {
       throw new SafeModeException("Checkpoint not created",
                                   safeMode);
     }
     LOG.info("Roll Edit Log from " + Server.getRemoteAddress());
-    getEditLog().rollEditLog();
-    ckptState = CheckpointStates.ROLLED_EDITS;
-    return getEditLog().getFsEditTime();
+    return getFSImage().rollEditLog();
   }
 
   synchronized void rollFSImage() throws IOException {
-    LOG.info("Roll FSImage from " + Server.getRemoteAddress());
     if (isInSafeMode()) {
       throw new SafeModeException("Checkpoint not created",
                                   safeMode);
     }
-    if (ckptState != CheckpointStates.UPLOAD_DONE) {
-      throw new IOException("Cannot roll fsImage before rolling edits log.");
-    }
-    dir.fsImage.rollFSImage();
-    ckptState = CheckpointStates.START;
-  }
-
-  File getFsEditName() throws IOException {
-    return getEditLog().getFsEditName();
-  }
-
-  /**
-   * This is called just before a new checkpoint is uploaded to the
-   * namenode.
-   */
-  synchronized void validateCheckpointUpload(long token) throws IOException {
-    if (ckptState != CheckpointStates.ROLLED_EDITS) {
-      throw new IOException("Namenode is not expecting an new image " +
-                             ckptState);
-    } 
-    // verify token
-    long modtime = getEditLog().getFsEditTime();
-    if (token != modtime) {
-      throw new IOException("Namenode has an edit log with timestamp of " +
-                            DATE_FORM.format(new Date(modtime)) +
-                            " but new checkpoint was created using editlog " +
-                            " with timestamp " + 
-                            DATE_FORM.format(new Date(token)) + 
-                            ". Checkpoint Aborted.");
-    }
-    ckptState = CheckpointStates.UPLOAD_START;
-  }
-
-  /**
-   * This is called when a checkpoint upload finishes successfully.
-   */
-  synchronized void checkpointUploadDone() {
-    ckptState = CheckpointStates.UPLOAD_DONE;
+    LOG.info("Roll FSImage from " + Server.getRemoteAddress());
+    getFSImage().rollFSImage();
   }
 
   /**
@@ -4273,7 +4227,7 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
   /**
    * Number of live data nodes
-   * @returns Number of live data nodes
+   * @return Number of live data nodes
    */
   public int numLiveDataNodes() {
     int numLive = 0;
@@ -4292,7 +4246,7 @@ class FSNamesystem implements FSConstants, FSNamesystemMBean {
 
   /**
    * Number of dead data nodes
-   * @returns Number of dead data nodes
+   * @return Number of dead data nodes
    */
   public int numDeadDataNodes() {
     int numDead = 0;

+ 10 - 12
src/java/org/apache/hadoop/dfs/GetImageServlet.java

@@ -19,14 +19,13 @@ package org.apache.hadoop.dfs;
 
 import java.util.*;
 import java.io.*;
-import org.apache.hadoop.util.StringUtils;
-import org.apache.commons.logging.*;
 import javax.servlet.ServletContext;
 import javax.servlet.ServletException;
 import javax.servlet.http.HttpServlet;
 import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
 
+import org.apache.hadoop.util.StringUtils;
 
 /**
  * This class is used in Namesystem's jetty to retrieve a file.
@@ -34,8 +33,7 @@ import javax.servlet.http.HttpServletResponse;
  * edit file for periodic checkpointing.
  */
 public class GetImageServlet extends HttpServlet {
-
-  private static final Log LOG = LogFactory.getLog("org.apache.hadoop.dfs.FSNamesystem");
+  private static final long serialVersionUID = -7669068179452648952L;
 
   @SuppressWarnings("unchecked")
   public void doGet(HttpServletRequest request,
@@ -44,22 +42,22 @@ public class GetImageServlet extends HttpServlet {
     Map<String,String[]> pmap = request.getParameterMap();
     try {
       ServletContext context = getServletContext();
-      NameNode nn = (NameNode) context.getAttribute("name.node");
+      FSImage nnImage = (FSImage)context.getAttribute("name.system.image");
       TransferFsImage ff = new TransferFsImage(pmap, request, response);
       if (ff.getImage()) {
-        // send fsImage to Secondary
+        // send fsImage
         TransferFsImage.getFileServer(response.getOutputStream(),
-                                      nn.getFsImageName()); 
+                                      nnImage.getFsImageName()); 
       } else if (ff.getEdit()) {
-        // send old edits to Secondary
+        // send edits
         TransferFsImage.getFileServer(response.getOutputStream(),
-                                      nn.getFsEditName());
+                                      nnImage.getFsEditName());
       } else if (ff.putImage()) {
         // issue a HTTP get request to download the new fsimage 
-        nn.validateCheckpointUpload(ff.getToken());
+        nnImage.validateCheckpointUpload(ff.getToken());
         TransferFsImage.getFileClient(ff.getInfoServer(), "getimage=1", 
-                                      nn.getFsImageNameCheckpoint());
-        nn.checkpointUploadDone();
+                                      nnImage.getFsImageNameCheckpoint());
+        nnImage.checkpointUploadDone();
       }
     } catch (Exception ie) {
       String errMsg = "GetImage failed. " + StringUtils.stringifyException(ie);

+ 14 - 28
src/java/org/apache/hadoop/dfs/NameNode.java

@@ -492,7 +492,7 @@ public class NameNode implements ClientProtocol, DatanodeProtocol,
   /**
    * Roll the edit log.
    */
-  public long rollEditLog() throws IOException {
+  public CheckpointSignature rollEditLog() throws IOException {
     return namesystem.rollEditLog();
   }
 
@@ -705,27 +705,6 @@ public class NameNode implements ClientProtocol, DatanodeProtocol,
     return getFSImage().getFsImageNameCheckpoint();
   }
 
-  /**
-   * Validates that this is a valid checkpoint upload request
-   */
-  public void validateCheckpointUpload(long token) throws IOException {
-    namesystem.validateCheckpointUpload(token);
-  }
-
-  /**
-   * Indicates that a new checkpoint has been successfully uploaded.
-   */
-  public void checkpointUploadDone() {
-    namesystem.checkpointUploadDone();
-  }
-
-  /**
-   * Returns the name of the edits file
-   */
-  public File getFsEditName() throws IOException {
-    return namesystem.getFsEditName();
-  }
-
   /**
    * Returns the address on which the NameNodes is listening to.
    * @return the address on which the NameNodes is listening to.
@@ -794,7 +773,12 @@ public class NameNode implements ClientProtocol, DatanodeProtocol,
 
   private static void printUsage() {
     System.err.println(
-      "Usage: java NameNode [-format] | [-upgrade] | [-rollback] | [-finalize]");
+      "Usage: java NameNode [" +
+      StartupOption.FORMAT.getName() + "] | [" +
+      StartupOption.UPGRADE.getName() + "] | [" +
+      StartupOption.ROLLBACK.getName() + "] | [" +
+      StartupOption.FINALIZE.getName() + "] | [" +
+      StartupOption.IMPORT.getName() + "]");
   }
 
   private static StartupOption parseArguments(String args[], 
@@ -803,16 +787,18 @@ public class NameNode implements ClientProtocol, DatanodeProtocol,
     StartupOption startOpt = StartupOption.REGULAR;
     for(int i=0; i < argsLen; i++) {
       String cmd = args[i];
-      if ("-format".equalsIgnoreCase(cmd)) {
+      if (StartupOption.FORMAT.getName().equalsIgnoreCase(cmd)) {
         startOpt = StartupOption.FORMAT;
-      } else if ("-regular".equalsIgnoreCase(cmd)) {
+      } else if (StartupOption.REGULAR.getName().equalsIgnoreCase(cmd)) {
         startOpt = StartupOption.REGULAR;
-      } else if ("-upgrade".equalsIgnoreCase(cmd)) {
+      } else if (StartupOption.UPGRADE.getName().equalsIgnoreCase(cmd)) {
         startOpt = StartupOption.UPGRADE;
-      } else if ("-rollback".equalsIgnoreCase(cmd)) {
+      } else if (StartupOption.ROLLBACK.getName().equalsIgnoreCase(cmd)) {
         startOpt = StartupOption.ROLLBACK;
-      } else if ("-finalize".equalsIgnoreCase(cmd)) {
+      } else if (StartupOption.FINALIZE.getName().equalsIgnoreCase(cmd)) {
         startOpt = StartupOption.FINALIZE;
+      } else if (StartupOption.IMPORT.getName().equalsIgnoreCase(cmd)) {
+        startOpt = StartupOption.IMPORT;
       } else
         return null;
     }

+ 209 - 132
src/java/org/apache/hadoop/dfs/SecondaryNameNode.java

@@ -29,12 +29,8 @@ import org.apache.hadoop.net.NetUtils;
 
 import java.io.*;
 import java.net.*;
-import java.util.Map;
-import javax.servlet.ServletContext;
-import javax.servlet.ServletException;
-import javax.servlet.http.HttpServlet;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
+import java.util.ArrayList;
+import java.util.Collection;
 import org.apache.hadoop.metrics.jvm.JvmMetrics;
 
 /**********************************************************
@@ -52,11 +48,11 @@ import org.apache.hadoop.metrics.jvm.JvmMetrics;
  **********************************************************/
 public class SecondaryNameNode implements FSConstants, Runnable {
     
-  public static final Log LOG = LogFactory.getLog(
-                                                  "org.apache.hadoop.dfs.NameNode.Secondary");
-  private static final String SRC_FS_IMAGE = "srcimage.tmp";
-  private static final String FS_EDITS = "edits.tmp";
-  private static final String DEST_FS_IMAGE = "destimage.tmp";
+  public static final Log LOG = 
+    LogFactory.getLog("org.apache.hadoop.dfs.NameNode.Secondary");
+
+  private String fsName;
+  private CheckpointStorage checkpointImage;
 
   private ClientProtocol namenode;
   private Configuration conf;
@@ -66,12 +62,9 @@ public class SecondaryNameNode implements FSConstants, Runnable {
   private int infoPort;
   private String infoBindAddress;
 
-  private File checkpointDir;
+  private Collection<File> checkpointDirs;
   private long checkpointPeriod;	// in seconds
   private long checkpointSize;    // size (in MB) of current Edit Log
-  private File srcImage;
-  private File destImage;
-  private File editFile;
 
   /**
    * Utility class to facilitate junit test error simulation.
@@ -103,17 +96,30 @@ public class SecondaryNameNode implements FSConstants, Runnable {
     }
   }
 
+  FSImage getFSImage() {
+    return checkpointImage;
+  }
+
   /**
    * Create a connection to the primary namenode.
    */
-  public SecondaryNameNode(Configuration conf)  throws IOException {
+  SecondaryNameNode(Configuration conf)  throws IOException {
+    try {
+      initialize(conf);
+    } catch(IOException e) {
+      shutdown();
+      throw e;
+    }
+  }
 
+  /**
+   * Initialize SecondaryNameNode.
+   */
+  private void initialize(Configuration conf) throws IOException {
     // initiate Java VM metrics
     JvmMetrics.init("SecondaryNameNode", conf.get("session.id"));
     
-    //
     // Create connection to the namenode.
-    //
     shouldRun = true;
     nameNodeAddr =
       NetUtils.createSocketAddr(FileSystem.getDefaultUri(conf).getAuthority());
@@ -122,9 +128,18 @@ public class SecondaryNameNode implements FSConstants, Runnable {
         (ClientProtocol) RPC.waitForProxy(ClientProtocol.class,
             ClientProtocol.versionID, nameNodeAddr, conf);
 
-    //
+    // initialize checkpoint directories
+    fsName = getInfoServer();
+    checkpointDirs = FSImage.getCheckpointDirs(conf,
+                                  "/tmp/hadoop/dfs/namesecondary");
+    checkpointImage = new CheckpointStorage();
+    checkpointImage.recoverCreate(checkpointDirs);
+
+    // Initialize other scheduling parameters from the configuration
+    checkpointPeriod = conf.getLong("fs.checkpoint.period", 3600);
+    checkpointSize = conf.getLong("fs.checkpoint.size", 4194304);
+
     // initialize the webserver for uploading files.
-    //
     String infoAddr = 
       NetUtils.getServerAddress(conf, 
                                 "dfs.secondary.info.bindAddress",
@@ -133,9 +148,9 @@ public class SecondaryNameNode implements FSConstants, Runnable {
     InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(infoAddr);
     infoBindAddress = infoSocAddr.getHostName();
     int tmpInfoPort = infoSocAddr.getPort();
-    infoServer = new StatusHttpServer("dfs", infoBindAddress, tmpInfoPort, 
+    infoServer = new StatusHttpServer("secondary", infoBindAddress, tmpInfoPort, 
                                       tmpInfoPort == 0);
-    infoServer.setAttribute("name.secondary", this);
+    infoServer.setAttribute("name.system.image", checkpointImage);
     this.infoServer.setAttribute("name.conf", conf);
     infoServer.addServlet("getimage", "/getimage", GetImageServlet.class);
     infoServer.start();
@@ -144,17 +159,6 @@ public class SecondaryNameNode implements FSConstants, Runnable {
     infoPort = infoServer.getPort();
     conf.set("dfs.secondary.http.address", infoBindAddress + ":" +infoPort); 
     LOG.info("Secondary Web-server up at: " + infoBindAddress + ":" +infoPort);
-
-    //
-    // Initialize other scheduling parameters from the configuration
-    //
-    String[] dirName = conf.getStrings("fs.checkpoint.dir");
-    checkpointDir = new File(dirName[0]);
-    checkpointPeriod = conf.getLong("fs.checkpoint.period", 3600);
-    checkpointSize = conf.getLong("fs.checkpoint.size", 4194304);
-    doSetup();
-
-    LOG.warn("Checkpoint Directory:" + checkpointDir);
     LOG.warn("Checkpoint Period   :" + checkpointPeriod + " secs " +
              "(" + checkpointPeriod/60 + " min)");
     LOG.warn("Log Size Trigger    :" + checkpointSize + " bytes " +
@@ -168,26 +172,15 @@ public class SecondaryNameNode implements FSConstants, Runnable {
   public void shutdown() {
     shouldRun = false;
     try {
-      infoServer.stop();
-    } catch (Exception e) {
+      if (infoServer != null) infoServer.stop();
+    } catch(InterruptedException ie) {
+      LOG.warn(StringUtils.stringifyException(ie));
+    }
+    try {
+      if (checkpointImage != null) checkpointImage.close();
+    } catch(IOException e) {
+      LOG.warn(StringUtils.stringifyException(e));
     }
-  }
-
-  private void doSetup() throws IOException {
-    //
-    // Create the checkpoint directory if needed. 
-    //
-    checkpointDir.mkdirs();
-    srcImage = new File(checkpointDir, SRC_FS_IMAGE);
-    destImage = new File(checkpointDir, DEST_FS_IMAGE);
-    editFile = new File(checkpointDir, FS_EDITS);
-    srcImage.delete();
-    destImage.delete();
-    editFile.delete();
-  }
-
-  File getNewImage() {
-    return destImage;
   }
 
   //
@@ -224,11 +217,11 @@ public class SecondaryNameNode implements FSConstants, Runnable {
           lastCheckpointTime = now;
         }
       } catch (IOException e) {
-        LOG.error("Exception in doCheckpoint:");
+        LOG.error("Exception in doCheckpoint: ");
         LOG.error(StringUtils.stringifyException(e));
         e.printStackTrace();
       } catch (Throwable e) {
-        LOG.error("Throwable Exception in doCheckpoint:");
+        LOG.error("Throwable Exception in doCheckpoint: ");
         LOG.error(StringUtils.stringifyException(e));
         e.printStackTrace();
         Runtime.getRuntime().exit(-1);
@@ -237,41 +230,48 @@ public class SecondaryNameNode implements FSConstants, Runnable {
   }
 
   /**
-   * get the current fsimage from Namenode.
+   * Download <code>fsimage</code> and <code>edits</code>
+   * files from the name-node.
+   * @throws IOException
    */
-  private void getFSImage() throws IOException {
-    String fsName = getInfoServer();
-    String fileid = "getimage=1";
-    TransferFsImage.getFileClient(fsName, fileid, srcImage);
-    LOG.info("Downloaded file " + srcImage + " size " +
-             srcImage.length() + " bytes.");
-  }
+  private void downloadCheckpointFiles(CheckpointSignature sig
+                                      ) throws IOException {
+    
+    checkpointImage.cTime = sig.cTime;
+    checkpointImage.checkpointTime = sig.checkpointTime;
 
-  /**
-   * get the old edits file from the NameNode
-   */
-  private void getFSEdits() throws IOException {
-    String fsName = getInfoServer();
-    String fileid = "getedit=1";
-    TransferFsImage.getFileClient(fsName, fileid, editFile);
-    LOG.info("Downloaded file " + editFile + " size " +
-             editFile.length() + " bytes.");
+    // get fsimage
+    String fileid = "getimage=1";
+    File[] srcNames = checkpointImage.getImageFiles();
+    assert srcNames.length > 0 : "No checkpoint targets.";
+    TransferFsImage.getFileClient(fsName, fileid, srcNames);
+    LOG.info("Downloaded file " + srcNames[0].getName() + " size " +
+             srcNames[0].length() + " bytes.");
+
+    // get edits file
+    fileid = "getedit=1";
+    srcNames = checkpointImage.getEditsFiles();
+    assert srcNames.length > 0 : "No checkpoint targets.";
+    TransferFsImage.getFileClient(fsName, fileid, srcNames);
+    LOG.info("Downloaded file " + srcNames[0].getName() + " size " +
+        srcNames[0].length() + " bytes.");
+
+    checkpointImage.checkpointUploadDone();
   }
 
   /**
    * Copy the new fsimage into the NameNode
    */
-  private void putFSImage(long token) throws IOException {
-    String fsName = getInfoServer();
+  private void putFSImage(CheckpointSignature sig) throws IOException {
     String fileid = "putimage=1&port=" + infoPort +
       "&machine=" +
       InetAddress.getLocalHost().getHostAddress() +
-      "&token=" + token;
+      "&token=" + sig.toString();
     LOG.info("Posted URL " + fsName + fileid);
     TransferFsImage.getFileClient(fsName, fileid, (File[])null);
   }
 
-  /*
+  /**
    * Returns the Jetty server that the Namenode is listening on.
    */
   private String getInfoServer() throws IOException {
@@ -283,66 +283,62 @@ public class SecondaryNameNode implements FSConstants, Runnable {
                                      "dfs.info.port", "dfs.http.address");
   }
 
-  /*
+  /**
    * Create a new checkpoint
    */
   void doCheckpoint() throws IOException {
 
-    //
     // Do the required initialization of the merge work area.
-    //
-    doSetup();
+    startCheckpoint();
 
-    //
     // Tell the namenode to start logging transactions in a new edit file
     // Retuns a token that would be used to upload the merged image.
-    //
-    long token = namenode.rollEditLog();
+    CheckpointSignature sig = (CheckpointSignature)namenode.rollEditLog();
 
-    //
     // error simulation code for junit test
-    //
     if (ErrorSimulator.getErrorSimulation(0)) {
       throw new IOException("Simulating error0 " +
                             "after creating edits.new");
     }
 
-    getFSImage();                // Fetch fsimage
-    getFSEdits();                // Fetch edist
-    doMerge();                   // Do the merge
+    downloadCheckpointFiles(sig);   // Fetch fsimage and edits
+    doMerge(sig);                   // Do the merge
   
     //
     // Upload the new image into the NameNode. Then tell the Namenode
     // to make this new uploaded image as the most current image.
     //
-    putFSImage(token);
+    putFSImage(sig);
 
-    //
     // error simulation code for junit test
-    //
     if (ErrorSimulator.getErrorSimulation(1)) {
       throw new IOException("Simulating error1 " +
                             "after uploading new image to NameNode");
     }
 
     namenode.rollFsImage();
+    checkpointImage.endCheckpoint();
 
-    LOG.warn("Checkpoint done. Image Size:" + srcImage.length() +
-             " Edit Size:" + editFile.length() +
-             " New Image Size:" + destImage.length());
+    LOG.warn("Checkpoint done. New Image Size: " 
+              + checkpointImage.getFsImageName().length());
+  }
+
+  private void startCheckpoint() throws IOException {
+    checkpointImage.unlockAll();
+    checkpointImage.getEditLog().close();
+    checkpointImage.recoverCreate(checkpointDirs);
+    checkpointImage.startCheckpoint();
   }
 
   /**
-   * merges SRC_FS_IMAGE with FS_EDITS and writes the output into
-   * DEST_FS_IMAGE
+   * Merge downloaded image and edits and write the new image into
+   * current storage directory.
    */
-  private void doMerge() throws IOException {
+  private void doMerge(CheckpointSignature sig) throws IOException {
     FSNamesystem namesystem = 
-            new FSNamesystem(new FSImage(checkpointDir), conf);
-    FSImage fsImage = namesystem.dir.fsImage;
-    fsImage.loadFSImage(srcImage);
-    fsImage.getEditLog().loadFSEdits(editFile);
-    fsImage.saveFSImage(destImage);
+            new FSNamesystem(checkpointImage, conf);
+    assert namesystem.dir.fsImage == checkpointImage;
+    checkpointImage.doMerge(sig);
   }
 
   /**
@@ -446,37 +442,6 @@ public class SecondaryNameNode implements FSConstants, Runnable {
     }
   }
 
-  /**
-   * This class is used in Namesystem's jetty to retrieve a file.
-   * Typically used by the Secondary NameNode to retrieve image and
-   * edit file for periodic checkpointing.
-   */
-  public static class GetImageServlet extends HttpServlet {
-    @SuppressWarnings("unchecked")
-    public void doGet(HttpServletRequest request,
-                      HttpServletResponse response
-                      ) throws ServletException, IOException {
-      Map<String,String[]> pmap = request.getParameterMap();
-      try {
-        ServletContext context = getServletContext();
-        SecondaryNameNode nn = (SecondaryNameNode) 
-          context.getAttribute("name.secondary");
-        TransferFsImage ff = new TransferFsImage(pmap, request, response);
-        if (ff.getImage()) {
-          TransferFsImage.getFileServer(response.getOutputStream(),
-                                        nn.getNewImage());
-        }
-        LOG.info("New Image " + nn.getNewImage() + " retrieved by Namenode.");
-      } catch (Exception ie) {
-        String errMsg = "GetImage failed. " + StringUtils.stringifyException(ie);
-        response.sendError(HttpServletResponse.SC_GONE, errMsg);
-        throw new IOException(errMsg);
-      } finally {
-        response.getOutputStream().close();
-      }
-    }
-  }
-
   /**
    * main() has some simple utility methods.
    * @param argv Command line parameters.
@@ -495,4 +460,116 @@ public class SecondaryNameNode implements FSConstants, Runnable {
     Daemon checkpointThread = new Daemon(new SecondaryNameNode(tconf)); 
     checkpointThread.start();
   }
+
+  static class CheckpointStorage extends FSImage {
+    /**
+     */
+    CheckpointStorage() throws IOException {
+      super();
+    }
+
+    @Override
+    boolean isConversionNeeded(StorageDirectory sd) {
+      return false;
+    }
+
+    /**
+     * Analyze checkpoint directories.
+     * Create directories if they do not exist.
+     * Recover from an unsuccessful checkpoint is necessary. 
+     * 
+     * @param dataDirs
+     * @throws IOException
+     */
+    void recoverCreate(Collection<File> dataDirs) throws IOException {
+      this.storageDirs = new ArrayList<StorageDirectory>(dataDirs.size());
+      for(File dataDir : dataDirs) {
+        boolean isAccessible = true;
+        try { // create directories if don't exist yet
+          if(!dataDir.mkdirs()) {
+            // do nothing, directory is already ctreated
+          }
+        } catch(SecurityException se) {
+          isAccessible = false;
+        }
+        if(!isAccessible)
+          throw new InconsistentFSStateException(dataDir,
+              "cannot access checkpoint directory.");
+        StorageDirectory sd = new StorageDirectory(dataDir);
+        StorageState curState;
+        try {
+          curState = sd.analyzeStorage(StartupOption.REGULAR);
+          // sd is locked but not opened
+          switch(curState) {
+          case NON_EXISTENT:
+            // fail if any of the configured checkpoint dirs are inaccessible 
+            throw new InconsistentFSStateException(sd.root,
+                  "checkpoint directory does not exist or is not accessible.");
+          case NOT_FORMATTED:
+            break;  // it's ok since initially there is no current and VERSION
+          case CONVERT:
+            throw new InconsistentFSStateException(sd.root,
+                  "not a checkpoint directory.");
+          case NORMAL:
+            break;
+          default:  // recovery is possible
+            sd.doRecover(curState);
+          }
+        } catch (IOException ioe) {
+          sd.unlock();
+          throw ioe;
+        }
+        // add to the storage list
+        addStorageDir(sd);
+        LOG.warn("Checkpoint directory " + sd.root + " is added.");
+      }
+    }
+
+    /**
+     * Prepare directories for a new checkpoint.
+     * <p>
+     * Rename <code>current</code> to <code>lastcheckpoint.tmp</code>
+     * and recreate <code>current</code>.
+     * @throws IOException
+     */
+    void startCheckpoint() throws IOException {
+      for(StorageDirectory sd : storageDirs) {
+        File curDir = sd.getCurrentDir();
+        File tmpCkptDir = sd.getLastCheckpointTmp();
+        assert !tmpCkptDir.exists() : 
+          tmpCkptDir.getName() + " directory must not exist.";
+        if(curDir.exists()) {
+          // rename current to tmp
+          rename(curDir, tmpCkptDir);
+        }
+        if (!curDir.mkdir())
+          throw new IOException("Cannot create directory " + curDir);
+      }
+    }
+
+    void endCheckpoint() throws IOException {
+      for(StorageDirectory sd : storageDirs) {
+        File tmpCkptDir = sd.getLastCheckpointTmp();
+        File prevCkptDir = sd.getPreviousCheckpoint();
+        // delete previous dir
+        if (prevCkptDir.exists())
+          deleteDir(prevCkptDir);
+        // rename tmp to previous
+        if (tmpCkptDir.exists())
+          rename(tmpCkptDir, prevCkptDir);
+      }
+    }
+
+    /**
+     * Merge image and edits, and verify consistency with the signature.
+     */
+    private void doMerge(CheckpointSignature sig) throws IOException {
+      getEditLog().open();
+      StorageDirectory sd = getStorageDir(0);
+      loadFSImage(FSImage.getImageFile(sd, NameNodeFile.IMAGE));
+      loadFSEdits(sd);
+      sig.validateStorageInfo(this);
+      saveFSImage();
+    }
+  }
 }

+ 111 - 16
src/java/org/apache/hadoop/dfs/Storage.java

@@ -23,6 +23,7 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.nio.channels.FileLock;
+import java.nio.channels.OverlappingFileLockException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Iterator;
@@ -57,14 +58,18 @@ class StorageInfo {
   }
   
   StorageInfo(StorageInfo from) {
-    layoutVersion = from.layoutVersion;
-    namespaceID = from.namespaceID;
-    cTime = from.cTime;
+    setStorageInfo(from);
   }
 
   public int    getLayoutVersion(){ return layoutVersion; }
   public int    getNamespaceID()  { return namespaceID; }
   public long   getCTime()        { return cTime; }
+
+  public void   setStorageInfo(StorageInfo from) {
+    layoutVersion = from.layoutVersion;
+    namespaceID = from.namespaceID;
+    cTime = from.cTime;
+  }
 }
 
 /**
@@ -100,6 +105,8 @@ abstract class Storage extends StorageInfo {
   private   static final String STORAGE_TMP_REMOVED   = "removed.tmp";
   private   static final String STORAGE_TMP_PREVIOUS  = "previous.tmp";
   private   static final String STORAGE_TMP_FINALIZED = "finalized.tmp";
+  private   static final String STORAGE_TMP_LAST_CKPT = "lastcheckpoint.tmp";
+  private   static final String STORAGE_PREVIOUS_CKPT = "previous.checkpoint";
   
   protected enum StorageState {
     NON_EXISTENT,
@@ -110,6 +117,8 @@ abstract class Storage extends StorageInfo {
     COMPLETE_FINALIZE,
     COMPLETE_ROLLBACK,
     RECOVER_ROLLBACK,
+    COMPLETE_CHECKPOINT,
+    RECOVER_CHECKPOINT,
     NORMAL;
   }
   
@@ -237,6 +246,12 @@ abstract class Storage extends StorageInfo {
     File getFinalizedTmp() {
       return new File(root, STORAGE_TMP_FINALIZED);
     }
+    File getLastCheckpointTmp() {
+      return new File(root, STORAGE_TMP_LAST_CKPT);
+    }
+    File getPreviousCheckpoint() {
+      return new File(root, STORAGE_PREVIOUS_CKPT);
+    }
 
     /**
      * Check consistency of the storage directory
@@ -280,7 +295,7 @@ abstract class Storage extends StorageInfo {
       if (startOpt == StartupOption.FORMAT)
         return StorageState.NOT_FORMATTED;
       // check whether a conversion is required
-      if (isConversionNeeded(this))
+      if (startOpt != StartupOption.IMPORT && isConversionNeeded(this))
         return StorageState.CONVERT;
       // check whether current directory is valid
       File versionFile = getVersionFile();
@@ -291,8 +306,10 @@ abstract class Storage extends StorageInfo {
       boolean hasPreviousTmp = getPreviousTmp().exists();
       boolean hasRemovedTmp = getRemovedTmp().exists();
       boolean hasFinalizedTmp = getFinalizedTmp().exists();
+      boolean hasCheckpointTmp = getLastCheckpointTmp().exists();
 
-      if (!(hasPreviousTmp || hasRemovedTmp || hasFinalizedTmp)) {
+      if (!(hasPreviousTmp || hasRemovedTmp
+          || hasFinalizedTmp || hasCheckpointTmp)) {
         // no temp dirs - no recovery
         if (hasCurrent)
           return StorageState.NORMAL;
@@ -302,12 +319,18 @@ abstract class Storage extends StorageInfo {
         return StorageState.NOT_FORMATTED;
       }
 
-      if ((hasPreviousTmp?1:0)+(hasRemovedTmp?1:0)+(hasFinalizedTmp?1:0) > 1)
+      if ((hasPreviousTmp?1:0) + (hasRemovedTmp?1:0)
+          + (hasFinalizedTmp?1:0) + (hasCheckpointTmp?1:0) > 1)
         // more than one temp dirs
         throw new InconsistentFSStateException(root,
                                                "too many temporary directories.");
 
       // # of temp dirs == 1 should either recover or complete a transition
+      if (hasCheckpointTmp) {
+        return hasCurrent ? StorageState.COMPLETE_CHECKPOINT
+                          : StorageState.RECOVER_CHECKPOINT;
+      }
+
       if (hasFinalizedTmp) {
         if (hasPrevious)
           throw new InconsistentFSStateException(root,
@@ -375,34 +398,70 @@ abstract class Storage extends StorageInfo {
                  + rootPath + ".");
         deleteDir(getFinalizedTmp());
         return;
+      case COMPLETE_CHECKPOINT: // mv lastcheckpoint.tmp -> previous.checkpoint
+        LOG.info("Completing previous checkpoint for storage directory " 
+                 + rootPath + ".");
+        File prevCkptDir = getPreviousCheckpoint();
+        if (prevCkptDir.exists())
+          deleteDir(prevCkptDir);
+        rename(getLastCheckpointTmp(), prevCkptDir);
+        return;
+      case RECOVER_CHECKPOINT:  // mv lastcheckpoint.tmp -> current
+        LOG.info("Recovering storage directory " + rootPath
+                 + " from failed checkpoint.");
+        if (curDir.exists())
+          deleteDir(curDir);
+        rename(getLastCheckpointTmp(), curDir);
+        return;
       default:
         throw new IOException("Unexpected FS state: " + curState);
       }
     }
 
     /**
-     * Lock storage.
+     * Lock storage to provide exclusive access.
+     * 
+     * <p> Locking is not supported by all file systems.
+     * E.g., NFS does not consistently support exclusive locks.
+     * 
+     * <p> If locking is supported we guarantee exculsive access to the
+     * storage directory. Otherwise, no guarantee is given.
      * 
      * @throws IOException if locking fails
      */
     void lock() throws IOException {
+      this.lock = tryLock();
+      if (lock == null) {
+        String msg = "Cannot lock storage " + this.root 
+          + ". The directory is already locked.";
+        LOG.info(msg);
+        throw new IOException(msg);
+      }
+    }
+
+    /**
+     * Attempts to acquire an exclusive lock on the storage.
+     * 
+     * @return A lock object representing the newly-acquired lock or
+     * <code>null</code> if storage is already locked.
+     * @throws IOException if locking fails.
+     */
+    FileLock tryLock() throws IOException {
       File lockF = new File(root, STORAGE_FILE_LOCK);
       lockF.deleteOnExit();
       RandomAccessFile file = new RandomAccessFile(lockF, "rws");
+      FileLock res = null;
       try {
-        this.lock = file.getChannel().tryLock();
+        res = file.getChannel().tryLock();
+      } catch(OverlappingFileLockException oe) {
+        file.close();
+        return null;
       } catch(IOException e) {
         LOG.info(StringUtils.stringifyException(e));
         file.close();
         throw e;
       }
-      if (lock == null) {
-        String msg = "Cannot lock storage " + this.root 
-          + ". The directory is already locked.";
-        LOG.info(msg);
-        file.close();
-        throw new IOException(msg);
-      }
+      return res;
     }
 
     /**
@@ -415,6 +474,7 @@ abstract class Storage extends StorageInfo {
         return;
       this.lock.release();
       lock.channel().close();
+      lock = null;
     }
   }
 
@@ -524,7 +584,7 @@ abstract class Storage extends StorageInfo {
   }
 
   /**
-   * Close all the version files.
+   * Unlock all storage directories.
    * @throws IOException
    */
   public void unlockAll() throws IOException {
@@ -533,6 +593,41 @@ abstract class Storage extends StorageInfo {
     }
   }
 
+  /**
+   * Check whether underlying file system supports file locking.
+   * 
+   * @return <code>true</code> if exclusive locks are supported or
+   *         <code>false</code> otherwise.
+   * @throws IOException
+   * @see StorageDirectory#lock()
+   */
+  boolean isLockSupported(int idx) throws IOException {
+    StorageDirectory sd = storageDirs.get(idx);
+    FileLock firstLock = null;
+    FileLock secondLock = null;
+    try {
+      firstLock = sd.lock;
+      if(firstLock == null) {
+        firstLock = sd.tryLock();
+        if(firstLock == null)
+          return true;
+      }
+      secondLock = sd.tryLock();
+      if(secondLock == null)
+        return true;
+    } finally {
+      if(firstLock != null && firstLock != sd.lock) {
+        firstLock.release();
+        firstLock.channel().close();
+      }
+      if(secondLock != null) {
+        secondLock.release();
+        secondLock.channel().close();
+      }
+    }
+    return false;
+  }
+
   public static String getBuildVersion() {
     return VersionInfo.getRevision();
   }

+ 9 - 19
src/java/org/apache/hadoop/dfs/TransferFsImage.java

@@ -36,7 +36,7 @@ class TransferFsImage implements FSConstants {
   private boolean isPutImage;
   private int remoteport;
   private String machineName;
-  private long token;
+  private CheckpointSignature token;
   
   /**
    * File downloader.
@@ -53,7 +53,7 @@ class TransferFsImage implements FSConstants {
     isGetImage = isGetEdit = isPutImage = false;
     remoteport = 0;
     machineName = null;
-    token = 0;
+    token = null;
 
     for (Iterator<String> it = pmap.keySet().iterator(); it.hasNext();) {
       String key = it.next();
@@ -68,12 +68,13 @@ class TransferFsImage implements FSConstants {
       } else if (key.equals("machine")) { 
         machineName = pmap.get("machine")[0];
       } else if (key.equals("token")) { 
-        token = new Long(pmap.get("token")[0]).longValue();
+        token = new CheckpointSignature(pmap.get("token")[0]);
       }
     }
-    if ((isGetImage && isGetEdit) ||
-        (!isGetImage && !isGetEdit && !isPutImage)) {
-      throw new IOException("No good parameters to TransferFsImage");
+
+    int numGets = (isGetImage?1:0) + (isGetEdit?1:0);
+    if ((numGets > 1) || (numGets == 0) && !isPutImage) {
+      throw new IOException("Illegal parameters to TransferFsImage");
     }
   }
 
@@ -89,7 +90,7 @@ class TransferFsImage implements FSConstants {
     return isPutImage;
   }
 
-  long getToken() {
+  CheckpointSignature getToken() {
     return token;
   }
 
@@ -167,7 +168,7 @@ class TransferFsImage implements FSConstants {
       }
     } finally {
       stream.close();
-      if (localPath != null) {
+      if (output != null) {
         for (int i = 0; i < output.length; i++) {
           if (output[i] != null) {
             output[i].close();
@@ -176,15 +177,4 @@ class TransferFsImage implements FSConstants {
       }
     }
   }
-
-  /**
-   * Client-side Method to fetch file from a server
-   * Copies the response from the URL to the local file.
-   */
-  static void getFileClient(String fsName, String id, File localPath)
-    throws IOException {
-    File[] filelist = new File[1];
-    filelist[0] = localPath;
-    getFileClient(fsName, id, filelist);
-  }
 }

+ 1 - 1
src/java/org/apache/hadoop/io/WritableComparable.java

@@ -51,5 +51,5 @@ package org.apache.hadoop.io;
  *     }
  * </pre></blockquote></p>
  */
-public interface WritableComparable extends Writable, Comparable {
+public interface WritableComparable<T> extends Writable, Comparable<T> {
 }

+ 19 - 5
src/java/org/apache/hadoop/util/StringUtils.java

@@ -18,7 +18,6 @@
 
 package org.apache.hadoop.util;
 
-import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.StringWriter;
 import java.net.InetAddress;
@@ -32,6 +31,7 @@ import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
 import java.util.StringTokenizer;
+import java.util.Collection;
 
 import org.apache.hadoop.fs.*;
 
@@ -262,19 +262,33 @@ public class StringUtils {
   }
   
   /**
-   * returns an arraylist of strings  
+   * Returns an arraylist of strings.
    * @param str the comma seperated string values
    * @return the arraylist of the comma seperated string values
    */
   public static String[] getStrings(String str){
-    if (str == null)
+    Collection<String> values = getStringCollection(str);
+    if(values.size() == 0) {
       return null;
-    StringTokenizer tokenizer = new StringTokenizer (str,",");
+    }
+    return values.toArray(new String[values.size()]);
+  }
+
+  /**
+   * Returns a collection of strings.
+   * @param str comma seperated string values
+   * @return an <code>ArrayList</code> of string values
+   */
+  public static Collection<String> getStringCollection(String str){
     List<String> values = new ArrayList<String>();
+    if (str == null)
+      return values;
+    StringTokenizer tokenizer = new StringTokenizer (str,",");
+    values = new ArrayList<String>();
     while (tokenizer.hasMoreTokens()) {
       values.add(tokenizer.nextToken());
     }
-    return values.toArray(new String[values.size()]);
+    return values;
   }
 
   final public static char COMMA = ',';

+ 161 - 0
src/test/org/apache/hadoop/dfs/TestCheckpoint.java

@@ -20,12 +20,15 @@ package org.apache.hadoop.dfs;
 import junit.framework.TestCase;
 import java.io.*;
 import java.util.Collection;
+import java.util.List;
 import java.util.Random;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.dfs.FSConstants.StartupOption;
 import org.apache.hadoop.dfs.FSImage.NameNodeFile;
 import org.apache.hadoop.dfs.SecondaryNameNode.ErrorSimulator;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 
 /**
@@ -360,6 +363,163 @@ public class TestCheckpoint extends TestCase {
     }
   }
 
+  /**
+   * Test different startup scenarios.
+   * <p><ol>
+   * <li> Start of primary name-node in secondary directory must succeed. 
+   * <li> Start of secondary node when the primary is already running in 
+   *      this directory must fail.
+   * <li> Start of primary name-node if secondary node is already running in 
+   *      this directory must fail.
+   * <li> Start of two secondary nodes in the same directory must fail.
+   * <li> Import of a checkpoint must fail if primary 
+   * directory contains a valid image.
+   * <li> Import of the secondary image directory must succeed if primary 
+   * directory does not exist.
+   * <li> Recover failed checkpoint for secondary node.
+   * <li> Complete failed checkpoint for secondary node.
+   * </ol>
+   */
+  void testStartup(Configuration conf) throws IOException {
+    System.out.println("Startup of the name-node in the checkpoint directory.");
+    String primaryDirs = conf.get("dfs.name.dir");
+    String checkpointDirs = conf.get("fs.checkpoint.dir");
+    conf.set("dfs.http.address", "0.0.0.0:0");  
+    conf.set("dfs.name.dir", checkpointDirs);
+    String[] args = new String[]{};
+    NameNode nn = NameNode.createNameNode(args, conf);
+    assertTrue(nn.isInSafeMode());
+
+    // Starting secondary node in the same directory as the primary
+    System.out.println("Startup of secondary in the same dir as the primary.");
+    SecondaryNameNode secondary = null;
+    try {
+      conf.set("dfs.secondary.http.address", "0.0.0.0:0");
+      secondary = new SecondaryNameNode(conf);
+      assertFalse(secondary.getFSImage().isLockSupported(0));
+      secondary.shutdown();
+    } catch (IOException e) { // expected to fail
+      assertTrue(secondary == null);
+    }
+    nn.stop(); nn = null;
+
+    // Starting primary node in the same directory as the secondary
+    System.out.println("Startup of primary in the same dir as the secondary.");
+    conf.set("dfs.http.address", "0.0.0.0:0");  
+    conf.set("dfs.name.dir", primaryDirs);
+    // secondary won't start without primary
+    nn = NameNode.createNameNode(args, conf);
+    conf.set("dfs.secondary.http.address", "0.0.0.0:0");
+    boolean succeed = false;
+    do {
+      try {
+        secondary = new SecondaryNameNode(conf);
+        succeed = true;
+      } catch(IOException ie) { // keep trying
+        System.out.println("Try again: " + ie.getLocalizedMessage());
+      }
+    } while(!succeed);
+    nn.stop(); nn = null;
+    try {
+      conf.set("dfs.http.address", "0.0.0.0:0");  
+      conf.set("dfs.name.dir", checkpointDirs);
+      nn = NameNode.createNameNode(args, conf);
+      assertFalse(nn.getFSImage().isLockSupported(0));
+      nn.stop(); nn = null;
+    } catch (IOException e) { // expected to fail
+      assertTrue(nn == null);
+    }
+
+    // Try another secondary in the same directory
+    System.out.println("Startup of two secondaries in the same dir.");
+    conf.set("dfs.http.address", "0.0.0.0:0");  
+    conf.set("dfs.name.dir", primaryDirs);
+    // secondary won't start without primary
+    nn = NameNode.createNameNode(args, conf);
+    SecondaryNameNode secondary2 = null;
+    try {
+      conf.set("dfs.secondary.http.address", "0.0.0.0:0");
+      secondary2 = new SecondaryNameNode(conf);
+      assertFalse(secondary2.getFSImage().isLockSupported(0));
+      secondary2.shutdown();
+    } catch (IOException e) { // expected to fail
+      assertTrue(secondary2 == null);
+    }
+    nn.stop(); nn = null;
+    secondary.shutdown();
+
+    // Import a checkpoint with existing primary image.
+    System.out.println("Import a checkpoint with existing primary image.");
+    args = new String[]{StartupOption.IMPORT.getName()};
+    try {
+      conf.set("dfs.http.address", "0.0.0.0:0");  
+      conf.set("dfs.name.dir", primaryDirs);
+      nn = NameNode.createNameNode(args, conf);
+      assertTrue(false);
+    } catch (IOException e) { // expected to fail
+      assertTrue(nn == null);
+    }
+
+    // Remove current image and import a checkpoint.
+    System.out.println("Import a checkpoint with existing primary image.");
+    List<File> nameDirs = (List<File>)FSNamesystem.getNamespaceDirs(conf);
+    long fsimageLength = new File(new File(nameDirs.get(0), "current"), 
+                                        NameNodeFile.IMAGE.getName()).length();
+    for(File dir : nameDirs) {
+      if(dir.exists())
+        if(!(FileUtil.fullyDelete(dir)))
+          throw new IOException("Cannot remove directory: " + dir);
+      if (!dir.mkdirs())
+        throw new IOException("Cannot create directory " + dir);
+    }
+    conf.set("dfs.http.address", "0.0.0.0:0");  
+    conf.set("dfs.name.dir", primaryDirs);
+    nn = NameNode.createNameNode(args, conf);
+    // Verify that image file sizes did not change.
+    FSImage image = nn.getFSImage();
+    int nrDirs = image.getNumStorageDirs();
+    for(int idx = 0; idx < nrDirs; idx++) {
+      assertTrue(image.getImageFile(idx, 
+                              NameNodeFile.IMAGE).length() == fsimageLength);
+    }
+    nn.stop();
+
+    // recover failed checkpoint
+    conf.set("dfs.name.dir", primaryDirs);
+    args = new String[]{};
+    nn = NameNode.createNameNode(args, conf);
+    Collection<File> secondaryDirs = FSImage.getCheckpointDirs(conf, null);
+    for(File dir : secondaryDirs) {
+      Storage.rename(new File(dir, "current"), 
+                     new File(dir, "lastcheckpoint.tmp"));
+    }
+    secondary = new SecondaryNameNode(conf);
+    secondary.shutdown();
+    for(File dir : secondaryDirs) {
+      assertTrue(new File(dir, "current").exists()); 
+      assertFalse(new File(dir, "lastcheckpoint.tmp").exists());
+    }
+    
+    // complete failed checkpoint
+    for(File dir : secondaryDirs) {
+      Storage.rename(new File(dir, "previous.checkpoint"), 
+                     new File(dir, "lastcheckpoint.tmp"));
+    }
+    secondary = new SecondaryNameNode(conf);
+    secondary.shutdown();
+    for(File dir : secondaryDirs) {
+      assertTrue(new File(dir, "current").exists()); 
+      assertTrue(new File(dir, "previous.checkpoint").exists()); 
+      assertFalse(new File(dir, "lastcheckpoint.tmp").exists());
+    }
+    nn.stop(); nn = null;
+    
+    // Check that everything starts ok now.
+    MiniDFSCluster cluster = new MiniDFSCluster(conf, numDatanodes, false, null);
+    cluster.waitActive();
+    cluster.shutdown();
+  }
+
   /**
    * Tests checkpoint in DFS.
    */
@@ -452,5 +612,6 @@ public class TestCheckpoint extends TestCase {
     testSecondaryNamenodeError3(conf);
     testNamedirError(conf, namedirs);
     testSecondaryFailsToReturnImage(conf);
+    testStartup(conf);
   }
 }