|
@@ -21,20 +21,14 @@ import static org.apache.hadoop.util.Time.now;
|
|
|
|
|
|
import java.io.DataInput;
|
|
|
import java.io.DataInputStream;
|
|
|
-import java.io.DataOutputStream;
|
|
|
import java.io.File;
|
|
|
import java.io.FileInputStream;
|
|
|
import java.io.FileNotFoundException;
|
|
|
-import java.io.FileOutputStream;
|
|
|
import java.io.IOException;
|
|
|
import java.security.DigestInputStream;
|
|
|
-import java.security.DigestOutputStream;
|
|
|
import java.security.MessageDigest;
|
|
|
-import java.util.ArrayList;
|
|
|
import java.util.Arrays;
|
|
|
import java.util.Collection;
|
|
|
-import java.util.HashMap;
|
|
|
-import java.util.List;
|
|
|
import java.util.Map;
|
|
|
import java.util.TreeMap;
|
|
|
|
|
@@ -56,7 +50,6 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
|
|
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
|
|
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
|
|
|
import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
|
|
|
-import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.snapshot.FileDiffList;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.snapshot.INodeDirectorySnapshottable;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
|
|
@@ -67,7 +60,6 @@ import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress.Counter;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.startupprogress.Step;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.startupprogress.StepType;
|
|
|
-import org.apache.hadoop.hdfs.util.ReadOnlyList;
|
|
|
import org.apache.hadoop.io.IOUtils;
|
|
|
import org.apache.hadoop.io.MD5Hash;
|
|
|
import org.apache.hadoop.io.Text;
|
|
@@ -77,105 +69,8 @@ import com.google.common.base.Preconditions;
|
|
|
import com.google.common.annotations.VisibleForTesting;
|
|
|
|
|
|
/**
|
|
|
- * Contains inner classes for reading or writing the on-disk format for
|
|
|
- * FSImages.
|
|
|
- *
|
|
|
- * In particular, the format of the FSImage looks like:
|
|
|
- * <pre>
|
|
|
- * FSImage {
|
|
|
- * layoutVersion: int, namespaceID: int, numberItemsInFSDirectoryTree: long,
|
|
|
- * namesystemGenerationStampV1: long, namesystemGenerationStampV2: long,
|
|
|
- * generationStampAtBlockIdSwitch:long, lastAllocatedBlockId:
|
|
|
- * long transactionID: long, snapshotCounter: int, numberOfSnapshots: int,
|
|
|
- * numOfSnapshottableDirs: int,
|
|
|
- * {FSDirectoryTree, FilesUnderConstruction, SecretManagerState} (can be compressed)
|
|
|
- * }
|
|
|
- *
|
|
|
- * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported) {
|
|
|
- * INodeInfo of root, numberOfChildren of root: int
|
|
|
- * [list of INodeInfo of root's children],
|
|
|
- * [list of INodeDirectoryInfo of root's directory children]
|
|
|
- * }
|
|
|
- *
|
|
|
- * FSDirectoryTree (if {@link Feature#FSIMAGE_NAME_OPTIMIZATION} not supported){
|
|
|
- * [list of INodeInfo of INodes in topological order]
|
|
|
- * }
|
|
|
- *
|
|
|
- * INodeInfo {
|
|
|
- * {
|
|
|
- * localName: short + byte[]
|
|
|
- * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is supported
|
|
|
- * or
|
|
|
- * {
|
|
|
- * fullPath: byte[]
|
|
|
- * } when {@link Feature#FSIMAGE_NAME_OPTIMIZATION} is not supported
|
|
|
- * replicationFactor: short, modificationTime: long,
|
|
|
- * accessTime: long, preferredBlockSize: long,
|
|
|
- * numberOfBlocks: int (-1 for INodeDirectory, -2 for INodeSymLink),
|
|
|
- * {
|
|
|
- * nsQuota: long, dsQuota: long,
|
|
|
- * {
|
|
|
- * isINodeSnapshottable: byte,
|
|
|
- * isINodeWithSnapshot: byte (if isINodeSnapshottable is false)
|
|
|
- * } (when {@link Feature#SNAPSHOT} is supported),
|
|
|
- * fsPermission: short, PermissionStatus
|
|
|
- * } for INodeDirectory
|
|
|
- * or
|
|
|
- * {
|
|
|
- * symlinkString, fsPermission: short, PermissionStatus
|
|
|
- * } for INodeSymlink
|
|
|
- * or
|
|
|
- * {
|
|
|
- * [list of BlockInfo]
|
|
|
- * [list of FileDiff]
|
|
|
- * {
|
|
|
- * isINodeFileUnderConstructionSnapshot: byte,
|
|
|
- * {clientName: short + byte[], clientMachine: short + byte[]} (when
|
|
|
- * isINodeFileUnderConstructionSnapshot is true),
|
|
|
- * } (when {@link Feature#SNAPSHOT} is supported and writing snapshotINode),
|
|
|
- * fsPermission: short, PermissionStatus
|
|
|
- * } for INodeFile
|
|
|
- * }
|
|
|
- *
|
|
|
- * INodeDirectoryInfo {
|
|
|
- * fullPath of the directory: short + byte[],
|
|
|
- * numberOfChildren: int, [list of INodeInfo of children INode],
|
|
|
- * {
|
|
|
- * numberOfSnapshots: int,
|
|
|
- * [list of Snapshot] (when NumberOfSnapshots is positive),
|
|
|
- * numberOfDirectoryDiffs: int,
|
|
|
- * [list of DirectoryDiff] (NumberOfDirectoryDiffs is positive),
|
|
|
- * number of children that are directories,
|
|
|
- * [list of INodeDirectoryInfo of the directory children] (includes
|
|
|
- * snapshot copies of deleted sub-directories)
|
|
|
- * } (when {@link Feature#SNAPSHOT} is supported),
|
|
|
- * }
|
|
|
- *
|
|
|
- * Snapshot {
|
|
|
- * snapshotID: int, root of Snapshot: INodeDirectoryInfo (its local name is
|
|
|
- * the name of the snapshot)
|
|
|
- * }
|
|
|
- *
|
|
|
- * DirectoryDiff {
|
|
|
- * full path of the root of the associated Snapshot: short + byte[],
|
|
|
- * childrenSize: int,
|
|
|
- * isSnapshotRoot: byte,
|
|
|
- * snapshotINodeIsNotNull: byte (when isSnapshotRoot is false),
|
|
|
- * snapshotINode: INodeDirectory (when SnapshotINodeIsNotNull is true), Diff
|
|
|
- * }
|
|
|
- *
|
|
|
- * Diff {
|
|
|
- * createdListSize: int, [Local name of INode in created list],
|
|
|
- * deletedListSize: int, [INode in deleted list: INodeInfo]
|
|
|
- * }
|
|
|
- *
|
|
|
- * FileDiff {
|
|
|
- * full path of the root of the associated Snapshot: short + byte[],
|
|
|
- * fileSize: long,
|
|
|
- * snapshotINodeIsNotNull: byte,
|
|
|
- * snapshotINode: INodeFile (when SnapshotINodeIsNotNull is true), Diff
|
|
|
- * }
|
|
|
- * </pre>
|
|
|
+ * This class loads and stores the FSImage of the NameNode. The file
|
|
|
+ * src/main/proto/fsimage.proto describes the on-disk layout of the FSImage.
|
|
|
*/
|
|
|
@InterfaceAudience.Private
|
|
|
@InterfaceStability.Evolving
|
|
@@ -683,11 +578,6 @@ public class FSImageFormat {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /** @return The FSDirectory of the namesystem where the fsimage is loaded */
|
|
|
- public FSDirectory getFSDirectoryInLoading() {
|
|
|
- return namesystem.dir;
|
|
|
- }
|
|
|
-
|
|
|
public INode loadINodeWithLocalName(boolean isSnapshotINode, DataInput in,
|
|
|
boolean updateINodeMap) throws IOException {
|
|
|
return loadINodeWithLocalName(isSnapshotINode, in, updateINodeMap, null);
|
|
@@ -920,7 +810,7 @@ public class FSImageFormat {
|
|
|
if (path != null && FSDirectory.isReservedName(path) &&
|
|
|
NameNodeLayoutVersion.supports(
|
|
|
LayoutVersion.Feature.ADD_INODE_ID, getLayoutVersion())) {
|
|
|
- // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
|
|
|
+ // TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
|
|
|
// snapshot. If we support INode ID in the layout version, we can use
|
|
|
// the inode id to find the oldnode.
|
|
|
oldnode = namesystem.dir.getInode(cons.getId()).asFile();
|
|
@@ -1117,7 +1007,7 @@ public class FSImageFormat {
|
|
|
+ " option to automatically rename these paths during upgrade.";
|
|
|
|
|
|
/**
|
|
|
- * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
|
|
|
+ * Same as {@link #renameReservedPathsOnUpgrade}, but for a single
|
|
|
* byte array path component.
|
|
|
*/
|
|
|
private static byte[] renameReservedComponentOnUpgrade(byte[] component,
|
|
@@ -1138,7 +1028,7 @@ public class FSImageFormat {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Same as {@link #renameReservedPathsOnUpgrade(String)}, but for a single
|
|
|
+ * Same as {@link #renameReservedPathsOnUpgrade}, but for a single
|
|
|
* byte array path component.
|
|
|
*/
|
|
|
private static byte[] renameReservedRootComponentOnUpgrade(byte[] component,
|
|
@@ -1160,266 +1050,4 @@ public class FSImageFormat {
|
|
|
}
|
|
|
return component;
|
|
|
}
|
|
|
-
|
|
|
- /**
|
|
|
- * A one-shot class responsible for writing an image file.
|
|
|
- * The write() function should be called once, after which the getter
|
|
|
- * functions may be used to retrieve information about the file that was written.
|
|
|
- */
|
|
|
- static class Saver {
|
|
|
- private final SaveNamespaceContext context;
|
|
|
- /** Set to true once an image has been written */
|
|
|
- private boolean saved = false;
|
|
|
-
|
|
|
- /** The MD5 checksum of the file that was written */
|
|
|
- private MD5Hash savedDigest;
|
|
|
- private final ReferenceMap referenceMap = new ReferenceMap();
|
|
|
-
|
|
|
- private final Map<Long, INodeFile> snapshotUCMap =
|
|
|
- new HashMap<Long, INodeFile>();
|
|
|
-
|
|
|
- /** @throws IllegalStateException if the instance has not yet saved an image */
|
|
|
- private void checkSaved() {
|
|
|
- if (!saved) {
|
|
|
- throw new IllegalStateException("FSImageSaver has not saved an image");
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /** @throws IllegalStateException if the instance has already saved an image */
|
|
|
- private void checkNotSaved() {
|
|
|
- if (saved) {
|
|
|
- throw new IllegalStateException("FSImageSaver has already saved an image");
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- Saver(SaveNamespaceContext context) {
|
|
|
- this.context = context;
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * Return the MD5 checksum of the image file that was saved.
|
|
|
- */
|
|
|
- MD5Hash getSavedDigest() {
|
|
|
- checkSaved();
|
|
|
- return savedDigest;
|
|
|
- }
|
|
|
-
|
|
|
- void save(File newFile, FSImageCompression compression) throws IOException {
|
|
|
- checkNotSaved();
|
|
|
-
|
|
|
- final FSNamesystem sourceNamesystem = context.getSourceNamesystem();
|
|
|
- final INodeDirectory rootDir = sourceNamesystem.dir.rootDir;
|
|
|
- final long numINodes = rootDir.getDirectoryWithQuotaFeature()
|
|
|
- .getSpaceConsumed().get(Quota.NAMESPACE);
|
|
|
- String sdPath = newFile.getParentFile().getParentFile().getAbsolutePath();
|
|
|
- Step step = new Step(StepType.INODES, sdPath);
|
|
|
- StartupProgress prog = NameNode.getStartupProgress();
|
|
|
- prog.beginStep(Phase.SAVING_CHECKPOINT, step);
|
|
|
- prog.setTotal(Phase.SAVING_CHECKPOINT, step, numINodes);
|
|
|
- Counter counter = prog.getCounter(Phase.SAVING_CHECKPOINT, step);
|
|
|
- long startTime = now();
|
|
|
- //
|
|
|
- // Write out data
|
|
|
- //
|
|
|
- MessageDigest digester = MD5Hash.getDigester();
|
|
|
- FileOutputStream fout = new FileOutputStream(newFile);
|
|
|
- DigestOutputStream fos = new DigestOutputStream(fout, digester);
|
|
|
- DataOutputStream out = new DataOutputStream(fos);
|
|
|
- try {
|
|
|
- out.writeInt(HdfsConstants.NAMENODE_LAYOUT_VERSION);
|
|
|
- LayoutFlags.write(out);
|
|
|
- // We use the non-locked version of getNamespaceInfo here since
|
|
|
- // the coordinating thread of saveNamespace already has read-locked
|
|
|
- // the namespace for us. If we attempt to take another readlock
|
|
|
- // from the actual saver thread, there's a potential of a
|
|
|
- // fairness-related deadlock. See the comments on HDFS-2223.
|
|
|
- out.writeInt(sourceNamesystem.unprotectedGetNamespaceInfo()
|
|
|
- .getNamespaceID());
|
|
|
- out.writeLong(numINodes);
|
|
|
- out.writeLong(sourceNamesystem.getGenerationStampV1());
|
|
|
- out.writeLong(sourceNamesystem.getGenerationStampV2());
|
|
|
- out.writeLong(sourceNamesystem.getGenerationStampAtblockIdSwitch());
|
|
|
- out.writeLong(sourceNamesystem.getLastAllocatedBlockId());
|
|
|
- out.writeLong(context.getTxId());
|
|
|
- out.writeLong(sourceNamesystem.getLastInodeId());
|
|
|
-
|
|
|
-
|
|
|
- sourceNamesystem.getSnapshotManager().write(out);
|
|
|
-
|
|
|
- // write compression info and set up compressed stream
|
|
|
- out = compression.writeHeaderAndWrapStream(fos);
|
|
|
- LOG.info("Saving image file " + newFile +
|
|
|
- " using " + compression);
|
|
|
-
|
|
|
- // save the root
|
|
|
- saveINode2Image(rootDir, out, false, referenceMap, counter);
|
|
|
- // save the rest of the nodes
|
|
|
- saveImage(rootDir, out, true, false, counter);
|
|
|
- prog.endStep(Phase.SAVING_CHECKPOINT, step);
|
|
|
- // Now that the step is finished, set counter equal to total to adjust
|
|
|
- // for possible under-counting due to reference inodes.
|
|
|
- prog.setCount(Phase.SAVING_CHECKPOINT, step, numINodes);
|
|
|
- // save files under construction
|
|
|
- // TODO: for HDFS-5428, since we cannot break the compatibility of
|
|
|
- // fsimage, we store part of the under-construction files that are only
|
|
|
- // in snapshots in this "under-construction-file" section. As a
|
|
|
- // temporary solution, we use "/.reserved/.inodes/<inodeid>" as their
|
|
|
- // paths, so that when loading fsimage we do not put them into the lease
|
|
|
- // map. In the future, we can remove this hack when we can bump the
|
|
|
- // layout version.
|
|
|
- sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
|
|
|
-
|
|
|
- context.checkCancelled();
|
|
|
- sourceNamesystem.saveSecretManagerStateCompat(out, sdPath);
|
|
|
- context.checkCancelled();
|
|
|
- sourceNamesystem.getCacheManager().saveStateCompat(out, sdPath);
|
|
|
- context.checkCancelled();
|
|
|
- out.flush();
|
|
|
- context.checkCancelled();
|
|
|
- fout.getChannel().force(true);
|
|
|
- } finally {
|
|
|
- out.close();
|
|
|
- }
|
|
|
-
|
|
|
- saved = true;
|
|
|
- // set md5 of the saved image
|
|
|
- savedDigest = new MD5Hash(digester.digest());
|
|
|
-
|
|
|
- LOG.info("Image file " + newFile + " of size " + newFile.length() +
|
|
|
- " bytes saved in " + (now() - startTime)/1000 + " seconds.");
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * Save children INodes.
|
|
|
- * @param children The list of children INodes
|
|
|
- * @param out The DataOutputStream to write
|
|
|
- * @param inSnapshot Whether the parent directory or its ancestor is in
|
|
|
- * the deleted list of some snapshot (caused by rename or
|
|
|
- * deletion)
|
|
|
- * @param counter Counter to increment for namenode startup progress
|
|
|
- * @return Number of children that are directory
|
|
|
- */
|
|
|
- private int saveChildren(ReadOnlyList<INode> children,
|
|
|
- DataOutputStream out, boolean inSnapshot, Counter counter)
|
|
|
- throws IOException {
|
|
|
- // Write normal children INode.
|
|
|
- out.writeInt(children.size());
|
|
|
- int dirNum = 0;
|
|
|
- int i = 0;
|
|
|
- for(INode child : children) {
|
|
|
- // print all children first
|
|
|
- // TODO: for HDFS-5428, we cannot change the format/content of fsimage
|
|
|
- // here, thus even if the parent directory is in snapshot, we still
|
|
|
- // do not handle INodeUC as those stored in deleted list
|
|
|
- saveINode2Image(child, out, false, referenceMap, counter);
|
|
|
- if (child.isDirectory()) {
|
|
|
- dirNum++;
|
|
|
- } else if (inSnapshot && child.isFile()
|
|
|
- && child.asFile().isUnderConstruction()) {
|
|
|
- this.snapshotUCMap.put(child.getId(), child.asFile());
|
|
|
- }
|
|
|
- if (i++ % 50 == 0) {
|
|
|
- context.checkCancelled();
|
|
|
- }
|
|
|
- }
|
|
|
- return dirNum;
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * Save file tree image starting from the given root.
|
|
|
- * This is a recursive procedure, which first saves all children and
|
|
|
- * snapshot diffs of a current directory and then moves inside the
|
|
|
- * sub-directories.
|
|
|
- *
|
|
|
- * @param current The current node
|
|
|
- * @param out The DataoutputStream to write the image
|
|
|
- * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
|
|
|
- * reference node, its subtree may already have been
|
|
|
- * saved before.
|
|
|
- * @param inSnapshot Whether the current directory is in snapshot
|
|
|
- * @param counter Counter to increment for namenode startup progress
|
|
|
- */
|
|
|
- private void saveImage(INodeDirectory current, DataOutputStream out,
|
|
|
- boolean toSaveSubtree, boolean inSnapshot, Counter counter)
|
|
|
- throws IOException {
|
|
|
- // write the inode id of the directory
|
|
|
- out.writeLong(current.getId());
|
|
|
-
|
|
|
- if (!toSaveSubtree) {
|
|
|
- return;
|
|
|
- }
|
|
|
-
|
|
|
- final ReadOnlyList<INode> children = current
|
|
|
- .getChildrenList(Snapshot.CURRENT_STATE_ID);
|
|
|
- int dirNum = 0;
|
|
|
- List<INodeDirectory> snapshotDirs = null;
|
|
|
- DirectoryWithSnapshotFeature sf = current.getDirectoryWithSnapshotFeature();
|
|
|
- if (sf != null) {
|
|
|
- snapshotDirs = new ArrayList<INodeDirectory>();
|
|
|
- sf.getSnapshotDirectory(snapshotDirs);
|
|
|
- dirNum += snapshotDirs.size();
|
|
|
- }
|
|
|
-
|
|
|
- // 2. Write INodeDirectorySnapshottable#snapshotsByNames to record all
|
|
|
- // Snapshots
|
|
|
- if (current instanceof INodeDirectorySnapshottable) {
|
|
|
- INodeDirectorySnapshottable snapshottableNode =
|
|
|
- (INodeDirectorySnapshottable) current;
|
|
|
- SnapshotFSImageFormat.saveSnapshots(snapshottableNode, out);
|
|
|
- } else {
|
|
|
- out.writeInt(-1); // # of snapshots
|
|
|
- }
|
|
|
-
|
|
|
- // 3. Write children INode
|
|
|
- dirNum += saveChildren(children, out, inSnapshot, counter);
|
|
|
-
|
|
|
- // 4. Write DirectoryDiff lists, if there is any.
|
|
|
- SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
|
|
|
-
|
|
|
- // Write sub-tree of sub-directories, including possible snapshots of
|
|
|
- // deleted sub-directories
|
|
|
- out.writeInt(dirNum); // the number of sub-directories
|
|
|
- for(INode child : children) {
|
|
|
- if(!child.isDirectory()) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- // make sure we only save the subtree under a reference node once
|
|
|
- boolean toSave = child.isReference() ?
|
|
|
- referenceMap.toProcessSubtree(child.getId()) : true;
|
|
|
- saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
|
|
|
- }
|
|
|
- if (snapshotDirs != null) {
|
|
|
- for (INodeDirectory subDir : snapshotDirs) {
|
|
|
- // make sure we only save the subtree under a reference node once
|
|
|
- boolean toSave = subDir.getParentReference() != null ?
|
|
|
- referenceMap.toProcessSubtree(subDir.getId()) : true;
|
|
|
- saveImage(subDir, out, toSave, true, counter);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * Saves inode and increments progress counter.
|
|
|
- *
|
|
|
- * @param inode INode to save
|
|
|
- * @param out DataOutputStream to receive inode
|
|
|
- * @param writeUnderConstruction boolean true if this is under construction
|
|
|
- * @param referenceMap ReferenceMap containing reference inodes
|
|
|
- * @param counter Counter to increment for namenode startup progress
|
|
|
- * @throws IOException thrown if there is an I/O error
|
|
|
- */
|
|
|
- private void saveINode2Image(INode inode, DataOutputStream out,
|
|
|
- boolean writeUnderConstruction, ReferenceMap referenceMap,
|
|
|
- Counter counter) throws IOException {
|
|
|
- FSImageSerialization.saveINode2Image(inode, out, writeUnderConstruction,
|
|
|
- referenceMap);
|
|
|
- // Intentionally do not increment counter for reference inodes, because it
|
|
|
- // is too difficult at this point to assess whether or not this is a
|
|
|
- // reference that counts toward quota.
|
|
|
- if (!(inode instanceof INodeReference)) {
|
|
|
- counter.increment();
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
}
|