|
@@ -61,6 +61,7 @@ import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportReplica;
|
|
import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
|
import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
|
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
|
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
|
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
|
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
|
|
|
+import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
|
|
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
|
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
|
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
|
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
|
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
|
|
import org.apache.hadoop.hdfs.protocol.LocatedStripedBlock;
|
|
@@ -93,6 +94,7 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage.State;
|
|
import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand;
|
|
import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand;
|
|
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
|
|
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
|
|
import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
|
|
import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
|
|
|
|
+import org.apache.hadoop.hdfs.util.FoldedTreeSet;
|
|
import org.apache.hadoop.hdfs.util.LightWeightHashSet;
|
|
import org.apache.hadoop.hdfs.util.LightWeightHashSet;
|
|
import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
|
|
import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
|
|
|
|
|
|
@@ -106,6 +108,7 @@ import org.apache.hadoop.util.Daemon;
|
|
import org.apache.hadoop.util.ExitUtil;
|
|
import org.apache.hadoop.util.ExitUtil;
|
|
import org.apache.hadoop.util.LightWeightGSet;
|
|
import org.apache.hadoop.util.LightWeightGSet;
|
|
import org.apache.hadoop.util.Time;
|
|
import org.apache.hadoop.util.Time;
|
|
|
|
+import org.apache.hadoop.util.VersionInfo;
|
|
|
|
|
|
import com.google.common.annotations.VisibleForTesting;
|
|
import com.google.common.annotations.VisibleForTesting;
|
|
import com.google.common.base.Preconditions;
|
|
import com.google.common.base.Preconditions;
|
|
@@ -195,7 +198,12 @@ public class BlockManager implements BlockStatsMXBean {
|
|
|
|
|
|
/**replicationRecheckInterval is how often namenode checks for new replication work*/
|
|
/**replicationRecheckInterval is how often namenode checks for new replication work*/
|
|
private final long replicationRecheckInterval;
|
|
private final long replicationRecheckInterval;
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+ /** How often to check and the limit for the storageinfo efficiency. */
|
|
|
|
+ private final long storageInfoDefragmentInterval;
|
|
|
|
+ private final long storageInfoDefragmentTimeout;
|
|
|
|
+ private final double storageInfoDefragmentRatio;
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* Mapping: Block -> { BlockCollection, datanodes, self ref }
|
|
* Mapping: Block -> { BlockCollection, datanodes, self ref }
|
|
* Updated only in response to client-sent information.
|
|
* Updated only in response to client-sent information.
|
|
@@ -204,6 +212,10 @@ public class BlockManager implements BlockStatsMXBean {
|
|
|
|
|
|
/** Replication thread. */
|
|
/** Replication thread. */
|
|
final Daemon replicationThread = new Daemon(new ReplicationMonitor());
|
|
final Daemon replicationThread = new Daemon(new ReplicationMonitor());
|
|
|
|
+
|
|
|
|
+ /** StorageInfoDefragmenter thread. */
|
|
|
|
+ private final Daemon storageInfoDefragmenterThread =
|
|
|
|
+ new Daemon(new StorageInfoDefragmenter());
|
|
|
|
|
|
/** Block report thread for handling async reports. */
|
|
/** Block report thread for handling async reports. */
|
|
private final BlockReportProcessingThread blockReportThread =
|
|
private final BlockReportProcessingThread blockReportThread =
|
|
@@ -376,7 +388,20 @@ public class BlockManager implements BlockStatsMXBean {
|
|
this.replicationRecheckInterval =
|
|
this.replicationRecheckInterval =
|
|
conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY,
|
|
conf.getInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY,
|
|
DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_DEFAULT) * 1000L;
|
|
DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_DEFAULT) * 1000L;
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+ this.storageInfoDefragmentInterval =
|
|
|
|
+ conf.getLong(
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_INTERVAL_MS_KEY,
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_INTERVAL_MS_DEFAULT);
|
|
|
|
+ this.storageInfoDefragmentTimeout =
|
|
|
|
+ conf.getLong(
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_TIMEOUT_MS_KEY,
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_TIMEOUT_MS_DEFAULT);
|
|
|
|
+ this.storageInfoDefragmentRatio =
|
|
|
|
+ conf.getDouble(
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_RATIO_KEY,
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_STORAGEINFO_DEFRAGMENT_RATIO_DEFAULT);
|
|
|
|
+
|
|
this.encryptDataTransfer =
|
|
this.encryptDataTransfer =
|
|
conf.getBoolean(DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY,
|
|
conf.getBoolean(DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_KEY,
|
|
DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT);
|
|
DFSConfigKeys.DFS_ENCRYPT_DATA_TRANSFER_DEFAULT);
|
|
@@ -508,6 +533,8 @@ public class BlockManager implements BlockStatsMXBean {
|
|
datanodeManager.activate(conf);
|
|
datanodeManager.activate(conf);
|
|
this.replicationThread.setName("ReplicationMonitor");
|
|
this.replicationThread.setName("ReplicationMonitor");
|
|
this.replicationThread.start();
|
|
this.replicationThread.start();
|
|
|
|
+ storageInfoDefragmenterThread.setName("StorageInfoMonitor");
|
|
|
|
+ storageInfoDefragmenterThread.start();
|
|
this.blockReportThread.start();
|
|
this.blockReportThread.start();
|
|
mxBeanName = MBeans.register("NameNode", "BlockStats", this);
|
|
mxBeanName = MBeans.register("NameNode", "BlockStats", this);
|
|
bmSafeMode.activate(blockTotal);
|
|
bmSafeMode.activate(blockTotal);
|
|
@@ -517,8 +544,10 @@ public class BlockManager implements BlockStatsMXBean {
|
|
bmSafeMode.close();
|
|
bmSafeMode.close();
|
|
try {
|
|
try {
|
|
replicationThread.interrupt();
|
|
replicationThread.interrupt();
|
|
|
|
+ storageInfoDefragmenterThread.interrupt();
|
|
blockReportThread.interrupt();
|
|
blockReportThread.interrupt();
|
|
replicationThread.join(3000);
|
|
replicationThread.join(3000);
|
|
|
|
+ storageInfoDefragmenterThread.join(3000);
|
|
blockReportThread.join(3000);
|
|
blockReportThread.join(3000);
|
|
} catch (InterruptedException ie) {
|
|
} catch (InterruptedException ie) {
|
|
}
|
|
}
|
|
@@ -1165,9 +1194,15 @@ public class BlockManager implements BlockStatsMXBean {
|
|
|
|
|
|
/** Remove the blocks associated to the given datanode. */
|
|
/** Remove the blocks associated to the given datanode. */
|
|
void removeBlocksAssociatedTo(final DatanodeDescriptor node) {
|
|
void removeBlocksAssociatedTo(final DatanodeDescriptor node) {
|
|
- final Iterator<BlockInfo> it = node.getBlockIterator();
|
|
|
|
- while(it.hasNext()) {
|
|
|
|
- removeStoredBlock(it.next(), node);
|
|
|
|
|
|
+ for (DatanodeStorageInfo storage : node.getStorageInfos()) {
|
|
|
|
+ final Iterator<BlockInfo> it = storage.getBlockIterator();
|
|
|
|
+ while (it.hasNext()) {
|
|
|
|
+ BlockInfo block = it.next();
|
|
|
|
+ // DatanodeStorageInfo must be removed using the iterator to avoid
|
|
|
|
+ // ConcurrentModificationException in the underlying storage
|
|
|
|
+ it.remove();
|
|
|
|
+ removeStoredBlock(block, node);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
// Remove all pending DN messages referencing this DN.
|
|
// Remove all pending DN messages referencing this DN.
|
|
pendingDNMessages.removeAllMessagesForDatanode(node);
|
|
pendingDNMessages.removeAllMessagesForDatanode(node);
|
|
@@ -1183,6 +1218,9 @@ public class BlockManager implements BlockStatsMXBean {
|
|
DatanodeDescriptor node = storageInfo.getDatanodeDescriptor();
|
|
DatanodeDescriptor node = storageInfo.getDatanodeDescriptor();
|
|
while(it.hasNext()) {
|
|
while(it.hasNext()) {
|
|
BlockInfo block = it.next();
|
|
BlockInfo block = it.next();
|
|
|
|
+ // DatanodeStorageInfo must be removed using the iterator to avoid
|
|
|
|
+ // ConcurrentModificationException in the underlying storage
|
|
|
|
+ it.remove();
|
|
removeStoredBlock(block, node);
|
|
removeStoredBlock(block, node);
|
|
final Block b = getBlockOnStorage(block, storageInfo);
|
|
final Block b = getBlockOnStorage(block, storageInfo);
|
|
if (b != null) {
|
|
if (b != null) {
|
|
@@ -2033,8 +2071,8 @@ public class BlockManager implements BlockStatsMXBean {
|
|
*/
|
|
*/
|
|
public boolean processReport(final DatanodeID nodeID,
|
|
public boolean processReport(final DatanodeID nodeID,
|
|
final DatanodeStorage storage,
|
|
final DatanodeStorage storage,
|
|
- final BlockListAsLongs newReport, BlockReportContext context,
|
|
|
|
- boolean lastStorageInRpc) throws IOException {
|
|
|
|
|
|
+ final BlockListAsLongs newReport,
|
|
|
|
+ BlockReportContext context, boolean lastStorageInRpc) throws IOException {
|
|
namesystem.writeLock();
|
|
namesystem.writeLock();
|
|
final long startTime = Time.monotonicNow(); //after acquiring write lock
|
|
final long startTime = Time.monotonicNow(); //after acquiring write lock
|
|
final long endTime;
|
|
final long endTime;
|
|
@@ -2079,7 +2117,8 @@ public class BlockManager implements BlockStatsMXBean {
|
|
nodeID.getDatanodeUuid());
|
|
nodeID.getDatanodeUuid());
|
|
processFirstBlockReport(storageInfo, newReport);
|
|
processFirstBlockReport(storageInfo, newReport);
|
|
} else {
|
|
} else {
|
|
- invalidatedBlocks = processReport(storageInfo, newReport);
|
|
|
|
|
|
+ invalidatedBlocks = processReport(storageInfo, newReport,
|
|
|
|
+ context != null ? context.isSorted() : false);
|
|
}
|
|
}
|
|
|
|
|
|
storageInfo.receivedBlockReport();
|
|
storageInfo.receivedBlockReport();
|
|
@@ -2149,6 +2188,9 @@ public class BlockManager implements BlockStatsMXBean {
|
|
// TODO: remove this assumption in case we want to put a block on
|
|
// TODO: remove this assumption in case we want to put a block on
|
|
// more than one storage on a datanode (and because it's a difficult
|
|
// more than one storage on a datanode (and because it's a difficult
|
|
// assumption to really enforce)
|
|
// assumption to really enforce)
|
|
|
|
+ // DatanodeStorageInfo must be removed using the iterator to avoid
|
|
|
|
+ // ConcurrentModificationException in the underlying storage
|
|
|
|
+ iter.remove();
|
|
removeStoredBlock(block, zombie.getDatanodeDescriptor());
|
|
removeStoredBlock(block, zombie.getDatanodeDescriptor());
|
|
Block b = getBlockOnStorage(block, zombie);
|
|
Block b = getBlockOnStorage(block, zombie);
|
|
if (b != null) {
|
|
if (b != null) {
|
|
@@ -2238,7 +2280,7 @@ public class BlockManager implements BlockStatsMXBean {
|
|
|
|
|
|
private Collection<Block> processReport(
|
|
private Collection<Block> processReport(
|
|
final DatanodeStorageInfo storageInfo,
|
|
final DatanodeStorageInfo storageInfo,
|
|
- final BlockListAsLongs report) throws IOException {
|
|
|
|
|
|
+ final BlockListAsLongs report, final boolean sorted) throws IOException {
|
|
// Normal case:
|
|
// Normal case:
|
|
// Modify the (block-->datanode) map, according to the difference
|
|
// Modify the (block-->datanode) map, according to the difference
|
|
// between the old and new block report.
|
|
// between the old and new block report.
|
|
@@ -2248,9 +2290,29 @@ public class BlockManager implements BlockStatsMXBean {
|
|
Collection<Block> toInvalidate = new LinkedList<>();
|
|
Collection<Block> toInvalidate = new LinkedList<>();
|
|
Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<>();
|
|
Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<>();
|
|
Collection<StatefulBlockInfo> toUC = new LinkedList<>();
|
|
Collection<StatefulBlockInfo> toUC = new LinkedList<>();
|
|
- reportDiff(storageInfo, report,
|
|
|
|
- toAdd, toRemove, toInvalidate, toCorrupt, toUC);
|
|
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+ Iterable<BlockReportReplica> sortedReport;
|
|
|
|
+ if (!sorted) {
|
|
|
|
+ blockLog.warn("BLOCK* processReport: Report from the DataNode ({}) is "
|
|
|
|
+ + "unsorted. This will cause overhead on the NameNode "
|
|
|
|
+ + "which needs to sort the Full BR. Please update the "
|
|
|
|
+ + "DataNode to the same version of Hadoop HDFS as the "
|
|
|
|
+ + "NameNode ({}).",
|
|
|
|
+ storageInfo.getDatanodeDescriptor().getDatanodeUuid(),
|
|
|
|
+ VersionInfo.getVersion());
|
|
|
|
+ Set<BlockReportReplica> set = new FoldedTreeSet<>();
|
|
|
|
+ for (BlockReportReplica iblk : report) {
|
|
|
|
+ set.add(new BlockReportReplica(iblk));
|
|
|
|
+ }
|
|
|
|
+ sortedReport = set;
|
|
|
|
+ } else {
|
|
|
|
+ sortedReport = report;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ reportDiffSorted(storageInfo, sortedReport,
|
|
|
|
+ toAdd, toRemove, toInvalidate, toCorrupt, toUC);
|
|
|
|
+
|
|
|
|
+
|
|
DatanodeDescriptor node = storageInfo.getDatanodeDescriptor();
|
|
DatanodeDescriptor node = storageInfo.getDatanodeDescriptor();
|
|
// Process the blocks on each queue
|
|
// Process the blocks on each queue
|
|
for (StatefulBlockInfo b : toUC) {
|
|
for (StatefulBlockInfo b : toUC) {
|
|
@@ -2399,126 +2461,111 @@ public class BlockManager implements BlockStatsMXBean {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- private void reportDiff(DatanodeStorageInfo storageInfo,
|
|
|
|
- BlockListAsLongs newReport,
|
|
|
|
|
|
+ private void reportDiffSorted(DatanodeStorageInfo storageInfo,
|
|
|
|
+ Iterable<BlockReportReplica> newReport,
|
|
Collection<BlockInfoToAdd> toAdd, // add to DatanodeDescriptor
|
|
Collection<BlockInfoToAdd> toAdd, // add to DatanodeDescriptor
|
|
Collection<BlockInfo> toRemove, // remove from DatanodeDescriptor
|
|
Collection<BlockInfo> toRemove, // remove from DatanodeDescriptor
|
|
Collection<Block> toInvalidate, // should be removed from DN
|
|
Collection<Block> toInvalidate, // should be removed from DN
|
|
Collection<BlockToMarkCorrupt> toCorrupt, // add to corrupt replicas list
|
|
Collection<BlockToMarkCorrupt> toCorrupt, // add to corrupt replicas list
|
|
Collection<StatefulBlockInfo> toUC) { // add to under-construction list
|
|
Collection<StatefulBlockInfo> toUC) { // add to under-construction list
|
|
|
|
|
|
- // place a delimiter in the list which separates blocks
|
|
|
|
- // that have been reported from those that have not
|
|
|
|
- Block delimiterBlock = new Block();
|
|
|
|
- BlockInfo delimiter = new BlockInfoContiguous(delimiterBlock,
|
|
|
|
- (short) 1);
|
|
|
|
- AddBlockResult result = storageInfo.addBlock(delimiter, delimiterBlock);
|
|
|
|
- assert result == AddBlockResult.ADDED
|
|
|
|
- : "Delimiting block cannot be present in the node";
|
|
|
|
- int headIndex = 0; //currently the delimiter is in the head of the list
|
|
|
|
- int curIndex;
|
|
|
|
-
|
|
|
|
- if (newReport == null) {
|
|
|
|
- newReport = BlockListAsLongs.EMPTY;
|
|
|
|
- }
|
|
|
|
- // scan the report and process newly reported blocks
|
|
|
|
- for (BlockReportReplica iblk : newReport) {
|
|
|
|
- ReplicaState iState = iblk.getState();
|
|
|
|
- BlockInfo storedBlock = processReportedBlock(storageInfo,
|
|
|
|
- iblk, iState, toAdd, toInvalidate, toCorrupt, toUC);
|
|
|
|
-
|
|
|
|
- // move block to the head of the list
|
|
|
|
- if (storedBlock != null &&
|
|
|
|
- (curIndex = storedBlock.findStorageInfo(storageInfo)) >= 0) {
|
|
|
|
- headIndex = storageInfo.moveBlockToHead(storedBlock, curIndex, headIndex);
|
|
|
|
|
|
+ // The blocks must be sorted and the storagenodes blocks must be sorted
|
|
|
|
+ Iterator<BlockInfo> storageBlocksIterator = storageInfo.getBlockIterator();
|
|
|
|
+ DatanodeDescriptor dn = storageInfo.getDatanodeDescriptor();
|
|
|
|
+ BlockInfo storageBlock = null;
|
|
|
|
+
|
|
|
|
+ for (BlockReportReplica replica : newReport) {
|
|
|
|
+
|
|
|
|
+ long replicaID = replica.getBlockId();
|
|
|
|
+ if (BlockIdManager.isStripedBlockID(replicaID)
|
|
|
|
+ && (!hasNonEcBlockUsingStripedID ||
|
|
|
|
+ !blocksMap.containsBlock(replica))) {
|
|
|
|
+ replicaID = BlockIdManager.convertToStripedID(replicaID);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ ReplicaState reportedState = replica.getState();
|
|
|
|
+
|
|
|
|
+ if (LOG.isDebugEnabled()) {
|
|
|
|
+ LOG.debug("Reported block " + replica
|
|
|
|
+ + " on " + dn + " size " + replica.getNumBytes()
|
|
|
|
+ + " replicaState = " + reportedState);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (shouldPostponeBlocksFromFuture
|
|
|
|
+ && isGenStampInFuture(replica)) {
|
|
|
|
+ queueReportedBlock(storageInfo, replica, reportedState,
|
|
|
|
+ QUEUE_REASON_FUTURE_GENSTAMP);
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (storageBlock == null && storageBlocksIterator.hasNext()) {
|
|
|
|
+ storageBlock = storageBlocksIterator.next();
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ do {
|
|
|
|
+ int cmp;
|
|
|
|
+ if (storageBlock == null ||
|
|
|
|
+ (cmp = Long.compare(replicaID, storageBlock.getBlockId())) < 0) {
|
|
|
|
+ // Check if block is available in NN but not yet on this storage
|
|
|
|
+ BlockInfo nnBlock = blocksMap.getStoredBlock(new Block(replicaID));
|
|
|
|
+ if (nnBlock != null) {
|
|
|
|
+ reportDiffSortedInner(storageInfo, replica, reportedState,
|
|
|
|
+ nnBlock, toAdd, toCorrupt, toUC);
|
|
|
|
+ } else {
|
|
|
|
+ // Replica not found anywhere so it should be invalidated
|
|
|
|
+ toInvalidate.add(new Block(replica));
|
|
|
|
+ }
|
|
|
|
+ break;
|
|
|
|
+ } else if (cmp == 0) {
|
|
|
|
+ // Replica matched current storageblock
|
|
|
|
+ reportDiffSortedInner(storageInfo, replica, reportedState,
|
|
|
|
+ storageBlock, toAdd, toCorrupt, toUC);
|
|
|
|
+ storageBlock = null;
|
|
|
|
+ } else {
|
|
|
|
+ // replica has higher ID than storedBlock
|
|
|
|
+ // Remove all stored blocks with IDs lower than replica
|
|
|
|
+ do {
|
|
|
|
+ toRemove.add(storageBlock);
|
|
|
|
+ storageBlock = storageBlocksIterator.hasNext()
|
|
|
|
+ ? storageBlocksIterator.next() : null;
|
|
|
|
+ } while (storageBlock != null &&
|
|
|
|
+ Long.compare(replicaID, storageBlock.getBlockId()) > 0);
|
|
|
|
+ }
|
|
|
|
+ } while (storageBlock != null);
|
|
}
|
|
}
|
|
|
|
|
|
- // collect blocks that have not been reported
|
|
|
|
- // all of them are next to the delimiter
|
|
|
|
- Iterator<BlockInfo> it =
|
|
|
|
- storageInfo.new BlockIterator(delimiter.getNext(0));
|
|
|
|
- while (it.hasNext()) {
|
|
|
|
- toRemove.add(it.next());
|
|
|
|
|
|
+ // Iterate any remaing blocks that have not been reported and remove them
|
|
|
|
+ while (storageBlocksIterator.hasNext()) {
|
|
|
|
+ toRemove.add(storageBlocksIterator.next());
|
|
}
|
|
}
|
|
- storageInfo.removeBlock(delimiter);
|
|
|
|
}
|
|
}
|
|
|
|
|
|
- /**
|
|
|
|
- * Process a block replica reported by the data-node.
|
|
|
|
- * No side effects except adding to the passed-in Collections.
|
|
|
|
- *
|
|
|
|
- * <ol>
|
|
|
|
- * <li>If the block is not known to the system (not in blocksMap) then the
|
|
|
|
- * data-node should be notified to invalidate this block.</li>
|
|
|
|
- * <li>If the reported replica is valid that is has the same generation stamp
|
|
|
|
- * and length as recorded on the name-node, then the replica location should
|
|
|
|
- * be added to the name-node.</li>
|
|
|
|
- * <li>If the reported replica is not valid, then it is marked as corrupt,
|
|
|
|
- * which triggers replication of the existing valid replicas.
|
|
|
|
- * Corrupt replicas are removed from the system when the block
|
|
|
|
- * is fully replicated.</li>
|
|
|
|
- * <li>If the reported replica is for a block currently marked "under
|
|
|
|
- * construction" in the NN, then it should be added to the
|
|
|
|
- * BlockUnderConstructionFeature's list of replicas.</li>
|
|
|
|
- * </ol>
|
|
|
|
- *
|
|
|
|
- * @param storageInfo DatanodeStorageInfo that sent the report.
|
|
|
|
- * @param block reported block replica
|
|
|
|
- * @param reportedState reported replica state
|
|
|
|
- * @param toAdd add to DatanodeDescriptor
|
|
|
|
- * @param toInvalidate missing blocks (not in the blocks map)
|
|
|
|
- * should be removed from the data-node
|
|
|
|
- * @param toCorrupt replicas with unexpected length or generation stamp;
|
|
|
|
- * add to corrupt replicas
|
|
|
|
- * @param toUC replicas of blocks currently under construction
|
|
|
|
- * @return the up-to-date stored block, if it should be kept.
|
|
|
|
- * Otherwise, null.
|
|
|
|
- */
|
|
|
|
- private BlockInfo processReportedBlock(
|
|
|
|
|
|
+ private void reportDiffSortedInner(
|
|
final DatanodeStorageInfo storageInfo,
|
|
final DatanodeStorageInfo storageInfo,
|
|
- final Block block, final ReplicaState reportedState,
|
|
|
|
|
|
+ final BlockReportReplica replica, final ReplicaState reportedState,
|
|
|
|
+ final BlockInfo storedBlock,
|
|
final Collection<BlockInfoToAdd> toAdd,
|
|
final Collection<BlockInfoToAdd> toAdd,
|
|
- final Collection<Block> toInvalidate,
|
|
|
|
final Collection<BlockToMarkCorrupt> toCorrupt,
|
|
final Collection<BlockToMarkCorrupt> toCorrupt,
|
|
final Collection<StatefulBlockInfo> toUC) {
|
|
final Collection<StatefulBlockInfo> toUC) {
|
|
-
|
|
|
|
- DatanodeDescriptor dn = storageInfo.getDatanodeDescriptor();
|
|
|
|
|
|
|
|
- if(LOG.isDebugEnabled()) {
|
|
|
|
- LOG.debug("Reported block " + block
|
|
|
|
- + " on " + dn + " size " + block.getNumBytes()
|
|
|
|
- + " replicaState = " + reportedState);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (shouldPostponeBlocksFromFuture && isGenStampInFuture(block)) {
|
|
|
|
- queueReportedBlock(storageInfo, block, reportedState,
|
|
|
|
- QUEUE_REASON_FUTURE_GENSTAMP);
|
|
|
|
- return null;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- // find block by blockId
|
|
|
|
- BlockInfo storedBlock = getStoredBlock(block);
|
|
|
|
- if(storedBlock == null) {
|
|
|
|
- // If blocksMap does not contain reported block id,
|
|
|
|
- // the replica should be removed from the data-node.
|
|
|
|
- toInvalidate.add(new Block(block));
|
|
|
|
- return null;
|
|
|
|
- }
|
|
|
|
|
|
+ assert replica != null;
|
|
|
|
+ assert storedBlock != null;
|
|
|
|
+
|
|
|
|
+ DatanodeDescriptor dn = storageInfo.getDatanodeDescriptor();
|
|
BlockUCState ucState = storedBlock.getBlockUCState();
|
|
BlockUCState ucState = storedBlock.getBlockUCState();
|
|
-
|
|
|
|
|
|
+
|
|
// Block is on the NN
|
|
// Block is on the NN
|
|
- if(LOG.isDebugEnabled()) {
|
|
|
|
|
|
+ if (LOG.isDebugEnabled()) {
|
|
LOG.debug("In memory blockUCState = " + ucState);
|
|
LOG.debug("In memory blockUCState = " + ucState);
|
|
}
|
|
}
|
|
|
|
|
|
// Ignore replicas already scheduled to be removed from the DN
|
|
// Ignore replicas already scheduled to be removed from the DN
|
|
- if(invalidateBlocks.contains(dn, block)) {
|
|
|
|
- return storedBlock;
|
|
|
|
|
|
+ if (invalidateBlocks.contains(dn, replica)) {
|
|
|
|
+ return;
|
|
}
|
|
}
|
|
|
|
|
|
- BlockToMarkCorrupt c = checkReplicaCorrupt(
|
|
|
|
- block, reportedState, storedBlock, ucState, dn);
|
|
|
|
|
|
+ BlockToMarkCorrupt c = checkReplicaCorrupt(replica, reportedState,
|
|
|
|
+ storedBlock, ucState, dn);
|
|
if (c != null) {
|
|
if (c != null) {
|
|
if (shouldPostponeBlocksFromFuture) {
|
|
if (shouldPostponeBlocksFromFuture) {
|
|
// If the block is an out-of-date generation stamp or state,
|
|
// If the block is an out-of-date generation stamp or state,
|
|
@@ -2532,23 +2579,16 @@ public class BlockManager implements BlockStatsMXBean {
|
|
} else {
|
|
} else {
|
|
toCorrupt.add(c);
|
|
toCorrupt.add(c);
|
|
}
|
|
}
|
|
- return storedBlock;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) {
|
|
|
|
- toUC.add(new StatefulBlockInfo(storedBlock,
|
|
|
|
- new Block(block), reportedState));
|
|
|
|
- return storedBlock;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- // Add replica if appropriate. If the replica was previously corrupt
|
|
|
|
- // but now okay, it might need to be updated.
|
|
|
|
- if (reportedState == ReplicaState.FINALIZED
|
|
|
|
- && (storedBlock.findStorageInfo(storageInfo) == -1 ||
|
|
|
|
- corruptReplicas.isReplicaCorrupt(storedBlock, dn))) {
|
|
|
|
- toAdd.add(new BlockInfoToAdd(storedBlock, block));
|
|
|
|
|
|
+ } else if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) {
|
|
|
|
+ toUC.add(new StatefulBlockInfo(storedBlock, new Block(replica),
|
|
|
|
+ reportedState));
|
|
|
|
+ } else if (reportedState == ReplicaState.FINALIZED &&
|
|
|
|
+ (storedBlock.findStorageInfo(storageInfo) == -1 ||
|
|
|
|
+ corruptReplicas.isReplicaCorrupt(storedBlock, dn))) {
|
|
|
|
+ // Add replica if appropriate. If the replica was previously corrupt
|
|
|
|
+ // but now okay, it might need to be updated.
|
|
|
|
+ toAdd.add(new BlockInfoToAdd(storedBlock, replica));
|
|
}
|
|
}
|
|
- return storedBlock;
|
|
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -2774,7 +2814,7 @@ public class BlockManager implements BlockStatsMXBean {
|
|
}
|
|
}
|
|
|
|
|
|
// just add it
|
|
// just add it
|
|
- AddBlockResult result = storageInfo.addBlock(storedBlock, reported);
|
|
|
|
|
|
+ AddBlockResult result = storageInfo.addBlockInitial(storedBlock, reported);
|
|
|
|
|
|
// Now check for completion of blocks and safe block count
|
|
// Now check for completion of blocks and safe block count
|
|
int numCurrentReplica = countLiveNodes(storedBlock);
|
|
int numCurrentReplica = countLiveNodes(storedBlock);
|
|
@@ -3497,40 +3537,75 @@ public class BlockManager implements BlockStatsMXBean {
|
|
DatanodeStorageInfo storageInfo, Block block,
|
|
DatanodeStorageInfo storageInfo, Block block,
|
|
ReplicaState reportedState, DatanodeDescriptor delHintNode)
|
|
ReplicaState reportedState, DatanodeDescriptor delHintNode)
|
|
throws IOException {
|
|
throws IOException {
|
|
- // blockReceived reports a finalized block
|
|
|
|
- Collection<BlockInfoToAdd> toAdd = new LinkedList<>();
|
|
|
|
- Collection<Block> toInvalidate = new LinkedList<Block>();
|
|
|
|
- Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<BlockToMarkCorrupt>();
|
|
|
|
- Collection<StatefulBlockInfo> toUC = new LinkedList<StatefulBlockInfo>();
|
|
|
|
|
|
+
|
|
final DatanodeDescriptor node = storageInfo.getDatanodeDescriptor();
|
|
final DatanodeDescriptor node = storageInfo.getDatanodeDescriptor();
|
|
|
|
|
|
- processReportedBlock(storageInfo, block, reportedState, toAdd, toInvalidate,
|
|
|
|
- toCorrupt, toUC);
|
|
|
|
- // the block is only in one of the to-do lists
|
|
|
|
- // if it is in none then data-node already has it
|
|
|
|
- assert toUC.size() + toAdd.size() + toInvalidate.size() + toCorrupt.size() <= 1
|
|
|
|
- : "The block should be only in one of the lists.";
|
|
|
|
|
|
+ if(LOG.isDebugEnabled()) {
|
|
|
|
+ LOG.debug("Reported block " + block
|
|
|
|
+ + " on " + node + " size " + block.getNumBytes()
|
|
|
|
+ + " replicaState = " + reportedState);
|
|
|
|
+ }
|
|
|
|
|
|
- for (StatefulBlockInfo b : toUC) {
|
|
|
|
- addStoredBlockUnderConstruction(b, storageInfo);
|
|
|
|
|
|
+ if (shouldPostponeBlocksFromFuture &&
|
|
|
|
+ isGenStampInFuture(block)) {
|
|
|
|
+ queueReportedBlock(storageInfo, block, reportedState,
|
|
|
|
+ QUEUE_REASON_FUTURE_GENSTAMP);
|
|
|
|
+ return;
|
|
}
|
|
}
|
|
- long numBlocksLogged = 0;
|
|
|
|
- for (BlockInfoToAdd b : toAdd) {
|
|
|
|
- addStoredBlock(b.stored, b.reported, storageInfo, delHintNode,
|
|
|
|
- numBlocksLogged < maxNumBlocksToLog);
|
|
|
|
- numBlocksLogged++;
|
|
|
|
|
|
+
|
|
|
|
+ // find block by blockId
|
|
|
|
+ BlockInfo storedBlock = getStoredBlock(block);
|
|
|
|
+ if(storedBlock == null) {
|
|
|
|
+ // If blocksMap does not contain reported block id,
|
|
|
|
+ // the replica should be removed from the data-node.
|
|
|
|
+ blockLog.debug("BLOCK* addBlock: block {} on node {} size {} does not " +
|
|
|
|
+ "belong to any file", block, node, block.getNumBytes());
|
|
|
|
+ addToInvalidates(new Block(block), node);
|
|
|
|
+ return;
|
|
}
|
|
}
|
|
- if (numBlocksLogged > maxNumBlocksToLog) {
|
|
|
|
- blockLog.debug("BLOCK* addBlock: logged info for {} of {} reported.",
|
|
|
|
- maxNumBlocksToLog, numBlocksLogged);
|
|
|
|
|
|
+
|
|
|
|
+ BlockUCState ucState = storedBlock.getBlockUCState();
|
|
|
|
+ // Block is on the NN
|
|
|
|
+ if(LOG.isDebugEnabled()) {
|
|
|
|
+ LOG.debug("In memory blockUCState = " + ucState);
|
|
}
|
|
}
|
|
- for (Block b : toInvalidate) {
|
|
|
|
- blockLog.debug("BLOCK* addBlock: block {} on node {} size {} does not " +
|
|
|
|
- "belong to any file", b, node, b.getNumBytes());
|
|
|
|
- addToInvalidates(b, node);
|
|
|
|
|
|
+
|
|
|
|
+ // Ignore replicas already scheduled to be removed from the DN
|
|
|
|
+ if(invalidateBlocks.contains(node, block)) {
|
|
|
|
+ return;
|
|
}
|
|
}
|
|
- for (BlockToMarkCorrupt b : toCorrupt) {
|
|
|
|
- markBlockAsCorrupt(b, storageInfo, node);
|
|
|
|
|
|
+
|
|
|
|
+ BlockToMarkCorrupt c = checkReplicaCorrupt(
|
|
|
|
+ block, reportedState, storedBlock, ucState, node);
|
|
|
|
+ if (c != null) {
|
|
|
|
+ if (shouldPostponeBlocksFromFuture) {
|
|
|
|
+ // If the block is an out-of-date generation stamp or state,
|
|
|
|
+ // but we're the standby, we shouldn't treat it as corrupt,
|
|
|
|
+ // but instead just queue it for later processing.
|
|
|
|
+ // TODO: Pretty confident this should be s/storedBlock/block below,
|
|
|
|
+ // since we should be postponing the info of the reported block, not
|
|
|
|
+ // the stored block. See HDFS-6289 for more context.
|
|
|
|
+ queueReportedBlock(storageInfo, storedBlock, reportedState,
|
|
|
|
+ QUEUE_REASON_CORRUPT_STATE);
|
|
|
|
+ } else {
|
|
|
|
+ markBlockAsCorrupt(c, storageInfo, node);
|
|
|
|
+ }
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) {
|
|
|
|
+ addStoredBlockUnderConstruction(
|
|
|
|
+ new StatefulBlockInfo(storedBlock, new Block(block), reportedState),
|
|
|
|
+ storageInfo);
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // Add replica if appropriate. If the replica was previously corrupt
|
|
|
|
+ // but now okay, it might need to be updated.
|
|
|
|
+ if (reportedState == ReplicaState.FINALIZED
|
|
|
|
+ && (storedBlock.findStorageInfo(storageInfo) == -1 ||
|
|
|
|
+ corruptReplicas.isReplicaCorrupt(storedBlock, node))) {
|
|
|
|
+ addStoredBlock(storedBlock, block, storageInfo, delHintNode, true);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -4060,6 +4135,87 @@ public class BlockManager implements BlockStatsMXBean {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
+ * Runnable that monitors the fragmentation of the StorageInfo TreeSet and
|
|
|
|
+ * compacts it when it falls under a certain threshold.
|
|
|
|
+ */
|
|
|
|
+ private class StorageInfoDefragmenter implements Runnable {
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ public void run() {
|
|
|
|
+ while (namesystem.isRunning()) {
|
|
|
|
+ try {
|
|
|
|
+ // Check storage efficiency only when active NN is out of safe mode.
|
|
|
|
+ if (isPopulatingReplQueues()) {
|
|
|
|
+ scanAndCompactStorages();
|
|
|
|
+ }
|
|
|
|
+ Thread.sleep(storageInfoDefragmentInterval);
|
|
|
|
+ } catch (Throwable t) {
|
|
|
|
+ if (!namesystem.isRunning()) {
|
|
|
|
+ LOG.info("Stopping thread.");
|
|
|
|
+ if (!(t instanceof InterruptedException)) {
|
|
|
|
+ LOG.info("Received an exception while shutting down.", t);
|
|
|
|
+ }
|
|
|
|
+ break;
|
|
|
|
+ } else if (!checkNSRunning && t instanceof InterruptedException) {
|
|
|
|
+ LOG.info("Stopping for testing.");
|
|
|
|
+ break;
|
|
|
|
+ }
|
|
|
|
+ LOG.error("Thread received Runtime exception.", t);
|
|
|
|
+ terminate(1, t);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private void scanAndCompactStorages() throws InterruptedException {
|
|
|
|
+ ArrayList<String> datanodesAndStorages = new ArrayList<>();
|
|
|
|
+ for (DatanodeDescriptor node
|
|
|
|
+ : datanodeManager.getDatanodeListForReport(DatanodeReportType.ALL)) {
|
|
|
|
+ for (DatanodeStorageInfo storage : node.getStorageInfos()) {
|
|
|
|
+ try {
|
|
|
|
+ namesystem.readLock();
|
|
|
|
+ double ratio = storage.treeSetFillRatio();
|
|
|
|
+ if (ratio < storageInfoDefragmentRatio) {
|
|
|
|
+ datanodesAndStorages.add(node.getDatanodeUuid());
|
|
|
|
+ datanodesAndStorages.add(storage.getStorageID());
|
|
|
|
+ }
|
|
|
|
+ LOG.info("StorageInfo TreeSet fill ratio {} : {}{}",
|
|
|
|
+ storage.getStorageID(), ratio,
|
|
|
|
+ (ratio < storageInfoDefragmentRatio)
|
|
|
|
+ ? " (queued for defragmentation)" : "");
|
|
|
|
+ } finally {
|
|
|
|
+ namesystem.readUnlock();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (!datanodesAndStorages.isEmpty()) {
|
|
|
|
+ for (int i = 0; i < datanodesAndStorages.size(); i += 2) {
|
|
|
|
+ namesystem.writeLock();
|
|
|
|
+ try {
|
|
|
|
+ DatanodeStorageInfo storage = datanodeManager.
|
|
|
|
+ getDatanode(datanodesAndStorages.get(i)).
|
|
|
|
+ getStorageInfo(datanodesAndStorages.get(i + 1));
|
|
|
|
+ if (storage != null) {
|
|
|
|
+ boolean aborted =
|
|
|
|
+ !storage.treeSetCompact(storageInfoDefragmentTimeout);
|
|
|
|
+ if (aborted) {
|
|
|
|
+ // Compaction timed out, reset iterator to continue with
|
|
|
|
+ // the same storage next iteration.
|
|
|
|
+ i -= 2;
|
|
|
|
+ }
|
|
|
|
+ LOG.info("StorageInfo TreeSet defragmented {} : {}{}",
|
|
|
|
+ storage.getStorageID(), storage.treeSetFillRatio(),
|
|
|
|
+ aborted ? " (aborted)" : "");
|
|
|
|
+ }
|
|
|
|
+ } finally {
|
|
|
|
+ namesystem.writeUnlock();
|
|
|
|
+ }
|
|
|
|
+ // Wait between each iteration
|
|
|
|
+ Thread.sleep(1000);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
|
|
/**
|
|
/**
|
|
* Compute block replication and block invalidation work that can be scheduled
|
|
* Compute block replication and block invalidation work that can be scheduled
|