|
@@ -43,6 +43,7 @@ import org.apache.hadoop.classification.InterfaceAudience;
|
|
import org.apache.hadoop.conf.Configuration;
|
|
import org.apache.hadoop.conf.Configuration;
|
|
import org.apache.hadoop.fs.StorageType;
|
|
import org.apache.hadoop.fs.StorageType;
|
|
import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
|
|
import org.apache.hadoop.hdfs.protocol.BlockStoragePolicy;
|
|
|
|
+import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
|
import org.apache.hadoop.hdfs.DFSUtil;
|
|
import org.apache.hadoop.hdfs.DFSUtil;
|
|
import org.apache.hadoop.hdfs.HAUtil;
|
|
import org.apache.hadoop.hdfs.HAUtil;
|
|
@@ -534,9 +535,9 @@ public class BlockManager {
|
|
|
|
|
|
NumberReplicas numReplicas = new NumberReplicas();
|
|
NumberReplicas numReplicas = new NumberReplicas();
|
|
// source node returned is not used
|
|
// source node returned is not used
|
|
- chooseSourceDatanode(block, containingNodes,
|
|
|
|
|
|
+ chooseSourceDatanodes(getStoredBlock(block), containingNodes,
|
|
containingLiveReplicasNodes, numReplicas,
|
|
containingLiveReplicasNodes, numReplicas,
|
|
- UnderReplicatedBlocks.LEVEL);
|
|
|
|
|
|
+ new LinkedList<Short>(), 1, UnderReplicatedBlocks.LEVEL);
|
|
|
|
|
|
// containingLiveReplicasNodes can include READ_ONLY_SHARED replicas which are
|
|
// containingLiveReplicasNodes can include READ_ONLY_SHARED replicas which are
|
|
// not included in the numReplicas.liveReplicas() count
|
|
// not included in the numReplicas.liveReplicas() count
|
|
@@ -1337,15 +1338,15 @@ public class BlockManager {
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
- * Scan blocks in {@link #neededReplications} and assign replication
|
|
|
|
- * work to data-nodes they belong to.
|
|
|
|
|
|
+ * Scan blocks in {@link #neededReplications} and assign recovery
|
|
|
|
+ * (replication or erasure coding) work to data-nodes they belong to.
|
|
*
|
|
*
|
|
* The number of process blocks equals either twice the number of live
|
|
* The number of process blocks equals either twice the number of live
|
|
* data-nodes or the number of under-replicated blocks whichever is less.
|
|
* data-nodes or the number of under-replicated blocks whichever is less.
|
|
*
|
|
*
|
|
* @return number of blocks scheduled for replication during this iteration.
|
|
* @return number of blocks scheduled for replication during this iteration.
|
|
*/
|
|
*/
|
|
- int computeReplicationWork(int blocksToProcess) {
|
|
|
|
|
|
+ int computeBlockRecoveryWork(int blocksToProcess) {
|
|
List<List<BlockInfo>> blocksToReplicate = null;
|
|
List<List<BlockInfo>> blocksToReplicate = null;
|
|
namesystem.writeLock();
|
|
namesystem.writeLock();
|
|
try {
|
|
try {
|
|
@@ -1355,30 +1356,32 @@ public class BlockManager {
|
|
} finally {
|
|
} finally {
|
|
namesystem.writeUnlock();
|
|
namesystem.writeUnlock();
|
|
}
|
|
}
|
|
- return computeReplicationWorkForBlocks(blocksToReplicate);
|
|
|
|
|
|
+ return computeRecoveryWorkForBlocks(blocksToReplicate);
|
|
}
|
|
}
|
|
|
|
|
|
- /** Replicate a set of blocks
|
|
|
|
|
|
+ /**
|
|
|
|
+ * Recover a set of blocks to full strength through replication or
|
|
|
|
+ * erasure coding
|
|
*
|
|
*
|
|
- * @param blocksToReplicate blocks to be replicated, for each priority
|
|
|
|
|
|
+ * @param blocksToRecover blocks to be recovered, for each priority
|
|
* @return the number of blocks scheduled for replication
|
|
* @return the number of blocks scheduled for replication
|
|
*/
|
|
*/
|
|
@VisibleForTesting
|
|
@VisibleForTesting
|
|
- int computeReplicationWorkForBlocks(List<List<BlockInfo>> blocksToReplicate) {
|
|
|
|
|
|
+ int computeRecoveryWorkForBlocks(List<List<BlockInfo>> blocksToRecover) {
|
|
int requiredReplication, numEffectiveReplicas;
|
|
int requiredReplication, numEffectiveReplicas;
|
|
List<DatanodeDescriptor> containingNodes;
|
|
List<DatanodeDescriptor> containingNodes;
|
|
- DatanodeDescriptor srcNode;
|
|
|
|
BlockCollection bc = null;
|
|
BlockCollection bc = null;
|
|
int additionalReplRequired;
|
|
int additionalReplRequired;
|
|
|
|
|
|
int scheduledWork = 0;
|
|
int scheduledWork = 0;
|
|
- List<ReplicationWork> work = new LinkedList<ReplicationWork>();
|
|
|
|
|
|
+ List<BlockRecoveryWork> recovWork = new LinkedList<>();
|
|
|
|
|
|
|
|
+ // Step 1: categorize at-risk blocks into replication and EC tasks
|
|
namesystem.writeLock();
|
|
namesystem.writeLock();
|
|
try {
|
|
try {
|
|
synchronized (neededReplications) {
|
|
synchronized (neededReplications) {
|
|
- for (int priority = 0; priority < blocksToReplicate.size(); priority++) {
|
|
|
|
- for (BlockInfo block : blocksToReplicate.get(priority)) {
|
|
|
|
|
|
+ for (int priority = 0; priority < blocksToRecover.size(); priority++) {
|
|
|
|
+ for (BlockInfo block : blocksToRecover.get(priority)) {
|
|
// block should belong to a file
|
|
// block should belong to a file
|
|
bc = blocksMap.getBlockCollection(block);
|
|
bc = blocksMap.getBlockCollection(block);
|
|
// abandoned block or block reopened for append
|
|
// abandoned block or block reopened for append
|
|
@@ -1392,25 +1395,31 @@ public class BlockManager {
|
|
requiredReplication = bc.getPreferredBlockReplication();
|
|
requiredReplication = bc.getPreferredBlockReplication();
|
|
|
|
|
|
// get a source data-node
|
|
// get a source data-node
|
|
- containingNodes = new ArrayList<DatanodeDescriptor>();
|
|
|
|
- List<DatanodeStorageInfo> liveReplicaNodes = new ArrayList<DatanodeStorageInfo>();
|
|
|
|
|
|
+ containingNodes = new ArrayList<>();
|
|
|
|
+ List<DatanodeStorageInfo> liveReplicaNodes = new ArrayList<>();
|
|
NumberReplicas numReplicas = new NumberReplicas();
|
|
NumberReplicas numReplicas = new NumberReplicas();
|
|
- srcNode = chooseSourceDatanode(
|
|
|
|
|
|
+ List<Short> missingBlockIndices = new LinkedList<>();
|
|
|
|
+ DatanodeDescriptor[] srcNodes;
|
|
|
|
+ int numSourceNodes = bc.isStriped() ?
|
|
|
|
+ HdfsConstants.NUM_DATA_BLOCKS : 1;
|
|
|
|
+ srcNodes = chooseSourceDatanodes(
|
|
block, containingNodes, liveReplicaNodes, numReplicas,
|
|
block, containingNodes, liveReplicaNodes, numReplicas,
|
|
- priority);
|
|
|
|
- if(srcNode == null) { // block can not be replicated from any node
|
|
|
|
- LOG.debug("Block " + block + " cannot be repl from any node");
|
|
|
|
|
|
+ missingBlockIndices, numSourceNodes, priority);
|
|
|
|
+ if(srcNodes == null || srcNodes.length == 0) {
|
|
|
|
+ // block can not be replicated from any node
|
|
|
|
+ LOG.debug("Block " + block + " cannot be recovered " +
|
|
|
|
+ "from any node");
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
|
|
- // liveReplicaNodes can include READ_ONLY_SHARED replicas which are
|
|
|
|
|
|
+ // liveReplicaNodes can include READ_ONLY_SHARED replicas which are
|
|
// not included in the numReplicas.liveReplicas() count
|
|
// not included in the numReplicas.liveReplicas() count
|
|
assert liveReplicaNodes.size() >= numReplicas.liveReplicas();
|
|
assert liveReplicaNodes.size() >= numReplicas.liveReplicas();
|
|
|
|
|
|
// do not schedule more if enough replicas is already pending
|
|
// do not schedule more if enough replicas is already pending
|
|
numEffectiveReplicas = numReplicas.liveReplicas() +
|
|
numEffectiveReplicas = numReplicas.liveReplicas() +
|
|
pendingReplications.getNumReplicas(block);
|
|
pendingReplications.getNumReplicas(block);
|
|
-
|
|
|
|
|
|
+
|
|
if (numEffectiveReplicas >= requiredReplication) {
|
|
if (numEffectiveReplicas >= requiredReplication) {
|
|
if ( (pendingReplications.getNumReplicas(block) > 0) ||
|
|
if ( (pendingReplications.getNumReplicas(block) > 0) ||
|
|
(blockHasEnoughRacks(block)) ) {
|
|
(blockHasEnoughRacks(block)) ) {
|
|
@@ -1427,9 +1436,21 @@ public class BlockManager {
|
|
} else {
|
|
} else {
|
|
additionalReplRequired = 1; // Needed on a new rack
|
|
additionalReplRequired = 1; // Needed on a new rack
|
|
}
|
|
}
|
|
- work.add(new ReplicationWork(block, bc, srcNode,
|
|
|
|
- containingNodes, liveReplicaNodes, additionalReplRequired,
|
|
|
|
- priority));
|
|
|
|
|
|
+ if (bc.isStriped()) {
|
|
|
|
+ ErasureCodingWork ecw = new ErasureCodingWork(block, bc, srcNodes,
|
|
|
|
+ containingNodes, liveReplicaNodes, additionalReplRequired,
|
|
|
|
+ priority);
|
|
|
|
+ short[] missingBlockArray = new short[missingBlockIndices.size()];
|
|
|
|
+ for (int i = 0 ; i < missingBlockIndices.size(); i++) {
|
|
|
|
+ missingBlockArray[i] = missingBlockIndices.get(i);
|
|
|
|
+ }
|
|
|
|
+ ecw.setMissingBlockIndices(missingBlockArray);
|
|
|
|
+ recovWork.add(ecw);
|
|
|
|
+ } else {
|
|
|
|
+ recovWork.add(new ReplicationWork(block, bc, srcNodes,
|
|
|
|
+ containingNodes, liveReplicaNodes, additionalReplRequired,
|
|
|
|
+ priority));
|
|
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -1437,8 +1458,9 @@ public class BlockManager {
|
|
namesystem.writeUnlock();
|
|
namesystem.writeUnlock();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ // Step 2: choose target nodes for each recovery task
|
|
final Set<Node> excludedNodes = new HashSet<Node>();
|
|
final Set<Node> excludedNodes = new HashSet<Node>();
|
|
- for(ReplicationWork rw : work){
|
|
|
|
|
|
+ for(BlockRecoveryWork rw : recovWork){
|
|
// Exclude all of the containing nodes from being targets.
|
|
// Exclude all of the containing nodes from being targets.
|
|
// This list includes decommissioning or corrupt nodes.
|
|
// This list includes decommissioning or corrupt nodes.
|
|
excludedNodes.clear();
|
|
excludedNodes.clear();
|
|
@@ -1452,9 +1474,10 @@ public class BlockManager {
|
|
rw.chooseTargets(blockplacement, storagePolicySuite, excludedNodes);
|
|
rw.chooseTargets(blockplacement, storagePolicySuite, excludedNodes);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ // Step 3: add tasks to the DN
|
|
namesystem.writeLock();
|
|
namesystem.writeLock();
|
|
try {
|
|
try {
|
|
- for(ReplicationWork rw : work){
|
|
|
|
|
|
+ for(BlockRecoveryWork rw : recovWork){
|
|
final DatanodeStorageInfo[] targets = rw.targets;
|
|
final DatanodeStorageInfo[] targets = rw.targets;
|
|
if(targets == null || targets.length == 0){
|
|
if(targets == null || targets.length == 0){
|
|
rw.targets = null;
|
|
rw.targets = null;
|
|
@@ -1493,7 +1516,7 @@ public class BlockManager {
|
|
|
|
|
|
if ( (numReplicas.liveReplicas() >= requiredReplication) &&
|
|
if ( (numReplicas.liveReplicas() >= requiredReplication) &&
|
|
(!blockHasEnoughRacks(block)) ) {
|
|
(!blockHasEnoughRacks(block)) ) {
|
|
- if (rw.srcNode.getNetworkLocation().equals(
|
|
|
|
|
|
+ if (rw.srcNodes[0].getNetworkLocation().equals(
|
|
targets[0].getDatanodeDescriptor().getNetworkLocation())) {
|
|
targets[0].getDatanodeDescriptor().getNetworkLocation())) {
|
|
//No use continuing, unless a new rack in this case
|
|
//No use continuing, unless a new rack in this case
|
|
continue;
|
|
continue;
|
|
@@ -1501,7 +1524,17 @@ public class BlockManager {
|
|
}
|
|
}
|
|
|
|
|
|
// Add block to the to be replicated list
|
|
// Add block to the to be replicated list
|
|
- rw.srcNode.addBlockToBeReplicated(block, targets);
|
|
|
|
|
|
+ if (bc.isStriped()) {
|
|
|
|
+ assert rw instanceof ErasureCodingWork;
|
|
|
|
+ assert rw.targets.length > 0;
|
|
|
|
+ rw.targets[0].getDatanodeDescriptor().addBlockToBeErasureCoded(
|
|
|
|
+ new ExtendedBlock(namesystem.getBlockPoolId(), block),
|
|
|
|
+ rw.srcNodes, rw.targets,
|
|
|
|
+ ((ErasureCodingWork)rw).getMissingBlockIndicies());
|
|
|
|
+ }
|
|
|
|
+ else {
|
|
|
|
+ rw.srcNodes[0].addBlockToBeReplicated(block, targets);
|
|
|
|
+ }
|
|
scheduledWork++;
|
|
scheduledWork++;
|
|
DatanodeStorageInfo.incrementBlocksScheduled(targets);
|
|
DatanodeStorageInfo.incrementBlocksScheduled(targets);
|
|
|
|
|
|
@@ -1525,7 +1558,7 @@ public class BlockManager {
|
|
|
|
|
|
if (blockLog.isInfoEnabled()) {
|
|
if (blockLog.isInfoEnabled()) {
|
|
// log which blocks have been scheduled for replication
|
|
// log which blocks have been scheduled for replication
|
|
- for(ReplicationWork rw : work){
|
|
|
|
|
|
+ for(BlockRecoveryWork rw : recovWork){
|
|
DatanodeStorageInfo[] targets = rw.targets;
|
|
DatanodeStorageInfo[] targets = rw.targets;
|
|
if (targets != null && targets.length != 0) {
|
|
if (targets != null && targets.length != 0) {
|
|
StringBuilder targetList = new StringBuilder("datanode(s)");
|
|
StringBuilder targetList = new StringBuilder("datanode(s)");
|
|
@@ -1533,7 +1566,7 @@ public class BlockManager {
|
|
targetList.append(' ');
|
|
targetList.append(' ');
|
|
targetList.append(targets[k].getDatanodeDescriptor());
|
|
targetList.append(targets[k].getDatanodeDescriptor());
|
|
}
|
|
}
|
|
- blockLog.info("BLOCK* ask {} to replicate {} to {}", rw.srcNode,
|
|
|
|
|
|
+ blockLog.info("BLOCK* ask {} to replicate {} to {}", rw.srcNodes,
|
|
rw.block, targetList);
|
|
rw.block, targetList);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -1619,55 +1652,66 @@ public class BlockManager {
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
- * Parse the data-nodes the block belongs to and choose one,
|
|
|
|
- * which will be the replication source.
|
|
|
|
|
|
+ * Parse the data-nodes the block belongs to and choose a certain number
|
|
|
|
+ * from them to be the recovery sources.
|
|
*
|
|
*
|
|
* We prefer nodes that are in DECOMMISSION_INPROGRESS state to other nodes
|
|
* We prefer nodes that are in DECOMMISSION_INPROGRESS state to other nodes
|
|
* since the former do not have write traffic and hence are less busy.
|
|
* since the former do not have write traffic and hence are less busy.
|
|
* We do not use already decommissioned nodes as a source.
|
|
* We do not use already decommissioned nodes as a source.
|
|
- * Otherwise we choose a random node among those that did not reach their
|
|
|
|
- * replication limits. However, if the replication is of the highest priority
|
|
|
|
- * and all nodes have reached their replication limits, we will choose a
|
|
|
|
- * random node despite the replication limit.
|
|
|
|
|
|
+ * Otherwise we randomly choose nodes among those that did not reach their
|
|
|
|
+ * replication limits. However, if the recovery work is of the highest
|
|
|
|
+ * priority and all nodes have reached their replication limits, we will
|
|
|
|
+ * randomly choose the desired number of nodes despite the replication limit.
|
|
*
|
|
*
|
|
* In addition form a list of all nodes containing the block
|
|
* In addition form a list of all nodes containing the block
|
|
* and calculate its replication numbers.
|
|
* and calculate its replication numbers.
|
|
*
|
|
*
|
|
* @param block Block for which a replication source is needed
|
|
* @param block Block for which a replication source is needed
|
|
- * @param containingNodes List to be populated with nodes found to contain the
|
|
|
|
- * given block
|
|
|
|
- * @param nodesContainingLiveReplicas List to be populated with nodes found to
|
|
|
|
- * contain live replicas of the given block
|
|
|
|
- * @param numReplicas NumberReplicas instance to be initialized with the
|
|
|
|
- * counts of live, corrupt, excess, and
|
|
|
|
- * decommissioned replicas of the given
|
|
|
|
- * block.
|
|
|
|
|
|
+ * @param containingNodes List to be populated with nodes found to contain
|
|
|
|
+ * the given block
|
|
|
|
+ * @param nodesContainingLiveReplicas List to be populated with nodes found
|
|
|
|
+ * to contain live replicas of the given
|
|
|
|
+ * block
|
|
|
|
+ * @param numReplicas NumberReplicas instance to be initialized with the
|
|
|
|
+ * counts of live, corrupt, excess, and decommissioned
|
|
|
|
+ * replicas of the given block.
|
|
|
|
+ * @param missingBlockIndices List to be populated with indices of missing
|
|
|
|
+ * blocks in a striped block group or missing
|
|
|
|
+ * replicas of a replicated block
|
|
|
|
+ * @param numSourceNodes integer specifying the number of source nodes to
|
|
|
|
+ * choose
|
|
* @param priority integer representing replication priority of the given
|
|
* @param priority integer representing replication priority of the given
|
|
* block
|
|
* block
|
|
- * @return the DatanodeDescriptor of the chosen node from which to replicate
|
|
|
|
- * the given block
|
|
|
|
- */
|
|
|
|
- @VisibleForTesting
|
|
|
|
- DatanodeDescriptor chooseSourceDatanode(Block block,
|
|
|
|
- List<DatanodeDescriptor> containingNodes,
|
|
|
|
- List<DatanodeStorageInfo> nodesContainingLiveReplicas,
|
|
|
|
- NumberReplicas numReplicas,
|
|
|
|
- int priority) {
|
|
|
|
|
|
+ * @return the array of DatanodeDescriptor of the chosen nodes from which to
|
|
|
|
+ * recover the given block
|
|
|
|
+ */
|
|
|
|
+ @VisibleForTesting
|
|
|
|
+ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block,
|
|
|
|
+ List<DatanodeDescriptor> containingNodes,
|
|
|
|
+ List<DatanodeStorageInfo> nodesContainingLiveReplicas,
|
|
|
|
+ NumberReplicas numReplicas,
|
|
|
|
+ List<Short> missingBlockIndices, int numSourceNodes, int priority) {
|
|
containingNodes.clear();
|
|
containingNodes.clear();
|
|
nodesContainingLiveReplicas.clear();
|
|
nodesContainingLiveReplicas.clear();
|
|
- DatanodeDescriptor srcNode = null;
|
|
|
|
|
|
+ LinkedList<DatanodeDescriptor> srcNodes = new LinkedList<>();
|
|
int live = 0;
|
|
int live = 0;
|
|
int decommissioned = 0;
|
|
int decommissioned = 0;
|
|
int decommissioning = 0;
|
|
int decommissioning = 0;
|
|
int corrupt = 0;
|
|
int corrupt = 0;
|
|
int excess = 0;
|
|
int excess = 0;
|
|
-
|
|
|
|
|
|
+ missingBlockIndices.clear();
|
|
|
|
+ Set<Short> healthyIndices = new HashSet<>();
|
|
|
|
+
|
|
Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(block);
|
|
Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(block);
|
|
for(DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
|
|
for(DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
|
|
|
|
+ if (block.isStriped()) {
|
|
|
|
+ healthyIndices.add((short) ((BlockInfoStriped) block).
|
|
|
|
+ getStorageBlockIndex(storage));
|
|
|
|
+ }
|
|
final DatanodeDescriptor node = storage.getDatanodeDescriptor();
|
|
final DatanodeDescriptor node = storage.getDatanodeDescriptor();
|
|
LightWeightLinkedSet<Block> excessBlocks =
|
|
LightWeightLinkedSet<Block> excessBlocks =
|
|
excessReplicateMap.get(node.getDatanodeUuid());
|
|
excessReplicateMap.get(node.getDatanodeUuid());
|
|
- int countableReplica = storage.getState() == State.NORMAL ? 1 : 0;
|
|
|
|
|
|
+ int countableReplica = storage.getState() == State.NORMAL ? 1 : 0;
|
|
if ((nodesCorrupt != null) && (nodesCorrupt.contains(node)))
|
|
if ((nodesCorrupt != null) && (nodesCorrupt.contains(node)))
|
|
corrupt += countableReplica;
|
|
corrupt += countableReplica;
|
|
else if (node.isDecommissionInProgress()) {
|
|
else if (node.isDecommissionInProgress()) {
|
|
@@ -1703,20 +1747,32 @@ public class BlockManager {
|
|
continue;
|
|
continue;
|
|
|
|
|
|
// We got this far, current node is a reasonable choice
|
|
// We got this far, current node is a reasonable choice
|
|
- if (srcNode == null) {
|
|
|
|
- srcNode = node;
|
|
|
|
|
|
+ if(srcNodes.size() < numSourceNodes) {
|
|
|
|
+ srcNodes.add(node);
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
// switch to a different node randomly
|
|
// switch to a different node randomly
|
|
// this to prevent from deterministically selecting the same node even
|
|
// this to prevent from deterministically selecting the same node even
|
|
// if the node failed to replicate the block on previous iterations
|
|
// if the node failed to replicate the block on previous iterations
|
|
- if(ThreadLocalRandom.current().nextBoolean())
|
|
|
|
- srcNode = node;
|
|
|
|
|
|
+ if(ThreadLocalRandom.current().nextBoolean()) {
|
|
|
|
+ int pos = ThreadLocalRandom.current().nextInt(numSourceNodes);
|
|
|
|
+ if(!srcNodes.get(pos).isDecommissionInProgress()) {
|
|
|
|
+ srcNodes.set(pos, node);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ if (block.isStriped()) {
|
|
|
|
+ for (short i = 0; i < HdfsConstants.NUM_DATA_BLOCKS +
|
|
|
|
+ HdfsConstants.NUM_PARITY_BLOCKS; i++) {
|
|
|
|
+ if (!healthyIndices.contains(i)) {
|
|
|
|
+ missingBlockIndices.add(i);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
}
|
|
}
|
|
if(numReplicas != null)
|
|
if(numReplicas != null)
|
|
numReplicas.initialize(live, decommissioned, decommissioning, corrupt,
|
|
numReplicas.initialize(live, decommissioned, decommissioning, corrupt,
|
|
excess, 0);
|
|
excess, 0);
|
|
- return srcNode;
|
|
|
|
|
|
+ return srcNodes.toArray(new DatanodeDescriptor[srcNodes.size()]);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
@@ -1751,7 +1807,7 @@ public class BlockManager {
|
|
*/
|
|
*/
|
|
}
|
|
}
|
|
}
|
|
}
|
|
-
|
|
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* StatefulBlockInfo is used to build the "toUC" list, which is a list of
|
|
* StatefulBlockInfo is used to build the "toUC" list, which is a list of
|
|
* updates to the information about under-construction blocks.
|
|
* updates to the information about under-construction blocks.
|
|
@@ -3716,7 +3772,7 @@ public class BlockManager {
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
/**
|
|
- * Periodically calls computeReplicationWork().
|
|
|
|
|
|
+ * Periodically calls computeBlockRecoveryWork().
|
|
*/
|
|
*/
|
|
private class ReplicationMonitor implements Runnable {
|
|
private class ReplicationMonitor implements Runnable {
|
|
|
|
|
|
@@ -3774,7 +3830,7 @@ public class BlockManager {
|
|
final int nodesToProcess = (int) Math.ceil(numlive
|
|
final int nodesToProcess = (int) Math.ceil(numlive
|
|
* this.blocksInvalidateWorkPct);
|
|
* this.blocksInvalidateWorkPct);
|
|
|
|
|
|
- int workFound = this.computeReplicationWork(blocksToProcess);
|
|
|
|
|
|
+ int workFound = this.computeBlockRecoveryWork(blocksToProcess);
|
|
|
|
|
|
// Update counters
|
|
// Update counters
|
|
namesystem.writeLock();
|
|
namesystem.writeLock();
|
|
@@ -3814,47 +3870,117 @@ public class BlockManager {
|
|
null);
|
|
null);
|
|
}
|
|
}
|
|
|
|
|
|
- private static class ReplicationWork {
|
|
|
|
-
|
|
|
|
- private final BlockInfo block;
|
|
|
|
- private final BlockCollection bc;
|
|
|
|
|
|
+ /**
|
|
|
|
+ * This class is used internally by {@link this#computeRecoveryWorkForBlocks}
|
|
|
|
+ * to represent a task to recover a block through replication or erasure
|
|
|
|
+ * coding. Recovery is done by transferring data from {@link srcNodes} to
|
|
|
|
+ * {@link targets}
|
|
|
|
+ */
|
|
|
|
+ private static class BlockRecoveryWork {
|
|
|
|
+ protected final BlockInfo block;
|
|
|
|
+ protected final BlockCollection bc;
|
|
|
|
|
|
- private final DatanodeDescriptor srcNode;
|
|
|
|
- private final List<DatanodeDescriptor> containingNodes;
|
|
|
|
- private final List<DatanodeStorageInfo> liveReplicaStorages;
|
|
|
|
- private final int additionalReplRequired;
|
|
|
|
|
|
+ /**
|
|
|
|
+ * An erasure coding recovery task has multiple source nodes.
|
|
|
|
+ * A replication task only has 1 source node, stored on top of the array
|
|
|
|
+ */
|
|
|
|
+ protected final DatanodeDescriptor[] srcNodes;
|
|
|
|
+ /** Nodes containing the block; avoid them in choosing new targets */
|
|
|
|
+ protected final List<DatanodeDescriptor> containingNodes;
|
|
|
|
+ /** Required by {@link BlockPlacementPolicy#chooseTarget} */
|
|
|
|
+ protected final List<DatanodeStorageInfo> liveReplicaStorages;
|
|
|
|
+ protected final int additionalReplRequired;
|
|
|
|
|
|
- private DatanodeStorageInfo targets[];
|
|
|
|
- private final int priority;
|
|
|
|
|
|
+ protected DatanodeStorageInfo[] targets;
|
|
|
|
+ protected final int priority;
|
|
|
|
|
|
- public ReplicationWork(BlockInfo block,
|
|
|
|
|
|
+ public BlockRecoveryWork(BlockInfo block,
|
|
BlockCollection bc,
|
|
BlockCollection bc,
|
|
- DatanodeDescriptor srcNode,
|
|
|
|
|
|
+ DatanodeDescriptor[] srcNodes,
|
|
List<DatanodeDescriptor> containingNodes,
|
|
List<DatanodeDescriptor> containingNodes,
|
|
List<DatanodeStorageInfo> liveReplicaStorages,
|
|
List<DatanodeStorageInfo> liveReplicaStorages,
|
|
int additionalReplRequired,
|
|
int additionalReplRequired,
|
|
int priority) {
|
|
int priority) {
|
|
this.block = block;
|
|
this.block = block;
|
|
this.bc = bc;
|
|
this.bc = bc;
|
|
- this.srcNode = srcNode;
|
|
|
|
- this.srcNode.incrementPendingReplicationWithoutTargets();
|
|
|
|
|
|
+ this.srcNodes = srcNodes;
|
|
this.containingNodes = containingNodes;
|
|
this.containingNodes = containingNodes;
|
|
this.liveReplicaStorages = liveReplicaStorages;
|
|
this.liveReplicaStorages = liveReplicaStorages;
|
|
this.additionalReplRequired = additionalReplRequired;
|
|
this.additionalReplRequired = additionalReplRequired;
|
|
this.priority = priority;
|
|
this.priority = priority;
|
|
this.targets = null;
|
|
this.targets = null;
|
|
}
|
|
}
|
|
-
|
|
|
|
- private void chooseTargets(BlockPlacementPolicy blockplacement,
|
|
|
|
|
|
+
|
|
|
|
+ protected void chooseTargets(BlockPlacementPolicy blockplacement,
|
|
|
|
+ BlockStoragePolicySuite storagePolicySuite,
|
|
|
|
+ Set<Node> excludedNodes) {
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private static class ReplicationWork extends BlockRecoveryWork {
|
|
|
|
+
|
|
|
|
+ public ReplicationWork(BlockInfo block,
|
|
|
|
+ BlockCollection bc,
|
|
|
|
+ DatanodeDescriptor[] srcNodes,
|
|
|
|
+ List<DatanodeDescriptor> containingNodes,
|
|
|
|
+ List<DatanodeStorageInfo> liveReplicaStorages,
|
|
|
|
+ int additionalReplRequired,
|
|
|
|
+ int priority) {
|
|
|
|
+ super(block, bc, srcNodes, containingNodes,
|
|
|
|
+ liveReplicaStorages, additionalReplRequired, priority);
|
|
|
|
+ LOG.debug("Creating a ReplicationWork to recover " + block);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ protected void chooseTargets(BlockPlacementPolicy blockplacement,
|
|
|
|
+ BlockStoragePolicySuite storagePolicySuite,
|
|
|
|
+ Set<Node> excludedNodes) {
|
|
|
|
+ assert srcNodes.length > 0
|
|
|
|
+ : "At least 1 source node should have been selected";
|
|
|
|
+ try {
|
|
|
|
+ targets = blockplacement.chooseTarget(bc.getName(),
|
|
|
|
+ additionalReplRequired, srcNodes[0], liveReplicaStorages, false,
|
|
|
|
+ excludedNodes, block.getNumBytes(),
|
|
|
|
+ storagePolicySuite.getPolicy(bc.getStoragePolicyID()));
|
|
|
|
+ } finally {
|
|
|
|
+ srcNodes[0].decrementPendingReplicationWithoutTargets();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private static class ErasureCodingWork extends BlockRecoveryWork {
|
|
|
|
+
|
|
|
|
+ private short[] missingBlockIndicies = null;
|
|
|
|
+
|
|
|
|
+ public ErasureCodingWork(BlockInfo block,
|
|
|
|
+ BlockCollection bc,
|
|
|
|
+ DatanodeDescriptor[] srcNodes,
|
|
|
|
+ List<DatanodeDescriptor> containingNodes,
|
|
|
|
+ List<DatanodeStorageInfo> liveReplicaStorages,
|
|
|
|
+ int additionalReplRequired,
|
|
|
|
+ int priority) {
|
|
|
|
+ super(block, bc, srcNodes, containingNodes,
|
|
|
|
+ liveReplicaStorages, additionalReplRequired, priority);
|
|
|
|
+ LOG.debug("Creating an ErasureCodingWork to recover " + block);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public short[] getMissingBlockIndicies() {
|
|
|
|
+ return missingBlockIndicies;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public void setMissingBlockIndices(short[] missingBlockIndicies) {
|
|
|
|
+ this.missingBlockIndicies = missingBlockIndicies;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ protected void chooseTargets(BlockPlacementPolicy blockplacement,
|
|
BlockStoragePolicySuite storagePolicySuite,
|
|
BlockStoragePolicySuite storagePolicySuite,
|
|
Set<Node> excludedNodes) {
|
|
Set<Node> excludedNodes) {
|
|
try {
|
|
try {
|
|
|
|
+ // TODO: new placement policy for EC considering multiple writers
|
|
targets = blockplacement.chooseTarget(bc.getName(),
|
|
targets = blockplacement.chooseTarget(bc.getName(),
|
|
- additionalReplRequired, srcNode, liveReplicaStorages, false,
|
|
|
|
|
|
+ additionalReplRequired, srcNodes[0], liveReplicaStorages, false,
|
|
excludedNodes, block.getNumBytes(),
|
|
excludedNodes, block.getNumBytes(),
|
|
storagePolicySuite.getPolicy(bc.getStoragePolicyID()));
|
|
storagePolicySuite.getPolicy(bc.getStoragePolicyID()));
|
|
} finally {
|
|
} finally {
|
|
- srcNode.decrementPendingReplicationWithoutTargets();
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|