|
@@ -21,12 +21,14 @@ import static org.apache.hadoop.util.ExitUtil.terminate;
|
|
|
|
|
|
import java.io.Closeable;
|
|
|
import java.io.IOException;
|
|
|
+import java.util.ArrayList;
|
|
|
import java.util.Collection;
|
|
|
import java.util.Date;
|
|
|
import java.util.Iterator;
|
|
|
import java.util.LinkedList;
|
|
|
import java.util.List;
|
|
|
import java.util.Random;
|
|
|
+import java.util.TreeMap;
|
|
|
import java.util.concurrent.TimeUnit;
|
|
|
import java.util.concurrent.locks.Condition;
|
|
|
import java.util.concurrent.locks.ReentrantLock;
|
|
@@ -76,7 +78,7 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
|
|
/**
|
|
|
* Pseudorandom number source
|
|
|
*/
|
|
|
- private final Random random = new Random();
|
|
|
+ private static final Random random = new Random();
|
|
|
|
|
|
/**
|
|
|
* The interval at which we scan the namesystem for caching changes.
|
|
@@ -310,8 +312,6 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
|
|
FSDirectory fsDir = namesystem.getFSDirectory();
|
|
|
final long now = new Date().getTime();
|
|
|
for (CacheDirective directive : cacheManager.getCacheDirectives()) {
|
|
|
- // Reset the directive's statistics
|
|
|
- directive.resetStatistics();
|
|
|
// Skip processing this entry if it has expired
|
|
|
if (LOG.isTraceEnabled()) {
|
|
|
LOG.trace("Directive expiry is at " + directive.getExpiryTime());
|
|
@@ -461,7 +461,7 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
|
|
// there may be a period of time when incomplete blocks remain cached
|
|
|
// on the DataNodes.
|
|
|
return "not complete";
|
|
|
- } else if (cblock.getReplication() == 0) {
|
|
|
+ } else if (cblock.getReplication() == 0) {
|
|
|
// Since 0 is not a valid value for a cache directive's replication
|
|
|
// field, seeing a replication of 0 on a CacheBlock means that it
|
|
|
// has never been reached by any sweep.
|
|
@@ -469,6 +469,9 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
|
|
} else if (cblock.getMark() != mark) {
|
|
|
// Although the block was needed in the past, we didn't reach it during
|
|
|
// the current sweep. Therefore, it doesn't need to be cached any more.
|
|
|
+ // Need to set the replication to 0 so it doesn't flip back to cached
|
|
|
+ // when the mark flips on the next scan
|
|
|
+ cblock.setReplicationAndMark((short)0, mark);
|
|
|
return "no longer needed by any directives";
|
|
|
}
|
|
|
return null;
|
|
@@ -595,7 +598,7 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
|
|
* @param pendingCached A list of DataNodes that will soon cache the
|
|
|
* block.
|
|
|
*/
|
|
|
- private void addNewPendingCached(int neededCached,
|
|
|
+ private void addNewPendingCached(final int neededCached,
|
|
|
CachedBlock cachedBlock, List<DatanodeDescriptor> cached,
|
|
|
List<DatanodeDescriptor> pendingCached) {
|
|
|
// To figure out which replicas can be cached, we consult the
|
|
@@ -616,35 +619,156 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
|
|
}
|
|
|
return;
|
|
|
}
|
|
|
- List<DatanodeDescriptor> possibilities = new LinkedList<DatanodeDescriptor>();
|
|
|
+ // Filter the list of replicas to only the valid targets
|
|
|
+ List<DatanodeDescriptor> possibilities =
|
|
|
+ new LinkedList<DatanodeDescriptor>();
|
|
|
int numReplicas = blockInfo.getCapacity();
|
|
|
Collection<DatanodeDescriptor> corrupt =
|
|
|
blockManager.getCorruptReplicas(blockInfo);
|
|
|
+ int outOfCapacity = 0;
|
|
|
for (int i = 0; i < numReplicas; i++) {
|
|
|
DatanodeDescriptor datanode = blockInfo.getDatanode(i);
|
|
|
- if ((datanode != null) &&
|
|
|
- ((!pendingCached.contains(datanode)) &&
|
|
|
- ((corrupt == null) || (!corrupt.contains(datanode))))) {
|
|
|
- possibilities.add(datanode);
|
|
|
+ if (datanode == null) {
|
|
|
+ continue;
|
|
|
}
|
|
|
- }
|
|
|
- while (neededCached > 0) {
|
|
|
- if (possibilities.isEmpty()) {
|
|
|
- LOG.warn("We need " + neededCached + " more replica(s) than " +
|
|
|
- "actually exist to provide a cache replication of " +
|
|
|
- cachedBlock.getReplication() + " for " + cachedBlock);
|
|
|
- return;
|
|
|
+ if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) {
|
|
|
+ continue;
|
|
|
}
|
|
|
- DatanodeDescriptor datanode =
|
|
|
- possibilities.remove(random.nextInt(possibilities.size()));
|
|
|
- if (LOG.isDebugEnabled()) {
|
|
|
- LOG.debug("AddNewPendingCached: datanode " + datanode +
|
|
|
- " will now cache block " + cachedBlock);
|
|
|
+ if (corrupt != null && corrupt.contains(datanode)) {
|
|
|
+ continue;
|
|
|
}
|
|
|
+ if (pendingCached.contains(datanode) || cached.contains(datanode)) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ long pendingCapacity = datanode.getCacheRemaining();
|
|
|
+ // Subtract pending cached blocks from effective capacity
|
|
|
+ Iterator<CachedBlock> it = datanode.getPendingCached().iterator();
|
|
|
+ while (it.hasNext()) {
|
|
|
+ CachedBlock cBlock = it.next();
|
|
|
+ BlockInfo info =
|
|
|
+ blockManager.getStoredBlock(new Block(cBlock.getBlockId()));
|
|
|
+ if (info != null) {
|
|
|
+ pendingCapacity -= info.getNumBytes();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ it = datanode.getPendingUncached().iterator();
|
|
|
+ // Add pending uncached blocks from effective capacity
|
|
|
+ while (it.hasNext()) {
|
|
|
+ CachedBlock cBlock = it.next();
|
|
|
+ BlockInfo info =
|
|
|
+ blockManager.getStoredBlock(new Block(cBlock.getBlockId()));
|
|
|
+ if (info != null) {
|
|
|
+ pendingCapacity += info.getNumBytes();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (pendingCapacity < blockInfo.getNumBytes()) {
|
|
|
+ if (LOG.isTraceEnabled()) {
|
|
|
+ LOG.trace("Datanode " + datanode + " is not a valid possibility for"
|
|
|
+ + " block " + blockInfo.getBlockId() + " of size "
|
|
|
+ + blockInfo.getNumBytes() + " bytes, only has "
|
|
|
+ + datanode.getCacheRemaining() + " bytes of cache remaining.");
|
|
|
+ }
|
|
|
+ outOfCapacity++;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ possibilities.add(datanode);
|
|
|
+ }
|
|
|
+ List<DatanodeDescriptor> chosen = chooseDatanodesForCaching(possibilities,
|
|
|
+ neededCached, blockManager.getDatanodeManager().getStaleInterval());
|
|
|
+ for (DatanodeDescriptor datanode : chosen) {
|
|
|
pendingCached.add(datanode);
|
|
|
boolean added = datanode.getPendingCached().add(cachedBlock);
|
|
|
assert added;
|
|
|
- neededCached--;
|
|
|
}
|
|
|
+ // We were unable to satisfy the requested replication factor
|
|
|
+ if (neededCached > chosen.size()) {
|
|
|
+ if (LOG.isDebugEnabled()) {
|
|
|
+ LOG.debug(
|
|
|
+ "Only have " +
|
|
|
+ (cachedBlock.getReplication() - neededCached + chosen.size()) +
|
|
|
+ " of " + cachedBlock.getReplication() + " cached replicas for " +
|
|
|
+ cachedBlock + " (" + outOfCapacity + " nodes have insufficient " +
|
|
|
+ "capacity).");
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Chooses datanode locations for caching from a list of valid possibilities.
|
|
|
+ * Non-stale nodes are chosen before stale nodes.
|
|
|
+ *
|
|
|
+ * @param possibilities List of candidate datanodes
|
|
|
+ * @param neededCached Number of replicas needed
|
|
|
+ * @param staleInterval Age of a stale datanode
|
|
|
+ * @return A list of chosen datanodes
|
|
|
+ */
|
|
|
+ private static List<DatanodeDescriptor> chooseDatanodesForCaching(
|
|
|
+ final List<DatanodeDescriptor> possibilities, final int neededCached,
|
|
|
+ final long staleInterval) {
|
|
|
+ // Make a copy that we can modify
|
|
|
+ List<DatanodeDescriptor> targets =
|
|
|
+ new ArrayList<DatanodeDescriptor>(possibilities);
|
|
|
+ // Selected targets
|
|
|
+ List<DatanodeDescriptor> chosen = new LinkedList<DatanodeDescriptor>();
|
|
|
+
|
|
|
+ // Filter out stale datanodes
|
|
|
+ List<DatanodeDescriptor> stale = new LinkedList<DatanodeDescriptor>();
|
|
|
+ Iterator<DatanodeDescriptor> it = targets.iterator();
|
|
|
+ while (it.hasNext()) {
|
|
|
+ DatanodeDescriptor d = it.next();
|
|
|
+ if (d.isStale(staleInterval)) {
|
|
|
+ it.remove();
|
|
|
+ stale.add(d);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // Select targets
|
|
|
+ while (chosen.size() < neededCached) {
|
|
|
+ // Try to use stale nodes if we're out of non-stale nodes, else we're done
|
|
|
+ if (targets.isEmpty()) {
|
|
|
+ if (!stale.isEmpty()) {
|
|
|
+ targets = stale;
|
|
|
+ } else {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // Select a random target
|
|
|
+ DatanodeDescriptor target =
|
|
|
+ chooseRandomDatanodeByRemainingCapacity(targets);
|
|
|
+ chosen.add(target);
|
|
|
+ targets.remove(target);
|
|
|
+ }
|
|
|
+ return chosen;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Choose a single datanode from the provided list of possible
|
|
|
+ * targets, weighted by the percentage of free space remaining on the node.
|
|
|
+ *
|
|
|
+ * @return The chosen datanode
|
|
|
+ */
|
|
|
+ private static DatanodeDescriptor chooseRandomDatanodeByRemainingCapacity(
|
|
|
+ final List<DatanodeDescriptor> targets) {
|
|
|
+ // Use a weighted probability to choose the target datanode
|
|
|
+ float total = 0;
|
|
|
+ for (DatanodeDescriptor d : targets) {
|
|
|
+ total += d.getCacheRemainingPercent();
|
|
|
+ }
|
|
|
+ // Give each datanode a portion of keyspace equal to its relative weight
|
|
|
+ // [0, w1) selects d1, [w1, w2) selects d2, etc.
|
|
|
+ TreeMap<Integer, DatanodeDescriptor> lottery =
|
|
|
+ new TreeMap<Integer, DatanodeDescriptor>();
|
|
|
+ int offset = 0;
|
|
|
+ for (DatanodeDescriptor d : targets) {
|
|
|
+ // Since we're using floats, be paranoid about negative values
|
|
|
+ int weight =
|
|
|
+ Math.max(1, (int)((d.getCacheRemainingPercent() / total) * 1000000));
|
|
|
+ offset += weight;
|
|
|
+ lottery.put(offset, d);
|
|
|
+ }
|
|
|
+ // Choose a number from [0, offset), which is the total amount of weight,
|
|
|
+ // to select the winner
|
|
|
+ DatanodeDescriptor winner =
|
|
|
+ lottery.higherEntry(random.nextInt(offset)).getValue();
|
|
|
+ return winner;
|
|
|
}
|
|
|
}
|