|
@@ -485,6 +485,7 @@ public class DatanodeAdminManager {
|
|
|
|
|
|
@Override
|
|
|
public void run() {
|
|
|
+ LOG.debug("DatanodeAdminMonitor is running.");
|
|
|
if (!namesystem.isRunning()) {
|
|
|
LOG.info("Namesystem is not running, skipping " +
|
|
|
"decommissioning/maintenance checks.");
|
|
@@ -499,6 +500,9 @@ public class DatanodeAdminManager {
|
|
|
try {
|
|
|
processPendingNodes();
|
|
|
check();
|
|
|
+ } catch (Exception e) {
|
|
|
+ LOG.warn("DatanodeAdminMonitor caught exception when processing node.",
|
|
|
+ e);
|
|
|
} finally {
|
|
|
namesystem.writeUnlock();
|
|
|
}
|
|
@@ -532,83 +536,96 @@ public class DatanodeAdminManager {
|
|
|
final Map.Entry<DatanodeDescriptor, AbstractList<BlockInfo>>
|
|
|
entry = it.next();
|
|
|
final DatanodeDescriptor dn = entry.getKey();
|
|
|
- AbstractList<BlockInfo> blocks = entry.getValue();
|
|
|
- boolean fullScan = false;
|
|
|
- if (dn.isMaintenance() && dn.maintenanceExpired()) {
|
|
|
- // If maintenance expires, stop tracking it.
|
|
|
- stopMaintenance(dn);
|
|
|
- toRemove.add(dn);
|
|
|
- continue;
|
|
|
- }
|
|
|
- if (dn.isInMaintenance()) {
|
|
|
- // The dn is IN_MAINTENANCE and the maintenance hasn't expired yet.
|
|
|
- continue;
|
|
|
- }
|
|
|
- if (blocks == null) {
|
|
|
- // This is a newly added datanode, run through its list to schedule
|
|
|
- // under-replicated blocks for replication and collect the blocks
|
|
|
- // that are insufficiently replicated for further tracking
|
|
|
- LOG.debug("Newly-added node {}, doing full scan to find " +
|
|
|
- "insufficiently-replicated blocks.", dn);
|
|
|
- blocks = handleInsufficientlyStored(dn);
|
|
|
- outOfServiceNodeBlocks.put(dn, blocks);
|
|
|
- fullScan = true;
|
|
|
- } else {
|
|
|
- // This is a known datanode, check if its # of insufficiently
|
|
|
- // replicated blocks has dropped to zero and if it can move
|
|
|
- // to the next state.
|
|
|
- LOG.debug("Processing {} node {}", dn.getAdminState(), dn);
|
|
|
- pruneReliableBlocks(dn, blocks);
|
|
|
- }
|
|
|
- if (blocks.size() == 0) {
|
|
|
- if (!fullScan) {
|
|
|
- // If we didn't just do a full scan, need to re-check with the
|
|
|
- // full block map.
|
|
|
- //
|
|
|
- // We've replicated all the known insufficiently replicated
|
|
|
- // blocks. Re-check with the full block map before finally
|
|
|
- // marking the datanode as DECOMMISSIONED or IN_MAINTENANCE.
|
|
|
- LOG.debug("Node {} has finished replicating current set of "
|
|
|
- + "blocks, checking with the full block map.", dn);
|
|
|
+ try {
|
|
|
+ AbstractList<BlockInfo> blocks = entry.getValue();
|
|
|
+ boolean fullScan = false;
|
|
|
+ if (dn.isMaintenance() && dn.maintenanceExpired()) {
|
|
|
+ // If maintenance expires, stop tracking it.
|
|
|
+ stopMaintenance(dn);
|
|
|
+ toRemove.add(dn);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (dn.isInMaintenance()) {
|
|
|
+ // The dn is IN_MAINTENANCE and the maintenance hasn't expired yet.
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ if (blocks == null) {
|
|
|
+ // This is a newly added datanode, run through its list to schedule
|
|
|
+ // under-replicated blocks for replication and collect the blocks
|
|
|
+ // that are insufficiently replicated for further tracking
|
|
|
+ LOG.debug("Newly-added node {}, doing full scan to find " +
|
|
|
+ "insufficiently-replicated blocks.", dn);
|
|
|
blocks = handleInsufficientlyStored(dn);
|
|
|
outOfServiceNodeBlocks.put(dn, blocks);
|
|
|
+ fullScan = true;
|
|
|
+ } else {
|
|
|
+ // This is a known datanode, check if its # of insufficiently
|
|
|
+ // replicated blocks has dropped to zero and if it can move
|
|
|
+ // to the next state.
|
|
|
+ LOG.debug("Processing {} node {}", dn.getAdminState(), dn);
|
|
|
+ pruneReliableBlocks(dn, blocks);
|
|
|
}
|
|
|
- // If the full scan is clean AND the node liveness is okay,
|
|
|
- // we can finally mark as DECOMMISSIONED or IN_MAINTENANCE.
|
|
|
- final boolean isHealthy =
|
|
|
- blockManager.isNodeHealthyForDecommissionOrMaintenance(dn);
|
|
|
- if (blocks.size() == 0 && isHealthy) {
|
|
|
- if (dn.isDecommissionInProgress()) {
|
|
|
- setDecommissioned(dn);
|
|
|
- toRemove.add(dn);
|
|
|
- } else if (dn.isEnteringMaintenance()) {
|
|
|
- // IN_MAINTENANCE node remains in the outOfServiceNodeBlocks to
|
|
|
- // to track maintenance expiration.
|
|
|
- setInMaintenance(dn);
|
|
|
+ if (blocks.size() == 0) {
|
|
|
+ if (!fullScan) {
|
|
|
+ // If we didn't just do a full scan, need to re-check with the
|
|
|
+ // full block map.
|
|
|
+ //
|
|
|
+ // We've replicated all the known insufficiently replicated
|
|
|
+ // blocks. Re-check with the full block map before finally
|
|
|
+ // marking the datanode as DECOMMISSIONED or IN_MAINTENANCE.
|
|
|
+ LOG.debug("Node {} has finished replicating current set of "
|
|
|
+ + "blocks, checking with the full block map.", dn);
|
|
|
+ blocks = handleInsufficientlyStored(dn);
|
|
|
+ outOfServiceNodeBlocks.put(dn, blocks);
|
|
|
+ }
|
|
|
+ // If the full scan is clean AND the node liveness is okay,
|
|
|
+ // we can finally mark as DECOMMISSIONED or IN_MAINTENANCE.
|
|
|
+ final boolean isHealthy =
|
|
|
+ blockManager.isNodeHealthyForDecommissionOrMaintenance(dn);
|
|
|
+ if (blocks.size() == 0 && isHealthy) {
|
|
|
+ if (dn.isDecommissionInProgress()) {
|
|
|
+ setDecommissioned(dn);
|
|
|
+ toRemove.add(dn);
|
|
|
+ } else if (dn.isEnteringMaintenance()) {
|
|
|
+ // IN_MAINTENANCE node remains in the outOfServiceNodeBlocks to
|
|
|
+ // to track maintenance expiration.
|
|
|
+ setInMaintenance(dn);
|
|
|
+ } else {
|
|
|
+ Preconditions.checkState(false,
|
|
|
+ "Node %s is in an invalid state! "
|
|
|
+ + "Invalid state: %s %s blocks are on this dn.",
|
|
|
+ dn, dn.getAdminState(), blocks.size());
|
|
|
+ }
|
|
|
+ LOG.debug("Node {} is sufficiently replicated and healthy, "
|
|
|
+ + "marked as {}.", dn, dn.getAdminState());
|
|
|
} else {
|
|
|
- Preconditions.checkState(false,
|
|
|
- "A node is in an invalid state!");
|
|
|
+ LOG.debug("Node {} {} healthy."
|
|
|
+ + " It needs to replicate {} more blocks."
|
|
|
+ + " {} is still in progress.", dn,
|
|
|
+ isHealthy ? "is": "isn't", blocks.size(), dn.getAdminState());
|
|
|
}
|
|
|
- LOG.debug("Node {} is sufficiently replicated and healthy, "
|
|
|
- + "marked as {}.", dn, dn.getAdminState());
|
|
|
} else {
|
|
|
- LOG.debug("Node {} {} healthy."
|
|
|
- + " It needs to replicate {} more blocks."
|
|
|
- + " {} is still in progress.", dn,
|
|
|
- isHealthy ? "is": "isn't", blocks.size(), dn.getAdminState());
|
|
|
+ LOG.debug("Node {} still has {} blocks to replicate "
|
|
|
+ + "before it is a candidate to finish {}.",
|
|
|
+ dn, blocks.size(), dn.getAdminState());
|
|
|
}
|
|
|
- } else {
|
|
|
- LOG.debug("Node {} still has {} blocks to replicate "
|
|
|
- + "before it is a candidate to finish {}.",
|
|
|
- dn, blocks.size(), dn.getAdminState());
|
|
|
+ } catch (Exception e) {
|
|
|
+ // Log and postpone to process node when meet exception since it is in
|
|
|
+ // an invalid state.
|
|
|
+ LOG.warn("DatanodeAdminMonitor caught exception when processing node "
|
|
|
+ + "{}.", dn, e);
|
|
|
+ pendingNodes.add(dn);
|
|
|
+ toRemove.add(dn);
|
|
|
+ } finally {
|
|
|
+ iterkey = dn;
|
|
|
}
|
|
|
- iterkey = dn;
|
|
|
}
|
|
|
// Remove the datanodes that are DECOMMISSIONED or in service after
|
|
|
// maintenance expiration.
|
|
|
for (DatanodeDescriptor dn : toRemove) {
|
|
|
Preconditions.checkState(dn.isDecommissioned() || dn.isInService(),
|
|
|
- "Removing a node that is not yet decommissioned or in service!");
|
|
|
+ "Removing node %s that is not yet decommissioned or in service!",
|
|
|
+ dn);
|
|
|
outOfServiceNodeBlocks.remove(dn);
|
|
|
}
|
|
|
}
|