|
@@ -18,8 +18,12 @@
|
|
package org.apache.hadoop.hdfs.server.blockmanagement;
|
|
package org.apache.hadoop.hdfs.server.blockmanagement;
|
|
|
|
|
|
import static org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol.DNA_ERASURE_CODING_RECONSTRUCTION;
|
|
import static org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol.DNA_ERASURE_CODING_RECONSTRUCTION;
|
|
|
|
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BLOCKPLACEMENTPOLICY_EXCLUDE_SLOW_NODES_ENABLED_KEY;
|
|
|
|
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BLOCKPLACEMENTPOLICY_EXCLUDE_SLOW_NODES_ENABLED_DEFAULT;
|
|
import static org.apache.hadoop.util.Time.monotonicNow;
|
|
import static org.apache.hadoop.util.Time.monotonicNow;
|
|
|
|
|
|
|
|
+import org.apache.hadoop.thirdparty.com.google.common.collect.Sets;
|
|
|
|
+import org.apache.commons.lang3.StringUtils;
|
|
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
|
|
import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
|
|
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
|
|
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
|
|
import org.apache.hadoop.thirdparty.com.google.common.net.InetAddresses;
|
|
import org.apache.hadoop.thirdparty.com.google.common.net.InetAddresses;
|
|
@@ -53,6 +57,7 @@ import org.apache.hadoop.ipc.Server;
|
|
import org.apache.hadoop.net.*;
|
|
import org.apache.hadoop.net.*;
|
|
import org.apache.hadoop.net.NetworkTopology.InvalidTopologyException;
|
|
import org.apache.hadoop.net.NetworkTopology.InvalidTopologyException;
|
|
import org.apache.hadoop.security.token.Token;
|
|
import org.apache.hadoop.security.token.Token;
|
|
|
|
+import org.apache.hadoop.util.Daemon;
|
|
import org.apache.hadoop.util.ReflectionUtils;
|
|
import org.apache.hadoop.util.ReflectionUtils;
|
|
import org.apache.hadoop.util.Timer;
|
|
import org.apache.hadoop.util.Timer;
|
|
|
|
|
|
@@ -201,8 +206,16 @@ public class DatanodeManager {
|
|
*/
|
|
*/
|
|
private final boolean useDfsNetworkTopology;
|
|
private final boolean useDfsNetworkTopology;
|
|
|
|
|
|
|
|
+ private static final String IP_PORT_SEPARATOR = ":";
|
|
|
|
+
|
|
@Nullable
|
|
@Nullable
|
|
private final SlowPeerTracker slowPeerTracker;
|
|
private final SlowPeerTracker slowPeerTracker;
|
|
|
|
+ private static Set<Node> slowNodesSet = Sets.newConcurrentHashSet();
|
|
|
|
+ private Daemon slowPeerCollectorDaemon;
|
|
|
|
+ private final long slowPeerCollectionInterval;
|
|
|
|
+ private final int maxSlowPeerReportNodes;
|
|
|
|
+ private boolean excludeSlowNodesEnabled;
|
|
|
|
+
|
|
@Nullable
|
|
@Nullable
|
|
private final SlowDiskTracker slowDiskTracker;
|
|
private final SlowDiskTracker slowDiskTracker;
|
|
|
|
|
|
@@ -242,11 +255,22 @@ public class DatanodeManager {
|
|
DFSConfigKeys.DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_KEY,
|
|
DFSConfigKeys.DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_KEY,
|
|
DFSConfigKeys.
|
|
DFSConfigKeys.
|
|
DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_DEFAULT));
|
|
DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_DEFAULT));
|
|
-
|
|
|
|
final Timer timer = new Timer();
|
|
final Timer timer = new Timer();
|
|
this.slowPeerTracker = dataNodePeerStatsEnabled ?
|
|
this.slowPeerTracker = dataNodePeerStatsEnabled ?
|
|
new SlowPeerTracker(conf, timer) : null;
|
|
new SlowPeerTracker(conf, timer) : null;
|
|
-
|
|
|
|
|
|
+ this.excludeSlowNodesEnabled = conf.getBoolean(
|
|
|
|
+ DFS_NAMENODE_BLOCKPLACEMENTPOLICY_EXCLUDE_SLOW_NODES_ENABLED_KEY,
|
|
|
|
+ DFS_NAMENODE_BLOCKPLACEMENTPOLICY_EXCLUDE_SLOW_NODES_ENABLED_DEFAULT);
|
|
|
|
+ this.maxSlowPeerReportNodes = conf.getInt(
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_MAX_SLOWPEER_COLLECT_NODES_KEY,
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_MAX_SLOWPEER_COLLECT_NODES_DEFAULT);
|
|
|
|
+ this.slowPeerCollectionInterval = conf.getTimeDuration(
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_SLOWPEER_COLLECT_INTERVAL_KEY,
|
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_SLOWPEER_COLLECT_INTERVAL_DEFAULT,
|
|
|
|
+ TimeUnit.MILLISECONDS);
|
|
|
|
+ if (slowPeerTracker != null && excludeSlowNodesEnabled) {
|
|
|
|
+ startSlowPeerCollector();
|
|
|
|
+ }
|
|
this.slowDiskTracker = dataNodeDiskStatsEnabled ?
|
|
this.slowDiskTracker = dataNodeDiskStatsEnabled ?
|
|
new SlowDiskTracker(conf, timer) : null;
|
|
new SlowDiskTracker(conf, timer) : null;
|
|
|
|
|
|
@@ -356,6 +380,44 @@ public class DatanodeManager {
|
|
DFSConfigKeys.DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY_DEFAULT);
|
|
DFSConfigKeys.DFS_NAMENODE_BLOCKS_PER_POSTPONEDBLOCKS_RESCAN_KEY_DEFAULT);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ private void startSlowPeerCollector() {
|
|
|
|
+ if (slowPeerCollectorDaemon != null) {
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+ slowPeerCollectorDaemon = new Daemon(new Runnable() {
|
|
|
|
+ @Override
|
|
|
|
+ public void run() {
|
|
|
|
+ while (true) {
|
|
|
|
+ try {
|
|
|
|
+ slowNodesSet = getSlowPeers();
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
+ LOG.error("Failed to collect slow peers", e);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ try {
|
|
|
|
+ Thread.sleep(slowPeerCollectionInterval);
|
|
|
|
+ } catch (InterruptedException e) {
|
|
|
|
+ LOG.error("Slow peers collection thread interrupted", e);
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ });
|
|
|
|
+ slowPeerCollectorDaemon.start();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public void stopSlowPeerCollector() {
|
|
|
|
+ if (slowPeerCollectorDaemon == null) {
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+ slowPeerCollectorDaemon.interrupt();
|
|
|
|
+ try {
|
|
|
|
+ slowPeerCollectorDaemon.join();
|
|
|
|
+ } catch (InterruptedException e) {
|
|
|
|
+ LOG.error("Slow peers collection thread did not shutdown", e);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
private static long getStaleIntervalFromConf(Configuration conf,
|
|
private static long getStaleIntervalFromConf(Configuration conf,
|
|
long heartbeatExpireInterval) {
|
|
long heartbeatExpireInterval) {
|
|
long staleInterval = conf.getLong(
|
|
long staleInterval = conf.getLong(
|
|
@@ -401,6 +463,7 @@ public class DatanodeManager {
|
|
void close() {
|
|
void close() {
|
|
datanodeAdminManager.close();
|
|
datanodeAdminManager.close();
|
|
heartbeatManager.close();
|
|
heartbeatManager.close();
|
|
|
|
+ stopSlowPeerCollector();
|
|
}
|
|
}
|
|
|
|
|
|
/** @return the network topology. */
|
|
/** @return the network topology. */
|
|
@@ -2019,6 +2082,48 @@ public class DatanodeManager {
|
|
return slowPeerTracker != null ? slowPeerTracker.getJson() : null;
|
|
return slowPeerTracker != null ? slowPeerTracker.getJson() : null;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /**
|
|
|
|
+ * Returns all tracking slow peers.
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ public Set<Node> getSlowPeers() {
|
|
|
|
+ Set<Node> slowPeersSet = Sets.newConcurrentHashSet();
|
|
|
|
+ if (slowPeerTracker == null) {
|
|
|
|
+ return slowPeersSet;
|
|
|
|
+ }
|
|
|
|
+ ArrayList<String> slowNodes =
|
|
|
|
+ slowPeerTracker.getSlowNodes(maxSlowPeerReportNodes);
|
|
|
|
+ for (String slowNode : slowNodes) {
|
|
|
|
+ if (StringUtils.isBlank(slowNode)
|
|
|
|
+ || !slowNode.contains(IP_PORT_SEPARATOR)) {
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+ String ipAddr = slowNode.split(IP_PORT_SEPARATOR)[0];
|
|
|
|
+ DatanodeDescriptor datanodeByHost =
|
|
|
|
+ host2DatanodeMap.getDatanodeByHost(ipAddr);
|
|
|
|
+ if (datanodeByHost != null) {
|
|
|
|
+ slowPeersSet.add(datanodeByHost);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return slowPeersSet;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Returns all tracking slow peers.
|
|
|
|
+ * @return
|
|
|
|
+ */
|
|
|
|
+ public static Set<Node> getSlowNodes() {
|
|
|
|
+ return slowNodesSet;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Use only for testing.
|
|
|
|
+ */
|
|
|
|
+ @VisibleForTesting
|
|
|
|
+ public SlowPeerTracker getSlowPeerTracker() {
|
|
|
|
+ return slowPeerTracker;
|
|
|
|
+ }
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* Use only for testing.
|
|
* Use only for testing.
|
|
*/
|
|
*/
|