|
@@ -95,6 +95,7 @@ import org.apache.hadoop.yarn.server.api.records.MasterKey;
|
|
import org.apache.hadoop.yarn.server.api.records.NodeAction;
|
|
import org.apache.hadoop.yarn.server.api.records.NodeAction;
|
|
import org.apache.hadoop.yarn.server.api.records.NodeStatus;
|
|
import org.apache.hadoop.yarn.server.api.records.NodeStatus;
|
|
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
|
|
import org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
|
|
|
|
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
|
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
|
|
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
|
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManagerImpl;
|
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
|
|
@@ -486,6 +487,35 @@ public class TestNodeStatusUpdater {
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ private class MyNodeStatusUpdater6 extends NodeStatusUpdaterImpl {
|
|
|
|
+
|
|
|
|
+ private final long rmStartIntervalMS;
|
|
|
|
+ private final boolean rmNeverStart;
|
|
|
|
+ public ResourceTracker resourceTracker;
|
|
|
|
+ public MyNodeStatusUpdater6(Context context, Dispatcher dispatcher,
|
|
|
|
+ NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics,
|
|
|
|
+ long rmStartIntervalMS, boolean rmNeverStart) {
|
|
|
|
+ super(context, dispatcher, healthChecker, metrics);
|
|
|
|
+ this.rmStartIntervalMS = rmStartIntervalMS;
|
|
|
|
+ this.rmNeverStart = rmNeverStart;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ protected void serviceStart() throws Exception {
|
|
|
|
+ //record the startup time
|
|
|
|
+ super.serviceStart();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private boolean isTriggered() {
|
|
|
|
+ return triggered;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ @Override
|
|
|
|
+ protected void stopRMProxy() {
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
private class MyNodeManager extends NodeManager {
|
|
private class MyNodeManager extends NodeManager {
|
|
|
|
|
|
private MyNodeStatusUpdater3 nodeStatusUpdater;
|
|
private MyNodeStatusUpdater3 nodeStatusUpdater;
|
|
@@ -1309,6 +1339,59 @@ public class TestNodeStatusUpdater {
|
|
+ "Message from ResourceManager: RM Shutting Down Node");
|
|
+ "Message from ResourceManager: RM Shutting Down Node");
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ @Test (timeout = 100000)
|
|
|
|
+ public void testNMRMConnectionConf() throws Exception {
|
|
|
|
+ final long delta = 50000;
|
|
|
|
+ final long nmRmConnectionWaitMs = 100;
|
|
|
|
+ final long nmRmRetryInterval = 100;
|
|
|
|
+ final long connectionWaitMs = -1;
|
|
|
|
+ final long connectionRetryIntervalMs = 1000;
|
|
|
|
+ //Waiting for rmStartIntervalMS, RM will be started
|
|
|
|
+ final long rmStartIntervalMS = 2*1000;
|
|
|
|
+ conf.setLong(YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
|
|
|
|
+ nmRmConnectionWaitMs);
|
|
|
|
+ conf.setLong(
|
|
|
|
+ YarnConfiguration.NM_RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
|
|
|
|
+ nmRmRetryInterval);
|
|
|
|
+ conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
|
|
|
|
+ connectionWaitMs);
|
|
|
|
+ conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
|
|
|
|
+ connectionRetryIntervalMs);
|
|
|
|
+ conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY,
|
|
|
|
+ 1);
|
|
|
|
+ //Test NM try to connect to RM Several times, but finally fail
|
|
|
|
+ NodeManagerWithCustomNodeStatusUpdater nmWithUpdater;
|
|
|
|
+ nm = nmWithUpdater = new NodeManagerWithCustomNodeStatusUpdater() {
|
|
|
|
+ @Override
|
|
|
|
+ protected NodeStatusUpdater createUpdater(Context context,
|
|
|
|
+ Dispatcher dispatcher, NodeHealthCheckerService healthChecker) {
|
|
|
|
+ NodeStatusUpdater nodeStatusUpdater = new MyNodeStatusUpdater6(
|
|
|
|
+ context, dispatcher, healthChecker, metrics,
|
|
|
|
+ rmStartIntervalMS, true);
|
|
|
|
+ return nodeStatusUpdater;
|
|
|
|
+ }
|
|
|
|
+ };
|
|
|
|
+ nm.init(conf);
|
|
|
|
+ long waitStartTime = System.currentTimeMillis();
|
|
|
|
+ try {
|
|
|
|
+ nm.start();
|
|
|
|
+ Assert.fail("NM should have failed to start due to RM connect failure");
|
|
|
|
+ } catch(Exception e) {
|
|
|
|
+ long t = System.currentTimeMillis();
|
|
|
|
+ long duration = t - waitStartTime;
|
|
|
|
+ boolean waitTimeValid = (duration >= nmRmConnectionWaitMs) &&
|
|
|
|
+ (duration < (connectionWaitMs + delta));
|
|
|
|
+
|
|
|
|
+ if(!waitTimeValid) {
|
|
|
|
+ // throw exception if NM doesn't retry long enough
|
|
|
|
+ throw new Exception("NM should have tried re-connecting to RM during " +
|
|
|
|
+ "period of at least " + connectionWaitMs + " ms, but " +
|
|
|
|
+ "stopped retrying within " + (connectionWaitMs + delta) +
|
|
|
|
+ " ms: " + e, e);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
@Test (timeout = 150000)
|
|
@Test (timeout = 150000)
|
|
public void testNMConnectionToRM() throws Exception {
|
|
public void testNMConnectionToRM() throws Exception {
|
|
final long delta = 50000;
|
|
final long delta = 50000;
|