|
@@ -37,15 +37,18 @@ import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest
|
|
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse;
|
|
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterResponse;
|
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
|
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
|
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
|
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
|
|
|
|
+import org.apache.hadoop.yarn.api.records.ApplicationId;
|
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
|
import org.apache.hadoop.yarn.api.records.ContainerStatus;
|
|
import org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest;
|
|
import org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest;
|
|
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
|
import org.apache.hadoop.yarn.api.records.ResourceRequest;
|
|
import org.apache.hadoop.yarn.api.records.UpdateContainerRequest;
|
|
import org.apache.hadoop.yarn.api.records.UpdateContainerRequest;
|
|
import org.apache.hadoop.yarn.api.records.UpdatedContainer;
|
|
import org.apache.hadoop.yarn.api.records.UpdatedContainer;
|
|
|
|
+import org.apache.hadoop.yarn.client.AMRMClientUtils;
|
|
import org.apache.hadoop.yarn.client.ClientRMProxy;
|
|
import org.apache.hadoop.yarn.client.ClientRMProxy;
|
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
|
import org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException;
|
|
import org.apache.hadoop.yarn.exceptions.ApplicationMasterNotRegisteredException;
|
|
|
|
+import org.apache.hadoop.yarn.exceptions.InvalidApplicationMasterRequestException;
|
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
|
import org.apache.hadoop.yarn.server.scheduler.ResourceRequestSet;
|
|
import org.apache.hadoop.yarn.server.scheduler.ResourceRequestSet;
|
|
@@ -105,13 +108,22 @@ public class AMRMClientRelayer extends AbstractService
|
|
new HashMap<>();
|
|
new HashMap<>();
|
|
private Map<ContainerId, UpdateContainerRequest> change = new HashMap<>();
|
|
private Map<ContainerId, UpdateContainerRequest> change = new HashMap<>();
|
|
|
|
|
|
|
|
+ private ApplicationId appId;
|
|
|
|
+
|
|
|
|
+ // Normally -1, otherwise will override responseId with this value in the next
|
|
|
|
+ // heartbeat
|
|
|
|
+ private volatile int resetResponseId;
|
|
|
|
+
|
|
public AMRMClientRelayer() {
|
|
public AMRMClientRelayer() {
|
|
super(AMRMClientRelayer.class.getName());
|
|
super(AMRMClientRelayer.class.getName());
|
|
|
|
+ this.resetResponseId = -1;
|
|
}
|
|
}
|
|
|
|
|
|
- public AMRMClientRelayer(ApplicationMasterProtocol rmClient) {
|
|
|
|
|
|
+ public AMRMClientRelayer(ApplicationMasterProtocol rmClient,
|
|
|
|
+ ApplicationId appId) {
|
|
this();
|
|
this();
|
|
this.rmClient = rmClient;
|
|
this.rmClient = rmClient;
|
|
|
|
+ this.appId = appId;
|
|
}
|
|
}
|
|
|
|
|
|
@Override
|
|
@Override
|
|
@@ -161,51 +173,56 @@ public class AMRMClientRelayer extends AbstractService
|
|
try {
|
|
try {
|
|
return this.rmClient.finishApplicationMaster(request);
|
|
return this.rmClient.finishApplicationMaster(request);
|
|
} catch (ApplicationMasterNotRegisteredException e) {
|
|
} catch (ApplicationMasterNotRegisteredException e) {
|
|
- LOG.warn("Out of sync with ResourceManager, hence resyncing.");
|
|
|
|
|
|
+ LOG.warn("Out of sync with RM for " + this.appId + ", hence resyncing.");
|
|
// re register with RM
|
|
// re register with RM
|
|
registerApplicationMaster(this.amRegistrationRequest);
|
|
registerApplicationMaster(this.amRegistrationRequest);
|
|
return finishApplicationMaster(request);
|
|
return finishApplicationMaster(request);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ private void addNewAllocateRequest(AllocateRequest allocateRequest)
|
|
|
|
+ throws YarnException {
|
|
|
|
+ // update the data structures first
|
|
|
|
+ addNewAsks(allocateRequest.getAskList());
|
|
|
|
+
|
|
|
|
+ if (allocateRequest.getReleaseList() != null) {
|
|
|
|
+ this.remotePendingRelease.addAll(allocateRequest.getReleaseList());
|
|
|
|
+ this.release.addAll(allocateRequest.getReleaseList());
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (allocateRequest.getResourceBlacklistRequest() != null) {
|
|
|
|
+ if (allocateRequest.getResourceBlacklistRequest()
|
|
|
|
+ .getBlacklistAdditions() != null) {
|
|
|
|
+ this.remoteBlacklistedNodes.addAll(allocateRequest
|
|
|
|
+ .getResourceBlacklistRequest().getBlacklistAdditions());
|
|
|
|
+ this.blacklistAdditions.addAll(allocateRequest
|
|
|
|
+ .getResourceBlacklistRequest().getBlacklistAdditions());
|
|
|
|
+ }
|
|
|
|
+ if (allocateRequest.getResourceBlacklistRequest()
|
|
|
|
+ .getBlacklistRemovals() != null) {
|
|
|
|
+ this.remoteBlacklistedNodes.removeAll(allocateRequest
|
|
|
|
+ .getResourceBlacklistRequest().getBlacklistRemovals());
|
|
|
|
+ this.blacklistRemovals.addAll(allocateRequest
|
|
|
|
+ .getResourceBlacklistRequest().getBlacklistRemovals());
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (allocateRequest.getUpdateRequests() != null) {
|
|
|
|
+ for (UpdateContainerRequest update : allocateRequest
|
|
|
|
+ .getUpdateRequests()) {
|
|
|
|
+ this.remotePendingChange.put(update.getContainerId(), update);
|
|
|
|
+ this.change.put(update.getContainerId(), update);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
@Override
|
|
@Override
|
|
public AllocateResponse allocate(AllocateRequest allocateRequest)
|
|
public AllocateResponse allocate(AllocateRequest allocateRequest)
|
|
throws YarnException, IOException {
|
|
throws YarnException, IOException {
|
|
AllocateResponse allocateResponse = null;
|
|
AllocateResponse allocateResponse = null;
|
|
try {
|
|
try {
|
|
synchronized (this) {
|
|
synchronized (this) {
|
|
- // update the data structures first
|
|
|
|
- addNewAsks(allocateRequest.getAskList());
|
|
|
|
-
|
|
|
|
- if (allocateRequest.getReleaseList() != null) {
|
|
|
|
- this.remotePendingRelease.addAll(allocateRequest.getReleaseList());
|
|
|
|
- this.release.addAll(allocateRequest.getReleaseList());
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (allocateRequest.getResourceBlacklistRequest() != null) {
|
|
|
|
- if (allocateRequest.getResourceBlacklistRequest()
|
|
|
|
- .getBlacklistAdditions() != null) {
|
|
|
|
- this.remoteBlacklistedNodes.addAll(allocateRequest
|
|
|
|
- .getResourceBlacklistRequest().getBlacklistAdditions());
|
|
|
|
- this.blacklistAdditions.addAll(allocateRequest
|
|
|
|
- .getResourceBlacklistRequest().getBlacklistAdditions());
|
|
|
|
- }
|
|
|
|
- if (allocateRequest.getResourceBlacklistRequest()
|
|
|
|
- .getBlacklistRemovals() != null) {
|
|
|
|
- this.remoteBlacklistedNodes.removeAll(allocateRequest
|
|
|
|
- .getResourceBlacklistRequest().getBlacklistRemovals());
|
|
|
|
- this.blacklistRemovals.addAll(allocateRequest
|
|
|
|
- .getResourceBlacklistRequest().getBlacklistRemovals());
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- if (allocateRequest.getUpdateRequests() != null) {
|
|
|
|
- for (UpdateContainerRequest update : allocateRequest
|
|
|
|
- .getUpdateRequests()) {
|
|
|
|
- this.remotePendingChange.put(update.getContainerId(), update);
|
|
|
|
- this.change.put(update.getContainerId(), update);
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
|
|
+ addNewAllocateRequest(allocateRequest);
|
|
|
|
|
|
ArrayList<ResourceRequest> askList = new ArrayList<>(ask.size());
|
|
ArrayList<ResourceRequest> askList = new ArrayList<>(ask.size());
|
|
for (ResourceRequest r : ask) {
|
|
for (ResourceRequest r : ask) {
|
|
@@ -222,13 +239,23 @@ public class AMRMClientRelayer extends AbstractService
|
|
new ArrayList<>(this.blacklistAdditions),
|
|
new ArrayList<>(this.blacklistAdditions),
|
|
new ArrayList<>(this.blacklistRemovals)))
|
|
new ArrayList<>(this.blacklistRemovals)))
|
|
.updateRequests(new ArrayList<>(this.change.values())).build();
|
|
.updateRequests(new ArrayList<>(this.change.values())).build();
|
|
|
|
+
|
|
|
|
+ if (this.resetResponseId != -1) {
|
|
|
|
+ LOG.info("Override allocate responseId from "
|
|
|
|
+ + allocateRequest.getResponseId() + " to " + this.resetResponseId
|
|
|
|
+ + " for " + this.appId);
|
|
|
|
+ allocateRequest.setResponseId(this.resetResponseId);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
// Do the actual allocate call
|
|
// Do the actual allocate call
|
|
try {
|
|
try {
|
|
allocateResponse = this.rmClient.allocate(allocateRequest);
|
|
allocateResponse = this.rmClient.allocate(allocateRequest);
|
|
|
|
+
|
|
|
|
+ // Heartbeat succeeded, wipe out responseId overriding
|
|
|
|
+ this.resetResponseId = -1;
|
|
} catch (ApplicationMasterNotRegisteredException e) {
|
|
} catch (ApplicationMasterNotRegisteredException e) {
|
|
- LOG.warn("ApplicationMaster is out of sync with ResourceManager,"
|
|
|
|
|
|
+ LOG.warn("ApplicationMaster is out of sync with RM for " + this.appId
|
|
+ " hence resyncing.");
|
|
+ " hence resyncing.");
|
|
|
|
|
|
synchronized (this) {
|
|
synchronized (this) {
|
|
@@ -249,6 +276,25 @@ public class AMRMClientRelayer extends AbstractService
|
|
// Reset responseId after re-register
|
|
// Reset responseId after re-register
|
|
allocateRequest.setResponseId(0);
|
|
allocateRequest.setResponseId(0);
|
|
return allocate(allocateRequest);
|
|
return allocate(allocateRequest);
|
|
|
|
+ } catch (Throwable t) {
|
|
|
|
+
|
|
|
|
+ // If RM is complaining about responseId out of sync, force reset next
|
|
|
|
+ // time
|
|
|
|
+ if (t instanceof InvalidApplicationMasterRequestException) {
|
|
|
|
+ int responseId = AMRMClientUtils
|
|
|
|
+ .parseExpectedResponseIdFromException(t.getMessage());
|
|
|
|
+ if (responseId != -1) {
|
|
|
|
+ this.resetResponseId = responseId;
|
|
|
|
+ LOG.info("ResponseId out of sync with RM, expect " + responseId
|
|
|
|
+ + " but " + allocateRequest.getResponseId() + " used by "
|
|
|
|
+ + this.appId + ". Will override in the next allocate.");
|
|
|
|
+ } else {
|
|
|
|
+ LOG.warn("Failed to parse expected responseId out of exception for "
|
|
|
|
+ + this.appId);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ throw t;
|
|
}
|
|
}
|
|
|
|
|
|
synchronized (this) {
|
|
synchronized (this) {
|