|
@@ -117,6 +117,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
|
|
import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
|
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
|
|
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
|
|
+import org.apache.hadoop.yarn.util.BoundedAppender;
|
|
|
import org.apache.hadoop.yarn.util.SystemClock;
|
|
|
import org.apache.hadoop.yarn.util.TimelineServiceHelper;
|
|
|
import org.apache.hadoop.yarn.util.resource.ResourceUtils;
|
|
@@ -345,6 +346,7 @@ public class ApplicationMaster {
|
|
|
static final String CONTAINER_ENTITY_GROUP_ID = "CONTAINERS";
|
|
|
static final String APPID_TIMELINE_FILTER_NAME = "appId";
|
|
|
static final String USER_TIMELINE_FILTER_NAME = "user";
|
|
|
+ static final String DIAGNOSTICS = "Diagnostics";
|
|
|
|
|
|
private final String linux_bash_command = "bash";
|
|
|
private final String windows_command = "cmd /c";
|
|
@@ -356,6 +358,8 @@ public class ApplicationMaster {
|
|
|
protected final Set<ContainerId> launchedContainers =
|
|
|
Collections.newSetFromMap(new ConcurrentHashMap<ContainerId, Boolean>());
|
|
|
|
|
|
+ private BoundedAppender diagnostics = new BoundedAppender(64 * 1024);
|
|
|
+
|
|
|
/**
|
|
|
* Container start times used to set id prefix while publishing entity
|
|
|
* to ATSv2.
|
|
@@ -390,7 +394,7 @@ public class ApplicationMaster {
|
|
|
LOG.info("Application Master completed successfully. exiting");
|
|
|
System.exit(0);
|
|
|
} else {
|
|
|
- LOG.info("Application Master failed. exiting");
|
|
|
+ LOG.error("Application Master failed. exiting");
|
|
|
System.exit(2);
|
|
|
}
|
|
|
}
|
|
@@ -931,28 +935,25 @@ public class ApplicationMaster {
|
|
|
LOG.info("Application completed. Signalling finish to RM");
|
|
|
|
|
|
FinalApplicationStatus appStatus;
|
|
|
- String appMessage = null;
|
|
|
boolean success = true;
|
|
|
+ String message = null;
|
|
|
if (numCompletedContainers.get() - numFailedContainers.get()
|
|
|
>= numTotalContainers) {
|
|
|
appStatus = FinalApplicationStatus.SUCCEEDED;
|
|
|
} else {
|
|
|
appStatus = FinalApplicationStatus.FAILED;
|
|
|
- appMessage = "Diagnostics." + ", total=" + numTotalContainers
|
|
|
- + ", completed=" + numCompletedContainers.get() + ", allocated="
|
|
|
- + numAllocatedContainers.get() + ", failed="
|
|
|
- + numFailedContainers.get();
|
|
|
- LOG.info(appMessage);
|
|
|
+ message = String.format("Application Failure: desired = %d, " +
|
|
|
+ "completed = %d, allocated = %d, failed = %d, " +
|
|
|
+ "diagnostics = %s", numRequestedContainers.get(),
|
|
|
+ numCompletedContainers.get(), numAllocatedContainers.get(),
|
|
|
+ numFailedContainers.get(), diagnostics);
|
|
|
success = false;
|
|
|
}
|
|
|
try {
|
|
|
- amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
|
|
|
- } catch (YarnException ex) {
|
|
|
+ amRMClient.unregisterApplicationMaster(appStatus, message, null);
|
|
|
+ } catch (YarnException | IOException ex) {
|
|
|
LOG.error("Failed to unregister application", ex);
|
|
|
- } catch (IOException e) {
|
|
|
- LOG.error("Failed to unregister application", e);
|
|
|
}
|
|
|
-
|
|
|
amRMClient.stop();
|
|
|
|
|
|
// Stop Timeline Client
|
|
@@ -974,11 +975,17 @@ public class ApplicationMaster {
|
|
|
LOG.info("Got response from RM for container ask, completedCnt="
|
|
|
+ completedContainers.size());
|
|
|
for (ContainerStatus containerStatus : completedContainers) {
|
|
|
- LOG.info(appAttemptID + " got container status for containerID="
|
|
|
+ String message = appAttemptID + " got container status for containerID="
|
|
|
+ containerStatus.getContainerId() + ", state="
|
|
|
+ containerStatus.getState() + ", exitStatus="
|
|
|
+ containerStatus.getExitStatus() + ", diagnostics="
|
|
|
- + containerStatus.getDiagnostics());
|
|
|
+ + containerStatus.getDiagnostics();
|
|
|
+ if (containerStatus.getExitStatus() != 0) {
|
|
|
+ LOG.error(message);
|
|
|
+ diagnostics.append(containerStatus.getDiagnostics());
|
|
|
+ } else {
|
|
|
+ LOG.info(message);
|
|
|
+ }
|
|
|
|
|
|
// non complete containers should not be here
|
|
|
assert (containerStatus.getState() == ContainerState.COMPLETE);
|
|
@@ -1244,10 +1251,17 @@ public class ApplicationMaster {
|
|
|
|
|
|
@Override
|
|
|
public void onStartContainerError(ContainerId containerId, Throwable t) {
|
|
|
- LOG.error("Failed to start Container " + containerId, t);
|
|
|
+ LOG.error("Failed to start Container {}", containerId, t);
|
|
|
containers.remove(containerId);
|
|
|
applicationMaster.numCompletedContainers.incrementAndGet();
|
|
|
applicationMaster.numFailedContainers.incrementAndGet();
|
|
|
+ if (timelineServiceV2Enabled) {
|
|
|
+ publishContainerStartFailedEventOnTimelineServiceV2(containerId,
|
|
|
+ t.getMessage());
|
|
|
+ }
|
|
|
+ if (timelineServiceV1Enabled) {
|
|
|
+ publishContainerStartFailedEvent(containerId, t.getMessage());
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
@Override
|
|
@@ -1525,6 +1539,7 @@ public class ApplicationMaster {
|
|
|
event.setEventType(DSEvent.DS_CONTAINER_END.toString());
|
|
|
event.addEventInfo("State", container.getState().name());
|
|
|
event.addEventInfo("Exit Status", container.getExitStatus());
|
|
|
+ event.addEventInfo(DIAGNOSTICS, container.getDiagnostics());
|
|
|
entity.addEvent(event);
|
|
|
try {
|
|
|
processTimelineResponseErrors(
|
|
@@ -1653,6 +1668,58 @@ public class ApplicationMaster {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ private void publishContainerStartFailedEventOnTimelineServiceV2(
|
|
|
+ final ContainerId containerId, String diagnostics) {
|
|
|
+ final org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity
|
|
|
+ entity = new org.apache.hadoop.yarn.api.records.timelineservice.
|
|
|
+ TimelineEntity();
|
|
|
+ entity.setId(containerId.toString());
|
|
|
+ entity.setType(DSEntity.DS_CONTAINER.toString());
|
|
|
+ entity.addInfo("user", appSubmitterUgi.getShortUserName());
|
|
|
+ org.apache.hadoop.yarn.api.records.timelineservice.TimelineEvent event =
|
|
|
+ new org.apache.hadoop.yarn.api.records.timelineservice
|
|
|
+ .TimelineEvent();
|
|
|
+ event.setTimestamp(System.currentTimeMillis());
|
|
|
+ event.setId(DSEvent.DS_CONTAINER_END.toString());
|
|
|
+ event.addInfo(DIAGNOSTICS, diagnostics);
|
|
|
+ entity.addEvent(event);
|
|
|
+ try {
|
|
|
+ appSubmitterUgi.doAs((PrivilegedExceptionAction<Object>) () -> {
|
|
|
+ timelineV2Client.putEntitiesAsync(entity);
|
|
|
+ return null;
|
|
|
+ });
|
|
|
+ } catch (Exception e) {
|
|
|
+ LOG.error("Container start failed event could not be published for {}",
|
|
|
+ containerId,
|
|
|
+ e instanceof UndeclaredThrowableException ? e.getCause() : e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void publishContainerStartFailedEvent(final ContainerId containerId,
|
|
|
+ String diagnostics) {
|
|
|
+ final TimelineEntity entityV1 = new TimelineEntity();
|
|
|
+ entityV1.setEntityId(containerId.toString());
|
|
|
+ entityV1.setEntityType(DSEntity.DS_CONTAINER.toString());
|
|
|
+ entityV1.setDomainId(domainId);
|
|
|
+ entityV1.addPrimaryFilter(USER_TIMELINE_FILTER_NAME, appSubmitterUgi
|
|
|
+ .getShortUserName());
|
|
|
+ entityV1.addPrimaryFilter(APPID_TIMELINE_FILTER_NAME,
|
|
|
+ containerId.getApplicationAttemptId().getApplicationId().toString());
|
|
|
+
|
|
|
+ TimelineEvent eventV1 = new TimelineEvent();
|
|
|
+ eventV1.setTimestamp(System.currentTimeMillis());
|
|
|
+ eventV1.setEventType(DSEvent.DS_CONTAINER_END.toString());
|
|
|
+ eventV1.addEventInfo(DIAGNOSTICS, diagnostics);
|
|
|
+ entityV1.addEvent(eventV1);
|
|
|
+ try {
|
|
|
+ processTimelineResponseErrors(putContainerEntity(timelineClient,
|
|
|
+ containerId.getApplicationAttemptId(), entityV1));
|
|
|
+ } catch (YarnException | IOException | ClientHandlerException e) {
|
|
|
+ LOG.error("Container end event could not be published for {}",
|
|
|
+ containerId, e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
private void publishContainerEndEventOnTimelineServiceV2(
|
|
|
final ContainerStatus container, long containerStartTime) {
|
|
|
final org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity
|
|
@@ -1669,6 +1736,7 @@ public class ApplicationMaster {
|
|
|
event.setId(DSEvent.DS_CONTAINER_END.toString());
|
|
|
event.addInfo("State", container.getState().name());
|
|
|
event.addInfo("Exit Status", container.getExitStatus());
|
|
|
+ event.addInfo(DIAGNOSTICS, container.getDiagnostics());
|
|
|
entity.addEvent(event);
|
|
|
entity.setIdPrefix(TimelineServiceHelper.invertLong(containerStartTime));
|
|
|
|