Browse Source

YARN-4267. Add additional logging to container launch implementations in container-executor. Contributed by Sidharta Seethana.

Varun Vasudev 9 năm trước cách đây
mục cha
commit
151eab2a12

+ 4 - 0
hadoop-yarn-project/CHANGES.txt

@@ -519,6 +519,10 @@ Release 2.8.0 - UNRELEASED
     YARN-4170. AM need to be notified with priority in AllocateResponse.
     (Sunil G via jianhe)
 
+    YARN-4267. Add additional logging to container launch implementations in
+    container-executor. (Sidharta Seethana via vvasudev)
+
+
   OPTIMIZATIONS
 
     YARN-3339. TestDockerContainerExecutor should pull a single image and not

+ 22 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c

@@ -1225,6 +1225,8 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
   if (docker_binary == NULL) {
     docker_binary = "docker";
   }
+
+  fprintf(LOGFILE, "Creating script paths...\n");
   exit_code = create_script_paths(
     work_dir, script_name, cred_file, &script_file_dest, &cred_file_dest,
     &container_file_source, &cred_file_source);
@@ -1235,6 +1237,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
   }
   gid_t user_gid = getegid();
 
+  fprintf(LOGFILE, "Creating local dirs...\n");
   exit_code = create_local_dirs(user, app_id, container_id,
     work_dir, script_name, cred_file, local_dirs, log_dirs,
     1, script_file_dest, cred_file_dest,
@@ -1245,6 +1248,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
     goto cleanup;
   }
 
+  fprintf(LOGFILE, "Getting exit code file...\n");
   exit_code_file = get_exit_code_file(pid_file);
   if (NULL == exit_code_file) {
     exit_code = OUT_OF_MEMORY;
@@ -1253,6 +1257,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
     goto cleanup;
   }
 
+  fprintf(LOGFILE, "Changing effective user to root...\n");
   if (change_effective_user(0, user_gid) != 0) {
     fprintf(ERRORFILE, "Could not change to effective users %d, %d\n", 0, user_gid);
     fflush(ERRORFILE);
@@ -1261,6 +1266,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
 
   snprintf(docker_command_with_binary, EXECUTOR_PATH_MAX, "%s %s", docker_binary, docker_command);
 
+  fprintf(LOGFILE, "Launching docker container...\n");
   FILE* start_docker = popen(docker_command_with_binary, "r");
   if (pclose (start_docker) != 0)
   {
@@ -1275,6 +1281,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
     "%s inspect --format {{.State.Pid}} %s",
     docker_binary, container_id);
 
+  fprintf(LOGFILE, "Inspecting docker container...\n");
   FILE* inspect_docker = popen(docker_inspect_command, "r");
   int pid = 0;
   int res = fscanf (inspect_docker, "%d", &pid);
@@ -1288,6 +1295,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
   }
 
   if (pid != 0) {
+    fprintf(LOGFILE, "Writing to cgroup task files...\n");
     // cgroups-based resource enforcement
     if (resources_key != NULL && ! strcmp(resources_key, "cgroups")) {
      // write pid to cgroups
@@ -1301,7 +1309,9 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
        }
      }
     }
+
     // write pid to pidfile
+    fprintf(LOGFILE, "Writing pid file...\n");
     if (pid_file == NULL
         || write_pid_to_file_as_nm(pid_file, (pid_t)pid) != 0) {
       exit_code = WRITE_PIDFILE_FAILED;
@@ -1313,6 +1323,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
     snprintf(docker_wait_command, EXECUTOR_PATH_MAX,
       "%s wait %s", docker_binary, container_id);
 
+    fprintf(LOGFILE, "Waiting for docker container to finish...\n");
     FILE* wait_docker = popen(docker_wait_command, "r");
     res = fscanf (wait_docker, "%d", &exit_code);
     if (pclose (wait_docker) != 0 || res <= 0) {
@@ -1321,6 +1332,8 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
       fflush(ERRORFILE);
     }
     if(exit_code != 0) {
+      fprintf(ERRORFILE, "Docker container exit code was not zero: %d\n",
+      exit_code);
       snprintf(docker_logs_command, EXECUTOR_PATH_MAX, "%s logs --tail=250 %s",
         docker_binary, container_id);
       FILE* logs = popen(docker_logs_command, "r");
@@ -1350,6 +1363,7 @@ int launch_docker_container_as_user(const char * user, const char *app_id,
     }
   }
 
+  fprintf(LOGFILE, "Removing docker container post-exit...\n");
   snprintf(docker_rm_command, EXECUTOR_PATH_MAX,
     "%s rm %s", docker_binary, container_id);
   FILE* rm_docker = popen(docker_rm_command, "w");
@@ -1399,7 +1413,7 @@ int launch_container_as_user(const char *user, const char *app_id,
   char *cred_file_dest = NULL;
   char *exit_code_file = NULL;
 
-
+  fprintf(LOGFILE, "Getting exit code file...\n");
   exit_code_file = get_exit_code_file(pid_file);
   if (NULL == exit_code_file) {
     exit_code = OUT_OF_MEMORY;
@@ -1408,6 +1422,8 @@ int launch_container_as_user(const char *user, const char *app_id,
 
   int container_file_source =-1;
   int cred_file_source = -1;
+
+  fprintf(LOGFILE, "Creating script paths...\n");
   exit_code = create_script_paths(
     work_dir, script_name, cred_file, &script_file_dest, &cred_file_dest,
     &container_file_source, &cred_file_source);
@@ -1431,6 +1447,7 @@ int launch_container_as_user(const char *user, const char *app_id,
     goto cleanup;
   }
 
+  fprintf(LOGFILE, "Writing pid file...\n");
   // write pid to pidfile
   if (pid_file == NULL
       || write_pid_to_file_as_nm(pid_file, pid) != 0) {
@@ -1438,6 +1455,7 @@ int launch_container_as_user(const char *user, const char *app_id,
     goto cleanup;
   }
 
+  fprintf(LOGFILE, "Writing to cgroup task files...\n");
   // cgroups-based resource enforcement
   if (resources_key != NULL && ! strcmp(resources_key, "cgroups")) {
     // write pid to cgroups
@@ -1452,6 +1470,7 @@ int launch_container_as_user(const char *user, const char *app_id,
     }
   }
 
+  fprintf(LOGFILE, "Creating local dirs...\n");
   exit_code = create_local_dirs(user, app_id, container_id,
     work_dir, script_name, cred_file, local_dirs, log_dirs,
     0, script_file_dest, cred_file_dest,
@@ -1462,6 +1481,8 @@ int launch_container_as_user(const char *user, const char *app_id,
     goto cleanup;
   }
 
+  fprintf(LOGFILE, "Launching container...\n");
+
 #if HAVE_FCLOSEALL
   fcloseall();
 #else