|
@@ -230,7 +230,6 @@ None: directories are created on demand.
|
|
Rename task attempt path to task committed path.
|
|
Rename task attempt path to task committed path.
|
|
|
|
|
|
```python
|
|
```python
|
|
-
|
|
|
|
def needsTaskCommit(fs, jobAttemptPath, taskAttemptPath, dest):
|
|
def needsTaskCommit(fs, jobAttemptPath, taskAttemptPath, dest):
|
|
return fs.exists(taskAttemptPath)
|
|
return fs.exists(taskAttemptPath)
|
|
|
|
|
|
@@ -276,12 +275,12 @@ def commitJob(fs, jobAttemptDir, dest):
|
|
(See below for details on `mergePaths()`)
|
|
(See below for details on `mergePaths()`)
|
|
|
|
|
|
|
|
|
|
-A failure during job abort cannot be recovered from except by re-executing
|
|
|
|
|
|
+A failure during job commit cannot be recovered from except by re-executing
|
|
the entire query:
|
|
the entire query:
|
|
|
|
|
|
```python
|
|
```python
|
|
def isCommitJobRepeatable() :
|
|
def isCommitJobRepeatable() :
|
|
- return True
|
|
|
|
|
|
+ return False
|
|
```
|
|
```
|
|
|
|
|
|
Accordingly, it is a failure point in the protocol. With a low number of files
|
|
Accordingly, it is a failure point in the protocol. With a low number of files
|
|
@@ -307,12 +306,28 @@ def cleanupJob(fs, dest):
|
|
```
|
|
```
|
|
|
|
|
|
|
|
|
|
-### Job Recovery
|
|
|
|
|
|
+### Job Recovery Before `commitJob()`
|
|
|
|
|
|
-1. Data under task committed paths is retained
|
|
|
|
-1. All directories under `$dest/_temporary/$appAttemptId/_temporary/` are deleted.
|
|
|
|
|
|
+For all committers, the recovery process takes place in the application
|
|
|
|
+master.
|
|
|
|
+1. The job history file of the previous attempt is loaded and scanned
|
|
|
|
+to determine which tasks were recorded as having succeeded.
|
|
|
|
+1. For each successful task, the job committer has its `recoverTask()` method
|
|
|
|
+invoked with a `TaskAttemptContext` built from the previous attempt's details.
|
|
|
|
+1. If the method does not raise an exception, it is considered to have been
|
|
|
|
+recovered, and not to be re-executed.
|
|
|
|
+1. All other tasks are queued for execution.
|
|
|
|
|
|
-Uncommitted/unexecuted tasks are (re)executed.
|
|
|
|
|
|
+For the v1 committer, task recovery is straightforward.
|
|
|
|
+The directory of the committed task from the previous attempt is
|
|
|
|
+moved under the directory of the current application attempt.
|
|
|
|
+
|
|
|
|
+```python
|
|
|
|
+def recoverTask(tac):
|
|
|
|
+ oldAttemptId = appAttemptId - 1
|
|
|
|
+ fs.rename('$dest/_temporary/oldAttemptId/${tac.taskId}',
|
|
|
|
+ '$dest/_temporary/appAttemptId/${tac.taskId}')
|
|
|
|
+```
|
|
|
|
|
|
This significantly improves time to recover from Job driver (here MR AM) failure.
|
|
This significantly improves time to recover from Job driver (here MR AM) failure.
|
|
The only lost work is that of all tasks in progress -those which had generated
|
|
The only lost work is that of all tasks in progress -those which had generated
|
|
@@ -330,6 +345,11 @@ failure simply by rerunning the entire job. This is implicitly the strategy
|
|
in Spark, which does not attempt to recover any in-progress jobs. The faster
|
|
in Spark, which does not attempt to recover any in-progress jobs. The faster
|
|
your queries, the simpler your recovery strategy needs to be.
|
|
your queries, the simpler your recovery strategy needs to be.
|
|
|
|
|
|
|
|
+### Job Recovery During `commitJob()`
|
|
|
|
+
|
|
|
|
+This is not possible; a failure during job commit requires the entire job
|
|
|
|
+to be re-executed after cleaning up the destination directory.
|
|
|
|
+
|
|
### `mergePaths(FileSystem fs, FileStatus src, Path dest)` Algorithm
|
|
### `mergePaths(FileSystem fs, FileStatus src, Path dest)` Algorithm
|
|
|
|
|
|
`mergePaths()` is the core algorithm to merge data; it is somewhat confusing
|
|
`mergePaths()` is the core algorithm to merge data; it is somewhat confusing
|
|
@@ -352,24 +372,23 @@ def mergePathsV1(fs, src, dest) :
|
|
fs.delete(dest, recursive = True)
|
|
fs.delete(dest, recursive = True)
|
|
fs.rename(src.getPath, dest)
|
|
fs.rename(src.getPath, dest)
|
|
else :
|
|
else :
|
|
- # destination is directory, choose action on source type
|
|
|
|
- if src.isDirectory :
|
|
|
|
- if not toStat is None :
|
|
|
|
- if not toStat.isDirectory :
|
|
|
|
- # Destination exists and is not a directory
|
|
|
|
- fs.delete(dest)
|
|
|
|
- fs.rename(src.getPath(), dest)
|
|
|
|
- else :
|
|
|
|
- # Destination exists and is a directory
|
|
|
|
- # merge all children under destination directory
|
|
|
|
- for child in fs.listStatus(src.getPath) :
|
|
|
|
- mergePathsV1(fs, child, dest + child.getName)
|
|
|
|
- else :
|
|
|
|
- # destination does not exist
|
|
|
|
|
|
+ # src is directory, choose action on dest type
|
|
|
|
+ if not toStat is None :
|
|
|
|
+ if not toStat.isDirectory :
|
|
|
|
+ # Destination exists and is not a directory
|
|
|
|
+ fs.delete(dest)
|
|
fs.rename(src.getPath(), dest)
|
|
fs.rename(src.getPath(), dest)
|
|
|
|
+ else :
|
|
|
|
+ # Destination exists and is a directory
|
|
|
|
+ # merge all children under destination directory
|
|
|
|
+ for child in fs.listStatus(src.getPath) :
|
|
|
|
+ mergePathsV1(fs, child, dest + child.getName)
|
|
|
|
+ else :
|
|
|
|
+ # destination does not exist
|
|
|
|
+ fs.rename(src.getPath(), dest)
|
|
```
|
|
```
|
|
|
|
|
|
-## v2 commit algorithm
|
|
|
|
|
|
+## The v2 Commit Algorithm
|
|
|
|
|
|
|
|
|
|
The v2 algorithm directly commits task output into the destination directory.
|
|
The v2 algorithm directly commits task output into the destination directory.
|
|
@@ -506,12 +525,31 @@ Cost: `O(1)` for normal filesystems, `O(files)` for object stores.
|
|
As no data is written to the destination directory, a task can be cleaned up
|
|
As no data is written to the destination directory, a task can be cleaned up
|
|
by deleting the task attempt directory.
|
|
by deleting the task attempt directory.
|
|
|
|
|
|
-### v2 Job Recovery
|
|
|
|
|
|
+### v2 Job Recovery Before `commitJob()`
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+Because the data has been renamed into the destination directory, all tasks
|
|
|
|
+recorded as having being committed have no recovery needed at all:
|
|
|
|
+
|
|
|
|
+```python
|
|
|
|
+def recoverTask(tac):
|
|
|
|
+```
|
|
|
|
+
|
|
|
|
+All active and queued tasks are scheduled for execution.
|
|
|
|
+
|
|
|
|
+There is a weakness here, the same one on a failure during `commitTask()`:
|
|
|
|
+it is only safe to repeat a task which failed during that commit operation
|
|
|
|
+if the name of all generated files are constant across all task attempts.
|
|
|
|
+
|
|
|
|
+If the Job AM fails while a task attempt has been instructed to commit,
|
|
|
|
+and that commit is not recorded as having completed, the state of that
|
|
|
|
+in-progress task is unknown...really it isn't be safe to recover the
|
|
|
|
+job at this point.
|
|
|
|
+
|
|
|
|
|
|
-Because the data has been renamed into the destination directory, it is nominally
|
|
|
|
-recoverable. However, this assumes that the number and name of generated
|
|
|
|
-files are constant on retried tasks.
|
|
|
|
|
|
+### v2 Job Recovery During `commitJob()`
|
|
|
|
|
|
|
|
+This is straightforward: `commitJob()` is re-invoked.
|
|
|
|
|
|
## How MapReduce uses the committer in a task
|
|
## How MapReduce uses the committer in a task
|
|
|
|
|
|
@@ -896,7 +934,7 @@ and metadata.
|
|
|
|
|
|
POST bucket.s3.aws.com/path?uploads
|
|
POST bucket.s3.aws.com/path?uploads
|
|
|
|
|
|
- An UploadId is returned
|
|
|
|
|
|
+ An `UploadId` is returned
|
|
|
|
|
|
1. Caller uploads one or more parts.
|
|
1. Caller uploads one or more parts.
|
|
|
|
|
|
@@ -994,7 +1032,7 @@ Task outputs are directed to the local FS by `getTaskAttemptPath` and `getWorkPa
|
|
|
|
|
|
The single-directory and partitioned committers handle conflict resolution by
|
|
The single-directory and partitioned committers handle conflict resolution by
|
|
checking whether target paths exist in S3 before uploading any data.
|
|
checking whether target paths exist in S3 before uploading any data.
|
|
-There are 3 conflict resolution modes, controlled by setting `fs.s3a.committer.staging.conflict-mode`:
|
|
|
|
|
|
+There are three conflict resolution modes, controlled by setting `fs.s3a.committer.staging.conflict-mode`:
|
|
|
|
|
|
* `fail`: Fail a task if an output directory or partition already exists. (Default)
|
|
* `fail`: Fail a task if an output directory or partition already exists. (Default)
|
|
* `append`: Upload data files without checking whether directories or partitions already exist.
|
|
* `append`: Upload data files without checking whether directories or partitions already exist.
|