|
@@ -0,0 +1,308 @@
|
|
|
+/**
|
|
|
+ * Licensed to the Apache Software Foundation (ASF) under one
|
|
|
+ * or more contributor license agreements. See the NOTICE file
|
|
|
+ * distributed with this work for additional information
|
|
|
+ * regarding copyright ownership. The ASF licenses this file
|
|
|
+ * to you under the Apache License, Version 2.0 (the
|
|
|
+ * "License"); you may not use this file except in compliance
|
|
|
+ * with the License. You may obtain a copy of the License at
|
|
|
+ *
|
|
|
+ * http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+ *
|
|
|
+ * Unless required by applicable law or agreed to in writing, software
|
|
|
+ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+ * See the License for the specific language governing permissions and
|
|
|
+ * limitations under the License.
|
|
|
+ */
|
|
|
+
|
|
|
+package org.apache.hadoop.yarn.server.sharedcachemanager;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.concurrent.locks.Lock;
|
|
|
+
|
|
|
+import org.apache.commons.logging.Log;
|
|
|
+import org.apache.commons.logging.LogFactory;
|
|
|
+import org.apache.hadoop.classification.InterfaceAudience.Private;
|
|
|
+import org.apache.hadoop.classification.InterfaceStability.Evolving;
|
|
|
+import org.apache.hadoop.conf.Configuration;
|
|
|
+import org.apache.hadoop.fs.FileStatus;
|
|
|
+import org.apache.hadoop.fs.FileSystem;
|
|
|
+import org.apache.hadoop.fs.Path;
|
|
|
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
|
|
+import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
|
+import org.apache.hadoop.yarn.server.sharedcache.SharedCacheUtil;
|
|
|
+import org.apache.hadoop.yarn.server.sharedcachemanager.metrics.CleanerMetrics;
|
|
|
+import org.apache.hadoop.yarn.server.sharedcachemanager.store.SCMStore;
|
|
|
+
|
|
|
+/**
|
|
|
+ * The task that runs and cleans up the shared cache area for stale entries and
|
|
|
+ * orphaned files. It is expected that only one cleaner task runs at any given
|
|
|
+ * point in time.
|
|
|
+ */
|
|
|
+@Private
|
|
|
+@Evolving
|
|
|
+class CleanerTask implements Runnable {
|
|
|
+ private static final String RENAMED_SUFFIX = "-renamed";
|
|
|
+ private static final Log LOG = LogFactory.getLog(CleanerTask.class);
|
|
|
+
|
|
|
+ private final String location;
|
|
|
+ private final long sleepTime;
|
|
|
+ private final int nestedLevel;
|
|
|
+ private final Path root;
|
|
|
+ private final FileSystem fs;
|
|
|
+ private final SCMStore store;
|
|
|
+ private final CleanerMetrics metrics;
|
|
|
+ private final Lock cleanerTaskLock;
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Creates a cleaner task based on the configuration. This is provided for
|
|
|
+ * convenience.
|
|
|
+ *
|
|
|
+ * @param conf
|
|
|
+ * @param store
|
|
|
+ * @param metrics
|
|
|
+ * @param cleanerTaskLock lock that ensures a serial execution of cleaner
|
|
|
+ * task
|
|
|
+ * @return an instance of a CleanerTask
|
|
|
+ */
|
|
|
+ public static CleanerTask create(Configuration conf, SCMStore store,
|
|
|
+ CleanerMetrics metrics, Lock cleanerTaskLock) {
|
|
|
+ try {
|
|
|
+ // get the root directory for the shared cache
|
|
|
+ String location =
|
|
|
+ conf.get(YarnConfiguration.SHARED_CACHE_ROOT,
|
|
|
+ YarnConfiguration.DEFAULT_SHARED_CACHE_ROOT);
|
|
|
+
|
|
|
+ long sleepTime =
|
|
|
+ conf.getLong(YarnConfiguration.SCM_CLEANER_RESOURCE_SLEEP_MS,
|
|
|
+ YarnConfiguration.DEFAULT_SCM_CLEANER_RESOURCE_SLEEP_MS);
|
|
|
+ int nestedLevel = SharedCacheUtil.getCacheDepth(conf);
|
|
|
+ FileSystem fs = FileSystem.get(conf);
|
|
|
+
|
|
|
+ return new CleanerTask(location, sleepTime, nestedLevel, fs, store,
|
|
|
+ metrics, cleanerTaskLock);
|
|
|
+ } catch (IOException e) {
|
|
|
+ LOG.error("Unable to obtain the filesystem for the cleaner service", e);
|
|
|
+ throw new ExceptionInInitializerError(e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Creates a cleaner task based on the root directory location and the
|
|
|
+ * filesystem.
|
|
|
+ */
|
|
|
+ CleanerTask(String location, long sleepTime, int nestedLevel, FileSystem fs,
|
|
|
+ SCMStore store, CleanerMetrics metrics, Lock cleanerTaskLock) {
|
|
|
+ this.location = location;
|
|
|
+ this.sleepTime = sleepTime;
|
|
|
+ this.nestedLevel = nestedLevel;
|
|
|
+ this.root = new Path(location);
|
|
|
+ this.fs = fs;
|
|
|
+ this.store = store;
|
|
|
+ this.metrics = metrics;
|
|
|
+ this.cleanerTaskLock = cleanerTaskLock;
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void run() {
|
|
|
+ if (!this.cleanerTaskLock.tryLock()) {
|
|
|
+ // there is already another task running
|
|
|
+ LOG.warn("A cleaner task is already running. "
|
|
|
+ + "This scheduled cleaner task will do nothing.");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ try {
|
|
|
+ if (!fs.exists(root)) {
|
|
|
+ LOG.error("The shared cache root " + location + " was not found. "
|
|
|
+ + "The cleaner task will do nothing.");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // we're now ready to process the shared cache area
|
|
|
+ process();
|
|
|
+ } catch (Throwable e) {
|
|
|
+ LOG.error("Unexpected exception while initializing the cleaner task. "
|
|
|
+ + "This task will do nothing,", e);
|
|
|
+ } finally {
|
|
|
+ // this is set to false regardless of if it is a scheduled or on-demand
|
|
|
+ // task
|
|
|
+ this.cleanerTaskLock.unlock();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Sweeps and processes the shared cache area to clean up stale and orphaned
|
|
|
+ * files.
|
|
|
+ */
|
|
|
+ void process() {
|
|
|
+ // mark the beginning of the run in the metrics
|
|
|
+ metrics.reportCleaningStart();
|
|
|
+ try {
|
|
|
+ // now traverse individual directories and process them
|
|
|
+ // the directory structure is specified by the nested level parameter
|
|
|
+ // (e.g. 9/c/d/<checksum>)
|
|
|
+ String pattern = SharedCacheUtil.getCacheEntryGlobPattern(nestedLevel);
|
|
|
+ FileStatus[] resources =
|
|
|
+ fs.globStatus(new Path(root, pattern));
|
|
|
+ int numResources = resources == null ? 0 : resources.length;
|
|
|
+ LOG.info("Processing " + numResources + " resources in the shared cache");
|
|
|
+ long beginMs = System.currentTimeMillis();
|
|
|
+ if (resources != null) {
|
|
|
+ for (FileStatus resource : resources) {
|
|
|
+ // check for interruption so it can abort in a timely manner in case
|
|
|
+ // of shutdown
|
|
|
+ if (Thread.currentThread().isInterrupted()) {
|
|
|
+ LOG.warn("The cleaner task was interrupted. Aborting.");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (resource.isDirectory()) {
|
|
|
+ processSingleResource(resource);
|
|
|
+ } else {
|
|
|
+ LOG.warn("Invalid file at path " + resource.getPath().toString()
|
|
|
+ +
|
|
|
+ " when a directory was expected");
|
|
|
+ }
|
|
|
+ // add sleep time between cleaning each directory if it is non-zero
|
|
|
+ if (sleepTime > 0) {
|
|
|
+ Thread.sleep(sleepTime);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ long endMs = System.currentTimeMillis();
|
|
|
+ long durationMs = endMs - beginMs;
|
|
|
+ LOG.info("Processed " + numResources + " resource(s) in " + durationMs +
|
|
|
+ " ms.");
|
|
|
+ } catch (IOException e1) {
|
|
|
+ LOG.error("Unable to complete the cleaner task", e1);
|
|
|
+ } catch (InterruptedException e2) {
|
|
|
+ Thread.currentThread().interrupt(); // restore the interrupt
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Returns a path for the root directory for the shared cache.
|
|
|
+ */
|
|
|
+ Path getRootPath() {
|
|
|
+ return root;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Processes a single shared cache resource directory.
|
|
|
+ */
|
|
|
+ void processSingleResource(FileStatus resource) {
|
|
|
+ Path path = resource.getPath();
|
|
|
+ // indicates the processing status of the resource
|
|
|
+ ResourceStatus resourceStatus = ResourceStatus.INIT;
|
|
|
+
|
|
|
+ // first, if the path ends with the renamed suffix, it indicates the
|
|
|
+ // directory was moved (as stale) but somehow not deleted (probably due to
|
|
|
+ // SCM failure); delete the directory
|
|
|
+ if (path.toString().endsWith(RENAMED_SUFFIX)) {
|
|
|
+ LOG.info("Found a renamed directory that was left undeleted at " +
|
|
|
+ path.toString() + ". Deleting.");
|
|
|
+ try {
|
|
|
+ if (fs.delete(path, true)) {
|
|
|
+ resourceStatus = ResourceStatus.DELETED;
|
|
|
+ }
|
|
|
+ } catch (IOException e) {
|
|
|
+ LOG.error("Error while processing a shared cache resource: " + path, e);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ // this is the path to the cache resource directory
|
|
|
+ // the directory name is the resource key (i.e. a unique identifier)
|
|
|
+ String key = path.getName();
|
|
|
+
|
|
|
+ try {
|
|
|
+ store.cleanResourceReferences(key);
|
|
|
+ } catch (YarnException e) {
|
|
|
+ LOG.error("Exception thrown while removing dead appIds.", e);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (store.isResourceEvictable(key, resource)) {
|
|
|
+ try {
|
|
|
+ /*
|
|
|
+ * TODO See YARN-2663: There is a race condition between
|
|
|
+ * store.removeResource(key) and
|
|
|
+ * removeResourceFromCacheFileSystem(path) operations because they do
|
|
|
+ * not happen atomically and resources can be uploaded with different
|
|
|
+ * file names by the node managers.
|
|
|
+ */
|
|
|
+ // remove the resource from scm (checks for appIds as well)
|
|
|
+ if (store.removeResource(key)) {
|
|
|
+ // remove the resource from the file system
|
|
|
+ boolean deleted = removeResourceFromCacheFileSystem(path);
|
|
|
+ if (deleted) {
|
|
|
+ resourceStatus = ResourceStatus.DELETED;
|
|
|
+ } else {
|
|
|
+ LOG.error("Failed to remove path from the file system."
|
|
|
+ + " Skipping this resource: " + path);
|
|
|
+ resourceStatus = ResourceStatus.ERROR;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ // we did not delete the resource because it contained application
|
|
|
+ // ids
|
|
|
+ resourceStatus = ResourceStatus.PROCESSED;
|
|
|
+ }
|
|
|
+ } catch (IOException e) {
|
|
|
+ LOG.error(
|
|
|
+ "Failed to remove path from the file system. Skipping this resource: "
|
|
|
+ + path, e);
|
|
|
+ resourceStatus = ResourceStatus.ERROR;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ resourceStatus = ResourceStatus.PROCESSED;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // record the processing
|
|
|
+ switch (resourceStatus) {
|
|
|
+ case DELETED:
|
|
|
+ metrics.reportAFileDelete();
|
|
|
+ break;
|
|
|
+ case PROCESSED:
|
|
|
+ metrics.reportAFileProcess();
|
|
|
+ break;
|
|
|
+ case ERROR:
|
|
|
+ metrics.reportAFileError();
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ LOG.error("Cleaner encountered an invalid status (" + resourceStatus
|
|
|
+ + ") while processing resource: " + path.getName());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private boolean removeResourceFromCacheFileSystem(Path path)
|
|
|
+ throws IOException {
|
|
|
+ // rename the directory to make the delete atomic
|
|
|
+ Path renamedPath = new Path(path.toString() + RENAMED_SUFFIX);
|
|
|
+ if (fs.rename(path, renamedPath)) {
|
|
|
+ // the directory can be removed safely now
|
|
|
+ // log the original path
|
|
|
+ LOG.info("Deleting " + path.toString());
|
|
|
+ return fs.delete(renamedPath, true);
|
|
|
+ } else {
|
|
|
+ // we were unable to remove it for some reason: it's best to leave
|
|
|
+ // it at that
|
|
|
+ LOG.error("We were not able to rename the directory to "
|
|
|
+ + renamedPath.toString() + ". We will leave it intact.");
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * A status indicating what happened with the processing of a given cache
|
|
|
+ * resource.
|
|
|
+ */
|
|
|
+ private enum ResourceStatus {
|
|
|
+ INIT,
|
|
|
+ /** Resource was successfully processed, but not deleted **/
|
|
|
+ PROCESSED,
|
|
|
+ /** Resource was successfully deleted **/
|
|
|
+ DELETED,
|
|
|
+ /** The cleaner task ran into an error while processing the resource **/
|
|
|
+ ERROR
|
|
|
+ }
|
|
|
+}
|