Selaa lähdekoodia

MAPREDUCE-2981. Backport FairScheduler from trunk. Contributed by Matei Zaharia.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.20-security@1169585 13f79535-47bb-0310-9956-ffa450edef68
Arun Murthy 13 vuotta sitten
vanhempi
commit
1daa814588
37 muutettua tiedostoa jossa 4566 lisäystä ja 989 poistoa
  1. 3 0
      CHANGES.txt
  2. 12 0
      conf/fair-scheduler.xml.template
  3. 3 2
      src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java
  4. 8 2
      src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java
  5. 253 0
      src/contrib/fairscheduler/designdoc/fair_scheduler_design_doc.tex
  6. 8 0
      src/contrib/fairscheduler/ivy.xml
  7. 18 1
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/CapBasedLoadManager.java
  8. 13 4
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/DefaultTaskSelector.java
  9. 506 427
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/FairScheduler.java
  10. 142 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/FairSchedulerEventLog.java
  11. 113 86
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/FairSchedulerServlet.java
  12. 2 1
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/FifoJobComparator.java
  13. 185 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/JobSchedulable.java
  14. 22 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/LoadManager.java
  15. 65 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/LocalityLevel.java
  16. 1 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/NewJobWeightBooster.java
  17. 41 1
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/Pool.java
  18. 228 23
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/PoolManager.java
  19. 221 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/PoolSchedulable.java
  20. 171 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/Schedulable.java
  21. 209 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/SchedulingAlgorithms.java
  22. 26 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/SchedulingMode.java
  23. 2 1
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/TaskSelector.java
  24. 1 0
      src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/WeightAdjuster.java
  25. 124 0
      src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/FakeSchedulable.java
  26. 150 0
      src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/TestCapBasedLoadManager.java
  27. 184 0
      src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/TestComputeFairShares.java
  28. 719 224
      src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/TestFairScheduler.java
  29. 199 0
      src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/TestFairSchedulerSystem.java
  30. 434 211
      src/docs/src/documentation/content/xdocs/fair_scheduler.xml
  31. 54 2
      src/mapred/org/apache/hadoop/mapred/JobInProgress.java
  32. 2 2
      src/mapred/org/apache/hadoop/mapred/JobQueueTaskScheduler.java
  33. 11 1
      src/mapred/org/apache/hadoop/mapred/TaskTrackerManager.java
  34. 7 1
      src/test/org/apache/hadoop/mapred/TestJobQueueTaskScheduler.java
  35. 156 0
      src/test/org/apache/hadoop/mapred/TestLinuxTaskControllerLaunchArgs.java
  36. 7 0
      src/test/org/apache/hadoop/mapred/TestParallelInitialization.java
  37. 266 0
      src/test/org/apache/hadoop/mapreduce/TestSleepJob.java

+ 3 - 0
CHANGES.txt

@@ -202,6 +202,9 @@ Release 0.20.205.0 - unreleased
     HADOOP-7599. Script improvements to setup a secure Hadoop cluster 
     (Eric Yang via ddas)
 
+    MAPREDUCE-2981. Backport FairScheduler from trunk. (Matei Zaharia via
+    acmurthy) 
+
 Release 0.20.204.0 - 2011-8-25
 
   NEW FEATURES

+ 12 - 0
conf/fair-scheduler.xml.template

@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+
+<!--
+  This file contains pool and user allocations for the Fair Scheduler.
+  Its format is explained in the Fair Scheduler documentation at
+  http://hadoop.apache.org/common/docs/r0.20.205.0/fair_scheduler.html.
+  The documentation also includes a sample config file.
+-->
+
+<allocations>
+
+</allocations>

+ 3 - 2
src/contrib/capacity-scheduler/src/java/org/apache/hadoop/mapred/CapacityTaskScheduler.java

@@ -509,8 +509,9 @@ class CapacityTaskScheduler extends TaskScheduler {
       job.schedulingOpportunity();
       
       // First, try to get a 'local' task
-      Task t = 
-        job.obtainNewLocalMapTask(taskTracker, numTaskTrackers, numUniqueHosts);
+      Task t = job.obtainNewNodeOrRackLocalMapTask(taskTracker,
+                                                   numTaskTrackers,
+                                                   numUniqueHosts);
       
       if (t != null) {
         return TaskLookupResult.getTaskFoundResult(t, job); 

+ 8 - 2
src/contrib/capacity-scheduler/src/test/org/apache/hadoop/mapred/TestCapacityScheduler.java

@@ -195,8 +195,8 @@ public class TestCapacityScheduler extends TestCase {
     }
 
     @Override
-    public Task obtainNewLocalMapTask(final TaskTrackerStatus tts, int clusterSize,
-        int ignored) throws IOException {
+    public Task obtainNewNodeOrRackLocalMapTask(final TaskTrackerStatus tts,
+        int clusterSize, int ignored) throws IOException {
       return obtainNewMapTask(tts, clusterSize, ignored);
     }
     
@@ -553,6 +553,12 @@ public class TestCapacityScheduler extends TestCase {
       return statuses;
     }
 
+    @Override
+    public boolean killTask(TaskAttemptID taskid, boolean shouldFail)
+      throws IOException {
+      return false;
+    }
+
 
     public void addJobInProgressListener(JobInProgressListener listener) {
       mylisteners.add(listener);

+ 253 - 0
src/contrib/fairscheduler/designdoc/fair_scheduler_design_doc.tex

@@ -0,0 +1,253 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+% 
+%     http://www.apache.org/licenses/LICENSE-2.0
+% 
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS,
+% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+% See the License for the specific language governing permissions and
+% limitations under the License.
+
+\documentclass[11pt]{article}
+\usepackage{geometry}
+\geometry{letterpaper}
+
+\begin{document}
+
+\title{Hadoop Fair Scheduler Design Document}
+\author{}
+\maketitle
+\tableofcontents
+
+\section{Introduction}
+
+The Hadoop Fair Scheduler started as a simple means to share MapReduce clusters. Over time, it has grown in functionality to support hierarchical scheduling, preemption, and multiple ways of organizing and weighing jobs. This document explains the goals and features of the Fair Scheduler and its internal design.
+
+\section{Fair Scheduler Goals}
+
+The Fair Scheduler was designed with four main goals:
+\begin{enumerate}
+  \item Run small jobs quickly even if they are sharing a cluster with large jobs. Unlike Hadoop's built-in FIFO scheduler, fair scheduling lets small jobs make progress even if a large job is running, without starving the large job.
+  \item Provide guaranteed service levels to ``production" jobs, to let them run alongside experimental jobs in a shared cluster.
+  \item Be simple to administer and configure. The scheduler should do something reasonable ``out of the box," and users should only need to configure it as they discover that they want to use more advanced features.
+  \item Support reconfiguration at runtime, without requiring a cluster restart.
+\end{enumerate}
+
+\section{Scheduler Features}
+
+This section provides a quick overview of the features of the Fair Scheduler. A detailed usage guide is available in the Hadoop documentation in {\tt build/docs/fair\_scheduler.html}.
+
+\subsection{Pools}
+
+The Fair Scheduler groups jobs into ``pools" and performs fair sharing between these pools. Each pool can use either FIFO or fair sharing to schedule jobs internal to the pool. The pool that a job is placed in is determined by a JobConf property, the ``pool name property". By default, this is {\tt user.name}, so that there is one pool per user. However, different properties can be used, e.g.~{\tt group.name} to have one pool per Unix group.
+
+A common trick is to set the pool name property to an unused property name such as {\tt pool.name} and make this default to {\tt user.name}, so that there is one pool per user but it is also possible to place jobs into ``special" pools by setting their {\tt pool.name} directly. The {\tt mapred-site.xml} snippet below shows how to do this:
+
+\begin{verbatim}
+<property>
+  <name>mapred.fairscheduler.poolnameproperty</name>
+  <value>pool.name</value>
+</property>
+
+<property>
+  <name>pool.name</name>
+  <value>${user.name}</value>
+</property>
+\end{verbatim}
+
+\subsection{Minimum Shares}
+
+Normally, active pools (those that contain jobs) will get equal shares of the map and reduce task slots in the cluster. However, it is also possible to set a \emph{minimum share} of map and reduce slots on a given pool, which is a number of slots that it will always get when it is active, even if its fair share would be below this number. This is useful for guaranteeing that production jobs get a certain desired level of service when sharing a cluster with non-production jobs. Minimum shares have three effects:
+\begin{enumerate}
+  \item The pool's fair share will always be at least as large as its minimum share. Slots are taken from the share of other pools to achieve this. The only exception is if the minimum shares of the active pools add up to more than the total number of slots in the cluster; in this case, each pool's share will be scaled down proportionally.
+  \item Pools whose running task count is below their minimum share get assigned slots first when slots are available.
+  \item It is possible to set a \emph{preemption timeout} on the pool after which, if it has not received enough task slots to meet its minimum share, it is allowed to kill tasks in other jobs to meet its share. Minimum shares with preemption timeouts thus act like SLAs.
+\end{enumerate}
+
+Note that when a pool is inactive (contains no jobs), its minimum share is not ``reserved" for it -- the slots are split up among the other pools.
+
+\subsection{Preemption}
+
+As explained above, the scheduler may kill tasks from a job in one pool in order to meet the minimum share of another pool. We call this preemption, although this usage of the word is somewhat strange given the normal definition of preemption as pausing; really it is the \emph{job} that gets preempted, while the task gets killed. The feature explained above is called \emph{min share preemption}. In addition, the scheduler supports \emph{fair share preemption}, to kill tasks when a pool's fair share is not being met. Fair share preemption is much more conservative than min share preemption, because pools without min shares are expected to be non-production jobs where some amount of unfairness is tolerable. In particular, fair share preemption activates if a pool has been below \emph{half} of its fair share for a configurable fair share preemption timeout, which is recommended to be set fairly high (e.g. 10 minutes).
+
+In both types of preemption, the scheduler kills the most recently launched tasks from over-scheduled pools, to minimize the amount of computation wasted by preemption.
+
+\subsection{Running Job Limits}
+
+The fair scheduler can limit the number of concurrently running jobs from each user and from each pool. This is useful for limiting the amount of intermediate data generated on the cluster. The jobs that will run are chosen in order of submit time and priority. Jobs submitted beyond the limit wait for one of the running jobs to finish.
+
+\subsection{Job Priorities}
+
+Within a pool, job priorities can be used to control the scheduling of jobs, whether the pool's internal scheduling mode is FIFO or fair sharing:
+\begin{itemize}
+  \item In FIFO pools, jobs are ordered first by priority and then by submit time, as in Hadoop's default scheduler.
+  \item In fair sharing pools, job priorities are used as weights to control how much share a job gets. The normal priority corresponds to a weight of 1.0, and each level gives 2x more weight. For example, a high-priority job gets a weight of 2.0, and will therefore get 2x the share of a normal-priority job. 
+\end{itemize}
+
+\subsection{Pool Weights}
+
+Pools can be given weights to achieve unequal sharing of the cluster. For example, a pool with weight 2.0 gets 2x the share of a pool with weight 1.0.
+
+\subsection{Delay Scheduling}
+
+The Fair Scheduler contains an algorithm called delay scheduling to improve data locality. Jobs that cannot launch a data-local map task wait for some period of time before they are allowed to launch non-data-local tasks, ensuring that they will run locally if some node in the cluster has the relevant data. Delay scheduling is described in detail in Section \ref{sec:delay-scheduling}.
+
+\subsection{Administration}
+
+The Fair Scheduler includes a web UI displaying the active pools and jobs and their fair shares, moving jobs between pools, and changing job priorities.
+In addition, the Fair Scheduler's allocation file (specifying min shares and preemption timeouts for the pools) is automatically reloaded if it is modified on disk, to allow runtime reconfiguration.
+
+\section{Implementation}
+
+\subsection{Hadoop Scheduling Background}
+
+Hadoop jobs consist of a number of map and reduce \emph{tasks}. These task run in \emph{slots} on the nodes on the cluster. Each node is configured with a number of map slots and reduce slots based on its computational resources (typically one slot per core). The role of the scheduler is to assign tasks to any slots that are free.
+
+All schedulers in Hadoop, including the Fair Scheduler, inherit from the {\tt TaskScheduler} abstract class. This class provides access to a {\tt TaskTrackerManager} -- an interface to the JobTracker -- as well as a {\tt Configuration} instance. It also ask the scheduler to implement three abstract methods: the lifecycle methods {\tt start} and {\tt terminate}, and a method called {\tt assignTasks} to launch tasks on a given TaskTracker.
+Task assignment in Hadoop is reactive. TaskTrackers periodically send heartbeats to the JobTracker with their {\tt TaskTrackerStatus}, which contains a list of running tasks, the number of slots on the node, and other information. The JobTracker then calls {\tt assignTasks} on the scheduler to obtain tasks to launch. These are returned with the heartbeat response.
+
+Apart from reacting to heartbeats through {\tt assignTasks}, schedulers can also be notified when jobs have been submitted to the cluster, killed, or removed by adding listeners to the {\tt TaskTrackerManager}. The Fair Scheduler sets up these listeners in its {\tt start} method. An important role of the listeners is to initialize jobs that are submitted -- until a job is initialized, it cannot launch tasks. The Fair Scheduler currently initializes all jobs right away, but it may also be desirable to hold off initializing jobs if too many are submitted to limit memory usage on the JobTracker.
+
+Selection of tasks \emph{within} a job is mostly done by the {\tt JobInProgress} class, and not by individual schedulers. {\tt JobInProgress} exposes two methods, {\tt obtainNewMapTask} and {\tt obtainNewReduceTask}, to launch a task of either type. Both methods may either return a {\tt Task} object or {\tt null} if the job does not wish to launch a task. Whether a job wishes to launch a task may change back and forth during its lifetime. Even after all tasks in the job have been started, the job may wish to run another task for speculative execution. In addition, if the node containing a map task failed, the job will wish to re-run it to rebuild its output for use in the reduce tasks. Schedulers may therefore need to poll multiple jobs until they find one with a task to run.
+
+Finally, for map tasks, an important scheduling criterion is data locality: running the task on a node or rack that contains its input data. Normally, {\tt JobInProgress.obtainNewMapTask} returns the ``closest" map task to a given node. However, to give schedulers slightly more control over data locality, there is also a version of {\tt obtainNewMapTask} that allow the scheduler to cap the level of non-locality allowed for the task (e.g.~request a task only on the same node, or {\tt null} if none is available). The Fair Scheduler uses this method with an algorithm called delay scheduling (Section \ref{sec:delay-scheduling}) to optimize data locality.
+
+\subsection{Fair Scheduler Basics}
+
+At a high level, the Fair Scheduler uses hierarchical scheduling to assign tasks. First it selects a pool to assign a task to according to the fair sharing algorithm in Section \ref{sec:fair-sharing-alg}. Then it asks the pool obtain a task. The pool chooses among its jobs according to its internal scheduling order (FIFO or fair sharing).
+
+In fact, because jobs might not have tasks to launch ({\tt obtainNew(Map|Reduce)Task} can return null), the scheduler actually establishes an ordering on jobs and asks them for tasks in turn. Within a pool, jobs are sorted either by priority and start time (for FIFO) or by distance below fair share. If the first job in the ordering does not have a task to launch, the pool will ask the second, third, etc jobs. Pools themselves are sorted by distance below min share and fair share, so if the first pool does not have any jobs that can launch tasks, the second pool is asked, etc. This makes it straightforward to implement features like delay scheduling (Section \ref{sec:delay-scheduling}) that may cause jobs to ``pass" on a slot.
+
+Apart from the assign tasks code path, the Fair Scheduler also has a periodic update thread that calls {\tt update} every few seconds. This thread is responsible for recomputing fair shares to display them on the UI (Section \ref{sec:fair-share-computation}), checking whether jobs need to be preempted (Section \ref{sec:preemption}), and checking whether the allocations file has changed to reload pool allocations (through {\tt PoolManager}).
+
+\subsection{The {\tt Schedulable} Class}
+
+To allow the same fair sharing algorithm to be used both between pools and within a pool, the Fair Scheduler uses an abstract class called {\tt Schedulable} to represent both pools and jobs. Its subclasses for these roles are {\tt PoolSchedulable} and {\tt JobSchedulable}. A {\tt Schedulable} is responsible for three roles:
+\begin{enumerate}
+  \item It can be asked to obtain a task through {\tt assignTask}. This may return {\tt null} if the {\tt Schedulable} has no tasks to launch.
+  \item It can be queried for information about the pool/job to use in scheduling, such as:
+  \begin{itemize}
+    \item Number of running tasks.
+    \item Demand (number of tasks the {\tt Schedulable} \emph{wants} to run; this is equal to number of running tasks + number of unlaunched tasks).
+    \item Min share assigned through config file.
+    \item Weight (for fair sharing).
+    \item Priority and start time (for FIFO scheduling).
+  \end{itemize}
+  \item It can be assigned a fair share through {\tt setFairShare}.
+\end{enumerate}
+
+There are separate {\tt Schedulable}s for map and reduce tasks, to make it possible to use the same algorithm on both types of tasks.
+
+\subsection{Fair Sharing Algorithm}
+\label{sec:fair-sharing-alg}
+
+A simple way to achieve fair sharing is the following: whenever a slot is available, assign it to the pool that has the fewest running tasks. This will ensure that all pool get an equal number of slots, unless a pool's demand is less than its fair share, in which case the extra slots are divided evenly among the other pools. Two features of the Fair Scheduler complicate this algorithm a little:
+\begin{itemize}
+  \item Pool weights mean that some pools should get more slots than others. For example, a pool with weight 2 should get 2x more slots than a pool with weight 1. This is accomplished by changing the scheduling rule to ``assign the slot to the pool whose value of $runningTasks/weight$ is smallest."
+  \item Minimum shares mean that pools below their min share should get slots first. When we sort pools to choose which ones to schedule next, we place pools below their min share ahead of pools above their min share. We order the pools below their min share by how far they are below it as a percentage of the share.
+\end{itemize}
+
+This fair sharing algorithm is implemented in {\tt FairShareComparator} in the {\tt SchedulingAlgorithms} class. The comparator orders jobs by distance below min share and then by $runningTasks/weight$.
+
+\subsection{Preemption}
+\label{sec:preemption}
+
+To determine when to preempt tasks, the Fair Schedulers maintains two values for each {\tt PoolSchedulable}: the last time when the pool was at its min share, and the last time when the pool was at half its fair share. These conditions are checked periodically by the update thread in {\tt FairScheduler.updatePreemptionVariables}, using the methods {\tt isStarvedForMinShare} and {\tt isStarvedForFairShare}. These methods also take into account the demand of the pool, so that a pool is not counted as starving if its demand is below its min/fair share but is otherwise met.
+
+When preempting tasks, the scheduler kills the most recently launched tasks from over-scheduled pools. This minimizes the amount of computation wasted by preemption and ensures that all jobs can eventually finish (it is as if the preempted jobs just never got their last few slots). The tasks are chosen and preempted in {\tt preemptTasks}.
+
+Note that for min share preemption, it is clear when a pool is below its min share because the min share is given as a number of slots, but for fair share preemption, we must be able to compute a pool's fair share to determine when it is being starved. This computation is trickier than dividing the number of slots by the number of pools due to weights, min shares and demands. Section \ref{sec:fair-share-computation} explains how fair shares are computed.
+
+\subsection{Fair Share Computation}
+\label{sec:fair-share-computation}
+
+The scheduling algorithm in Section \ref{sec:fair-sharing-alg} achieves fair shares without actually needing to compute pools' numerical shares beforehand. However, for preemption and for displaying shares in the Web UI, we want to know what a pool's fair share is even if the pool is not currently at its share. That is, we want to know how many slots the pool \emph{would} get if we started with all slots being empty and ran the algorithm in Section \ref{sec:fair-sharing-alg} until we filled them.
+One way to compute these shares would be to simulate starting out with empty slots and calling {\tt assignTasks} repeatedly until they filled, but this is expensive, because each scheduling decision takes $O(numJobs)$ time and we need to make $O(numSlots)$ decisions.
+
+To compute fair shares efficiently, the Fair Scheduler includes an algorithm based on binary search in {\tt SchedulingAlgorithms.computeFairShares}. This algorithm is based on the following observation. If all slots had been assigned according to weighted fair sharing respecting pools' demands and min shares, then there would exist a ratio $r$ such that:
+\begin{enumerate}
+  \item Pools whose demand $d_i$ is less than $r w_i$ (where $w_i$ is the weight of the pool) are assigned $d_i$ slots.
+  \item Pools whose min share $m_i$ is more than $r w_i$ are assigned $\min(m_i, d_i)$ slots.
+  \item All other pools are assigned $r w_i$ slots.
+  \item The pools' shares sum up to the total number of slots $t$.
+\end{enumerate}
+
+The Fair Scheduler uses binary search to compute the correct $r$. We define a function $f(r)$ as the number of slots that would be used for a given $r$ if conditions 1-3 above were met, and then find a value of $r$ that makes $f(r)=t$. More precisely, $f(r)$ is defined as:
+$$f(r) = \sum_i{\min(d_i, \max(r w_i, m_i)).}$$
+
+Note that $f(r)$ is increasing in $r$ because every term of the sum is increasing, so the equation $f(r) = t$ can be solved by binary search. We choose 0 as a lower bound of our binary search because with $r=0$, only min shares are assigned. (An earlier check in {\tt computeFairShares} checks whether the min shares add up to more than the total number of slots, and if so, computes fair shares by scaling down the min shares proportionally and returns.) To compute an upper bound for the binary search, we try $r=1,2,4,8,\dots$ until we find a value large enough that either more than $t$ slots are used or all pools' demands are met (in case the demands added up to less than $t$).
+
+The steps of the algorithm are explained in detail in {\tt SchedulingAlgorithms.java}.
+
+This algorithm runs in time $O(NP)$, where $N$ is the number of jobs/pools and $P$ is the desired number of bits of precision in the computed values (number of iterations of binary search), which we've set to 25. It thus scales linearly in the number of jobs and pools.
+
+\subsection{Running Job Limits}
+
+Running job limits are implemented by marking jobs as not runnable if there are too many jobs submitted by the same user or pool. This is done in {\tt FairScheduler.updateRunnability}. A job that is not runnable declares its demand as 0 and always returns {\tt null} from {\tt assignTasks}.
+
+\subsection{Delay Scheduling}
+\label{sec:delay-scheduling}
+
+In Hadoop, running map tasks on the nodes or racks that contain their input data is critical for performance, because it avoids shipping the data over the network. However, always assigning slots to the first job in order of pool shares and in-pool ordering (the ``head-of-line job") can sometimes lead to poor locality:
+\begin{itemize}
+  \item If the head-of-line job is small, the chance of it having data on the node that a heartbeat was received from is small. Therefore, locality would be poor in a small-job workload if we always assigned slots to the head-of-line job.
+  \item When fair sharing is used, there is a strong bias for a job to be reassigned into a slot that it just finished a task in, because when it finishes the task, the job falls below its fair share. This can mean that jobs have a difficult time running in slots that other jobs have taken and thus achieve poor locality.
+\end{itemize}
+
+To deal with both of these situations, the Fair Scheduler can sacrifice fairness temporarily to improve locality through an algorithm called delay scheduling. If the head-of-line job cannot launch a local task on the TaskTracker that sent a heartbeat, then it is skipped, and other running jobs are looked at in order of pool shares and in-pool scheduling rules to find a job with a local task. However, if the head-of-line job has been skipped for a sufficiently long time, it is allowed to launch rack-local tasks. Then, if it is skipped for a longer time, it is also allowed to launch off-rack tasks. These skip times are called locality delays. Delays of a few seconds are sufficient to drastically increase locality.
+
+The Fair Scheduler allows locality delays to be set through {\tt mapred-site.xml} or to be turned off by setting them to zero. However, by default, it computes the delay automatically based on the heartbeat interval of the cluster. The delay is set to 1.5x the heartbeat interval.
+
+When a job that has been allowed to launch non-local tasks ends up launching a local task again, its ``locality level" resets and it must wait again before launching non-local tasks. This is done so that a job that gets ``unlucky" early in its lifetime does not continue to launch non-local tasks throughout its life.
+
+Delay scheduling is implemented by keeping track of two variables on each job: the locality level of the last map it launched (0 for node-local, 1 for rack-local and 2 for off-rack) and the time it has spent being skipped for a task. These are kept in a {\tt JobInfo} structure associated with each job in {\tt FairScheduler.java}. Whenever a job is asked for tasks, it checks the locality level it is allowed to launch them at through {\tt FairScheduler.getAllowedLocalityLevel}. If it does not launch a task, it is marked as ``visited" on that heartbeat by appending itself to a {\tt visited} job list that is passed around between calls to {\tt assignTasks} on the same heartbeat. Jobs that are visited on a heartbeat but do not launch any tasks during it are considered as skipped for the time interval between this heartbeat and the next. Code at the beginning of {\tt FairScheduler.assignTasks} increments the wait time of each skipped job by the time elapsed since the last heartbeat. Once a job has been skipped for more than the locality delay, {\tt getAllowedLocalityLevel} starts returning higher locality so that it is allowed to launch less-local tasks. Whenever the job launches a task, its wait time is reset, but we remember the locality level of the launched task so that the job is allowed to launch more tasks at this level without further waiting.
+
+\subsection{Locking Order}
+
+Fair Scheduler data structures can be touched by several threads. Most commonly, the JobTracker invokes {\tt assignTasks}. This happens inside a block of code where the JobTracker has locked itself already. Therefore, to prevent deadlocks, we always ensure that \emph{if both the FairScheduler and the JobTracker must be locked, the JobTracker is locked first}. Other threads that can lock the FairScheduler include the update thread and the web UI.
+
+\subsection{Unit Tests}
+
+The Fair Scheduler contains extensive unit tests using mock {\tt TaskTrackerManager}, {\tt JobInProgress}, {\tt TaskInProgress}, and {\tt Schedulable} objects. Scheduler tests are in {\tt TestFairScheduler.java}. The {\tt computeFairShares} algorithm is tested separately in {\tt TestComputeFairShares.java}. All tests use accelerated time via a fake {\tt Clock} class.
+
+\pagebreak
+\section{Code Guide}
+
+The following table lists some key source files in the Fair Scheduler:
+
+\begin{center}
+\begin{tabular}{|l|p{0.7\columnwidth}|}
+  \hline
+  {\bf File} & {\bf Contents} 
+  \\ \hline
+  {\tt FairScheduler.java} & Scheduler entry point. Also contains update thread, and logic for preemption, delay scheduling, and running job limits.
+  \\ \hline
+  {\tt Schedulable.java} & Definition of the {\tt Schedulable} class. Extended by {\tt JobSchedulable} and {\tt PoolSchedulable}.
+  \\ \hline
+  {\tt SchedulingAlgorithms.java} & Contains FIFO and fair sharing comparators, as well as the {\tt computeFairShares} algorithm in Section \ref{sec:fair-share-computation}.
+  \\ \hline
+  {\tt PoolManager.java} & Reads pool properties from the allocation file and maintains a collection of {\tt Pool} objects. Pools are created on demand.
+  \\ \hline
+  {\tt Pool.java} & Represents a pool and stores its map and reduce {\tt Schedulables}.
+  \\ \hline
+  {\tt FairSchedulerServlet.java} & Implements the scheduler's web UI.
+  \\ \hline
+  {\tt FairSchedulerEventLog.java} & An easy-to-parse event log for debugging. Must be enabled through {\tt mapred.fairscheduler.eventlog.enabled}.
+  If enabled, logs are placed in {\tt \$HADOOP\_LOG\_DIR/fairscheduler}.
+  \\ \hline
+  {\tt TaskSelector.java} & A pluggable class responsible for picking tasks within a job. Currently, {\tt DefaultTaskSelector} delegates to {\tt JobInProgress}, but this would be a useful place to experiment with new algorithms for speculative execution and locality.
+  \\ \hline
+  {\tt LoadManager.java} & A pluggable class responsible for determining when to launch more tasks on a TaskTracker. Currently, {\tt CapBasedLoadManager} uses slot counts, but this would be a useful place to experiment with scheduling based on machine load.
+  \\ \hline
+  {\tt WeightAdjuster.java} & A pluggable class responsible for setting job weights. An example, {\tt NewJobWeightBooster}, is provided, which increases weight temporarily for new jobs.
+  \\ \hline
+\end{tabular}
+\end{center}
+
+\end{document}

+ 8 - 0
src/contrib/fairscheduler/ivy.xml

@@ -26,6 +26,14 @@
       name="commons-logging"
       rev="${commons-logging.version}"
       conf="common->default"/>
+    <dependency org="commons-collections"
+      name="commons-collections"
+      rev="${commons-collections.version}"
+      conf="common->default"/>
+    <dependency org="commons-cli"
+      name="commons-cli"
+      rev="${commons-cli.version}"
+      conf="common->default"/>
     <dependency org="log4j"
       name="log4j"
       rev="${log4j.version}"

+ 18 - 1
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/CapBasedLoadManager.java

@@ -18,12 +18,23 @@
 
 package org.apache.hadoop.mapred;
 
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.conf.Configuration;
+
 /**
  * A {@link LoadManager} for use by the {@link FairScheduler} that allocates
  * tasks evenly across nodes up to their per-node maximum, using the default
  * load management algorithm in Hadoop.
  */
 public class CapBasedLoadManager extends LoadManager {
+  
+  float maxDiff = 0.0f;
+  
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    maxDiff = conf.getFloat("mapred.fairscheduler.load.max.diff", 0.0f);
+  }
+  
   /**
    * Determine how many tasks of a given type we want to run on a TaskTracker. 
    * This cap is chosen based on how many tasks of that type are outstanding in
@@ -32,7 +43,7 @@ public class CapBasedLoadManager extends LoadManager {
    * machines sent out heartbeats earliest.
    */
   int getCap(int totalRunnableTasks, int localMaxTasks, int totalSlots) {
-    double load = ((double)totalRunnableTasks) / totalSlots;
+    double load = maxDiff + ((double)totalRunnableTasks) / totalSlots;
     return (int) Math.ceil(localMaxTasks * Math.min(1.0, load));
   }
 
@@ -49,4 +60,10 @@ public class CapBasedLoadManager extends LoadManager {
     return tracker.countReduceTasks() < getCap(totalRunnableReduces,
         tracker.getMaxReduceSlots(), totalReduceSlots);
   }
+
+  @Override
+  public boolean canLaunchTask(TaskTrackerStatus tracker,
+      JobInProgress job,  TaskType type) {
+    return true;
+  }
 }

+ 13 - 4
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/DefaultTaskSelector.java

@@ -56,12 +56,21 @@ public class DefaultTaskSelector extends TaskSelector {
   }
 
   @Override
-  public Task obtainNewMapTask(TaskTrackerStatus taskTracker, JobInProgress job)
-      throws IOException {
+  public Task obtainNewMapTask(TaskTrackerStatus taskTracker, JobInProgress job,
+      int localityLevel) throws IOException {
     ClusterStatus clusterStatus = taskTrackerManager.getClusterStatus();
     int numTaskTrackers = clusterStatus.getTaskTrackers();
-    return job.obtainNewMapTask(taskTracker, numTaskTrackers,
-        taskTrackerManager.getNumberOfUniqueHosts());
+    switch (localityLevel) {
+      case 1:
+        return job.obtainNewNodeLocalMapTask(taskTracker, numTaskTrackers,
+          taskTrackerManager.getNumberOfUniqueHosts());
+      case 2:
+        return job.obtainNewNodeOrRackLocalMapTask(taskTracker, numTaskTrackers,
+          taskTrackerManager.getNumberOfUniqueHosts());
+      default:
+        return job.obtainNewMapTask(taskTracker, numTaskTrackers,
+          taskTrackerManager.getNumberOfUniqueHosts());
+    }
   }
 
   @Override

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 506 - 427
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/FairScheduler.java


+ 142 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/FairSchedulerEventLog.java

@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.log4j.DailyRollingFileAppender;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.log4j.PatternLayout;
+import org.apache.log4j.spi.LoggingEvent;
+
+/**
+ * Event log used by the fair scheduler for machine-readable debug info.
+ * This class uses a log4j rolling file appender to write the log, but uses
+ * a custom tab-separated event format of the form:
+ * <pre>
+ * DATE    EVENT_TYPE   PARAM_1   PARAM_2   ...
+ * </pre>
+ * Various event types are used by the fair scheduler. The purpose of logging
+ * in this format is to enable tools to parse the history log easily and read
+ * internal scheduler variables, rather than trying to make the log human
+ * readable. The fair scheduler also logs human readable messages in the
+ * JobTracker's main log.
+ * 
+ * Constructing this class creates a disabled log. It must be initialized
+ * using {@link FairSchedulerEventLog#init(Configuration, String)} to begin
+ * writing to the file.
+ */
+class FairSchedulerEventLog {
+  private static final Log LOG = LogFactory.getLog(
+    "org.apache.hadoop.mapred.FairSchedulerEventLog");
+  
+  /** Set to true if logging is disabled due to an error. */
+  private boolean logDisabled = true;
+  
+  /**
+   * Log directory, set by mapred.fairscheduler.eventlog.location in conf file;
+   * defaults to {hadoop.log.dir}/fairscheduler.
+   */
+  private String logDir;
+  
+  /** 
+   * Active log file, which is {LOG_DIR}/hadoop-{user}-fairscheduler.{host}.log.
+   * Older files are also stored as {LOG_FILE}.date (date format YYYY-MM-DD).
+   */ 
+  private String logFile;
+  
+  /** Log4j appender used to write to the log file */
+  private DailyRollingFileAppender appender;
+
+  boolean init(Configuration conf, String jobtrackerHostname) {
+    try {
+      logDir = conf.get("mapred.fairscheduler.eventlog.location",
+          new File(System.getProperty("hadoop.log.dir")).getAbsolutePath()
+          + File.separator + "fairscheduler");
+      Path logDirPath = new Path(logDir);
+      FileSystem fs = logDirPath.getFileSystem(conf);
+      if (!fs.exists(logDirPath)) {
+        if (!fs.mkdirs(logDirPath)) {
+          throw new IOException(
+              "Mkdirs failed to create " + logDirPath.toString());
+        }
+      }
+      String username = System.getProperty("user.name");
+      logFile = String.format("%s%shadoop-%s-fairscheduler-%s.log",
+          logDir, File.separator, username, jobtrackerHostname);
+      logDisabled = false;
+      PatternLayout layout = new PatternLayout("%d{ISO8601}\t%m%n");
+      appender = new DailyRollingFileAppender(layout, logFile, "'.'yyyy-MM-dd");
+      appender.activateOptions();
+      LOG.info("Initialized fair scheduler event log, logging to " + logFile);
+    } catch (IOException e) {
+      LOG.error(
+          "Failed to initialize fair scheduler event log. Disabling it.", e);
+      logDisabled = true;
+    }
+    return !(logDisabled);
+  }
+  
+  /**
+   * Log an event, writing a line in the log file of the form
+   * <pre>
+   * DATE    EVENT_TYPE   PARAM_1   PARAM_2   ...
+   * </pre>
+   */
+  synchronized void log(String eventType, Object... params) {
+    try {
+      if (logDisabled)
+        return;
+      StringBuffer buffer = new StringBuffer();
+      buffer.append(eventType);
+      for (Object param: params) {
+        buffer.append("\t");
+        buffer.append(param);
+      }
+      String message = buffer.toString();
+      Logger logger = Logger.getLogger(getClass());
+      appender.append(new LoggingEvent("", logger, Level.INFO, message, null));
+    } catch (Exception e) {
+      LOG.error("Failed to append to fair scheduler event log", e);
+      logDisabled = true;
+    }
+  }
+  
+  /**
+   * Flush and close the log.
+   */
+  void shutdown() {
+    try {
+      if (appender != null)
+        appender.close();
+    } catch (Exception e) {}
+    logDisabled = true;
+  }
+
+  boolean isEnabled() {
+    return !logDisabled;
+  }
+}

+ 113 - 86
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/FairSchedulerServlet.java

@@ -30,6 +30,7 @@ import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Date;
+import java.util.Iterator;
 import java.util.List;
 
 import javax.servlet.ServletContext;
@@ -39,16 +40,15 @@ import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
 
 import org.apache.hadoop.mapred.FairScheduler.JobInfo;
+import org.apache.hadoop.mapreduce.TaskType;
 import org.apache.hadoop.util.StringUtils;
 
 /**
  * Servlet for displaying fair scheduler information, installed at
  * [job tracker URL]/scheduler when the {@link FairScheduler} is in use.
  * 
- * The main features are viewing each job's task count and fair share, ability
- * to change job priorities and pools from the UI, and ability to switch the
- * scheduler to FIFO mode without restarting the JobTracker if this is required
- * for any reason.
+ * The main features are viewing each job's task count and fair share,
+ * and admin controls to change job priorities and pools from the UI.
  * 
  * There is also an "advanced" view for debugging that can be turned on by
  * going to [job tracker URL]/scheduler?advanced.
@@ -82,13 +82,9 @@ public class FairSchedulerServlet extends HttpServlet {
     // If the request has a set* param, handle that and redirect to the regular
     // view page so that the user won't resubmit the data if they hit refresh.
     boolean advancedView = request.getParameter("advanced") != null;
-    if (request.getParameter("setFifo") != null) {
-      scheduler.setUseFifo(request.getParameter("setFifo").equals("true"));
-      response.sendRedirect("/scheduler" + (advancedView ? "?advanced" : ""));
-      return;
-    }
-    if (request.getParameter("setPool") != null) {
-      Collection<JobInProgress> runningJobs = jobTracker.getRunningJobs();
+    if (JSPUtil.privateActionsAllowed(jobTracker.conf)
+        && request.getParameter("setPool") != null) {
+      Collection<JobInProgress> runningJobs = getInitedJobs();
       PoolManager poolMgr = null;
       synchronized (scheduler) {
         poolMgr = scheduler.getPoolManager();
@@ -107,8 +103,9 @@ public class FairSchedulerServlet extends HttpServlet {
       response.sendRedirect("/scheduler" + (advancedView ? "?advanced" : ""));
       return;
     }
-    if (request.getParameter("setPriority") != null) {
-      Collection<JobInProgress> runningJobs = jobTracker.getRunningJobs();      
+    if (JSPUtil.privateActionsAllowed(jobTracker.conf)
+        && request.getParameter("setPriority") != null) {
+      Collection<JobInProgress> runningJobs = getInitedJobs();
       JobPriority priority = JobPriority.valueOf(request.getParameter(
           "setPriority"));
       String jobId = request.getParameter("jobid");
@@ -126,22 +123,21 @@ public class FairSchedulerServlet extends HttpServlet {
     response.setContentType("text/html");
 
     // Because the client may read arbitrarily slow, and we hold locks while
-    // the servlet output, we want to write to our own buffer which we know
+    // the servlet outputs, we want to write to our own buffer which we know
     // won't block.
     ByteArrayOutputStream baos = new ByteArrayOutputStream();
     PrintWriter out = new PrintWriter(baos);
     String hostname = StringUtils.simpleHostname(
         jobTracker.getJobTrackerMachine());
     out.print("<html><head>");
-    out.printf("<title>%s Job Scheduler Admininstration</title>\n", hostname);
+    out.printf("<title>%s Fair Scheduler Administration</title>\n", hostname);
     out.print("<link rel=\"stylesheet\" type=\"text/css\" " + 
         "href=\"/static/hadoop.css\">\n");
     out.print("</head><body>\n");
     out.printf("<h1><a href=\"/jobtracker.jsp\">%s</a> " + 
-        "Job Scheduler Administration</h1>\n", hostname);
+        "Fair Scheduler Administration</h1>\n", hostname);
     showPools(out, advancedView);
     showJobs(out, advancedView);
-    showAdminForm(out, advancedView);
     out.print("</body></html>\n");
     out.close();
 
@@ -156,12 +152,17 @@ public class FairSchedulerServlet extends HttpServlet {
    */
   private void showPools(PrintWriter out, boolean advancedView) {
     synchronized(scheduler) {
+      boolean warnInverted = false;
       PoolManager poolManager = scheduler.getPoolManager();
       out.print("<h2>Pools</h2>\n");
       out.print("<table border=\"2\" cellpadding=\"5\" cellspacing=\"2\">\n");
-      out.print("<tr><th>Pool</th><th>Running Jobs</th>" + 
-          "<th>Min Maps</th><th>Min Reduces</th>" + 
-          "<th>Running Maps</th><th>Running Reduces</th></tr>\n");
+      out.print("<tr><th rowspan=2>Pool</th>" +
+          "<th rowspan=2>Running Jobs</th>" + 
+          "<th colspan=4>Map Tasks</th>" + 
+          "<th colspan=4>Reduce Tasks</th>" +
+          "<th rowspan=2>Scheduling Mode</th></tr>\n<tr>" + 
+          "<th>Min Share</th><th>Max Share</th><th>Running</th><th>Fair Share</th>" + 
+          "<th>Min Share</th><th>Max Share</th><th>Running</th><th>Fair Share</th></tr>\n");
       List<Pool> pools = new ArrayList<Pool>(poolManager.getPools());
       Collections.sort(pools, new Comparator<Pool>() {
         public int compare(Pool p1, Pool p2) {
@@ -172,27 +173,54 @@ public class FairSchedulerServlet extends HttpServlet {
           else return p1.getName().compareTo(p2.getName());
         }});
       for (Pool pool: pools) {
-        int runningMaps = 0;
-        int runningReduces = 0;
-        for (JobInProgress job: pool.getJobs()) {
-          JobInfo info = scheduler.infos.get(job);
-          if (info != null) {
-            runningMaps += info.runningMaps;
-            runningReduces += info.runningReduces;
-          }
-        }
-        out.print("<tr>\n");
-        out.printf("<td>%s</td>\n", pool.getName());
-        out.printf("<td>%s</td>\n", pool.getJobs().size());
-        out.printf("<td>%s</td>\n", poolManager.getAllocation(pool.getName(),
+        String name = pool.getName();
+        int runningMaps = pool.getMapSchedulable().getRunningTasks();
+        int runningReduces = pool.getReduceSchedulable().getRunningTasks();
+        int maxMaps = poolManager.getMaxSlots(name, TaskType.MAP);
+        int maxReduces = poolManager.getMaxSlots(name, TaskType.REDUCE);
+        boolean invertedMaps = poolManager.invertedMinMax(TaskType.MAP, name);
+        boolean invertedReduces = poolManager.invertedMinMax(TaskType.REDUCE, name);
+        warnInverted = warnInverted || invertedMaps || invertedReduces;
+        out.print("<tr>");
+        out.printf("<td>%s</td>", name);
+        out.printf("<td>%d</td>", pool.getJobs().size());
+        // Map Tasks
+        out.printf("<td>%d</td>", poolManager.getAllocation(name,
             TaskType.MAP));
-        out.printf("<td>%s</td>\n", poolManager.getAllocation(pool.getName(), 
+        out.print("<td>");
+        if(maxMaps == Integer.MAX_VALUE) {
+          out.print("-");
+        } else {
+          out.print(maxMaps);
+        }
+        if(invertedMaps) {
+          out.print("*");
+        }
+        out.print("</td>");
+        out.printf("<td>%d</td>", runningMaps);
+        out.printf("<td>%.1f</td>", pool.getMapSchedulable().getFairShare());
+        // Reduce Tasks
+        out.printf("<td>%d</td>", poolManager.getAllocation(name,
             TaskType.REDUCE));
-        out.printf("<td>%s</td>\n", runningMaps);
-        out.printf("<td>%s</td>\n", runningReduces);
+        out.print("<td>");
+        if(maxReduces == Integer.MAX_VALUE) {
+          out.print("-");
+        } else {
+          out.print(maxReduces);
+        }
+        if(invertedReduces) {
+          out.print("*");
+        }
+        out.print("</td>");
+        out.printf("<td>%d</td>", runningReduces);
+        out.printf("<td>%.1f</td>", pool.getReduceSchedulable().getFairShare());
+        out.printf("<td>%s</td>", pool.getSchedulingMode());
         out.print("</tr>\n");
       }
       out.print("</table>\n");
+      if(warnInverted) {
+        out.print("<p>* One or more pools have max share set lower than min share. Max share will be used and minimum will be treated as if set equal to max.</p>");
+      }
     }
   }
 
@@ -202,66 +230,70 @@ public class FairSchedulerServlet extends HttpServlet {
   private void showJobs(PrintWriter out, boolean advancedView) {
     out.print("<h2>Running Jobs</h2>\n");
     out.print("<table border=\"2\" cellpadding=\"5\" cellspacing=\"2\">\n");
-    int colsPerTaskType = advancedView ? 6 : 3;
+    int colsPerTaskType = advancedView ? 4 : 3;
     out.printf("<tr><th rowspan=2>Submitted</th>" + 
         "<th rowspan=2>JobID</th>" +
         "<th rowspan=2>User</th>" +
         "<th rowspan=2>Name</th>" +
         "<th rowspan=2>Pool</th>" +
         "<th rowspan=2>Priority</th>" +
-        "<th colspan=%d>Maps</th>" +
-        "<th colspan=%d>Reduces</th>",
+        "<th colspan=%d>Map Tasks</th>" +
+        "<th colspan=%d>Reduce Tasks</th>",
         colsPerTaskType, colsPerTaskType);
     out.print("</tr><tr>\n");
     out.print("<th>Finished</th><th>Running</th><th>Fair Share</th>" +
-        (advancedView ? "<th>Weight</th><th>Deficit</th><th>minMaps</th>" : ""));
+        (advancedView ? "<th>Weight</th>" : ""));
     out.print("<th>Finished</th><th>Running</th><th>Fair Share</th>" +
-        (advancedView ? "<th>Weight</th><th>Deficit</th><th>minReduces</th>" : ""));
+        (advancedView ? "<th>Weight</th>" : ""));
     out.print("</tr>\n");
     synchronized (jobTracker) {
-      Collection<JobInProgress> runningJobs = jobTracker.getRunningJobs();
+      Collection<JobInProgress> runningJobs = getInitedJobs();
       synchronized (scheduler) {
         for (JobInProgress job: runningJobs) {
           JobProfile profile = job.getProfile();
           JobInfo info = scheduler.infos.get(job);
           if (info == null) { // Job finished, but let's show 0's for info
-            info = new JobInfo();
+            info = new JobInfo(null, null);
           }
           out.print("<tr>\n");
           out.printf("<td>%s</td>\n", DATE_FORMAT.format(
-                       new Date(job.getStartTime())));
+              new Date(job.getStartTime())));
           out.printf("<td><a href=\"jobdetails.jsp?jobid=%s\">%s</a></td>",
-                     profile.getJobID(), profile.getJobID());
+              profile.getJobID(), profile.getJobID());
           out.printf("<td>%s</td>\n", profile.getUser());
           out.printf("<td>%s</td>\n", profile.getJobName());
-          out.printf("<td>%s</td>\n", generateSelect(
-                       scheduler.getPoolManager().getPoolNames(),
-                       scheduler.getPoolManager().getPoolName(job),
-                       "/scheduler?setPool=<CHOICE>&jobid=" + profile.getJobID() +
-                       (advancedView ? "&advanced" : "")));
-          out.printf("<td>%s</td>\n", generateSelect(
-                       Arrays.asList(new String[]
-                         {"VERY_LOW", "LOW", "NORMAL", "HIGH", "VERY_HIGH"}),
-                       job.getPriority().toString(),
-                       "/scheduler?setPriority=<CHOICE>&jobid=" + profile.getJobID() +
-                       (advancedView ? "&advanced" : "")));
-          out.printf("<td>%d / %d</td><td>%d</td><td>%8.1f</td>\n",
-                     job.finishedMaps(), job.desiredMaps(), info.runningMaps,
-                     info.mapFairShare);
+          if (JSPUtil.privateActionsAllowed(jobTracker.conf)) {
+            out.printf("<td>%s</td>\n", generateSelect(scheduler
+                .getPoolManager().getPoolNames(), scheduler.getPoolManager()
+                .getPoolName(job), "/scheduler?setPool=<CHOICE>&jobid="
+                + profile.getJobID() + (advancedView ? "&advanced" : "")));
+            out.printf("<td>%s</td>\n", generateSelect(Arrays
+                .asList(new String[] { "VERY_LOW", "LOW", "NORMAL", "HIGH",
+                    "VERY_HIGH" }), job.getPriority().toString(),
+                "/scheduler?setPriority=<CHOICE>&jobid=" + profile.getJobID()
+                    + (advancedView ? "&advanced" : "")));
+          } else {
+            out.printf("<td>%s</td>\n", scheduler.getPoolManager().getPoolName(job));
+            out.printf("<td>%s</td>\n", job.getPriority().toString());
+          }
+          Pool pool = scheduler.getPoolManager().getPool(job);
+          String mapShare = (pool.getSchedulingMode() == SchedulingMode.FAIR) ?
+              String.format("%.1f", info.mapSchedulable.getFairShare()) : "NA";
+          out.printf("<td>%d / %d</td><td>%d</td><td>%s</td>\n",
+              job.finishedMaps(), job.desiredMaps(), 
+              info.mapSchedulable.getRunningTasks(),
+              mapShare);
           if (advancedView) {
-            out.printf("<td>%8.1f</td>\n", info.mapWeight);
-            out.printf("<td>%s</td>\n", info.neededMaps > 0 ?
-                       (info.mapDeficit / 1000) + "s" : "--");
-            out.printf("<td>%d</td>\n", info.minMaps);
+            out.printf("<td>%.1f</td>\n", info.mapSchedulable.getWeight());
           }
-          out.printf("<td>%d / %d</td><td>%d</td><td>%8.1f</td>\n",
-                     job.finishedReduces(), job.desiredReduces(), info.runningReduces,
-                     info.reduceFairShare);
+          String reduceShare = (pool.getSchedulingMode() == SchedulingMode.FAIR) ?
+              String.format("%.1f", info.reduceSchedulable.getFairShare()) : "NA";
+          out.printf("<td>%d / %d</td><td>%d</td><td>%s</td>\n",
+              job.finishedReduces(), job.desiredReduces(), 
+              info.reduceSchedulable.getRunningTasks(),
+              reduceShare);
           if (advancedView) {
-            out.printf("<td>%8.1f</td>\n", info.reduceWeight);
-            out.printf("<td>%s</td>\n", info.neededReduces > 0 ?
-                       (info.reduceDeficit / 1000) + "s" : "--");
-            out.printf("<td>%d</td>\n", info.minReduces);
+            out.printf("<td>%.1f</td>\n", info.reduceSchedulable.getWeight());
           }
           out.print("</tr>\n");
         }
@@ -294,22 +326,17 @@ public class FairSchedulerServlet extends HttpServlet {
   }
 
   /**
-   * Print the administration form at the bottom of the page, which currently
-   * only includes the button for switching between FIFO and Fair Scheduling.
+   * Obtained all initialized jobs
    */
-  private void showAdminForm(PrintWriter out, boolean advancedView) {
-    out.print("<h2>Scheduling Mode</h2>\n");
-    String curMode = scheduler.getUseFifo() ? "FIFO" : "Fair Sharing";
-    String otherMode = scheduler.getUseFifo() ? "Fair Sharing" : "FIFO";
-    String advParam = advancedView ? "?advanced" : "";
-    out.printf("<form method=\"post\" action=\"/scheduler%s\">\n", advParam);
-    out.printf("<p>The scheduler is currently using <b>%s mode</b>. " +
-        "<input type=\"submit\" value=\"Switch to %s mode.\" " + 
-        "onclick=\"return confirm('Are you sure you want to change " +
-        "scheduling mode to %s?')\" />\n",
-        curMode, otherMode, otherMode);
-    out.printf("<input type=\"hidden\" name=\"setFifo\" value=\"%s\" />",
-        !scheduler.getUseFifo());
-    out.print("</form>\n");
+  private Collection<JobInProgress> getInitedJobs() {
+    Collection<JobInProgress> runningJobs = jobTracker.getRunningJobs();
+    for (Iterator<JobInProgress> it = runningJobs.iterator(); it.hasNext();) {
+      JobInProgress job = it.next();
+      if (!job.inited()) {
+        it.remove();
+      }
+    }
+    return runningJobs;
   }
+
 }

+ 2 - 1
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/FifoJobComparator.java

@@ -35,7 +35,8 @@ public class FifoJobComparator implements Comparator<JobInProgress> {
       }
     }
     if (res == 0) {
-      res = j1.hashCode() - j2.hashCode();
+      // If there is a tie, break it by job ID to get a deterministic order
+      res = j1.getJobID().compareTo(j2.getJobID());
     }
     return res;
   }

+ 185 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/JobSchedulable.java

@@ -0,0 +1,185 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.hadoop.mapred.FairScheduler.JobInfo;
+import org.apache.hadoop.mapreduce.TaskType;
+
+public class JobSchedulable extends Schedulable {
+  private FairScheduler scheduler;
+  private JobInProgress job;
+  private TaskType taskType;
+  private int demand = 0;
+
+  public JobSchedulable(FairScheduler scheduler, JobInProgress job, 
+      TaskType taskType) {
+    this.scheduler = scheduler;
+    this.job = job;
+    this.taskType = taskType;
+    
+    initMetrics();
+  }
+  
+  @Override
+  public TaskType getTaskType() {
+    return taskType;
+  }
+  
+  @Override
+  public String getName() {
+    return job.getJobID().toString();
+  }
+
+  public JobInProgress getJob() {
+    return job;
+  }
+  
+  @Override
+  public void updateDemand() {
+    demand = 0;
+    if (isRunnable()) {
+      // For reduces, make sure enough maps are done that reduces can launch
+      if (taskType == TaskType.REDUCE && !job.scheduleReduces())
+        return;
+      // Add up demand from each TaskInProgress; each TIP can either
+      // - have no attempts running, in which case it demands 1 slot
+      // - have N attempts running, in which case it demands N slots, and may
+      //   potentially demand one more slot if it needs to be speculated
+      TaskInProgress[] tips = (taskType == TaskType.MAP ? 
+          job.getTasks(TaskType.MAP) : job.getTasks(TaskType.REDUCE));
+      boolean speculationEnabled = (taskType == TaskType.MAP ?
+          job.getMapSpeculativeExecution() : job.getReduceSpeculativeExecution());
+      double avgProgress = (taskType == TaskType.MAP ?
+          job.getStatus().mapProgress() : job.getStatus().reduceProgress());
+      long time = scheduler.getClock().getTime();
+      for (TaskInProgress tip: tips) {
+        if (!tip.isComplete()) {
+          if (tip.isRunning()) {
+            // Count active tasks and any speculative task we want to launch
+            demand += tip.getActiveTasks().size();
+            if (speculationEnabled && tip.hasSpeculativeTask(time, avgProgress))
+              demand += 1;
+          } else {
+            // Need to launch 1 task
+            demand += 1;
+          }
+        }
+      }
+    }
+  }
+
+  private boolean isRunnable() {
+    JobInfo info = scheduler.getJobInfo(job);
+    int runState = job.getStatus().getRunState();
+    return (info != null && info.runnable && runState == JobStatus.RUNNING);
+  }
+
+  @Override
+  public int getDemand() {
+    return demand;
+  }
+  
+  @Override
+  public void redistributeShare() {}
+
+  @Override
+  public JobPriority getPriority() {
+    return job.getPriority();
+  }
+
+  @Override
+  public int getRunningTasks() {
+    if (!job.inited()) {
+      return 0;
+    }
+    return taskType == TaskType.MAP ? job.runningMaps() : job.runningReduces();
+  }
+
+  @Override
+  public long getStartTime() {
+    return job.startTime;
+  }
+  
+  @Override
+  public double getWeight() {
+    return scheduler.getJobWeight(job, taskType);
+  }
+  
+  @Override
+  public int getMinShare() {
+    return 0;
+  }
+
+  @Override
+  public Task assignTask(TaskTrackerStatus tts, long currentTime,
+      Collection<JobInProgress> visited) throws IOException {
+    if (isRunnable()) {
+      visited.add(job);
+      TaskTrackerManager ttm = scheduler.taskTrackerManager;
+      ClusterStatus clusterStatus = ttm.getClusterStatus();
+      int numTaskTrackers = clusterStatus.getTaskTrackers();
+
+      // check with the load manager whether it is safe to 
+      // launch this task on this taskTracker.
+      LoadManager loadMgr = scheduler.getLoadManager();
+      if (!loadMgr.canLaunchTask(tts, job, taskType)) {
+        return null;
+      }
+      if (taskType == TaskType.MAP) {
+        LocalityLevel localityLevel = scheduler.getAllowedLocalityLevel(
+            job, currentTime);
+        scheduler.getEventLog().log(
+            "ALLOWED_LOC_LEVEL", job.getJobID(), localityLevel);
+        switch (localityLevel) {
+          case NODE:
+            return job.obtainNewNodeLocalMapTask(tts, numTaskTrackers,
+                ttm.getNumberOfUniqueHosts());
+          case RACK:
+            return job.obtainNewNodeOrRackLocalMapTask(tts, numTaskTrackers,
+                ttm.getNumberOfUniqueHosts());
+          default:
+            return job.obtainNewMapTask(tts, numTaskTrackers,
+                ttm.getNumberOfUniqueHosts());
+        }
+      } else {
+        return job.obtainNewReduceTask(tts, numTaskTrackers,
+            ttm.getNumberOfUniqueHosts());
+      }
+    } else {
+      return null;
+    }
+  }
+
+  
+  @Override
+  protected String getMetricsContextName() {
+    return "jobs";
+  }
+  
+  @Override
+  void updateMetrics() {
+    assert metrics != null;
+    
+    super.setMetricValues(metrics);
+    metrics.update();
+  }
+}

+ 22 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/LoadManager.java

@@ -22,6 +22,7 @@ import java.io.IOException;
 
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.TaskType;
 
 /**
  * A pluggable object that manages the load on each {@link TaskTracker}, telling
@@ -30,6 +31,7 @@ import org.apache.hadoop.conf.Configuration;
 public abstract class LoadManager implements Configurable {
   protected Configuration conf;
   protected TaskTrackerManager taskTrackerManager;
+  protected FairSchedulerEventLog schedulingLog;
   
   public Configuration getConf() {
     return conf;
@@ -43,6 +45,10 @@ public abstract class LoadManager implements Configurable {
       TaskTrackerManager taskTrackerManager) {
     this.taskTrackerManager = taskTrackerManager;
   }
+
+  public void setEventLog(FairSchedulerEventLog schedulingLog) {
+    this.schedulingLog = schedulingLog;
+  }
   
   /**
    * Lifecycle method to allow the LoadManager to start any work in separate
@@ -61,6 +67,8 @@ public abstract class LoadManager implements Configurable {
   
   /**
    * Can a given {@link TaskTracker} run another map task?
+   * This method may check whether the specified tracker has
+   * enough resources to run another map task.
    * @param tracker The machine we wish to run a new map on
    * @param totalRunnableMaps Set of running jobs in the cluster
    * @param totalMapSlots The total number of map slots in the cluster
@@ -71,6 +79,8 @@ public abstract class LoadManager implements Configurable {
 
   /**
    * Can a given {@link TaskTracker} run another reduce task?
+   * This method may check whether the specified tracker has
+   * enough resources to run another reduce task.
    * @param tracker The machine we wish to run a new map on
    * @param totalRunnableReduces Set of running jobs in the cluster
    * @param totalReduceSlots The total number of reduce slots in the cluster
@@ -78,4 +88,16 @@ public abstract class LoadManager implements Configurable {
    */
   public abstract boolean canAssignReduce(TaskTrackerStatus tracker,
       int totalRunnableReduces, int totalReduceSlots);
+
+  /**
+   * Can a given {@link TaskTracker} run another new task from a given job? 
+   * This method is provided for use by LoadManagers that take into 
+   * account jobs' individual resource needs when placing tasks.
+   * @param tracker The machine we wish to run a new map on
+   * @param job The job from which we want to run a task on this machine
+   * @param type The type of task that we want to run on
+   * @return true if this task can be launched on <code>tracker</code>
+   */
+  public abstract boolean canLaunchTask(TaskTrackerStatus tracker,
+      JobInProgress job,  TaskType type);
 }

+ 65 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/LocalityLevel.java

@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+/**
+ * Represents the level of data-locality at which a job in the fair scheduler
+ * is allowed to launch tasks. By default, jobs are not allowed to launch
+ * non-data-local tasks until they have waited a small number of seconds to
+ * find a slot on a node that they have data on. If a job has waited this
+ * long, it is allowed to launch rack-local tasks as well (on nodes that may
+ * not have the task's input data, but share a rack with a node that does).
+ * Finally, after a further wait, jobs are allowed to launch tasks anywhere
+ * in the cluster.
+ * 
+ * This enum defines three levels - NODE, RACK and ANY (for allowing tasks
+ * to be launched on any node). A map task's level can be obtained from
+ * its job through {@link #fromTask(JobInProgress, Task, TaskTrackerStatus)}. In
+ * addition, for any locality level, it is possible to get a "level cap" to pass
+ * to {@link JobInProgress#obtainNewMapTask(TaskTrackerStatus, int, int, int)}
+ * to ensure that only tasks at this level or lower are launched, through
+ * the {@link #toCacheLevelCap()} method.
+ */
+public enum LocalityLevel {
+  NODE, RACK, ANY;
+  
+  public static LocalityLevel fromTask(JobInProgress job, Task mapTask,
+      TaskTrackerStatus tracker) {
+    TaskID tipID = mapTask.getTaskID().getTaskID();
+    TaskInProgress tip = job.getTaskInProgress(tipID);
+    switch (job.getLocalityLevel(tip, tracker)) {
+    case 0: return LocalityLevel.NODE;
+    case 1: return LocalityLevel.RACK;
+    default: return LocalityLevel.ANY;
+    }
+  }
+  
+  /**
+   * Obtain a JobInProgress cache level cap to pass to
+   * {@link JobInProgress#obtainNewMapTask(TaskTrackerStatus, int, int, int)}
+   * to ensure that only tasks of this locality level and lower are launched.
+   */
+  public int toCacheLevelCap() {
+    switch(this) {
+    case NODE: return 1;
+    case RACK: return 2;
+    default: return Integer.MAX_VALUE;
+    }
+  }
+}

+ 1 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/NewJobWeightBooster.java

@@ -20,6 +20,7 @@ package org.apache.hadoop.mapred;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.mapreduce.TaskType;
 
 /**
  * A {@link WeightAdjuster} implementation that gives a weight boost to new jobs

+ 41 - 1
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/Pool.java

@@ -21,6 +21,9 @@ package org.apache.hadoop.mapred;
 import java.util.ArrayList;
 import java.util.Collection;
 
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.metrics.MetricsContext;
+
 /**
  * A schedulable pool of jobs.
  */
@@ -33,9 +36,17 @@ public class Pool {
   
   /** Jobs in this specific pool; does not include children pools' jobs. */
   private Collection<JobInProgress> jobs = new ArrayList<JobInProgress>();
+  
+  /** Scheduling mode for jobs inside the pool (fair or FIFO) */
+  private SchedulingMode schedulingMode;
 
-  public Pool(String name) {
+  private PoolSchedulable mapSchedulable;
+  private PoolSchedulable reduceSchedulable;
+
+  public Pool(FairScheduler scheduler, String name) {
     this.name = name;
+    mapSchedulable = new PoolSchedulable(scheduler, this, TaskType.MAP);
+    reduceSchedulable = new PoolSchedulable(scheduler, this, TaskType.REDUCE);
   }
   
   public Collection<JobInProgress> getJobs() {
@@ -44,17 +55,46 @@ public class Pool {
   
   public void addJob(JobInProgress job) {
     jobs.add(job);
+    mapSchedulable.addJob(job);
+    reduceSchedulable.addJob(job);
   }
   
   public void removeJob(JobInProgress job) {
     jobs.remove(job);
+    mapSchedulable.removeJob(job);
+    reduceSchedulable.removeJob(job);
   }
   
   public String getName() {
     return name;
   }
 
+  public SchedulingMode getSchedulingMode() {
+    return schedulingMode;
+  }
+  
+  public void setSchedulingMode(SchedulingMode schedulingMode) {
+    this.schedulingMode = schedulingMode;
+  }
+
   public boolean isDefaultPool() {
     return Pool.DEFAULT_POOL_NAME.equals(name);
   }
+  
+  public PoolSchedulable getMapSchedulable() {
+    return mapSchedulable;
+  }
+  
+  public PoolSchedulable getReduceSchedulable() {
+    return reduceSchedulable;
+  }
+  
+  public PoolSchedulable getSchedulable(TaskType type) {
+    return type == TaskType.MAP ? mapSchedulable : reduceSchedulable;
+  }
+
+  public void updateMetrics() {
+    mapSchedulable.updateMetrics();
+    reduceSchedulable.updateMetrics();
+  }
 }

+ 228 - 23
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/PoolManager.java

@@ -20,6 +20,8 @@ package org.apache.hadoop.mapred;
 
 import java.io.File;
 import java.io.IOException;
+import java.net.URL;
+import java.net.URLConnection;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -34,6 +36,8 @@ import javax.xml.parsers.ParserConfigurationException;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.metrics.MetricsContext;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
@@ -42,7 +46,8 @@ import org.w3c.dom.Text;
 import org.xml.sax.SAXException;
 
 /**
- * Maintains a hierarchy of pools.
+ * Maintains a list of pools as well as scheduling parameters for each pool,
+ * such as guaranteed share allocations, from the fair scheduler config file.
  */
 public class PoolManager {
   public static final Log LOG = LogFactory.getLog(
@@ -56,11 +61,19 @@ public class PoolManager {
    * (this is done to prevent loading a file that hasn't been fully written).
    */
   public static final long ALLOC_RELOAD_WAIT = 5 * 1000; 
+
+  public static final String EXPLICIT_POOL_PROPERTY = "mapred.fairscheduler.pool";
+
+  private final FairScheduler scheduler;
   
   // Map and reduce minimum allocations for each pool
   private Map<String, Integer> mapAllocs = new HashMap<String, Integer>();
   private Map<String, Integer> reduceAllocs = new HashMap<String, Integer>();
 
+  // If set, cap number of map and reduce tasks in a pool
+  private Map<String, Integer> poolMaxMaps = new HashMap<String, Integer>();
+  private Map<String, Integer> poolMaxReduces = new HashMap<String, Integer>();
+
   // Sharing weights for each pool
   private Map<String, Double> poolWeights = new HashMap<String, Double>();
   
@@ -69,10 +82,31 @@ public class PoolManager {
   private Map<String, Integer> poolMaxJobs = new HashMap<String, Integer>();
   private Map<String, Integer> userMaxJobs = new HashMap<String, Integer>();
   private int userMaxJobsDefault = Integer.MAX_VALUE;
+  private int poolMaxJobsDefault = Integer.MAX_VALUE;
 
-  private String allocFile; // Path to XML file containing allocations
+  // Min share preemption timeout for each pool in seconds. If a job in the pool
+  // waits this long without receiving its guaranteed share, it is allowed to
+  // preempt other jobs' tasks.
+  private Map<String, Long> minSharePreemptionTimeouts =
+    new HashMap<String, Long>();
+  
+  // Default min share preemption timeout for pools where it is not set
+  // explicitly.
+  private long defaultMinSharePreemptionTimeout = Long.MAX_VALUE;
+  
+  // Preemption timeout for jobs below fair share in seconds. If a job remains
+  // below half its fair share for this long, it is allowed to preempt tasks.
+  private long fairSharePreemptionTimeout = Long.MAX_VALUE;
+  
+  SchedulingMode defaultSchedulingMode = SchedulingMode.FAIR;
+  
+  private Object allocFile; // Path to XML file containing allocations. This
+                            // is either a URL to specify a classpath resource
+                            // (if the fair-scheduler.xml on the classpath is
+                            // used) or a String to specify an absolute path (if
+                            // mapred.fairscheduler.allocation.file is used).
   private String poolNameProperty; // Jobconf property to use for determining a
-                                   // job's pool name (default: mapred.job.queue.name)
+                                   // job's pool name (default: user.name)
   
   private Map<String, Pool> pools = new HashMap<String, Pool>();
   
@@ -80,14 +114,25 @@ public class PoolManager {
   private long lastSuccessfulReload; // Last time we successfully reloaded pools
   private boolean lastReloadAttemptFailed = false;
 
-  public PoolManager(Configuration conf) throws IOException, SAXException,
+  public PoolManager(FairScheduler scheduler) {
+    this.scheduler = scheduler;
+  }
+  
+  public void initialize() throws IOException, SAXException,
       AllocationConfigurationException, ParserConfigurationException {
+    Configuration conf = scheduler.getConf();
     this.poolNameProperty = conf.get(
         "mapred.fairscheduler.poolnameproperty", "user.name");
     this.allocFile = conf.get("mapred.fairscheduler.allocation.file");
     if (allocFile == null) {
-      LOG.warn("No mapred.fairscheduler.allocation.file given in jobconf - " +
-          "the fair scheduler will not use any queues.");
+      // No allocation file specified in jobconf. Use the default allocation
+      // file, fair-scheduler.xml, looking for it on the classpath.
+      allocFile = new Configuration().getResource("fair-scheduler.xml");
+      if (allocFile == null) {
+        LOG.error("The fair scheduler allocation file fair-scheduler.xml was "
+            + "not found on the classpath, and no other config file is given "
+            + "through mapred.fairscheduler.allocation.file.");
+      }
     }
     reloadAllocs();
     lastSuccessfulReload = System.currentTimeMillis();
@@ -102,11 +147,19 @@ public class PoolManager {
   public synchronized Pool getPool(String name) {
     Pool pool = pools.get(name);
     if (pool == null) {
-      pool = new Pool(name);
+      pool = new Pool(scheduler, name);
+      pool.setSchedulingMode(defaultSchedulingMode);
       pools.put(name, pool);
     }
     return pool;
   }
+  
+  /**
+   * Get the pool that a given job is in.
+   */
+  public Pool getPool(JobInProgress job) {
+    return getPool(getPoolName(job));
+  }
 
   /**
    * Reload allocations file if it hasn't been loaded in a while
@@ -115,9 +168,20 @@ public class PoolManager {
     long time = System.currentTimeMillis();
     if (time > lastReloadAttempt + ALLOC_RELOAD_INTERVAL) {
       lastReloadAttempt = time;
+      if (null == allocFile) {
+        return;
+      }
       try {
-        File file = new File(allocFile);
-        long lastModified = file.lastModified();
+        // Get last modified time of alloc file depending whether it's a String
+        // (for a path name) or an URL (for a classloader resource)
+        long lastModified;
+        if (allocFile instanceof String) {
+          File file = new File((String) allocFile);
+          lastModified = file.lastModified();
+        } else { // allocFile is an URL
+          URLConnection conn = ((URL) allocFile).openConnection();
+          lastModified = conn.getLastModified();
+        }
         if (lastModified > lastSuccessfulReload &&
             time > lastModified + ALLOC_RELOAD_WAIT) {
           reloadAllocs();
@@ -131,7 +195,7 @@ public class PoolManager {
         // We log the error only on the first failure so we don't fill up the
         // JobTracker's log with these messages.
         if (!lastReloadAttemptFailed) {
-          LOG.error("Failed to reload allocations file - " +
+          LOG.error("Failed to reload fair scheduler config file - " +
               "will use existing allocations.", e);
         }
         lastReloadAttemptFailed = true;
@@ -165,8 +229,16 @@ public class PoolManager {
     Map<String, Integer> reduceAllocs = new HashMap<String, Integer>();
     Map<String, Integer> poolMaxJobs = new HashMap<String, Integer>();
     Map<String, Integer> userMaxJobs = new HashMap<String, Integer>();
+    Map<String, Integer> poolMaxMaps = new HashMap<String, Integer>();
+    Map<String, Integer> poolMaxReduces = new HashMap<String, Integer>();
     Map<String, Double> poolWeights = new HashMap<String, Double>();
+    Map<String, SchedulingMode> poolModes = new HashMap<String, SchedulingMode>();
+    Map<String, Long> minSharePreemptionTimeouts = new HashMap<String, Long>();
     int userMaxJobsDefault = Integer.MAX_VALUE;
+    int poolMaxJobsDefault = Integer.MAX_VALUE;
+    long fairSharePreemptionTimeout = Long.MAX_VALUE;
+    long defaultMinSharePreemptionTimeout = Long.MAX_VALUE;
+    SchedulingMode defaultSchedulingMode = SchedulingMode.FAIR;
     
     // Remember all pool names so we can display them on web UI, etc.
     List<String> poolNamesInAllocFile = new ArrayList<String>();
@@ -176,11 +248,16 @@ public class PoolManager {
       DocumentBuilderFactory.newInstance();
     docBuilderFactory.setIgnoringComments(true);
     DocumentBuilder builder = docBuilderFactory.newDocumentBuilder();
-    Document doc = builder.parse(new File(allocFile));
+    Document doc;
+    if (allocFile instanceof String) {
+      doc = builder.parse(new File((String) allocFile));
+    } else {
+      doc = builder.parse(allocFile.toString());
+    }
     Element root = doc.getDocumentElement();
     if (!"allocations".equals(root.getTagName()))
-      throw new AllocationConfigurationException("Bad allocations file: " + 
-          "top-level element not <allocations>");
+      throw new AllocationConfigurationException("Bad fair scheduler config " + 
+          "file: top-level element not <allocations>");
     NodeList elements = root.getChildNodes();
     for (int i = 0; i < elements.getLength(); i++) {
       Node node = elements.item(i);
@@ -204,6 +281,14 @@ public class PoolManager {
             String text = ((Text)field.getFirstChild()).getData().trim();
             int val = Integer.parseInt(text);
             reduceAllocs.put(poolName, val);
+          } else if ("maxMaps".equals(field.getTagName())) {
+            String text = ((Text)field.getFirstChild()).getData().trim();
+            int val = Integer.parseInt(text);
+            poolMaxMaps.put(poolName, val);
+          } else if ("maxReduces".equals(field.getTagName())) {
+            String text = ((Text)field.getFirstChild()).getData().trim();
+            int val = Integer.parseInt(text);
+            poolMaxReduces.put(poolName, val);
           } else if ("maxRunningJobs".equals(field.getTagName())) {
             String text = ((Text)field.getFirstChild()).getData().trim();
             int val = Integer.parseInt(text);
@@ -212,8 +297,25 @@ public class PoolManager {
             String text = ((Text)field.getFirstChild()).getData().trim();
             double val = Double.parseDouble(text);
             poolWeights.put(poolName, val);
+          } else if ("minSharePreemptionTimeout".equals(field.getTagName())) {
+            String text = ((Text)field.getFirstChild()).getData().trim();
+            long val = Long.parseLong(text) * 1000L;
+            minSharePreemptionTimeouts.put(poolName, val);
+          } else if ("schedulingMode".equals(field.getTagName())) {
+            String text = ((Text)field.getFirstChild()).getData().trim();
+            poolModes.put(poolName, parseSchedulingMode(text));
           }
         }
+        if (poolMaxMaps.containsKey(poolName) && mapAllocs.containsKey(poolName)
+            && poolMaxMaps.get(poolName) < mapAllocs.get(poolName)) {
+          LOG.warn(String.format("Pool %s has max maps %d less than min maps %d",
+              poolName, poolMaxMaps.get(poolName), mapAllocs.get(poolName)));        
+        }
+        if(poolMaxReduces.containsKey(poolName) && reduceAllocs.containsKey(poolName)
+            && poolMaxReduces.get(poolName) < reduceAllocs.get(poolName)) {
+          LOG.warn(String.format("Pool %s has max reduces %d less than min reduces %d",
+              poolName, poolMaxReduces.get(poolName), reduceAllocs.get(poolName)));        
+        }
       } else if ("user".equals(element.getTagName())) {
         String userName = element.getAttribute("name");
         NodeList fields = element.getChildNodes();
@@ -232,6 +334,21 @@ public class PoolManager {
         String text = ((Text)element.getFirstChild()).getData().trim();
         int val = Integer.parseInt(text);
         userMaxJobsDefault = val;
+      } else if ("poolMaxJobsDefault".equals(element.getTagName())) {
+        String text = ((Text)element.getFirstChild()).getData().trim();
+        int val = Integer.parseInt(text);
+        poolMaxJobsDefault = val;
+      } else if ("fairSharePreemptionTimeout".equals(element.getTagName())) {
+        String text = ((Text)element.getFirstChild()).getData().trim();
+        long val = Long.parseLong(text) * 1000L;
+        fairSharePreemptionTimeout = val;
+      } else if ("defaultMinSharePreemptionTimeout".equals(element.getTagName())) {
+        String text = ((Text)element.getFirstChild()).getData().trim();
+        long val = Long.parseLong(text) * 1000L;
+        defaultMinSharePreemptionTimeout = val;
+      } else if ("defaultPoolSchedulingMode".equals(element.getTagName())) {
+        String text = ((Text)element.getFirstChild()).getData().trim();
+        defaultSchedulingMode = parseSchedulingMode(text);
       } else {
         LOG.warn("Bad element in allocations file: " + element.getTagName());
       }
@@ -242,16 +359,60 @@ public class PoolManager {
     synchronized(this) {
       this.mapAllocs = mapAllocs;
       this.reduceAllocs = reduceAllocs;
+      this.poolMaxMaps = poolMaxMaps;
+      this.poolMaxReduces = poolMaxReduces;
       this.poolMaxJobs = poolMaxJobs;
       this.userMaxJobs = userMaxJobs;
-      this.userMaxJobsDefault = userMaxJobsDefault;
       this.poolWeights = poolWeights;
+      this.minSharePreemptionTimeouts = minSharePreemptionTimeouts;
+      this.userMaxJobsDefault = userMaxJobsDefault;
+      this.poolMaxJobsDefault = poolMaxJobsDefault;
+      this.fairSharePreemptionTimeout = fairSharePreemptionTimeout;
+      this.defaultMinSharePreemptionTimeout = defaultMinSharePreemptionTimeout;
+      this.defaultSchedulingMode = defaultSchedulingMode;
       for (String name: poolNamesInAllocFile) {
-        getPool(name);
+        Pool pool = getPool(name);
+        if (poolModes.containsKey(name)) {
+          pool.setSchedulingMode(poolModes.get(name));
+        } else {
+          pool.setSchedulingMode(defaultSchedulingMode);
+        }
       }
     }
   }
 
+  /**
+   * Does the pool have incompatible max and min allocation set.
+   * 
+   * @param type
+   *          {@link TaskType#MAP} or {@link TaskType#REDUCE}
+   * @param pool
+   *          the pool name
+   * @return true if the max is less than the min
+   */
+  boolean invertedMinMax(TaskType type, String pool) {
+    Map<String, Integer> max = TaskType.MAP == type ? poolMaxMaps : poolMaxReduces;
+    Map<String, Integer> min = TaskType.MAP == type ? mapAllocs : reduceAllocs;
+    if (max.containsKey(pool) && min.containsKey(pool)
+        && max.get(pool) < min.get(pool)) {
+      return true;
+    }
+    return false;
+  }
+
+  private SchedulingMode parseSchedulingMode(String text)
+      throws AllocationConfigurationException {
+    text = text.toLowerCase();
+    if (text.equals("fair")) {
+      return SchedulingMode.FAIR;
+    } else if (text.equals("fifo")) {
+      return SchedulingMode.FIFO;
+    } else {
+      throw new AllocationConfigurationException(
+          "Unknown scheduling mode : " + text + "; expected 'fifo' or 'fair'");
+    }
+  }
+
   /**
    * Get the allocation for a particular pool
    */
@@ -261,7 +422,20 @@ public class PoolManager {
     Integer alloc = allocationMap.get(pool);
     return (alloc == null ? 0 : alloc);
   }
-  
+
+  /**
+   * Get the maximum map or reduce slots for the given pool.
+   * @return the cap set on this pool, or Integer.MAX_VALUE if not set.
+   */
+  int getMaxSlots(String poolName, TaskType taskType) {
+    Map<String, Integer> maxMap = (taskType == TaskType.MAP ? poolMaxMaps : poolMaxReduces);
+    if (maxMap.containsKey(poolName)) {
+      return maxMap.get(poolName);
+    } else {
+      return Integer.MAX_VALUE;
+    }
+  }
+ 
   /**
    * Add a job in the appropriate pool
    */
@@ -281,7 +455,7 @@ public class PoolManager {
    */
   public synchronized void setPool(JobInProgress job, String pool) {
     removeJob(job);
-    job.getJobConf().set(poolNameProperty, pool);
+    job.getJobConf().set(EXPLICIT_POOL_PROPERTY, pool);
     addJob(job);
   }
 
@@ -293,13 +467,16 @@ public class PoolManager {
   }
   
   /**
-   * Get the pool name for a JobInProgress from its configuration. This uses
-   * the "project" property in the jobconf by default, or the property set with
-   * "mapred.fairscheduler.poolnameproperty".
+   * Get the pool name for a JobInProgress from its configuration.  This uses
+   * the value of mapred.fairscheduler.pool if specified, otherwise the value 
+   * of the property named in mapred.fairscheduler.poolnameproperty if that is
+   * specified.  Otherwise if neither is specified it uses the "user.name" property 
+   * in the jobconf by default.
    */
   public String getPoolName(JobInProgress job) {
-    JobConf conf = job.getJobConf();
-    return conf.get(poolNameProperty, Pool.DEFAULT_POOL_NAME).trim();
+    Configuration conf = job.getJobConf();
+    return conf.get(EXPLICIT_POOL_PROPERTY,
+      conf.get(poolNameProperty, Pool.DEFAULT_POOL_NAME)).trim();
   }
 
   /**
@@ -327,7 +504,7 @@ public class PoolManager {
     if (poolMaxJobs.containsKey(pool)) {
       return poolMaxJobs.get(pool);
     } else {
-      return Integer.MAX_VALUE;
+      return poolMaxJobsDefault;
     }
   }
 
@@ -338,4 +515,32 @@ public class PoolManager {
       return 1.0;
     }
   }
+
+  /**
+   * Get a pool's min share preemption timeout, in milliseconds. This is the
+   * time after which jobs in the pool may kill other pools' tasks if they
+   * are below their min share.
+   */
+  public long getMinSharePreemptionTimeout(String pool) {
+    if (minSharePreemptionTimeouts.containsKey(pool)) {
+      return minSharePreemptionTimeouts.get(pool);
+    } else {
+      return defaultMinSharePreemptionTimeout;
+    }
+  }
+  
+  /**
+   * Get the fair share preemption, in milliseconds. This is the time
+   * after which any job may kill other jobs' tasks if it is below half
+   * its fair share.
+   */
+  public long getFairSharePreemptionTimeout() {
+    return fairSharePreemptionTimeout;
+  }
+
+  synchronized void updateMetrics() {
+    for (Pool pool : pools.values()) {
+      pool.updateMetrics();
+    }
+  }
 }

+ 221 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/PoolSchedulable.java

@@ -0,0 +1,221 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.FairScheduler.JobInfo;
+import org.apache.hadoop.mapreduce.TaskType;
+
+public class PoolSchedulable extends Schedulable {
+  public static final Log LOG = LogFactory.getLog(
+      PoolSchedulable.class.getName());
+  
+  private FairScheduler scheduler;
+  private Pool pool;
+  private TaskType taskType;
+  private PoolManager poolMgr;
+  private List<JobSchedulable> jobScheds = new LinkedList<JobSchedulable>();
+  private int demand = 0;
+  
+  // Variables used for preemption
+  long lastTimeAtMinShare;
+  long lastTimeAtHalfFairShare;
+
+  public PoolSchedulable(FairScheduler scheduler, Pool pool, TaskType type) {
+    this.scheduler = scheduler;
+    this.pool = pool;
+    this.taskType = type;
+    this.poolMgr = scheduler.getPoolManager();
+    long currentTime = scheduler.getClock().getTime();
+    this.lastTimeAtMinShare = currentTime;
+    this.lastTimeAtHalfFairShare = currentTime;
+    
+    initMetrics();
+  }
+
+  public void addJob(JobInProgress job) {
+    JobInfo info = scheduler.getJobInfo(job);
+    jobScheds.add(taskType == TaskType.MAP ?
+        info.mapSchedulable : info.reduceSchedulable);
+  }
+  
+  public void removeJob(JobInProgress job) {
+    for (Iterator<JobSchedulable> it = jobScheds.iterator(); it.hasNext();) {
+      JobSchedulable jobSched = it.next();
+      if (jobSched.getJob() == job) {
+        it.remove();
+        break;
+      }
+    }
+  }
+
+  /**
+   * Update demand by asking jobs in the pool to update
+   */
+  @Override
+  public void updateDemand() {
+    demand = 0;
+    for (JobSchedulable sched: jobScheds) {
+      sched.updateDemand();
+      demand += sched.getDemand();
+    }
+    // if demand exceeds the cap for this pool, limit to the max
+    int maxTasks = poolMgr.getMaxSlots(pool.getName(), taskType);
+    if(demand > maxTasks) {
+      demand = maxTasks;
+    }
+  }
+  
+  /**
+   * Distribute the pool's fair share among its jobs
+   */
+  @Override
+  public void redistributeShare() {
+    if (pool.getSchedulingMode() == SchedulingMode.FAIR) {
+      SchedulingAlgorithms.computeFairShares(jobScheds, getFairShare());
+    } else {
+      for (JobSchedulable sched: jobScheds) {
+        sched.setFairShare(0);
+      }
+    } 
+  }
+
+  @Override
+  public int getDemand() {
+    return demand;
+  }
+
+  @Override
+  public int getMinShare() {
+    return poolMgr.getAllocation(pool.getName(), taskType);
+  }
+
+  @Override
+  public double getWeight() {
+    return poolMgr.getPoolWeight(pool.getName());
+  }
+
+  @Override
+  public JobPriority getPriority() {
+    return JobPriority.NORMAL;
+  }
+
+  @Override
+  public int getRunningTasks() {
+    int ans = 0;
+    for (JobSchedulable sched: jobScheds) {
+      ans += sched.getRunningTasks();
+    }
+    return ans;
+  }
+
+  @Override
+  public long getStartTime() {
+    return 0;
+  }
+
+  @Override
+  public Task assignTask(TaskTrackerStatus tts, long currentTime,
+      Collection<JobInProgress> visited) throws IOException {
+    int runningTasks = getRunningTasks();
+    if (runningTasks >= poolMgr.getMaxSlots(pool.getName(), taskType)) {
+      return null;
+    }
+    SchedulingMode mode = pool.getSchedulingMode();
+    Comparator<Schedulable> comparator;
+    if (mode == SchedulingMode.FIFO) {
+      comparator = new SchedulingAlgorithms.FifoComparator();
+    } else if (mode == SchedulingMode.FAIR) {
+      comparator = new SchedulingAlgorithms.FairShareComparator();
+    } else {
+      throw new RuntimeException("Unsupported pool scheduling mode " + mode);
+    }
+    Collections.sort(jobScheds, comparator);
+    for (JobSchedulable sched: jobScheds) {
+      Task task = sched.assignTask(tts, currentTime, visited);
+      if (task != null)
+        return task;
+    }
+    return null;
+  }
+  
+  @Override
+  public String getName() {
+    return pool.getName();
+  }
+
+  Pool getPool() {
+    return pool;
+  }
+
+  @Override
+  public TaskType getTaskType() {
+    return taskType;
+  }
+  
+  public Collection<JobSchedulable> getJobSchedulables() {
+    return jobScheds;
+  }
+  
+  public long getLastTimeAtMinShare() {
+    return lastTimeAtMinShare;
+  }
+  
+  public void setLastTimeAtMinShare(long lastTimeAtMinShare) {
+    this.lastTimeAtMinShare = lastTimeAtMinShare;
+  }
+  
+  public long getLastTimeAtHalfFairShare() {
+    return lastTimeAtHalfFairShare;
+  }
+  
+  public void setLastTimeAtHalfFairShare(long lastTimeAtHalfFairShare) {
+    this.lastTimeAtHalfFairShare = lastTimeAtHalfFairShare;
+  }
+
+  protected String getMetricsContextName() {
+    return "pools";
+  }
+  
+  @Override
+  public void updateMetrics() {
+    super.setMetricValues(metrics);
+    
+    if (scheduler.isPreemptionEnabled()) {
+      // These won't be set if preemption is off
+      long lastCheck = scheduler.getLastPreemptionUpdateTime();
+      metrics.setMetric("millisSinceAtMinShare", lastCheck - lastTimeAtMinShare);
+      metrics.setMetric("millisSinceAtHalfFairShare", lastCheck - lastTimeAtHalfFairShare);
+    }
+    metrics.update();
+
+    for (JobSchedulable job : jobScheds) {
+      job.updateMetrics();
+    }
+  }
+}

+ 171 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/Schedulable.java

@@ -0,0 +1,171 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.metrics.MetricsContext;
+import org.apache.hadoop.metrics.MetricsRecord;
+import org.apache.hadoop.metrics.MetricsUtil;
+
+/**
+ * A Schedulable represents an entity that can launch tasks, such as a job
+ * or a pool. It provides a common interface so that algorithms such as fair
+ * sharing can be applied both within a pool and across pools. There are 
+ * currently two types of Schedulables: JobSchedulables, which represent a
+ * single job, and PoolSchedulables, which allocate among jobs in their pool.
+ * 
+ * Separate sets of Schedulables are used for maps and reduces. Each pool has
+ * both a mapSchedulable and a reduceSchedulable, and so does each job.
+ * 
+ * A Schedulable is responsible for three roles:
+ * 1) It can launch tasks through assignTask().
+ * 2) It provides information about the job/pool to the scheduler, including:
+ *    - Demand (maximum number of tasks required)
+ *    - Number of currently running tasks
+ *    - Minimum share (for pools)
+ *    - Job/pool weight (for fair sharing)
+ *    - Start time and priority (for FIFO)
+ * 3) It can be assigned a fair share, for use with fair scheduling.
+ * 
+ * Schedulable also contains two methods for performing scheduling computations:
+ * - updateDemand() is called periodically to compute the demand of the various
+ *   jobs and pools, which may be expensive (e.g. jobs must iterate through all
+ *   their tasks to count failed tasks, tasks that can be speculated, etc).
+ * - redistributeShare() is called after demands are updated and a Schedulable's
+ *   fair share has been set by its parent to let it distribute its share among
+ *   the other Schedulables within it (e.g. for pools that want to perform fair
+ *   sharing among their jobs).
+ */
+abstract class Schedulable {
+  /** Fair share assigned to this Schedulable */
+  private double fairShare = 0;
+  protected MetricsRecord metrics;
+  
+  /**
+   * Name of job/pool, used for debugging as well as for breaking ties in
+   * scheduling order deterministically. 
+   */
+  public abstract String getName();
+  
+  /**
+   * @return the type of tasks that this pool schedules
+   */
+  public abstract TaskType getTaskType();
+  
+  /**
+   * Maximum number of tasks required by this Schedulable. This is defined as
+   * number of currently running tasks + number of unlaunched tasks (tasks that
+   * are either not yet launched or need to be speculated).
+   */
+  public abstract int getDemand();
+  
+  /** Number of tasks the schedulable is currently running. */
+  public abstract int getRunningTasks();
+  
+  /** Minimum share slots assigned to the schedulable. */
+  public abstract int getMinShare();
+  
+  /** Job/pool weight in fair sharing. */
+  public abstract double getWeight();
+  
+  /** Job priority for jobs in FIFO pools; meaningless for PoolSchedulables. */
+  public abstract JobPriority getPriority();
+  
+  /** Start time for jobs in FIFO pools; meaningless for PoolSchedulables. */
+  public abstract long getStartTime();
+  
+  /** Refresh the Schedulable's demand and those of its children if any. */
+  public abstract void updateDemand();
+  
+  /** 
+   * Distribute the fair share assigned to this Schedulable among its 
+   * children (used in pools where the internal scheduler is fair sharing). 
+   */
+  public abstract void redistributeShare();
+  
+  /**
+   * Obtain a task for a given TaskTracker, or null if the Schedulable has
+   * no tasks to launch at this moment or does not wish to launch a task on
+   * this TaskTracker (e.g. is waiting for a TaskTracker with local data). 
+   * In addition, if a job is skipped during this search because it is waiting
+   * for a TaskTracker with local data, this method is expected to add it to
+   * the <tt>visited</tt> collection passed in, so that the scheduler can
+   * properly mark it as skipped during this heartbeat. Please see
+   * {@link FairScheduler#getAllowedLocalityLevel(JobInProgress, long)}
+   * for details of delay scheduling (waiting for trackers with local data).
+   * 
+   * @param tts      TaskTracker that the task will be launched on
+   * @param currentTime Cached time (to prevent excessive calls to gettimeofday)
+   * @param visited  A Collection to which this method must add all jobs that
+   *                 were considered during the search for a job to assign.
+   * @return Task to launch, or null if Schedulable cannot currently launch one.
+   * @throws IOException Possible if obtainNew(Map|Reduce)Task throws exception.
+   */
+  public abstract Task assignTask(TaskTrackerStatus tts, long currentTime,
+      Collection<JobInProgress> visited) throws IOException;
+
+  /** Assign a fair share to this Schedulable. */
+  public void setFairShare(double fairShare) {
+    this.fairShare = fairShare;
+  }
+  
+  /** Get the fair share assigned to this Schedulable. */
+  public double getFairShare() {
+    return fairShare;
+  }
+  
+  /** Return the name of the metrics context for this schedulable */
+  protected abstract String getMetricsContextName();
+  
+  /**
+   * Set up metrics context
+   */
+  protected void initMetrics() {
+    MetricsContext metricsContext = MetricsUtil.getContext("fairscheduler");
+    this.metrics = MetricsUtil.createRecord(metricsContext,
+        getMetricsContextName());
+    metrics.setTag("name", getName());
+    metrics.setTag("taskType", getTaskType().toString());
+  }
+
+  void cleanupMetrics() {
+    metrics.remove();
+    metrics = null;
+  }
+
+  protected void setMetricValues(MetricsRecord metrics) {
+    metrics.setMetric("fairShare", (float)getFairShare());
+    metrics.setMetric("minShare", getMinShare());
+    metrics.setMetric("demand", getDemand());
+    metrics.setMetric("weight", (float)getWeight());
+    metrics.setMetric("runningTasks", getRunningTasks());
+  }
+  
+  abstract void updateMetrics();
+  
+  /** Convenient toString implementation for debugging. */
+  @Override
+  public String toString() {
+    return String.format("[%s, demand=%d, running=%d, share=%.1f, w=%.1f]",
+        getName(), getDemand(), getRunningTasks(), fairShare, getWeight());
+  }
+}

+ 209 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/SchedulingAlgorithms.java

@@ -0,0 +1,209 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import java.util.Collection;
+import java.util.Comparator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * Utility class containing scheduling algorithms used in the fair scheduler.
+ */
+class SchedulingAlgorithms {
+  public static final Log LOG = LogFactory.getLog(
+      SchedulingAlgorithms.class.getName());
+  
+  /**
+   * Compare Schedulables in order of priority and then submission time, as in
+   * the default FIFO scheduler in Hadoop.
+   */
+  public static class FifoComparator implements Comparator<Schedulable> {
+    @Override
+    public int compare(Schedulable s1, Schedulable s2) {
+      int res = s1.getPriority().compareTo(s2.getPriority());
+      if (res == 0) {
+        res = (int) Math.signum(s1.getStartTime() - s2.getStartTime());
+      }
+      if (res == 0) {
+        // In the rare case where jobs were submitted at the exact same time,
+        // compare them by name (which will be the JobID) to get a deterministic
+        // ordering, so we don't alternately launch tasks from different jobs.
+        res = s1.getName().compareTo(s2.getName());
+      }
+      return res;
+    }
+  }
+
+  /**
+   * Compare Schedulables via weighted fair sharing. In addition, Schedulables
+   * below their min share get priority over those whose min share is met. 
+   * 
+   * Schedulables below their min share are compared by how far below it they
+   * are as a ratio. For example, if job A has 8 out of a min share of 10 tasks
+   * and job B has 50 out of a min share of 100, then job B is scheduled next, 
+   * because B is at 50% of its min share and A is at 80% of its min share.
+   * 
+   * Schedulables above their min share are compared by (runningTasks / weight).
+   * If all weights are equal, slots are given to the job with the fewest tasks;
+   * otherwise, jobs with more weight get proportionally more slots.
+   */
+  public static class FairShareComparator implements Comparator<Schedulable> {
+    @Override
+    public int compare(Schedulable s1, Schedulable s2) {
+      double minShareRatio1, minShareRatio2;
+      double tasksToWeightRatio1, tasksToWeightRatio2;
+      int minShare1 = Math.min(s1.getMinShare(), s1.getDemand());
+      int minShare2 = Math.min(s2.getMinShare(), s2.getDemand());
+      boolean s1Needy = s1.getRunningTasks() < minShare1;
+      boolean s2Needy = s2.getRunningTasks() < minShare2;
+      minShareRatio1 = s1.getRunningTasks() / Math.max(minShare1, 1.0);
+      minShareRatio2 = s2.getRunningTasks() / Math.max(minShare2, 1.0);
+      tasksToWeightRatio1 = s1.getRunningTasks() / s1.getWeight();
+      tasksToWeightRatio2 = s2.getRunningTasks() / s2.getWeight();
+      int res = 0;
+      if (s1Needy && !s2Needy)
+        res = -1;
+      else if (s2Needy && !s1Needy)
+        res = 1;
+      else if (s1Needy && s2Needy)
+        res = (int) Math.signum(minShareRatio1 - minShareRatio2);
+      else // Neither schedulable is needy
+        res = (int) Math.signum(tasksToWeightRatio1 - tasksToWeightRatio2);
+      if (res == 0) {
+        // Jobs are tied in fairness ratio. Break the tie by submit time and job 
+        // name to get a deterministic ordering, which is useful for unit tests.
+        res = (int) Math.signum(s1.getStartTime() - s2.getStartTime());
+        if (res == 0)
+          res = s1.getName().compareTo(s2.getName());
+      }
+      return res;
+    }
+  }
+
+  /** 
+   * Number of iterations for the binary search in computeFairShares. This is 
+   * equivalent to the number of bits of precision in the output. 25 iterations 
+   * gives precision better than 0.1 slots in clusters with one million slots.
+   */
+  private static final int COMPUTE_FAIR_SHARES_ITERATIONS = 25;
+  
+  /**
+   * Given a set of Schedulables and a number of slots, compute their weighted
+   * fair shares. The min shares and demands of the Schedulables are assumed to
+   * be set beforehand. We compute the fairest possible allocation of shares 
+   * to the Schedulables that respects their min shares and demands.
+   * 
+   * To understand what this method does, we must first define what weighted
+   * fair sharing means in the presence of minimum shares and demands. If there
+   * were no minimum shares and every Schedulable had an infinite demand (i.e.
+   * could launch infinitely many tasks), then weighted fair sharing would be
+   * achieved if the ratio of slotsAssigned / weight was equal for each
+   * Schedulable and all slots were assigned. Minimum shares and demands add
+   * two further twists:
+   * - Some Schedulables may not have enough tasks to fill all their share.
+   * - Some Schedulables may have a min share higher than their assigned share.
+   * 
+   * To deal with these possibilities, we define an assignment of slots as
+   * being fair if there exists a ratio R such that:
+   * - Schedulables S where S.demand < R * S.weight are assigned share S.demand
+   * - Schedulables S where S.minShare > R * S.weight are given share S.minShare
+   * - All other Schedulables S are assigned share R * S.weight
+   * - The sum of all the shares is totalSlots.
+   * 
+   * We call R the weight-to-slots ratio because it converts a Schedulable's
+   * weight to the number of slots it is assigned.
+   * 
+   * We compute a fair allocation by finding a suitable weight-to-slot ratio R.
+   * To do this, we use binary search. Given a ratio R, we compute the number
+   * of slots that would be used in total with this ratio (the sum of the shares
+   * computed using the conditions above). If this number of slots is less than
+   * totalSlots, then R is too small and more slots could be assigned. If the
+   * number of slots is more than totalSlots, then R is too large. 
+   * 
+   * We begin the binary search with a lower bound on R of 0 (which means that
+   * all Schedulables are only given their minShare) and an upper bound computed
+   * to be large enough that too many slots are given (by doubling R until we
+   * either use more than totalSlots slots or we fulfill all jobs' demands).
+   * The helper method slotsUsedWithWeightToSlotRatio computes the total number
+   * of slots used with a given value of R.
+   * 
+   * The running time of this algorithm is linear in the number of Schedulables,
+   * because slotsUsedWithWeightToSlotRatio is linear-time and the number of
+   * iterations of binary search is a constant (dependent on desired precision).
+   */
+  public static void computeFairShares(
+      Collection<? extends Schedulable> schedulables, double totalSlots) {
+    // Find an upper bound on R that we can use in our binary search. We start 
+    // at R = 1 and double it until we have either used totalSlots slots or we
+    // have met all Schedulables' demands (if total demand < totalSlots).
+    double totalDemand = 0;
+    for (Schedulable sched: schedulables) {
+      totalDemand += sched.getDemand();
+    }
+    double cap = Math.min(totalDemand, totalSlots);
+    double rMax = 1.0;
+    while (slotsUsedWithWeightToSlotRatio(rMax, schedulables) < cap) {
+      rMax *= 2.0;
+    }
+    // Perform the binary search for up to COMPUTE_FAIR_SHARES_ITERATIONS steps
+    double left = 0;
+    double right = rMax;
+    for (int i = 0; i < COMPUTE_FAIR_SHARES_ITERATIONS; i++) {
+      double mid = (left + right) / 2.0;
+      if (slotsUsedWithWeightToSlotRatio(mid, schedulables) < cap) {
+        left = mid;
+      } else {
+        right = mid;
+      }
+    }
+    // Set the fair shares based on the value of R we've converged to
+    for (Schedulable sched: schedulables) {
+      sched.setFairShare(computeShare(sched, right));
+    }
+  }
+  
+  /**
+   * Compute the number of slots that would be used given a weight-to-slot
+   * ratio w2sRatio, for use in the computeFairShares algorithm as described
+   * in #{@link SchedulingAlgorithms#computeFairShares(Collection, double)}.
+   */
+  private static double slotsUsedWithWeightToSlotRatio(double w2sRatio,
+      Collection<? extends Schedulable> schedulables) {
+    double slotsTaken = 0;
+    for (Schedulable sched: schedulables) {
+      double share = computeShare(sched, w2sRatio);
+      slotsTaken += share;
+    }
+    return slotsTaken;
+  }
+
+  /**
+   * Compute the number of slots assigned to a Schedulable given a particular
+   * weight-to-slot ratio w2sRatio, for use in computeFairShares as described
+   * in #{@link SchedulingAlgorithms#computeFairShares(Collection, double)}.
+   */
+  private static double computeShare(Schedulable sched, double w2sRatio) {
+    double share = sched.getWeight() * w2sRatio;
+    share = Math.max(share, sched.getMinShare());
+    share = Math.min(share, sched.getDemand());
+    return share;
+  }
+}

+ 26 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/SchedulingMode.java

@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+/**
+ * Internal scheduling modes for pools.
+ */
+public enum SchedulingMode {
+  FAIR, FIFO
+}

+ 2 - 1
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/TaskSelector.java

@@ -22,6 +22,7 @@ import java.io.IOException;
 
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.server.jobtracker.TaskTracker;
 
 /**
  * A pluggable object for selecting tasks to run from a {@link JobInProgress} on
@@ -86,7 +87,7 @@ public abstract class TaskSelector implements Configurable {
    * @throws IOException 
    */
   public abstract Task obtainNewMapTask(TaskTrackerStatus taskTracker,
-      JobInProgress job) throws IOException;
+      JobInProgress job, int localityLevel) throws IOException;
 
   /**
    * Choose a reduce task to run from the given job on the given TaskTracker.

+ 1 - 0
src/contrib/fairscheduler/src/java/org/apache/hadoop/mapred/WeightAdjuster.java

@@ -19,6 +19,7 @@
 package org.apache.hadoop.mapred;
 
 import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.mapreduce.TaskType;
 
 /**
  * A pluggable object for altering the weights of jobs in the fair scheduler,

+ 124 - 0
src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/FakeSchedulable.java

@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.hadoop.mapreduce.TaskType;
+
+/**
+ * Dummy implementation of Schedulable for unit testing.
+ */
+public class FakeSchedulable extends Schedulable {
+  private int demand;
+  private int runningTasks;
+  private int minShare;
+  private double weight;
+  private JobPriority priority;
+  private long startTime;
+  
+  public FakeSchedulable() {
+    this(0, 0, 1, 0, 0, JobPriority.NORMAL, 0);
+  }
+  
+  public FakeSchedulable(int demand) {
+    this(demand, 0, 1, 0, 0, JobPriority.NORMAL, 0);
+  }
+  
+  public FakeSchedulable(int demand, int minShare) {
+    this(demand, minShare, 1, 0, 0, JobPriority.NORMAL, 0);
+  }
+  
+  public FakeSchedulable(int demand, int minShare, double weight) {
+    this(demand, minShare, weight, 0, 0, JobPriority.NORMAL, 0);
+  }
+  
+  public FakeSchedulable(int demand, int minShare, double weight, int fairShare,
+      int runningTasks, JobPriority priority, long startTime) {
+    this.demand = demand;
+    this.minShare = minShare;
+    this.weight = weight;
+    setFairShare(fairShare);
+    this.runningTasks = runningTasks;
+    this.priority = priority;
+    this.startTime = startTime;
+  }
+  
+  @Override
+  public Task assignTask(TaskTrackerStatus tts, long currentTime,
+      Collection<JobInProgress> visited) throws IOException {
+    return null;
+  }
+
+  @Override
+  public int getDemand() {
+    return demand;
+  }
+
+  @Override
+  public String getName() {
+    return "FakeSchedulable" + this.hashCode();
+  }
+
+  @Override
+  public JobPriority getPriority() {
+    return priority;
+  }
+
+  @Override
+  public int getRunningTasks() {
+    return runningTasks;
+  }
+
+  @Override
+  public long getStartTime() {
+    return startTime;
+  }
+  
+  @Override
+  public double getWeight() {
+    return weight;
+  }
+  
+  @Override
+  public int getMinShare() {
+    return minShare;
+  }
+
+  @Override
+  public void redistributeShare() {}
+
+  @Override
+  public void updateDemand() {}
+
+  @Override
+  public TaskType getTaskType() {
+    return TaskType.MAP;
+  }
+
+  @Override
+  protected String getMetricsContextName() {
+    return "fake";
+  }
+
+  @Override
+  void updateMetrics() {
+  }
+}

+ 150 - 0
src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/TestCapBasedLoadManager.java

@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.TaskStatus.State;
+
+import junit.framework.TestCase;
+
+/**
+ * Exercise the canAssignMap and canAssignReduce methods in 
+ * CapBasedLoadManager.
+ */
+public class TestCapBasedLoadManager extends TestCase {
+  
+  /**
+   * Returns a running MapTaskStatus.
+   */
+  private TaskStatus getRunningMapTaskStatus() {
+    TaskStatus ts = new MapTaskStatus();
+    ts.setRunState(State.RUNNING);
+    return ts;
+  }
+
+  /**
+   * Returns a running ReduceTaskStatus.
+   */
+  private TaskStatus getRunningReduceTaskStatus() {
+    TaskStatus ts = new ReduceTaskStatus();
+    ts.setRunState(State.RUNNING);
+    return ts;
+  }
+  
+  /**
+   * Returns a TaskTrackerStatus with the specified statistics. 
+   * @param mapCap        The capacity of map tasks 
+   * @param reduceCap     The capacity of reduce tasks
+   * @param runningMap    The number of running map tasks
+   * @param runningReduce The number of running reduce tasks
+   */
+  private TaskTrackerStatus getTaskTrackerStatus(int mapCap, int reduceCap, 
+      int runningMap, int runningReduce) {
+    List<TaskStatus> ts = new ArrayList<TaskStatus>();
+    for (int i = 0; i < runningMap; i++) {
+      ts.add(getRunningMapTaskStatus());
+    }
+    for (int i = 0; i < runningReduce; i++) {
+      ts.add(getRunningReduceTaskStatus());
+    }
+    TaskTrackerStatus tracker = new TaskTrackerStatus("tracker", 
+        "tracker_host", 1234, ts, 0, mapCap, reduceCap);
+    return tracker;
+  }
+
+  /**
+   * A single test of canAssignMap.
+   */
+  private void oneTestCanAssignMap(float maxDiff, int mapCap, int runningMap,
+      int totalMapSlots, int totalRunnableMap, boolean expected) {
+    
+    CapBasedLoadManager manager = new CapBasedLoadManager();
+    Configuration conf = new Configuration();
+    conf.setFloat("mapred.fairscheduler.load.max.diff", maxDiff);
+    manager.setConf(conf);
+    
+    TaskTrackerStatus ts = getTaskTrackerStatus(mapCap, 1, runningMap, 1);
+    
+    assertEquals( "When maxDiff=" + maxDiff + ", with totalRunnableMap=" 
+        + totalRunnableMap + " and totalMapSlots=" + totalMapSlots
+        + ", a tracker with runningMap=" + runningMap + " and mapCap="
+        + mapCap + " should " + (expected ? "" : "not ")
+        + "be able to take more Maps.",
+        expected,
+        manager.canAssignMap(ts, totalRunnableMap, totalMapSlots)
+        );
+  }
+  
+  
+  /** 
+   * Test canAssignMap method.
+   */
+  public void testCanAssignMap() {
+    oneTestCanAssignMap(0.0f, 5, 0, 50, 1, true);
+    oneTestCanAssignMap(0.0f, 5, 1, 50, 10, false);
+    oneTestCanAssignMap(0.2f, 5, 1, 50, 10, true);
+    oneTestCanAssignMap(0.0f, 5, 1, 50, 11, true);
+    oneTestCanAssignMap(0.0f, 5, 2, 50, 11, false);
+    oneTestCanAssignMap(0.3f, 5, 2, 50, 6, true);
+    oneTestCanAssignMap(1.0f, 5, 5, 50, 50, false);
+  }
+  
+  
+  /**
+   * A single test of canAssignReduce.
+   */
+  private void oneTestCanAssignReduce(float maxDiff, int ReduceCap,
+      int runningReduce, int totalReduceSlots, int totalRunnableReduce,
+      boolean expected) {
+    
+    CapBasedLoadManager manager = new CapBasedLoadManager();
+    Configuration conf = new Configuration();
+    conf.setFloat("mapred.fairscheduler.load.max.diff", maxDiff);
+    manager.setConf(conf);
+    
+    TaskTrackerStatus ts = getTaskTrackerStatus(1, ReduceCap, 1,
+        runningReduce);
+    
+    assertEquals( "When maxDiff=" + maxDiff + ", with totalRunnableReduce=" 
+        + totalRunnableReduce + " and totalReduceSlots=" + totalReduceSlots
+        + ", a tracker with runningReduce=" + runningReduce
+        + " and ReduceCap=" + ReduceCap + " should "
+        + (expected ? "" : "not ") + "be able to take more Reduces.",
+        expected,
+        manager.canAssignReduce(ts, totalRunnableReduce, totalReduceSlots)
+        );
+  }
+    
+  /** 
+   * Test canAssignReduce method.
+   */
+  public void testCanAssignReduce() {
+    oneTestCanAssignReduce(0.0f, 5, 0, 50, 1, true);
+    oneTestCanAssignReduce(0.0f, 5, 1, 50, 10, false);
+    oneTestCanAssignReduce(0.2f, 5, 1, 50, 10, true);
+    oneTestCanAssignReduce(0.0f, 5, 1, 50, 11, true);
+    oneTestCanAssignReduce(0.0f, 5, 2, 50, 11, false);
+    oneTestCanAssignReduce(0.3f, 5, 2, 50, 6, true);
+    oneTestCanAssignReduce(1.0f, 5, 5, 50, 50, false);
+  }
+  
+}

+ 184 - 0
src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/TestComputeFairShares.java

@@ -0,0 +1,184 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.mapred;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import junit.framework.TestCase;
+
+/**
+ * Exercise the computeFairShares method in SchedulingAlgorithms.
+ */
+public class TestComputeFairShares extends TestCase {
+  private List<Schedulable> scheds;
+  
+  @Override
+  protected void setUp() throws Exception {
+    scheds = new ArrayList<Schedulable>();
+  }
+  
+  /** 
+   * Basic test - pools with different demands that are all higher than their
+   * fair share (of 10 slots) should each get their fair share.
+   */
+  public void testEqualSharing() {
+    scheds.add(new FakeSchedulable(100));
+    scheds.add(new FakeSchedulable(50));
+    scheds.add(new FakeSchedulable(30));
+    scheds.add(new FakeSchedulable(20));
+    SchedulingAlgorithms.computeFairShares(scheds, 40);
+    verifyShares(10, 10, 10, 10);
+  }
+  
+  /**
+   * In this test, pool 4 has a smaller demand than the 40 / 4 = 10 slots that
+   * it would be assigned with equal sharing. It should only get the 3 slots
+   * it demands. The other pools must then split the remaining 37 slots, but
+   * pool 3, with 11 slots demanded, is now below its share of 37/3 ~= 12.3,
+   * so it only gets 11 slots. Pools 1 and 2 split the rest and get 13 each. 
+   */
+  public void testLowDemands() {
+    scheds.add(new FakeSchedulable(100));
+    scheds.add(new FakeSchedulable(50));
+    scheds.add(new FakeSchedulable(11));
+    scheds.add(new FakeSchedulable(3));
+    SchedulingAlgorithms.computeFairShares(scheds, 40);
+    verifyShares(13, 13, 11, 3);
+  }
+  
+  /**
+   * In this test, some pools have minimum shares set. Pool 1 has a min share
+   * of 20 so it gets 20 slots. Pool 2 also has a min share of 20, but its
+   * demand is only 10 so it can only get 10 slots. The remaining pools have
+   * 10 slots to split between them. Pool 4 gets 3 slots because its demand is
+   * only 3, and pool 3 gets the remaining 7 slots. Pool 4 also had a min share
+   * of 2 slots but this should not affect the outcome.
+   */
+  public void testMinShares() {
+    scheds.add(new FakeSchedulable(100, 20));
+    scheds.add(new FakeSchedulable(10, 20));
+    scheds.add(new FakeSchedulable(10, 0));
+    scheds.add(new FakeSchedulable(3, 2));
+    SchedulingAlgorithms.computeFairShares(scheds, 40);
+    verifyShares(20, 10, 7, 3);
+  }
+  
+  /**
+   * Basic test for weighted shares with no minimum shares and no low demands.
+   * Each pool should get slots in proportion to its weight.
+   */
+  public void testWeightedSharing() {
+    scheds.add(new FakeSchedulable(100, 0, 2.0));
+    scheds.add(new FakeSchedulable(50,  0, 1.0));
+    scheds.add(new FakeSchedulable(30,  0, 1.0));
+    scheds.add(new FakeSchedulable(20,  0, 0.5));
+    SchedulingAlgorithms.computeFairShares(scheds, 45);
+    verifyShares(20, 10, 10, 5);
+  }
+
+  /**
+   * Weighted sharing test where pools 1 and 2 are now given lower demands than
+   * above. Pool 1 stops at 10 slots, leaving 35. If the remaining pools split
+   * this into a 1:1:0.5 ratio, they would get 14:14:7 slots respectively, but
+   * pool 2's demand is only 11, so it only gets 11. The remaining 2 pools split
+   * the 24 slots left into a 1:0.5 ratio, getting 16 and 8 slots respectively.
+   */
+  public void testWeightedSharingWithLowDemands() {
+    scheds.add(new FakeSchedulable(10, 0, 2.0));
+    scheds.add(new FakeSchedulable(11, 0, 1.0));
+    scheds.add(new FakeSchedulable(30, 0, 1.0));
+    scheds.add(new FakeSchedulable(20, 0, 0.5));
+    SchedulingAlgorithms.computeFairShares(scheds, 45);
+    verifyShares(10, 11, 16, 8);
+  }
+
+  /**
+   * Weighted fair sharing test with min shares. As in the min share test above,
+   * pool 1 has a min share greater than its demand so it only gets its demand.
+   * Pool 3 has a min share of 15 even though its weight is very small, so it
+   * gets 15 slots. The remaining pools share the remaining 20 slots equally,
+   * getting 10 each. Pool 3's min share of 5 slots doesn't affect this.
+   */
+  public void testWeightedSharingWithMinShares() {
+    scheds.add(new FakeSchedulable(10, 20, 2.0));
+    scheds.add(new FakeSchedulable(11, 0, 1.0));
+    scheds.add(new FakeSchedulable(30, 5, 1.0));
+    scheds.add(new FakeSchedulable(20, 15, 0.5));
+    SchedulingAlgorithms.computeFairShares(scheds, 45);
+    verifyShares(10, 10, 10, 15);
+  }
+
+  /**
+   * Test that shares are computed accurately even when there are many more
+   * frameworks than available slots.
+   */
+  public void testSmallShares() {
+    scheds.add(new FakeSchedulable(10));
+    scheds.add(new FakeSchedulable(5));
+    scheds.add(new FakeSchedulable(3));
+    scheds.add(new FakeSchedulable(2));
+    SchedulingAlgorithms.computeFairShares(scheds, 1);
+    verifyShares(0.25, 0.25, 0.25, 0.25);
+  }
+
+  /**
+   * Test that shares are computed accurately even when the number of slots is
+   * very large.
+   */  
+  public void testLargeShares() {
+    int million = 1000 * 1000;
+    scheds.add(new FakeSchedulable(100 * million));
+    scheds.add(new FakeSchedulable(50 * million));
+    scheds.add(new FakeSchedulable(30 * million));
+    scheds.add(new FakeSchedulable(20 * million));
+    SchedulingAlgorithms.computeFairShares(scheds, 40 * million);
+    verifyShares(10 * million, 10 * million, 10 * million, 10 * million);
+  }
+
+  /**
+   * Test that having a pool with 0 demand doesn't confuse the algorithm.
+   */
+  public void testZeroDemand() {
+    scheds.add(new FakeSchedulable(100));
+    scheds.add(new FakeSchedulable(50));
+    scheds.add(new FakeSchedulable(30));
+    scheds.add(new FakeSchedulable(0));
+    SchedulingAlgorithms.computeFairShares(scheds, 30);
+    verifyShares(10, 10, 10, 0);
+  }
+  
+  /**
+   * Test that being called on an empty list doesn't confuse the algorithm.
+   */
+  public void testEmptyList() {
+    SchedulingAlgorithms.computeFairShares(scheds, 40);
+    verifyShares();
+  }
+  
+  /**
+   * Check that a given list of shares have been assigned to this.scheds.
+   */
+  private void verifyShares(double... shares) {
+    assertEquals(scheds.size(), shares.length);
+    for (int i = 0; i < shares.length; i++) {
+      assertEquals(shares[i], scheds.get(i).getFairShare(), 0.01);
+    }
+  }
+}

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 719 - 224
src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/TestFairScheduler.java


+ 199 - 0
src/contrib/fairscheduler/src/test/org/apache/hadoop/mapred/TestFairSchedulerSystem.java

@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.mapred;
+
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.mapreduce.TestSleepJob;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.hadoop.conf.Configuration;
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URL;
+import java.net.HttpURLConnection;
+import java.util.concurrent.Callable;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.TimeUnit;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+import org.junit.BeforeClass;
+import org.junit.AfterClass;
+import static org.junit.Assert.*;
+
+/**
+ * System tests for the fair scheduler. These run slower than the
+ * mock-based tests in TestFairScheduler but have a better chance
+ * of catching synchronization bugs with the real JT.
+ *
+ * This test suite will often be run inside JCarder in order to catch
+ * deadlock bugs which have plagued the scheduler in the past - hence
+ * it is a bit of a "grab-bag" of system tests, since it's important
+ * that they all run as part of the same JVM instantiation.
+ */
+public class TestFairSchedulerSystem {
+  static final int NUM_THREADS=2;
+
+  static MiniMRCluster mr;
+  static JobConf conf;
+
+  @BeforeClass
+  public static void setUp() throws Exception {
+    conf = new JobConf();
+    final int taskTrackers = 1;
+
+    // Bump up the frequency of preemption updates to test against
+    // deadlocks, etc.
+    conf.set("mapred.jobtracker.taskScheduler", FairScheduler.class.getCanonicalName());
+    conf.set("mapred.fairscheduler.update.interval", "1");
+    conf.set("mapred.fairscheduler.preemption.interval", "1");
+    conf.set("mapred.fairscheduler.preemption", "true");
+    conf.set("mapred.fairscheduler.eventlog.enabled", "true");
+    conf.set("mapred.fairscheduler.poolnameproperty", "group.name");
+    conf.set("mapred.job.tracker.persist.jobstatus.active", "false");
+    mr = new MiniMRCluster(taskTrackers, "file:///", 1, null, null, conf);
+  }
+
+  @AfterClass
+  public static void tearDown() throws Exception {
+    if (mr != null) {
+      mr.shutdown();
+    }
+  }
+
+  private void runSleepJob(JobConf conf) throws Exception {
+    String[] args = { "-m", "1", "-r", "1", "-mt", "1", "-rt", "1" };
+    ToolRunner.run(conf, new TestSleepJob(), args);
+  }
+
+  /**
+   * Submit some concurrent sleep jobs, and visit the scheduler servlet
+   * while they're running.
+   */
+  @Test
+  public void testFairSchedulerSystem() throws Exception {
+    ExecutorService exec = Executors.newFixedThreadPool(NUM_THREADS);
+    List<Future<Void>> futures = new ArrayList<Future<Void>>(NUM_THREADS);
+    for (int i = 0; i < NUM_THREADS; i++) {
+      futures.add(exec.submit(new Callable<Void>() {
+            public Void call() throws Exception {
+              JobConf jobConf = mr.createJobConf();
+              runSleepJob(jobConf);
+              return null;
+            }
+          }));
+    }
+
+    JobClient jc = new JobClient(mr.createJobConf(null));
+
+    // Wait for the tasks to finish, and visit the scheduler servlet
+    // every few seconds while waiting.
+    for (Future<Void> future : futures) {
+      while (true) {
+        try {
+          future.get(3, TimeUnit.SECONDS);
+          break;
+        } catch (TimeoutException te) {
+          // It's OK
+        }
+        checkServlet(true);
+        checkServlet(false);
+
+        JobStatus jobs[] = jc.getAllJobs();
+        if (jobs == null) {
+          System.err.println("No jobs running, not checking tasklog servlet");
+          continue;
+        }
+        for (JobStatus j : jobs) {
+          System.err.println("Checking task graph for " + j.getJobID());
+          try {
+            checkTaskGraphServlet(j.getJobID());
+          } catch (AssertionError err) {
+            // The task graph servlet will be empty if the job has retired.
+            // This is OK.
+            RunningJob rj = jc.getJob(j.getJobID());
+            if (!rj.isComplete()) {
+              throw err;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * Check the fair scheduler servlet for good status code and smoke test
+   * for contents.
+   */
+  private void checkServlet(boolean advanced) throws Exception {
+    String jtURL = "http://localhost:" +
+      mr.getJobTrackerRunner().getJobTrackerInfoPort();
+    URL url = new URL(jtURL + "/scheduler" +
+                      (advanced ? "?advanced" : ""));
+    HttpURLConnection connection = (HttpURLConnection)url.openConnection();
+    connection.setRequestMethod("GET");
+    connection.connect();
+    assertEquals(200, connection.getResponseCode());
+
+    // Just to be sure, slurp the content and make sure it looks like the scheduler
+    BufferedReader reader = new BufferedReader(
+      new InputStreamReader(connection.getInputStream()));
+    StringBuilder sb = new StringBuilder();
+
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      sb.append(line).append('\n');
+    }
+
+    String contents = sb.toString();
+    assertTrue("Bad contents for fair scheduler servlet: " + contents,
+      contents.contains("Fair Scheduler Administration"));
+  }
+
+  private void checkTaskGraphServlet(JobID job) throws Exception {
+    String jtURL = "http://localhost:" +
+      mr.getJobTrackerRunner().getJobTrackerInfoPort();
+    URL url = new URL(jtURL + "/taskgraph?jobid=" + job.toString() + "&type=map");
+    HttpURLConnection connection = (HttpURLConnection)url.openConnection();
+    connection.setRequestMethod("GET");
+    connection.connect();
+    assertEquals(200, connection.getResponseCode());
+
+    // Just to be sure, slurp the content and make sure it looks like the scheduler
+    String contents = slurpContents(connection);
+    assertTrue("Bad contents for job " + job + ":\n" + contents,
+      contents.contains("</svg>"));
+  }
+
+  private String slurpContents(HttpURLConnection connection) throws Exception {
+    BufferedReader reader = new BufferedReader(
+      new InputStreamReader(connection.getInputStream()));
+    StringBuilder sb = new StringBuilder();
+
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      sb.append(line).append('\n');
+    }
+
+    return sb.toString();
+  }
+}

+ 434 - 211
src/docs/src/documentation/content/xdocs/fair_scheduler.xml

@@ -18,16 +18,15 @@
 <!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
 <document>
   <header>
-    <title>Fair Scheduler Guide</title>
+    <title>Fair Scheduler</title>
   </header>
   <body>
 
     <section>
       <title>Purpose</title>
 
-      <p>This document describes the Fair Scheduler, a pluggable
-        MapReduce scheduler for Hadoop which provides a way to share
-        large clusters.</p>
+      <p>This document describes the Fair Scheduler, a pluggable MapReduce
+        scheduler that provides a way to share large clusters.</p>
     </section>
 
     <section>
@@ -39,52 +38,62 @@
         free up are assigned to the new jobs, so that each job gets
         roughly the same amount of CPU time. Unlike the default Hadoop
         scheduler, which forms a queue of jobs, this lets short jobs finish
-        in reasonable time while not starving long jobs. It is also a 
-        reasonable way to share a cluster between a number of users. Finally, 
-        fair sharing can also work with job priorities - the priorities are
+        in reasonable time while not starving long jobs. It is also an easy
+        way to share a cluster between multiple of users.
+        Fair sharing can also work with job priorities - the priorities are
         used as weights to determine the fraction of total compute time that
-        each job should get.
+        each job gets.
       </p>
       <p>
-        The scheduler actually organizes jobs further into "pools", and 
-        shares resources fairly between these pools. By default, there is a 
-        separate pool for each user, so that each user gets the same share 
-        of the cluster no matter how many jobs they submit. However, it is 
-        also possible to set a job's pool based on the user's Unix group or
-        any other jobconf property, such as the queue name property used by 
-        <a href="capacity_scheduler.html">Capacity Scheduler</a>. 
-        Within each pool, fair sharing is used to share capacity between 
-        the running jobs. Pools can also be given weights to share the 
-        cluster non-proportionally in the config file.
+        The fair scheduler organizes jobs into <em>pools</em>, and 
+        divides resources fairly between these pools. By default, there is a 
+        separate pool for each user, so that each user gets an equal share 
+        of the cluster. It is also possible to set a job's pool based on the
+        user's Unix group or any jobconf property. 
+        Within each pool, jobs can be scheduled using either fair sharing or 
+        first-in-first-out (FIFO) scheduling.
       </p>
       <p>
         In addition to providing fair sharing, the Fair Scheduler allows
-        assigning guaranteed minimum shares to pools, which is useful for
-        ensuring that certain users, groups or production applications
+        assigning guaranteed <em>minimum shares</em> to pools, which is useful
+        for ensuring that certain users, groups or production applications
         always get sufficient resources. When a pool contains jobs, it gets
         at least its minimum share, but when the pool does not need its full
-        guaranteed share, the excess is split between other running jobs.
-        This lets the scheduler guarantee capacity for pools while utilizing
-        resources efficiently when these pools don't contain jobs.       
+        guaranteed share, the excess is split between other pools.
       </p>
       <p>
-        The Fair Scheduler lets all jobs run by default, but it is also
-        possible to limit the number of running jobs per user and per pool
-        through the config file. This can be useful when a user must submit
-        hundreds of jobs at once, or in general to improve performance if
-        running too many jobs at once would cause too much intermediate data
-        to be created or too much context-switching. Limiting the jobs does
-        not cause any subsequently submitted jobs to fail, only to wait in the
-        sheduler's queue until some of the user's earlier jobs finish. Jobs to
-        run from each user/pool are chosen in order of priority and then
-        submit time, as in the default FIFO scheduler in Hadoop.
+        If a pool's minimum share is not met for some period of time, the
+        scheduler optionally supports <em>preemption</em> of jobs in other
+        pools. The pool will be allowed to kill tasks from other pools to make
+        room to run. Preemption can be used to guarantee
+        that "production" jobs are not starved while also allowing
+        the Hadoop cluster to also be used for experimental and research jobs.
+        In addition, a pool can also be allowed to preempt tasks if it is
+        below half of its fair share for a configurable timeout (generally
+        set larger than the minimum share preemption timeout).
+        When choosing tasks to kill, the fair scheduler picks the
+        most-recently-launched tasks from over-allocated jobs, 
+        to minimize wasted computation.
+        Preemption does not cause the preempted jobs to fail, because Hadoop
+        jobs tolerate losing tasks; it only makes them take longer to finish.
       </p>
       <p>
-        Finally, the fair scheduler provides several extension points where
-        the basic functionality can be extended. For example, the weight
-        calculation can be modified to give a priority boost to new jobs,
-        implementing a "shortest job first" policy which reduces response
-        times for interactive jobs even further.
+        The Fair Scheduler can limit the number of concurrent
+        running jobs per user and per pool. This can be useful when a 
+        user must submit hundreds of jobs at once, or for ensuring that
+        intermediate data does not fill up disk space on a cluster when too many
+        concurrent jobs are running.
+        Setting job limits causes jobs submitted beyond the limit to wait
+        until some of the user/pool's earlier jobs finish.
+        Jobs to run from each user/pool are chosen in order of priority and then
+        submit time.
+      </p>
+      <p>
+        Finally, the Fair Scheduler can limit the number of concurrent
+        running tasks per pool. This can be useful when jobs have a
+        dependency on an external service like a database or web
+        service that could be overloaded if too many map or reduce
+        tasks are run at once.
       </p>
     </section>
 
@@ -93,184 +102,367 @@
       <p>
         To run the fair scheduler in your Hadoop installation, you need to put
         it on the CLASSPATH. The easiest way is to copy the 
-        <em>hadoop-fairscheduler-*.jar</em> from
-        <em>HADOOP_HOME/contrib/fairscheduler</em> to <em>HADOOP_HOME/lib</em>.
+        <em>hadoop-*-fairscheduler.jar</em> from
+        <em>HADOOP_HOME/build/contrib/fairscheduler</em> to <em>HADOOP_HOME/lib</em>.
         Alternatively you can modify <em>HADOOP_CLASSPATH</em> to include this jar, in
         <em>HADOOP_CONF_DIR/hadoop-env.sh</em>
       </p>
-      <p>
-        In order to compile fair scheduler, from sources execute <em> ant 
-        package</em> in source folder and copy the 
-        <em>build/contrib/fair-scheduler/hadoop-fairscheduler-*.jar</em> 
-        to <em>HADOOP_HOME/lib</em>
-      </p>
       <p>
        You will also need to set the following property in the Hadoop config 
        file  <em>HADOOP_CONF_DIR/mapred-site.xml</em> to have Hadoop use 
-       the fair scheduler: <br/>
-       <code>&lt;property&gt;</code><br/> 
-       <code>&nbsp;&nbsp;&lt;name&gt;mapred.jobtracker.taskScheduler&lt;/name&gt;</code><br/>
-       <code>&nbsp;&nbsp;&lt;value&gt;org.apache.hadoop.mapred.FairScheduler&lt;/value&gt;</code><br/>
-       <code>&lt;/property&gt;</code>
+       the fair scheduler:
       </p>
+<source>
+&lt;property&gt;
+  &lt;name&gt;mapred.jobtracker.taskScheduler&lt;/name&gt;
+  &lt;value&gt;org.apache.hadoop.mapred.FairScheduler&lt;/value&gt;
+&lt;/property&gt;
+</source>
       <p>
         Once you restart the cluster, you can check that the fair scheduler 
-        is running by going to http://&lt;jobtracker URL&gt;/scheduler 
+        is running by going to <em>http://&lt;jobtracker URL&gt;/scheduler</em> 
         on the JobTracker's web UI. A &quot;job scheduler administration&quot; page should 
         be visible there. This page is described in the Administration section.
       </p>
+      <p>
+        If you wish to compile the fair scheduler from source, run <em> ant 
+        package</em> in your HADOOP_HOME directory. This will build
+        <em>build/contrib/fair-scheduler/hadoop-*-fairscheduler.jar</em>.
+      </p>
     </section>
     
     <section>
-      <title>Configuring the Fair scheduler</title>
+      <title>Configuration</title>
       <p>
-      The following properties can be set in mapred-site.xml to configure 
-      the fair scheduler:
+        The Fair Scheduler contains configuration in two places -- algorithm
+        parameters are set in <em>HADOOP_CONF_DIR/mapred-site.xml</em>, while 
+        a separate XML file called the <em>allocation file</em>, 
+        located by default in
+        <em>HADOOP_CONF_DIR/fair-scheduler.xml</em>, is used to configure
+        pools, minimum shares, running job limits and preemption timeouts.
+        The allocation file is reloaded periodically at runtime, 
+        allowing you to change pool settings without restarting 
+        your Hadoop cluster.
       </p>
-      <table>
-        <tr>
-        <th>Name</th><th>Description</th>
-        </tr>
-        <tr>
-        <td>
-          mapred.fairscheduler.allocation.file
-        </td>
-        <td>
-          Specifies an absolute path to an XML file which contains the 
-          allocations for each pool, as well as the per-pool and per-user 
-          limits on number of running jobs. If this property is not 
-          provided, allocations are not used.<br/>
-          This file must be in XML format, and can contain three types of 
-          elements:
+      <p>
+        For a minimal installation, to just get equal sharing between users,
+        you will not need to edit the allocation file.
+      </p>
+      <section>
+      <title>Scheduler Parameters in mapred-site.xml</title>
+        <p>
+          The following parameters can be set in <em>mapred-site.xml</em>
+          to affect the behavior of the fair scheduler:
+        </p>
+        <p><strong>Basic Parameters</strong></p>
+        <table>
+          <tr>
+          <th>Name</th><th>Description</th>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.preemption
+          </td>
+          <td>
+            Boolean property for enabling preemption. Default: false.
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.pool
+          </td>
+          <td>
+            Specify the pool that a job belongs in.  
+            If this is specified then mapred.fairscheduler.poolnameproperty is ignored.
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.poolnameproperty
+          </td>
+          <td>
+            Specify which jobconf property is used to determine the pool that a
+            job belongs in. String, default: <em>user.name</em>
+            (i.e. one pool for each user). 
+            Another useful value is <em>mapred.job.queue.name</em> to use MapReduce's "queue"
+            system for access control lists (see below).
+            mapred.fairscheduler.poolnameproperty is used only for jobs in which 
+            mapred.fairscheduler.pool is not explicitly set.
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.allocation.file
+          </td>
+          <td>
+            Can be used to have the scheduler use a different allocation file
+            than the default one (<em>HADOOP_CONF_DIR/fair-scheduler.xml</em>).
+            Must be an absolute path to the allocation file.
+          </td>
+          </tr>
+        </table>
+        <p> <br></br></p>
+        <p><strong>Advanced Parameters</strong> </p>
+        <table>
+          <tr>
+          <th>Name</th><th>Description</th>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.sizebasedweight
+          </td>
+          <td>
+            Take into account job sizes in calculating their weights for fair 
+            sharing. By default, weights are only based on job priorities. 
+            Setting this flag to true will make them based on the size of the 
+            job (number of tasks needed) as well,though not linearly 
+            (the weight will be proportional to the log of the number of tasks 
+            needed). This lets larger jobs get larger fair shares while still 
+            providing enough of a share to small jobs to let them finish fast. 
+            Boolean value, default: false.
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.preemption.only.log
+          </td>
+          <td>
+            This flag will cause the scheduler to run through the preemption
+            calculations but simply log when it wishes to preempt a task,
+            without actually preempting the task. 
+            Boolean property, default: false.
+            This property can be useful for
+            doing a "dry run" of preemption before enabling it to make sure
+            that you have not set timeouts too aggressively.
+            You will see preemption log messages in your JobTracker's output
+            log (<em>HADOOP_LOG_DIR/hadoop-jobtracker-*.log</em>).
+            The messages look as follows:<br/>
+            <code>Should preempt 2 tasks for job_20090101337_0001: tasksDueToMinShare = 2, tasksDueToFairShare = 0</code>
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.update.interval
+          </td>
+          <td>
+            Interval at which to update fair share calculations. The default
+            of 500ms works well for clusters with fewer than 500 nodes, 
+            but larger values reduce load on the JobTracker for larger clusters.
+            Integer value in milliseconds, default: 500.
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.preemption.interval
+          </td>
+          <td>
+            Interval at which to check for tasks to preempt. The default
+            of 15s works well for timeouts on the order of minutes.
+            It is not recommended to set timeouts much smaller than this
+            amount, but you can use this value to make preemption computations
+            run more often if you do set such timeouts. A value of less than
+            5s will probably be too small, however, as it becomes less than
+            the inter-heartbeat interval.
+            Integer value in milliseconds, default: 15000.
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.weightadjuster
+          </td>
+          <td>
+          An extension point that lets you specify a class to adjust the 
+          weights of running jobs. This class should implement the 
+          <em>WeightAdjuster</em> interface. There is currently one example 
+          implementation - <em>NewJobWeightBooster</em>, which increases the 
+          weight of jobs for the first 5 minutes of their lifetime to let 
+          short jobs finish faster. To use it, set the weightadjuster 
+          property to the full class name, 
+          <code>org.apache.hadoop.mapred.NewJobWeightBooster</code>.
+          NewJobWeightBooster itself provides two parameters for setting the 
+          duration and boost factor.
           <ul>
-          <li>pool elements, which may contain elements for minMaps, 
-          minReduces, maxRunningJobs (limit the number of jobs from the 
-          pool to run at once),and weight (to share the cluster 
-          non-proportionally with other pools).
-          </li>
-          <li>user elements, which may contain a maxRunningJobs to limit 
-          jobs. Note that by default, there is a separate pool for each 
-          user, so these may not be necessary; they are useful, however, 
-          if you create a pool per user group or manually assign jobs 
-          to pools.</li>
-          <li>A userMaxJobsDefault element, which sets the default running 
-          job limit for any users whose limit is not specified.</li>
+          <li><em>mapred.newjobweightbooster.factor</em>
+            Factor by which new jobs weight should be boosted. 
+            Default is 3.</li>
+          <li><em>mapred.newjobweightbooster.duration</em>
+            Boost duration in milliseconds. Default is 300000 for 5 minutes.</li>
           </ul>
-          <br/>
-          Example Allocation file is listed below :<br/>
-          <code>&lt;?xml version="1.0"?&gt; </code> <br/>
-          <code>&lt;allocations&gt;</code> <br/> 
-          <code>&nbsp;&nbsp;&lt;pool name="sample_pool"&gt;</code><br/>
-          <code>&nbsp;&nbsp;&nbsp;&nbsp;&lt;minMaps&gt;5&lt;/minMaps&gt;</code><br/>
-          <code>&nbsp;&nbsp;&nbsp;&nbsp;&lt;minReduces&gt;5&lt;/minReduces&gt;</code><br/>
-          <code>&nbsp;&nbsp;&nbsp;&nbsp;&lt;weight&gt;2.0&lt;/weight&gt;</code><br/>
-          <code>&nbsp;&nbsp;&lt;/pool&gt;</code><br/>
-          <code>&nbsp;&nbsp;&lt;user name="sample_user"&gt;</code><br/>
-          <code>&nbsp;&nbsp;&nbsp;&nbsp;&lt;maxRunningJobs&gt;6&lt;/maxRunningJobs&gt;</code><br/>
-          <code>&nbsp;&nbsp;&lt;/user&gt;</code><br/>
-          <code>&nbsp;&nbsp;&lt;userMaxJobsDefault&gt;3&lt;/userMaxJobsDefault&gt;</code><br/>
-          <code>&lt;/allocations&gt;</code>
-          <br/>
-          This example creates a pool sample_pool with a guarantee of 5 map 
-          slots and 5 reduce slots. The pool also has a weight of 2.0, meaning 
-          it has a 2x higher share of the cluster than other pools (the default 
-          weight is 1). Finally, the example limits the number of running jobs 
-          per user to 3, except for sample_user, who can run 6 jobs concurrently. 
-          Any pool not defined in the allocations file will have no guaranteed 
-          capacity and a weight of 1.0. Also, any pool or user with no max 
-          running jobs set in the file will be allowed to run an unlimited 
-          number of jobs.
-        </td>
-        </tr>
-        <tr>
-        <td>
-          mapred.fairscheduler.assignmultiple
-        </td>
-        <td>
-          Allows the scheduler to assign both a map task and a reduce task 
-          on each heartbeat, which improves cluster throughput when there 
-          are many small tasks to run. Boolean value, default: false.
-        </td>
-        </tr>
-        <tr>
-        <td>
-          mapred.fairscheduler.sizebasedweight
-        </td>
-        <td>
-          Take into account job sizes in calculating their weights for fair 
-          sharing.By default, weights are only based on job priorities. 
-          Setting this flag to true will make them based on the size of the 
-          job (number of tasks needed) as well,though not linearly 
-          (the weight will be proportional to the log of the number of tasks 
-          needed). This lets larger jobs get larger fair shares while still 
-          providing enough of a share to small jobs to let them finish fast. 
-          Boolean value, default: false.
-        </td>
-        </tr>
-        <tr>
-        <td>
-          mapred.fairscheduler.poolnameproperty
-        </td>
-        <td>
-          Specify which jobconf property is used to determine the pool that a
-          job belongs in. String, default: user.name (i.e. one pool for each 
-          user). Some other useful values to set this to are: <br/>
-          <ul> 
-            <li> group.name (to create a pool per Unix group).</li>
-            <li>mapred.job.queue.name (the same property as the queue name in 
-            <a href="capacity_scheduler.html">Capacity Scheduler</a>).</li>
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.loadmanager
+          </td>
+          <td>
+            An extension point that lets you specify a class that determines 
+            how many maps and reduces can run on a given TaskTracker. This class 
+            should implement the LoadManager interface. By default the task caps 
+            in the Hadoop config file are used, but this option could be used to 
+            make the load based on available memory and CPU utilization for example.
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.taskselector
+          </td>
+          <td>
+          An extension point that lets you specify a class that determines 
+          which task from within a job to launch on a given tracker. This can be 
+          used to change either the locality policy (e.g. keep some jobs within 
+          a particular rack) or the speculative execution algorithm (select 
+          when to launch speculative tasks). The default implementation uses 
+          Hadoop's default algorithms from JobInProgress.
+          </td>
+          </tr>
+          <!--
+          <tr>
+          <td>
+            mapred.fairscheduler.eventlog.enabled
+          </td>
+          <td>
+            Enable a detailed log of fair scheduler events, useful for
+            debugging.
+            This log is stored in <em>HADOOP_LOG_DIR/fairscheduler</em>.
+            Boolean value, default: false.
+          </td>
+          </tr>
+          <tr>
+          <td>
+            mapred.fairscheduler.dump.interval
+          </td>
+          <td>
+            If using the event log, this is the interval at which to dump
+            complete scheduler state (list of pools and jobs) to the log.
+            Integer value in milliseconds, default: 10000.
+          </td>
+          </tr>
+          -->
+        </table>
+      </section>  
+      <section>
+        <title>Allocation File (fair-scheduler.xml)</title>
+        <p>
+        The allocation file configures minimum shares, running job
+        limits, weights and preemption timeouts for each pool.
+        Only users/pools whose values differ from the defaults need to be
+        explicitly configured in this file.
+        The allocation file is located in
+        <em>HADOOP_HOME/conf/fair-scheduler.xml</em>.
+        It can contain the following types of elements:
+        </p>
+        <ul>
+        <li><em>pool</em> elements, which configure each pool.
+        These may contain the following sub-elements:
+          <ul>
+          <li><em>minMaps</em> and <em>minReduces</em>,
+            to set the pool's minimum share of task slots.</li>
+          <li><em>maxMaps</em> and <em>maxReduces</em>, to set the
+            pool's maximum concurrent task slots.</li>
+          <li><em>schedulingMode</em>, the pool's internal scheduling mode,
+          which can be <em>fair</em> for fair sharing or <em>fifo</em> for
+          first-in-first-out.</li>
+          <li><em>maxRunningJobs</em>, 
+          to limit the number of jobs from the 
+          pool to run at once (defaults to infinite).</li>
+          <li><em>weight</em>, to share the cluster 
+          non-proportionally with other pools. For example, a pool with weight 2.0 will get a 2x higher share than other pools. The default weight is 1.0.</li>
+          <li><em>minSharePreemptionTimeout</em>, the
+            number of seconds the pool will wait before
+            killing other pools' tasks if it is below its minimum share
+            (defaults to infinite).</li>
           </ul>
-        </td>
-        </tr>
-        <tr>
-        <td>
-          mapred.fairscheduler.weightadjuster
-        </td>
-        <td>
-        An extensibility point that lets you specify a class to adjust the 
-        weights of running jobs. This class should implement the 
-        <em>WeightAdjuster</em> interface. There is currently one example 
-        implementation - <em>NewJobWeightBooster</em>, which increases the 
-        weight of jobs for the first 5 minutes of their lifetime to let 
-        short jobs finish faster. To use it, set the weightadjuster 
-        property to the full class name, 
-        <code>org.apache.hadoop.mapred.NewJobWeightBooster</code> 
-        NewJobWeightBooster itself provides two parameters for setting the 
-        duration and boost factor. <br/>
-        <ol>
-        <li> <em>mapred.newjobweightbooster.factor</em>
-          Factor by which new jobs weight should be boosted. Default is 3</li>
-        <li><em>mapred.newjobweightbooster.duration</em>
-          Duration in milliseconds, default 300000 for 5 minutes</li>
-        </ol>
-        </td>
-        </tr>
-        <tr>
-        <td>
-          mapred.fairscheduler.loadmanager
-        </td>
-        <td>
-          An extensibility point that lets you specify a class that determines 
-          how many maps and reduces can run on a given TaskTracker. This class 
-          should implement the LoadManager interface. By default the task caps 
-          in the Hadoop config file are used, but this option could be used to 
-          make the load based on available memory and CPU utilization for example.
-        </td>
-        </tr>
-        <tr>
-        <td>
-          mapred.fairscheduler.taskselector:
-        </td>
-        <td>
-        An extensibility point that lets you specify a class that determines 
-        which task from within a job to launch on a given tracker. This can be 
-        used to change either the locality policy (e.g. keep some jobs within 
-        a particular rack) or the speculative execution algorithm (select 
-        when to launch speculative tasks). The default implementation uses 
-        Hadoop's default algorithms from JobInProgress.
-        </td>
-        </tr>
-      </table>      
+        </li>
+        <li><em>user</em> elements, which may contain a 
+        <em>maxRunningJobs</em> element to limit 
+        jobs. Note that by default, there is a pool for each 
+        user, so per-user limits are not necessary.</li>
+        <li><em>poolMaxJobsDefault</em>, which sets the default running 
+        job limit for any pools whose limit is not specified.</li>
+        <li><em>userMaxJobsDefault</em>, which sets the default running 
+        job limit for any users whose limit is not specified.</li>
+        <li><em>defaultMinSharePreemptionTimeout</em>, 
+        which sets the default minimum share preemption timeout 
+        for any pools where it is not specified.</li>
+        <li><em>fairSharePreemptionTimeout</em>, 
+        which sets the preemption timeout used when jobs are below half
+        their fair share.</li>
+        <li><em>defaultPoolSchedulingMode</em>, which sets the default scheduling 
+        mode (<em>fair</em> or <em>fifo</em>) for pools whose mode is
+        not specified.</li>
+        </ul>
+        <p>
+        Pool and user elements only required if you are setting
+        non-default values for the pool/user. That is, you do not need to
+        declare all users and all pools in your config file before running
+        the fair scheduler. If a user or pool is not listed in the config file,
+        the default values for limits, preemption timeouts, etc will be used.
+        </p>
+        <p>
+        An example allocation file is given below : </p>
+<source>
+&lt;?xml version="1.0"?&gt;  
+&lt;allocations&gt;  
+  &lt;pool name="sample_pool"&gt;
+    &lt;minMaps&gt;5&lt;/minMaps&gt;
+    &lt;minReduces&gt;5&lt;/minReduces&gt;
+    &lt;maxMaps&gt;25&lt;/maxMaps&gt;
+    &lt;maxReduces&gt;25&lt;/maxReduces&gt;
+    &lt;minSharePreemptionTimeout&gt;300&lt;/minSharePreemptionTimeout&gt;
+  &lt;/pool&gt;
+  &lt;user name="sample_user"&gt;
+    &lt;maxRunningJobs&gt;6&lt;/maxRunningJobs&gt;
+  &lt;/user&gt;
+  &lt;userMaxJobsDefault&gt;3&lt;/userMaxJobsDefault&gt;
+  &lt;fairSharePreemptionTimeout&gt;600&lt;/fairSharePreemptionTimeout&gt;
+&lt;/allocations&gt;
+</source>
+        <p>
+        This example creates a pool sample_pool with a guarantee of 5 map 
+        slots and 5 reduce slots. The pool also has a minimum share preemption
+        timeout of 300 seconds (5 minutes), meaning that if it does not get its
+        guaranteed share within this time, it is allowed to kill tasks from
+        other pools to achieve its share. The pool has a cap of 25 map and 25
+        reduce slots, which means that once 25 tasks are running, no more will
+        be scheduled even if the pool's fair share is higher.
+        The example also limits the number of running jobs 
+        per user to 3, except for sample_user, who can run 6 jobs concurrently. 
+        Finally, the example sets a fair share preemption timeout of 600 seconds
+        (10 minutes). If a job is below half its fair share for 10 minutes, it
+        will be allowed to kill tasks from other jobs to achieve its share.
+        Note that the preemption settings require preemption to be
+        enabled in <em>mapred-site.xml</em> as described earlier.
+        </p>
+        <p>
+        Any pool not defined in the allocation file will have no guaranteed 
+        capacity and no preemption timeout. Also, any pool or user with no max 
+        running jobs set in the file will be allowed to run an unlimited 
+        number of jobs.
+        </p>
+      </section>
+      <section>
+        <title>Access Control Lists (ACLs)</title>
+        <p>
+          The fair scheduler can be used in tandem with the "queue" based access
+          control system in MapReduce to restrict which pools each user can access.
+          To do this, first enable ACLs and set up some queues as described in the
+          <a href="mapred_tutorial.html#Job+Authorization">MapReduce usage guide</a>,
+          then set the fair scheduler to use one pool per queue by adding
+          the following property in <em>HADOOP_CONF_DIR/mapred-site.xml</em>:
+        </p>
+<source>
+&lt;property&gt;
+  &lt;name&gt;mapred.fairscheduler.poolnameproperty&lt;/name&gt;
+  &lt;value&gt;mapred.job.queue.name&lt;/value&gt;
+&lt;/property&gt;
+</source>
+        <p>
+          You can then set the minimum share, weight, and internal scheduling mode
+          for each pool as described earlier.
+          In addition, make sure that users submit jobs to the right queue by setting
+          the <em>mapred.job.queue.name</em> property in their jobs.
+        </p>
+      </section>
     </section>
     <section>
     <title> Administration</title>
@@ -280,14 +472,15 @@
     </p> 
     <ol>
     <li>
-      It is possible to modify pools' allocations 
-      and user and pool running job limits at runtime by editing the allocation 
-      config file. The scheduler will reload this file 10-15 seconds after it 
+      It is possible to modify minimum shares, limits, weights, preemption
+      timeouts and pool scheduling modes at runtime by editing the allocation
+      file. The scheduler will reload this file 10-15 seconds after it 
       sees that it was modified.
      </li>
      <li>
      Current jobs, pools, and fair shares  can be examined through the 
-     JobTracker's web interface, at  http://&lt;jobtracker URL&gt;/scheduler. 
+     JobTracker's web interface, at
+     <em>http://&lt;JobTracker URL&gt;/scheduler</em>. 
      On this interface, it is also possible to modify jobs' priorities or 
      move jobs from one pool to another and see the effects on the fair 
      shares (this requires JavaScript).
@@ -312,24 +505,36 @@
      the job has had, but on average it will get its fair share amount.</li>
      </ul>
      <p>
-     In addition, it is possible to turn on an "advanced" view for the web UI,
-     by going to http://&lt;jobtracker URL&gt;/scheduler?advanced. This view shows 
-     four more columns used for calculations internally:
+     In addition, it is possible to view an "advanced" version of the web 
+     UI by going to <em>http://&lt;JobTracker URL&gt;/scheduler?advanced</em>. 
+     This view shows two more columns:
      </p>
      <ul>
      <li><em>Maps/Reduce Weight</em>: Weight of the job in the fair sharing 
      calculations. This depends on priority and potentially also on 
      job size and job age if the <em>sizebasedweight</em> and 
      <em>NewJobWeightBooster</em> are enabled.</li>
-     <li><em>Map/Reduce Deficit</em>: The job's scheduling deficit in machine-
-     seconds - the amount of resources it should have gotten according to 
-     its fair share, minus how many it actually got. Positive deficit means
-      the job will be scheduled again in the near future because it needs to 
-      catch up to its fair share. The scheduler schedules jobs with higher 
-      deficit ahead of others. Please see the Implementation section of 
-      this document for details.</li>
      </ul>
     </section>
+    <section>
+      <title>Metrics</title>
+      <p>
+        The fair scheduler can export metrics using the Hadoop metrics interface.
+        This can be enabled by adding an entry to <code>hadoop-metrics.properties</code>
+        to enable the <code>fairscheduler</code> metrics context. For example, to
+        simply retain the metrics in memory so they may be viewed in the <code>/metrics</code>
+        servlet:
+      </p>
+      <p>
+        <code>fairscheduler.class=org.apache.hadoop.metrics.spi.NoEmitMetricsContext</code>
+      </p>
+      <p>
+        Metrics are generated for each pool and job, and contain the same information that
+        is visible on the <code>/scheduler</code> web page.
+      </p>
+    </section>
+
+    <!--
     <section>
     <title>Implementation</title>
     <p>There are two aspects to implementing fair scheduling: Calculating 
@@ -359,13 +564,31 @@
      This capacity is divided among the jobs in that pool according again to 
      their weights.
      </p>
-     <p>Finally, when limits on a user's running jobs or a pool's running jobs 
+     <p>When limits on a user's running jobs or a pool's running jobs 
      are in place, we choose which jobs get to run by sorting all jobs in order 
      of priority and then submit time, as in the standard Hadoop scheduler. Any 
      jobs that fall after the user/pool's limit in this ordering are queued up 
      and wait idle until they can be run. During this time, they are ignored 
      from the fair sharing calculations and do not gain or lose deficit (their 
      fair share is set to zero).</p>
+     <p>
+     Preemption is implemented by periodically checking whether jobs are
+     below their minimum share or below half their fair share. If a job has
+     been below its share for sufficiently long, it is allowed to kill
+     other jobs' tasks. The tasks chosen are the most-recently-launched
+     tasks from over-allocated jobs, to minimize the amount of wasted
+     computation.
+     </p>
+     <p>
+     Finally, the fair scheduler provides several extension points where
+     the basic functionality can be extended. For example, the weight
+     calculation can be modified to give a priority boost to new jobs,
+     implementing a "shortest job first" policy which reduces response
+     times for interactive jobs even further.
+     These extension points are listed in
+     <a href="#Scheduler+Parameters+in+mapred-site.xml">Advanced Parameters</a>.
+     </p>
     </section>
+    -->
   </body>  
 </document>

+ 54 - 2
src/mapred/org/apache/hadoop/mapred/JobInProgress.java

@@ -1367,8 +1367,8 @@ public class JobInProgress {
     }
   }
   
-  public synchronized Task obtainNewLocalMapTask(TaskTrackerStatus tts,
-                                                     int clusterSize, 
+  public synchronized Task obtainNewNodeLocalMapTask(TaskTrackerStatus tts,
+                                                     int clusterSize,
                                                      int numUniqueHosts)
   throws IOException {
     if (!tasksInited) {
@@ -1378,6 +1378,31 @@ public class JobInProgress {
       return null;
     }
 
+    int target = findNewMapTask(tts, clusterSize, numUniqueHosts, 1, 
+                                status.mapProgress());
+    if (target == -1) {
+      return null;
+    }
+
+    Task result = maps[target].getTaskToRun(tts.getTrackerName());
+    if (result != null) {
+      addRunningTaskToTIP(maps[target], result.getTaskID(), tts, true);
+      resetSchedulingOpportunities();
+    }
+
+    return result;
+  }
+  
+  public synchronized Task obtainNewNodeOrRackLocalMapTask(
+      TaskTrackerStatus tts, int clusterSize, int numUniqueHosts)
+  throws IOException {
+    if (!tasksInited) {
+      LOG.info("Cannot create task split for " + profile.getJobID());
+      try { throw new IOException("state = " + status.getRunState()); }
+      catch (IOException ioe) {ioe.printStackTrace();}
+      return null;
+    }
+
     int target = findNewMapTask(tts, clusterSize, numUniqueHosts, maxLevel, 
                                 status.mapProgress());
     if (target == -1) {
@@ -3504,4 +3529,31 @@ public class JobInProgress {
     LOG.info("jobToken generated and stored with users keys in "
         + keysFile.toUri().getPath());
   }
+
+  /**
+   * Get the level of locality that a given task would have if launched on
+   * a particular TaskTracker. Returns 0 if the task has data on that machine,
+   * 1 if it has data on the same rack, etc (depending on number of levels in
+   * the network hierarchy).
+   */
+  int getLocalityLevel(TaskInProgress tip, TaskTrackerStatus tts) {
+    Node tracker = jobtracker.getNode(tts.getHost());
+    int level = this.maxLevel;
+    // find the right level across split locations
+    for (String local : maps[tip.getIdWithinJob()].getSplitLocations()) {
+      Node datanode = jobtracker.getNode(local);
+      int newLevel = this.maxLevel;
+      if (tracker != null && datanode != null) {
+        newLevel = getMatchingLevelForNodes(tracker, datanode);
+      }
+      if (newLevel < level) {
+        level = newLevel;
+        // an optimization
+        if (level == 0) {
+          break;
+        }
+      }
+    }
+    return level;
+  }
 }

+ 2 - 2
src/mapred/org/apache/hadoop/mapred/JobQueueTaskScheduler.java

@@ -168,8 +168,8 @@ class JobQueueTaskScheduler extends TaskScheduler {
           
           // Try to schedule a node-local or rack-local Map task
           t = 
-            job.obtainNewLocalMapTask(taskTrackerStatus, numTaskTrackers,
-                                      taskTrackerManager.getNumberOfUniqueHosts());
+            job.obtainNewNodeOrRackLocalMapTask(taskTrackerStatus, 
+                numTaskTrackers, taskTrackerManager.getNumberOfUniqueHosts());
           if (t != null) {
             assignedTasks.add(t);
             ++numLocalMaps;

+ 11 - 1
src/mapred/org/apache/hadoop/mapred/TaskTrackerManager.java

@@ -88,7 +88,17 @@ interface TaskTrackerManager {
    * @return jobInProgress object
    */
   public JobInProgress getJob(JobID jobid);
-  
+
+  /**
+   * Mark the task attempt identified by taskid to be killed
+   * 
+   * @param taskid task to kill
+   * @param shouldFail whether to count the task as failed
+   * @return true if the task was found and successfully marked to kill
+   */
+  public boolean killTask(TaskAttemptID taskid, boolean shouldFail)
+      throws IOException;  
+
   /**
    * Initialize the Job
    * 

+ 7 - 1
src/test/org/apache/hadoop/mapred/TestJobQueueTaskScheduler.java

@@ -60,7 +60,7 @@ public class TestJobQueueTaskScheduler extends TestCase {
     }
 
     @Override
-    public Task obtainNewLocalMapTask(TaskTrackerStatus tts, int clusterSize, 
+    public Task obtainNewNodeOrRackLocalMapTask(TaskTrackerStatus tts, int clusterSize, 
                                       int ignored) 
     throws IOException {
       return obtainNewMapTask(tts, clusterSize, ignored);
@@ -204,6 +204,12 @@ public class TestJobQueueTaskScheduler extends TestCase {
     public void failJob(JobInProgress job) {
       // do nothing
     }
+
+    @Override
+    public boolean killTask(TaskAttemptID taskid, boolean shouldFail)
+      throws IOException {
+      return false;
+    }
     
     // Test methods
     

+ 156 - 0
src/test/org/apache/hadoop/mapred/TestLinuxTaskControllerLaunchArgs.java

@@ -0,0 +1,156 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.mapred;
+
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.net.InetSocketAddress;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.LocalDirAllocator;
+import org.apache.hadoop.mapred.TaskTracker.LocalStorage;
+import org.apache.hadoop.mapred.Task;
+
+import junit.framework.TestCase;
+
+/**
+ * The test case tests whether {@link LinuxTaskController} passes all required
+ * jvm properties in its initializeJob() and truncateLogsAsUser() methods,
+ * which launches jvm through the native task-controller.
+ */
+public class TestLinuxTaskControllerLaunchArgs extends TestCase {
+  private static final Log LOG = LogFactory.getLog(
+                                   TestLinuxTaskControllerLaunchArgs.class);
+  private static File testDir = new File(System.getProperty("test.build.data",
+                "/tmp"), TestLinuxTaskControllerLaunchArgs.class.getName());
+  private static File fakeTaskController = new File(testDir, "faketc.sh");
+  private static File mapredLocal = new File(testDir, "mapred/local");
+  private TaskController ltc;
+  private boolean initialized = false;
+  private String user = new String("testuser");
+  private InetSocketAddress addr = new InetSocketAddress("localhost", 3209);
+
+  Configuration conf = new Configuration();
+
+  // Do-nothing fake {@link MapTask} class
+  public static class MyMapTask extends MapTask {
+    @Override
+    public void write(DataOutput out) throws IOException {
+      // nothing
+    }
+  }
+
+  
+  // The shell script is used to fake the native task-controller.
+  // It checks the arguments for required java properties and args.
+  protected void createFakeTCScript() throws Exception {
+    FileWriter out = new FileWriter(fakeTaskController);
+    out.write("#!/bin/bash\n");
+    // setup() calls with zero args and expects 1 in return.
+    out.write("if [ $# -eq 0 ]; then exit 1; fi\n");
+
+    // Check for java, classpath, h.log.dir, h.root.logger and java.library.path
+    out.write("for LARG in \"$@\"\n");
+    out.write("do case \"$LARG\" in\n");
+    out.write("*/java) LTC_ARG1=1;;\n");
+    out.write("-classpath) LTC_ARG2=1;;\n");
+    out.write("-Dhadoop.log.dir*) LTC_ARG3=1;;\n");
+    out.write("-Dhadoop.root.logger*) LTC_ARG4=1;;\n");
+    out.write("-Djava.library.path*) LTC_ARG5=1;;\n");
+    out.write("esac; done\n");
+    out.write("LTC_ARGS=$((LTC_ARG1+LTC_ARG2+LTC_ARG3+LTC_ARG4+LTC_ARG5))\n");
+    out.write("if [ $LTC_ARGS -eq 5 ]; then exit 0; else exit 22; fi\n");
+    out.close();
+    fakeTaskController.setExecutable(true);
+  }
+
+  protected void initMyTest() throws Exception {
+    testDir.mkdirs();
+    mapredLocal.mkdirs();
+    createFakeTCScript();
+    conf.set(JobConf.MAPRED_LOCAL_DIR_PROPERTY, mapredLocal.toString());
+
+    // Set the task-controller binary path.
+    conf.set("mapreduce.tasktracker.task-controller.exe", fakeTaskController.toString());
+    ltc = new LinuxTaskController();
+    ltc.setConf(conf);
+
+    // LinuxTaskController runs task-controller in setup() with no 
+    // argument and expects 1 in return
+    try {
+      ltc.setup(new LocalDirAllocator(mapredLocal.toString()),
+                 new LocalStorage(new String[]{mapredLocal.toString()}));
+    } catch (IOException ie) {
+      fail("Error running task-controller from setup().");
+    }
+
+    initialized = true;
+  }
+
+
+  /**
+   * LinuxTaskController runs task-controller and it runs JobLocalizer
+   * in initializeJob(). task-controller should be prodived with all
+   * necessary java properties to launch JobLocalizer successfully.
+   */
+  public void testLTCCallInitializeJob() throws Exception {
+    if (!initialized) {
+      initMyTest();
+    }
+    
+    try {
+      ltc.initializeJob(user, new String("jobid"), new Path("/cred.xml"),
+                                       new Path("/job.xml"), null, addr);
+    } catch (IOException ie) {
+      fail("Missing argument when running task-controller from " +
+                                                   "initializeJob().\n");
+    }
+  }
+
+  /**
+   * LinuxTaskController runs task-controller and it runs TaskLogsTruncater
+   * in truncateLogsAsUser(). task-controller should be prodived with all
+   * necessary java properties to launch JobLocalizer successfully.
+   */
+  public void testLTCCallTruncateLogsAsUser() throws Exception {
+    if (!initialized) {
+      initMyTest();
+    }
+
+    List<Task> tasks = new ArrayList<Task>();
+    tasks.add(new MyMapTask());
+
+    try {
+      ltc.truncateLogsAsUser(user, tasks);
+    } catch (IOException ie) {
+      fail("Missing argument when running task-controller from " +
+                                               "truncateLogsAsUser()\n");
+    }
+  }
+}

+ 7 - 0
src/test/org/apache/hadoop/mapred/TestParallelInitialization.java

@@ -155,6 +155,13 @@ public class TestParallelInitialization extends TestCase {
         failJob(job);
       }
     }
+    
+    @Override
+    public boolean killTask(TaskAttemptID taskid, boolean shouldFail)
+      throws IOException {
+      return false;
+    }
+
     // Test methods
     
     public synchronized void failJob(JobInProgress job) {

+ 266 - 0
src/test/org/apache/hadoop/mapreduce/TestSleepJob.java

@@ -0,0 +1,266 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.mapreduce;
+
+import java.io.IOException;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * Dummy class for testing MR framefork. Sleeps for a defined period 
+ * of time in mapper and reducer. Generates fake input for map / reduce 
+ * jobs. Note that generated number of input pairs is in the order 
+ * of <code>numMappers * mapSleepTime / 100</code>, so the job uses
+ * some disk space.
+ */
+public class TestSleepJob extends Configured implements Tool {
+  public static String MAP_SLEEP_COUNT = "mapreduce.sleepjob.map.sleep.count";
+  public static String REDUCE_SLEEP_COUNT = 
+    "mapreduce.sleepjob.reduce.sleep.count";
+  public static String MAP_SLEEP_TIME = "mapreduce.sleepjob.map.sleep.time";
+  public static String REDUCE_SLEEP_TIME = 
+    "mapreduce.sleepjob.reduce.sleep.time";
+
+  public static class TestSleepJobPartitioner extends 
+      Partitioner<IntWritable, NullWritable> {
+    public int getPartition(IntWritable k, NullWritable v, int numPartitions) {
+      return k.get() % numPartitions;
+    }
+  }
+  
+  public static class EmptySplit extends InputSplit implements Writable {
+    public void write(DataOutput out) throws IOException { }
+    public void readFields(DataInput in) throws IOException { }
+    public long getLength() { return 0L; }
+    public String[] getLocations() { return new String[0]; }
+  }
+
+  public static class SleepInputFormat 
+      extends InputFormat<IntWritable,IntWritable> {
+    
+    public List<InputSplit> getSplits(JobContext jobContext) {
+      List<InputSplit> ret = new ArrayList<InputSplit>();
+      int numSplits = jobContext.getConfiguration().
+                        getInt("mapred.map.tasks", 1);
+      for (int i = 0; i < numSplits; ++i) {
+        ret.add(new EmptySplit());
+      }
+      return ret;
+    }
+    
+    public RecordReader<IntWritable,IntWritable> createRecordReader(
+        InputSplit ignored, TaskAttemptContext taskContext)
+        throws IOException {
+      Configuration conf = taskContext.getConfiguration();
+      final int count = conf.getInt(MAP_SLEEP_COUNT, 1);
+      if (count < 0) throw new IOException("Invalid map count: " + count);
+      final int redcount = conf.getInt(REDUCE_SLEEP_COUNT, 1);
+      if (redcount < 0)
+        throw new IOException("Invalid reduce count: " + redcount);
+      final int emitPerMapTask = (redcount * taskContext.getNumReduceTasks());
+      
+      return new RecordReader<IntWritable,IntWritable>() {
+        private int records = 0;
+        private int emitCount = 0;
+        private IntWritable key = null;
+        private IntWritable value = null;
+        public void initialize(InputSplit split, TaskAttemptContext context) {
+        }
+
+        public boolean nextKeyValue()
+            throws IOException {
+          if (count == 0) {
+            return false;
+          }
+          key = new IntWritable();
+          key.set(emitCount);
+          int emit = emitPerMapTask / count;
+          if ((emitPerMapTask) % count > records) {
+            ++emit;
+          }
+          emitCount += emit;
+          value = new IntWritable();
+          value.set(emit);
+          return records++ < count;
+        }
+        public IntWritable getCurrentKey() { return key; }
+        public IntWritable getCurrentValue() { return value; }
+        public void close() throws IOException { }
+        public float getProgress() throws IOException {
+          return count == 0 ? 100 : records / ((float)count);
+        }
+      };
+    }
+  }
+
+  public static class SleepMapper 
+      extends Mapper<IntWritable, IntWritable, IntWritable, NullWritable> {
+    private long mapSleepDuration = 100;
+    private int mapSleepCount = 1;
+    private int count = 0;
+
+    protected void setup(Context context) 
+      throws IOException, InterruptedException {
+      Configuration conf = context.getConfiguration();
+      this.mapSleepCount =
+        conf.getInt(MAP_SLEEP_COUNT, mapSleepCount);
+      this.mapSleepDuration = mapSleepCount == 0 ? 0 :
+        conf.getLong(MAP_SLEEP_TIME , 100) / mapSleepCount;
+    }
+
+    public void map(IntWritable key, IntWritable value, Context context
+               ) throws IOException, InterruptedException {
+      //it is expected that every map processes mapSleepCount number of records. 
+      try {
+        context.setStatus("Sleeping... (" +
+          (mapSleepDuration * (mapSleepCount - count)) + ") ms left");
+        Thread.sleep(mapSleepDuration);
+      }
+      catch (InterruptedException ex) {
+        throw (IOException)new IOException(
+            "Interrupted while sleeping").initCause(ex);
+      }
+      ++count;
+      // output reduceSleepCount * numReduce number of random values, so that
+      // each reducer will get reduceSleepCount number of keys.
+      int k = key.get();
+      for (int i = 0; i < value.get(); ++i) {
+        context.write(new IntWritable(k + i), NullWritable.get());
+      }
+    }
+  }
+  
+  public static class SleepReducer  
+      extends Reducer<IntWritable, NullWritable, NullWritable, NullWritable> {
+    private long reduceSleepDuration = 100;
+    private int reduceSleepCount = 1;
+    private int count = 0;
+
+    protected void setup(Context context) 
+      throws IOException, InterruptedException {
+      Configuration conf = context.getConfiguration();
+      this.reduceSleepCount =
+        conf.getInt(REDUCE_SLEEP_COUNT, reduceSleepCount);
+      this.reduceSleepDuration = reduceSleepCount == 0 ? 0 : 
+        conf.getLong(REDUCE_SLEEP_TIME , 100) / reduceSleepCount;
+    }
+
+    public void reduce(IntWritable key, Iterable<NullWritable> values,
+                       Context context)
+      throws IOException {
+      try {
+        context.setStatus("Sleeping... (" +
+            (reduceSleepDuration * (reduceSleepCount - count)) + ") ms left");
+        Thread.sleep(reduceSleepDuration);
+      
+      }
+      catch (InterruptedException ex) {
+        throw (IOException)new IOException(
+          "Interrupted while sleeping").initCause(ex);
+      }
+      count++;
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new TestSleepJob(), args);
+    System.exit(res);
+  }
+
+  public Job createJob(int numMapper, int numReducer, 
+                       long mapSleepTime, int mapSleepCount, 
+                       long reduceSleepTime, int reduceSleepCount) 
+      throws IOException {
+    Configuration conf = getConf();
+    conf.setLong(MAP_SLEEP_TIME, mapSleepTime);
+    conf.setLong(REDUCE_SLEEP_TIME, reduceSleepTime);
+    conf.setInt(MAP_SLEEP_COUNT, mapSleepCount);
+    conf.setInt(REDUCE_SLEEP_COUNT, reduceSleepCount);
+    conf.setInt("mapred.map.tasks", numMapper);
+    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
+    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
+    Job job = new Job(conf, "sleep");
+    job.setNumReduceTasks(numReducer);
+    job.setJarByClass(TestSleepJob.class);
+    job.setNumReduceTasks(numReducer);
+    job.setMapperClass(SleepMapper.class);
+    job.setMapOutputKeyClass(IntWritable.class);
+    job.setMapOutputValueClass(NullWritable.class);
+    job.setReducerClass(SleepReducer.class);
+    job.setOutputFormatClass(NullOutputFormat.class);
+    job.setInputFormatClass(SleepInputFormat.class);
+    job.setPartitionerClass(TestSleepJobPartitioner.class);
+    job.setJobName("Sleep job");
+    FileInputFormat.addInputPath(job, new Path("ignored"));
+    return job;
+  }
+
+  public int run(String[] args) throws Exception {
+
+    if(args.length < 1) {
+      System.err.println("TestSleepJob [-m numMapper] [-r numReducer]" +
+          " [-mt mapSleepTime (msec)] [-rt reduceSleepTime (msec)]" +
+          " [-recordt recordSleepTime (msec)]");
+      ToolRunner.printGenericCommandUsage(System.err);
+      return 2;
+    }
+
+    int numMapper = 1, numReducer = 1;
+    long mapSleepTime = 100, reduceSleepTime = 100, recSleepTime = 100;
+    int mapSleepCount = 1, reduceSleepCount = 1;
+
+    for(int i=0; i < args.length; i++ ) {
+      if(args[i].equals("-m")) {
+        numMapper = Integer.parseInt(args[++i]);
+      }
+      else if(args[i].equals("-r")) {
+        numReducer = Integer.parseInt(args[++i]);
+      }
+      else if(args[i].equals("-mt")) {
+        mapSleepTime = Long.parseLong(args[++i]);
+      }
+      else if(args[i].equals("-rt")) {
+        reduceSleepTime = Long.parseLong(args[++i]);
+      }
+      else if (args[i].equals("-recordt")) {
+        recSleepTime = Long.parseLong(args[++i]);
+      }
+    }
+    
+    // sleep for *SleepTime duration in Task by recSleepTime per record
+    mapSleepCount = (int)Math.ceil(mapSleepTime / ((double)recSleepTime));
+    reduceSleepCount = (int)Math.ceil(reduceSleepTime / ((double)recSleepTime));
+    Job job = createJob(numMapper, numReducer, mapSleepTime,
+                mapSleepCount, reduceSleepTime, reduceSleepCount);
+    return job.waitForCompletion(true) ? 0 : 1;
+  }
+
+}

Kaikkia tiedostoja ei voida näyttää, sillä liian monta tiedostoa muuttui tässä diffissä