소스 검색

Branching for 0.21 releases

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.21@939914 13f79535-47bb-0310-9956-ffa450edef68
Thomas White 15 년 전
부모
커밋
c98a440a61
100개의 변경된 파일9157개의 추가작업 그리고 1748개의 파일을 삭제
  1. 0 41
      .eclipse.templates/.classpath
  2. 0 27
      .eclipse.templates/.externalToolBuilders/Hadoop_Ant_Builder.launch
  3. 0 27
      .eclipse.templates/.project
  4. 0 6
      .eclipse.templates/README.txt
  5. 1 0
      .gitignore
  6. 391 25
      CHANGES.txt
  7. 2 1
      bin/hadoop
  8. 5 2
      bin/hadoop-config.sh
  9. 1 1
      bin/hadoop-daemon.sh
  10. 1 1
      bin/hadoop-daemons.sh
  11. 5 2
      bin/hdfs
  12. 1 1
      bin/rcc
  13. 1 1
      bin/slaves.sh
  14. 1 1
      bin/start-all.sh
  15. 1 1
      bin/start-balancer.sh
  16. 1 1
      bin/start-dfs.sh
  17. 1 1
      bin/start-mapred.sh
  18. 1 1
      bin/stop-all.sh
  19. 1 1
      bin/stop-balancer.sh
  20. 1 1
      bin/stop-dfs.sh
  21. 1 1
      bin/stop-mapred.sh
  22. 116 37
      build.xml
  23. 1 1
      conf/hadoop-env.sh.template
  24. 13 0
      conf/log4j.properties
  25. 6 6
      ivy.xml
  26. 2 2
      ivy/hadoop-core-template.xml
  27. 18 3
      ivy/ivysettings.xml
  28. 5 6
      ivy/libraries.properties
  29. 11 0
      lib/jdiff/hadoop_0.20.2.xml
  30. 497 0
      src/contrib/cloud/README.txt
  31. 45 0
      src/contrib/cloud/build.xml
  32. 202 0
      src/contrib/cloud/lib/pyAntTasks-1.3-LICENSE.txt
  33. BIN
      src/contrib/cloud/lib/pyAntTasks-1.3.jar
  34. 52 0
      src/contrib/cloud/src/integration-test/create-ebs-snapshot.sh
  35. 30 0
      src/contrib/cloud/src/integration-test/ebs-storage-spec.json
  36. 122 0
      src/contrib/cloud/src/integration-test/persistent-cluster.sh
  37. 112 0
      src/contrib/cloud/src/integration-test/transient-cluster.sh
  38. 21 0
      src/contrib/cloud/src/py/hadoop-cloud
  39. 21 0
      src/contrib/cloud/src/py/hadoop-ec2
  40. 14 0
      src/contrib/cloud/src/py/hadoop/__init__.py
  41. 15 0
      src/contrib/cloud/src/py/hadoop/cloud/__init__.py
  42. 438 0
      src/contrib/cloud/src/py/hadoop/cloud/cli.py
  43. 187 0
      src/contrib/cloud/src/py/hadoop/cloud/cluster.py
  44. 459 0
      src/contrib/cloud/src/py/hadoop/cloud/data/boot-rackspace.sh
  45. 548 0
      src/contrib/cloud/src/py/hadoop/cloud/data/hadoop-ec2-init-remote.sh
  46. 22 0
      src/contrib/cloud/src/py/hadoop/cloud/data/hadoop-rackspace-init-remote.sh
  47. 112 0
      src/contrib/cloud/src/py/hadoop/cloud/data/zookeeper-ec2-init-remote.sh
  48. 14 0
      src/contrib/cloud/src/py/hadoop/cloud/providers/__init__.py
  49. 61 0
      src/contrib/cloud/src/py/hadoop/cloud/providers/dummy.py
  50. 479 0
      src/contrib/cloud/src/py/hadoop/cloud/providers/ec2.py
  51. 239 0
      src/contrib/cloud/src/py/hadoop/cloud/providers/rackspace.py
  52. 640 0
      src/contrib/cloud/src/py/hadoop/cloud/service.py
  53. 173 0
      src/contrib/cloud/src/py/hadoop/cloud/storage.py
  54. 84 0
      src/contrib/cloud/src/py/hadoop/cloud/util.py
  55. 30 0
      src/contrib/cloud/src/py/setup.py
  56. 37 0
      src/contrib/cloud/src/test/py/testcluster.py
  57. 74 0
      src/contrib/cloud/src/test/py/testrackspace.py
  58. 143 0
      src/contrib/cloud/src/test/py/teststorage.py
  59. 44 0
      src/contrib/cloud/src/test/py/testuserdata.py
  60. 81 0
      src/contrib/cloud/src/test/py/testutil.py
  61. 46 0
      src/contrib/cloud/tools/rackspace/remote-setup.sh
  62. 1 1
      src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/Anonymizer.java
  63. 1 1
      src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/CPUParser.java
  64. 39 11
      src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/Environment.java
  65. 1 1
      src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/HadoopLogParser.java
  66. 17 3
      src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/LocalStore.java
  67. 1 1
      src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/NICParser.java
  68. 5 5
      src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/SMARTParser.java
  69. 4 4
      src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/SensorsParser.java
  70. 2 2
      src/contrib/test/core-site.xml
  71. 2 2
      src/contrib/test/hadoop-site.xml
  72. 2 2
      src/contrib/test/hdfs-site.xml
  73. 2 2
      src/contrib/test/mapred-site.xml
  74. 0 192
      src/docs/src/documentation/content/xdocs/hdfs_permissions_guide.xml
  75. 0 97
      src/docs/src/documentation/content/xdocs/libhdfs.xml
  76. 0 670
      src/docs/src/documentation/content/xdocs/streaming.xml
  77. 31 1
      src/java/core-default.xml
  78. 35 0
      src/java/org/apache/hadoop/HadoopIllegalArgumentException.java
  79. 3 5
      src/java/org/apache/hadoop/classification/InterfaceAudience.java
  80. 59 0
      src/java/org/apache/hadoop/classification/tools/ExcludePrivateAnnotationsJDiffDoclet.java
  81. 58 0
      src/java/org/apache/hadoop/classification/tools/ExcludePrivateAnnotationsStandardDoclet.java
  82. 234 0
      src/java/org/apache/hadoop/classification/tools/RootDocProcessor.java
  83. 69 0
      src/java/org/apache/hadoop/classification/tools/StabilityOptions.java
  84. 22 0
      src/java/org/apache/hadoop/classification/tools/package-info.java
  85. 104 0
      src/java/org/apache/hadoop/conf/ConfServlet.java
  86. 120 151
      src/java/org/apache/hadoop/conf/Configuration.java
  87. 809 0
      src/java/org/apache/hadoop/fs/AbstractFileSystem.java
  88. 63 0
      src/java/org/apache/hadoop/fs/AvroFSInput.java
  89. 27 10
      src/java/org/apache/hadoop/fs/ChecksumFileSystem.java
  90. 481 0
      src/java/org/apache/hadoop/fs/ChecksumFs.java
  91. 17 1
      src/java/org/apache/hadoop/fs/CommonConfigurationKeys.java
  92. 46 41
      src/java/org/apache/hadoop/fs/DF.java
  93. 210 0
      src/java/org/apache/hadoop/fs/DelegateToFileSystem.java
  94. 90 39
      src/java/org/apache/hadoop/fs/FSInputChecker.java
  95. 498 261
      src/java/org/apache/hadoop/fs/FileContext.java
  96. 53 2
      src/java/org/apache/hadoop/fs/FileStatus.java
  97. 77 25
      src/java/org/apache/hadoop/fs/FileSystem.java
  98. 43 20
      src/java/org/apache/hadoop/fs/FileUtil.java
  99. 51 0
      src/java/org/apache/hadoop/fs/FilterFileSystem.java
  100. 250 0
      src/java/org/apache/hadoop/fs/FilterFs.java

+ 0 - 41
.eclipse.templates/.classpath

@@ -1,41 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<classpath>
-	<classpathentry kind="src" path="src/java"/>
-	<classpathentry kind="src" path="src/test/aop"/>
-	<classpathentry kind="src" path="src/test/core"/>
-	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
-	<classpathentry kind="var" path="ANT_HOME/lib/ant.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/avro-1.2.0.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/commons-cli-1.2.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/commons-codec-1.3.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/commons-el-1.0.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/commons-httpclient-3.0.1.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/commons-logging-1.0.4.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/commons-logging-api-1.0.4.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/commons-net-1.4.1.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/core-3.1.1.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/hsqldb-1.8.0.10.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/jasper-compiler-5.5.12.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/jasper-runtime-5.5.12.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/jets3t-0.7.1.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/jetty-6.1.14.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/jetty-util-6.1.14.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/jsp-2.1-6.1.14.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/jsp-api-2.1-6.1.14.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/junit-4.5.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/kfs-0.3.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/log4j-1.2.15.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/oro-2.0.8.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/servlet-api-2.5-6.1.14.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/slf4j-api-1.4.3.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/slf4j-log4j12-1.4.3.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/common/xmlenc-0.52.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/test/ftplet-api-1.0.0.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/test/ftpserver-core-1.0.0.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/test/ftpserver-deprecated-1.0.0-M2.jar"/>
-	<classpathentry kind="lib" path="build/ivy/lib/Hadoop-Core/test/mina-core-2.0.0-M5.jar"/>
-	<classpathentry kind="lib" path="build/test/core/classes"/>
-	<classpathentry kind="lib" path="build/classes"/>
-	<classpathentry kind="lib" path="conf"/>
-	<classpathentry kind="output" path="build/eclipse-classes"/>
-</classpath>

+ 0 - 27
.eclipse.templates/.externalToolBuilders/Hadoop_Ant_Builder.launch

@@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<launchConfiguration type="org.eclipse.ant.AntBuilderLaunchConfigurationType">
-<stringAttribute key="org.eclipse.ant.ui.ATTR_ANT_AFTER_CLEAN_TARGETS" value="compile-core-test,"/>
-<stringAttribute key="org.eclipse.ant.ui.ATTR_ANT_CLEAN_TARGETS" value="clean,"/>
-<stringAttribute key="org.eclipse.ant.ui.ATTR_ANT_MANUAL_TARGETS" value="compile-core-test,"/>
-<booleanAttribute key="org.eclipse.ant.ui.ATTR_TARGETS_UPDATED" value="true"/>
-<booleanAttribute key="org.eclipse.ant.ui.DEFAULT_VM_INSTALL" value="false"/>
-<stringAttribute key="org.eclipse.debug.core.ATTR_REFRESH_SCOPE" value="${project}"/>
-<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
-<listEntry value="/@PROJECT@/build.xml"/>
-</listAttribute>
-<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
-<listEntry value="1"/>
-</listAttribute>
-<booleanAttribute key="org.eclipse.debug.core.appendEnvironmentVariables" value="true"/>
-<booleanAttribute key="org.eclipse.debug.ui.ATTR_LAUNCH_IN_BACKGROUND" value="false"/>
-<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.ant.ui.AntClasspathProvider"/>
-<booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="true"/>
-<!-- Use Java 1.6 for ant; Eclipse may be running on 1.5. -->
-<stringAttribute key="org.eclipse.jdt.launching.JRE_CONTAINER" value="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
-<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.eclipse.ant.internal.ui.antsupport.InternalAntRunner"/>
-<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="@PROJECT@"/>
-<stringAttribute key="org.eclipse.ui.externaltools.ATTR_LOCATION" value="${workspace_loc:/@PROJECT@/build.xml}"/>
-<stringAttribute key="org.eclipse.ui.externaltools.ATTR_RUN_BUILD_KINDS" value="full,incremental,clean"/>
-<booleanAttribute key="org.eclipse.ui.externaltools.ATTR_TRIGGERS_CONFIGURED" value="true"/>
-<stringAttribute key="process_factory_id" value="org.eclipse.ant.ui.remoteAntProcessFactory"/>
-</launchConfiguration>

+ 0 - 27
.eclipse.templates/.project

@@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<projectDescription>
-	<name>@PROJECT@</name>
-	<comment></comment>
-	<projects>
-	</projects>
-	<buildSpec>
-		<buildCommand>
-			<name>org.eclipse.jdt.core.javabuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
-		<buildCommand>
-			<name>org.eclipse.ui.externaltools.ExternalToolBuilder</name>
-			<triggers>full,incremental,</triggers>
-			<arguments>
-				<dictionary>
-					<key>LaunchConfigHandle</key>
-					<value>&lt;project&gt;/.externalToolBuilders/Hadoop_Ant_Builder.launch</value>
-				</dictionary>
-			</arguments>
-		</buildCommand>
-	</buildSpec>
-	<natures>
-		<nature>org.eclipse.jdt.core.javanature</nature>
-	</natures>
-</projectDescription>

+ 0 - 6
.eclipse.templates/README.txt

@@ -1,6 +0,0 @@
-This directory contains templates for generating Eclipse files to configure
-Eclipse for Hadoop development.
-
-For further information please consult
-
-http://wiki.apache.org/hadoop/EclipseEnvironment 

+ 1 - 0
.gitignore

@@ -20,6 +20,7 @@
 .svn
 build/
 build-fi/
+build.properties
 conf/masters
 conf/slaves
 conf/hadoop-env.sh

+ 391 - 25
CHANGES.txt

@@ -85,8 +85,20 @@ Release 0.21.0 - Unreleased
     and suggests using -skpTrash, when moving to trash fails.
     (Boris Shkolnik via suresh)
 
+    HADOOP-6303. Eclipse .classpath template has outdated jar files and is
+    missing some new ones.  (cos)
+
     HADOOP-6396. Fix uninformative exception message when unable to parse
-    umask (jghoman)
+    umask. (jghoman)
+
+    HADOOP-6299. Reimplement the UserGroupInformation to use the OS
+    specific and Kerberos JAAS login. (omalley)
+
+    HADOOP-6686. Remove redundant exception class name from the exception
+    message for the exceptions thrown at RPC client. (suresh)
+
+    HADOOP-6701. Fix incorrect exit codes returned from chmod, chown and chgrp
+    commands from FsShell. (Ravi Phulari via suresh)
 
   NEW FEATURES
 
@@ -176,8 +188,6 @@ Release 0.21.0 - Unreleased
     the io package and makes it available to other users (MAPREDUCE-318). 
     (Jothi Padmanabhan via ddas)
 
-    HADOOP-6165. Add metadata to Serializations. (tomwhite)
-
     HADOOP-6105. Adds support for automatically handling deprecation of
     configuration keys. (V.V.Chaitanya Krishna via yhemanth)
     
@@ -215,6 +225,70 @@ Release 0.21.0 - Unreleased
     HADOOP-6313. Implement Syncable interface in FSDataOutputStream to expose
     flush APIs to application users. (Hairong Kuang via suresh)
 
+    HADOOP-6284. Add a new parameter, HADOOP_JAVA_PLATFORM_OPTS, to
+    hadoop-config.sh so that it allows setting java command options for
+    JAVA_PLATFORM.  (Koji Noguchi via szetszwo)
+
+    HADOOP-6337. Updates FilterInitializer class to be more visible,
+    and the init of the class is made to take a Configuration argument.
+    (Jakob Homan via ddas)
+
+    HADOOP-6108. Add support for EBS storage on EC2. (tomwhite)
+
+    Hadoop-6223. Add new file system interface AbstractFileSystem with
+    implementation of some file systems that delegate to old FileSystem.
+    (Sanjay Radia via suresh)
+
+    HADOOP-6392. Run namenode and jobtracker on separate EC2 instances.
+    (tomwhite)
+
+    HADOOP-6433. Introduce asychronous deletion of files via a pool of
+    threads. This can be used to delete files in the Distributed
+    Cache. (Zheng Shao via dhruba)
+
+    HADOOP-6415. Adds a common token interface for both job token and 
+    delegation token. (Kan Zhang via ddas)
+
+    HADOOP-6466. Add a ZooKeeper service to the cloud scripts. (tomwhite)
+
+    HADOOP-6408. Add a /conf servlet to dump running configuration.
+    (Todd Lipcon via tomwhite)
+
+    HADOOP-6464. Write a Rackspace cloud provider. (tomwhite)
+
+    HADOOP-6520. Adds APIs to read/write Token and secret keys. Also
+    adds the automatic loading of tokens into UserGroupInformation
+    upon login. The tokens are read from a file specified in the
+    environment variable. (ddas)
+
+    HADOOP-6419. Adds SASL based authentication to RPC.
+    (Kan Zhang via ddas)
+
+    HADOOP-6510. Adds a way for superusers to impersonate other users
+    in a secure environment. (Jitendra Nath Pandey via ddas)
+
+    HADOOP-6421. Adds Symbolic links to FileContext, AbstractFileSystem.
+    It also adds a limited implementation for the local file system
+     (RawLocalFs) that allows local symlinks. (Eli Collins via Sanjay Radia)
+
+    HADOOP-6577. Add hidden configuration option "ipc.server.max.response.size"
+    to change the default 1 MB, the maximum size when large IPC handler 
+    response buffer is reset. (suresh)
+
+    HADOOP-6568. Adds authorization for the default servlets. 
+    (Vinod Kumar Vavilapalli via ddas)
+
+    HADOOP-6586. Log authentication and authorization failures and successes
+    for RPC (boryas)
+
+    HADOOP-6580. UGI should contain authentication method. (jnp via boryas)
+    
+    HADOOP-6657. Add a capitalization method to StringUtils for MAPREDUCE-1545.
+    (Luke Lu via Steve Loughran)
+
+    HADOOP-6692. Add FileContext#listStatus that returns an iterator.
+    (hairong)
+
   IMPROVEMENTS
 
     HADOOP-4565. Added CombineFileInputFormat to use data locality information
@@ -593,12 +667,6 @@ Release 0.21.0 - Unreleased
     HADOOP-6260. Add additional unit tests for FileContext util methods.
     (Gary Murry via suresh).
 
-    HADOOP-6261. Add URI based tests for FileContext.
-    (Ravi Phulari via suresh).
-
-    HADOOP-6305. Unify build property names to facilitate cross-projects
-    modifications. Contributed by Konstantin Boudnik
-
     HADOOP-6309. Change build.xml to run tests with java asserts.  (Eli
     Collins via szetszwo)
 
@@ -623,6 +691,163 @@ Release 0.21.0 - Unreleased
 
     HADOOP-6413. Move TestReflectionUtils to Common. (Todd Lipcon via tomwhite)
 
+    HADOOP-6283. Improve the exception messages thrown by
+    FileUtil$HardLink.getLinkCount(..).  (szetszwo)
+
+    HADOOP-6279. Add Runtime::maxMemory to JVM metrics. (Todd Lipcon via
+    cdouglas)
+
+    HADOOP-6305. Unify build property names to facilitate cross-projects
+    modifications (cos)
+
+    HADOOP-6312. Remove unnecessary debug logging in Configuration constructor.
+    (Aaron Kimball via cdouglas)
+
+    HADOOP-6366. Reduce ivy console output to ovservable level (cos)
+
+    HADOOP-6400. Log errors getting Unix UGI. (Todd Lipcon via tomwhite)
+
+    HADOOP-6346. Add support for specifying unpack pattern regex to
+    RunJar.unJar. (Todd Lipcon via tomwhite)
+
+    HADOOP-6422. Make RPC backend plugable, protocol-by-protocol, to
+    ease evolution towards Avro.  (cutting)
+
+    HADOOP-5958. Use JDK 1.6 File APIs in DF.java wherever possible.
+    (Aaron Kimball via tomwhite)
+
+    HADOOP-6222. Core doesn't have TestCommonCLI facility. (cos)
+
+    HADOOP-6394. Add a helper class to simplify FileContext related tests and
+    improve code reusability. (Jitendra Nath Pandey via suresh)
+
+    HADOOP-6426. Create ant build for running EC2 unit tests. (tomwhite)
+
+    HADOOP-4656. Add a user to groups mapping service. (boryas, acmurthy)
+
+    HADOOP-6444. Support additional security group option in hadoop-ec2 script.
+    (Paul Egan via tomwhite)
+
+    HADOOP-6454. Create setup.py for EC2 cloud scripts. (tomwhite)
+
+    HADOOP-6435. Make RPC.waitForProxy with timeout public. (Steve Loughran
+    via tomwhite)
+  
+    HADOOP-6472. add tokenCache option to GenericOptionsParser for passing
+     file with secret keys to a map reduce job. (boryas)
+
+    HADOOP-3205. Read multiple chunks directly from FSInputChecker subclass
+    into user buffers. (Todd Lipcon via tomwhite)
+
+    HADOOP-6479. TestUTF8 assertions could fail with better text.
+    (Steve Loughran via tomwhite)
+
+    HADOOP-6155. Deprecate RecordIO anticipating Avro. (Tom White via cdouglas)
+
+    HADOOP-6492. Make some Avro serialization APIs public.
+    (Aaron Kimball via cutting)
+
+    HADOOP-6497. Add an adapter for Avro's SeekableInput interface, so
+    that Avro can read FileSystem data.
+    (Aaron Kimball via cutting)
+
+    HADOOP-6495.  Identifier should be serialized after the password is
+     created In Token constructor (jnp via boryas)
+
+    HADOOP-6518. Makes the UGI honor the env var KRB5CCNAME. 
+    (Owen O'Malley via ddas)
+
+    HADOOP-6531. Enhance FileUtil with an API to delete all contents of a
+    directory. (Amareshwari Sriramadasu via yhemanth)
+
+    HADOOP-6547. Move DelegationToken into Common, so that it can be used by
+    MapReduce also. (devaraj via omalley)
+
+    HADOOP-6552. Puts renewTGT=true and useTicketCache=true for the keytab
+    kerberos options. (ddas)
+
+    HADOOP-6534. Trim whitespace from directory lists initializing
+    LocalDirAllocator. (Todd Lipcon via cdouglas)
+
+    HADOOP-6559. Makes the RPC client automatically re-login when the SASL 
+    connection setup fails. This is applicable only to keytab based logins.
+    (Devaraj Das)
+
+    HADOOP-6551. Delegation token renewing and cancelling should provide
+    meaningful exceptions when there are failures instead of returning 
+    false. (omalley)
+
+    HADOOP-6583. Captures authentication and authorization metrics. (ddas)
+
+    HADOOP-6543. Allows secure clients to talk to unsecure clusters. 
+    (Kan Zhang via ddas)
+
+    HADOOP-6579. Provide a mechanism for encoding/decoding Tokens from
+    a url-safe string and change the commons-code library to 1.4. (omalley)
+
+    HADOOP-6596. Add a version field to the AbstractDelegationTokenIdentifier's
+    serialized value. (omalley)
+
+    HADOOP-6573. Support for persistent delegation tokens.
+    (Jitendra Pandey via shv)
+
+    HADOOP-6594. Provide a fetchdt tool via bin/hdfs. (jhoman via acmurthy) 
+
+    HADOOP-6589. Provide better error messages when RPC authentication fails.
+    (Kan Zhang via omalley)
+
+    HADOOP-6599  Split existing RpcMetrics into RpcMetrics & RpcDetailedMetrics.
+    (Suresh Srinivas via Sanjay Radia)
+
+    HADOOP-6537 Declare more detailed exceptions in FileContext and 
+    AbstractFileSystem (Suresh Srinivas via Sanjay Radia)
+
+    HADOOP-6486. fix common classes to work with Avro 1.3 reflection.
+    (cutting via tomwhite)
+
+    HADOOP-6591. HarFileSystem can handle paths with the whitespace characters.
+    (Rodrigo Schmidt via dhruba)
+
+    HADOOP-6407. Have a way to automatically update Eclipse .classpath file
+    when new libs are added to the classpath through Ivy. (tomwhite)
+
+    HADOOP-3659. Patch to allow hadoop native to compile on Mac OS X.
+    (Colin Evans and Allen Wittenauer via tomwhite)
+
+    HADOOP-6471. StringBuffer -> StringBuilder - conversion of references
+    as necessary. (Kay Kay via tomwhite)
+
+    HADOOP-6646. Move HarfileSystem out of Hadoop Common. (mahadev)
+
+    HADOOP-6566. Add methods supporting, enforcing narrower permissions on
+    local daemon directories. (Arun Murthy and Luke Lu via cdouglas)
+
+    HADOOP-6705. Fix to work with 1.5 version of jiracli
+    (Giridharan Kesavan)
+
+    HADOOP-6658. Exclude Private elements from generated Javadoc. (tomwhite)
+
+    HADOOP-6635. Install/deploy source jars to Maven repo. 
+    (Patrick Angeles via jghoman)
+
+    HADOOP-6717. Log levels in o.a.h.security.Groups too high 
+    (Todd Lipcon via jghoman)
+
+    HADOOP-6667. RPC.waitForProxy should retry through NoRouteToHostException.
+    (Todd Lipcon via tomwhite)
+
+    HADOOP-6677. InterfaceAudience.LimitedPrivate should take a string not an
+    enum. (tomwhite)
+
+    HADOOP-678. Remove FileContext#isFile, isDirectory, and exists.
+    (Eli Collins via hairong)
+
+    HADOOP-6515. Make maximum number of http threads configurable.
+    (Scott Chen via zshao)
+
+    HADOOP-6563. Add more symlink tests to cover intermediate symlinks
+    in paths. (Eli Collins via suresh)
+
   OPTIMIZATIONS
 
     HADOOP-5595. NameNode does not need to run a replicator to choose a
@@ -641,9 +866,24 @@ Release 0.21.0 - Unreleased
     HADOOP-6271. Add recursive and non recursive create and mkdir to 
     FileContext. (Sanjay Radia via suresh)
 
+    HADOOP-6261. Add URI based tests for FileContext. 
+    (Ravi Pulari via suresh).
+
     HADOOP-6307. Add a new SequenceFile.Reader constructor in order to support
     reading on un-closed file.  (szetszwo)
 
+    HADOOP-6467. Improve the performance on HarFileSystem.listStatus(..).
+    (mahadev via szetszwo)
+
+    HADOOP-6569. FsShell#cat should avoid calling unecessary getFileStatus
+    before opening a file to read. (hairong)
+
+    HADOOP-6689. Add directory renaming test to existing FileContext tests.
+    (Eli Collins via suresh)
+
+    HADOOP-6713. The RPC server Listener thread is a scalability bottleneck.
+    (Dmytro Molkov via hairong)
+
   BUG FIXES
     
     HADOOP-5379. CBZip2InputStream to throw IOException on data crc error.
@@ -823,9 +1063,6 @@ Release 0.21.0 - Unreleased
     HADOOP-5715. Add conf/mapred-queue-acls.xml to the ignore lists.
     (szetszwo)
 
-    HADOOP-5612. Some c++ scripts are not chmodded before ant execution.
-    (Todd Lipcon via tomwhite)
-
     HADOOP-5592. Fix typo in Streaming doc in reference to GzipCodec.
     (Corinne Chandel via tomwhite)
 
@@ -1097,7 +1334,8 @@ Release 0.21.0 - Unreleased
     html characters in the parameters, to prevent cross site scripting 
     attacks. (omalley)
 
-    HADOOP-6274. Fix TestLocalFSFileContextMainOperations test failure. (Gary Murry via suresh).
+    HADOOP-6274. Fix TestLocalFSFileContextMainOperations test failure.
+    (Gary Murry via suresh).
 
     HADOOP-6281. Avoid null pointer exceptions when the jsps don't have 
     paramaters (omalley)
@@ -1128,11 +1366,6 @@ Release 0.21.0 - Unreleased
     HADOOP-6375. Sync documentation for FsShell du with its implementation.
     (Todd Lipcon via cdouglas)
 
-    HADOOP-6386. NameNode's HttpServer can't instantiate InetSocketAddress:
-    IllegalArgumentException is thrown (cos)
-
-    HADOOP-6428. HttpServer sleeps with negative values. (cos)
-
     HADOOP-6441. Protect web ui from cross site scripting attacks (XSS) on
     the host http header and using encoded utf-7. (omalley)
 
@@ -1144,23 +1377,150 @@ Release 0.21.0 - Unreleased
     HADOOP-6290. Prevent duplicate slf4j-simple jar via Avro's classpath.
     (Owen O'Malley via cdouglas)
 
+    HADOOP-6293. Fix FsShell -text to work on filesystems other than the
+    default. (cdouglas)
+
+    HADOOP-6341. Fix test-patch.sh for checkTests function. (gkesavan)
+
+    HADOOP-6314. Fix "fs -help" for the "-count" commond.  (Ravi Phulari via
+    szetszwo)
+
+    HADOOP-6405. Update Eclipse configuration to match changes to Ivy
+    configuration (Edwin Chan via cos)
+
+    HADOOP-6411. Remove deprecated file src/test/hadoop-site.xml. (cos)
+
+    HADOOP-6386. NameNode's HttpServer can't instantiate InetSocketAddress:
+    IllegalArgumentException is thrown (cos)
+
+    HADOOP-6254. Slow reads cause s3n to fail with SocketTimeoutException.
+    (Andrew Hitchcock via tomwhite)
+
+    HADOOP-6428. HttpServer sleeps with negative values. (cos)
+
+    HADOOP-6414. Add command line help for -expunge command.
+    (Ravi Phulari via tomwhite)
+
+    HADOOP-6391. Classpath should not be part of command line arguments.
+    (Cristian Ivascu via tomwhite)
+
+    HADOOP-6462. Target "compile" does not exist in contrib/cloud. (tomwhite)
+
+    HADOOP-6402. testConf.xsl is not well-formed XML. (Steve Loughran
+    via tomwhite)
+
+    HADOOP-6489. Fix 3 findbugs warnings. (Erik Steffl via suresh)
+
+    HADOOP-6517. Fix UserGroupInformation so that tokens are saved/retrieved
+    to/from the embedded Subject (Owen O'Malley & Kan Zhang via ddas)
+
+    HADOOP-6538. Sets hadoop.security.authentication to simple by default.
+    (ddas)
+
+    HADOOP-6540. Contrib unit tests have invalid XML for core-site, etc.
+    (Aaron Kimball via tomwhite)
+
+    HADOOP-6521. User specified umask using deprecated dfs.umask must override
+    server configured using new dfs.umaskmode for backward compatibility.
+    (suresh)
+    
     HADOOP-6522. Fix decoding of codepoint zero in UTF8. (cutting)
 
-Release 0.20.3 - Unreleased
+    HADOOP-6505. Use tr rather than sed to effect literal substitution in the
+    build script. (Allen Wittenauer via cdouglas)
 
-  NEW FEATURES
+    HADOOP-6548. Replace mortbay imports with commons logging. (cdouglas)
 
-    HADOOP-6637. Benchmark for establishing RPC session. (shv)
+    HADOOP-6560. Handle invalid har:// uri in HarFileSystem.  (szetszwo)
 
-  BUG FIXES
+    HADOOP-6549. TestDoAsEffectiveUser should use ip address of the host
+     for superuser ip check(jnp via boryas)
 
-    HADOOP-5611. Fix C++ libraries to build on Debian Lenny. (Todd Lipcon
+    HADOOP-6570. RPC#stopProxy throws NPE if getProxyEngine(proxy) returns
+    null. (hairong)
+
+    HADOOP-6558. Return null in HarFileSystem.getFileChecksum(..) since no
+    checksum algorithm is implemented.  (szetszwo)
+
+    HADOOP-6572. Makes sure that SASL encryption and push to responder
+    queue for the RPC response happens atomically. (Kan Zhang via ddas)
+
+    HADOOP-6545. Changes the Key for the FileSystem cache to be UGI (ddas)
+
+    HADOOP-6609. Fixed deadlock in RPC by replacing shared static 
+    DataOutputBuffer in the UTF8 class with a thread local variable. (omalley)
+
+    HADOOP-6504. Invalid example in the documentation of
+    org.apache.hadoop.util.Tool. (Benoit Sigoure via tomwhite)
+
+    HADOOP-6546. BloomMapFile can return false negatives. (Clark Jefcoat
     via tomwhite)
 
-    HADOOP-5612. Some c++ scripts are not chmodded before ant execution.
+    HADOOP-6593. TextRecordInputStream doesn't close SequenceFile.Reader.
+    (Chase Bradford via tomwhite)
+
+    HADOOP-6175. Incorrect version compilation with es_ES.ISO8859-15 locale
+    on Solaris 10. (Urko Benito via tomwhite)
+
+    HADOOP-6645.  Bugs on listStatus for HarFileSystem (rodrigo via mahadev)
+
+    HADOOP-6645. Re: Bugs on listStatus for HarFileSystem (rodrigo via
+    mahadev)
+
+    HADOOP-6654. Fix code example in WritableComparable javadoc.  (Tom White
+    via szetszwo)
+
+    HADOOP-6640. FileSystem.get() does RPC retries within a static
+    synchronized block. (hairong)
+
+    HADOOP-6680. hadoop-cloud push command invokes proxy creation.
+    (Andrew Klochkov via tomwhite)
+
+    HADOOP-6691. TestFileSystemCaching sometimes hangs. (hairong)
+
+    HADOOP-6507. Hadoop Common Docs - delete 3 doc files that do not belong
+    under Common. (Corinne Chandel via tomwhite)
+
+    HADOOP-6439. Fixes handling of deprecated keys to follow order in which
+    keys are defined. (V.V.Chaitanya Krishna via yhemanth)
+
+    HADOOP-6690. FilterFileSystem correctly handles setTimes call.
+    (Rodrigo Schmidt via dhruba)
+
+    HADOOP-6703. Prevent renaming a file, directory or symbolic link to
+    itself. (Eli Collins via suresh)
+
+    HADOOP-6710. Symbolic umask for file creation is not conformant with posix.
+    (suresh)
+    
+    HADOOP-6719. Insert all missing methods in FilterFs.
+    (Rodrigo Schmidt via dhruba)
+
+    HADOOP-6724. IPC doesn't properly handle IOEs thrown by socket factory.
     (Todd Lipcon via tomwhite)
 
-Release 0.20.2 - 2010-2-10
+    HADOOP-6722. NetUtils.connect should check that it hasn't connected a socket
+    to itself. (Todd Lipcon via tomwhite)
+
+    HADOOP-6634. Fix AccessControlList to use short names to verify access 
+    control. (Vinod Kumar Vavilapalli via sharad)
+
+    HADOOP-6709. Re-instate deprecated FileSystem methods that were removed
+    after 0.20. (tomwhite)
+ 
+    HADOOP-6630. hadoop-config.sh fails to get executed if hadoop wrapper
+    scripts are in path. (Allen Wittenauer via tomwhite)
+
+    HADOOP-6742. Add methods HADOOP-6709 from to TestFilterFileSystem.
+    (Eli Collins via tomwhite)
+
+Release 0.20.3 - Unreleased
+
+  NEW FEATURES
+
+    HADOOP-6637. Benchmark for establishing RPC session. (shv)
+
+Release 0.20.2 - 2010-2-16
 
   NEW FEATURES
 
@@ -1193,6 +1553,12 @@ Release 0.20.2 - 2010-2-10
 
   IMPROVEMENTS
 
+    HADOOP-5611. Fix C++ libraries to build on Debian Lenny. (Todd Lipcon
+    via tomwhite)
+
+    HADOOP-5612. Some c++ scripts are not chmodded before ant execution.
+    (Todd Lipcon via tomwhite)
+
     HADOOP-1849. Add undocumented configuration parameter for per handler 
     call queue size in IPC Server. (shv)
 

+ 2 - 1
bin/hadoop

@@ -112,7 +112,8 @@ case $COMMAND in
     if $cygwin; then
       CLASSPATH=`cygpath -p -w "$CLASSPATH"`
     fi
-    exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
+    export CLASSPATH=$CLASSPATH
+    exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS $CLASS "$@"
     ;;
 
 esac

+ 5 - 2
bin/hadoop-config.sh

@@ -19,7 +19,7 @@
 
 # resolve links - $0 may be a softlink
 
-this="$0"
+this="${BASH_SOURCE-$0}"
 while [ -h "$this" ]; do
   ls=`ls -ld "$this"`
   link=`expr "$ls" : '.*-> \(.*\)$'`
@@ -115,6 +115,9 @@ fi
 if [ -d "$HADOOP_CORE_HOME/build/test/classes" ]; then
   CLASSPATH=${CLASSPATH}:$HADOOP_CORE_HOME/build/test/classes
 fi
+if [ -d "$HADOOP_CORE_HOME/build/test/core/classes" ]; then
+  CLASSPATH=${CLASSPATH}:$HADOOP_CORE_HOME/build/test/core/classes
+fi
 
 # so that filenames w/ spaces are handled correctly in loops below
 IFS=
@@ -183,7 +186,7 @@ fi
 # setup 'java.library.path' for native-hadoop code if necessary
 JAVA_LIBRARY_PATH=''
 if [ -d "${HADOOP_CORE_HOME}/build/native" -o -d "${HADOOP_CORE_HOME}/lib/native" ]; then
-  JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} -Xmx32m org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
+  JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} -Xmx32m ${HADOOP_JAVA_PLATFORM_OPTS} org.apache.hadoop.util.PlatformName | sed -e "s/ /_/g"`
   
   if [ -d "$HADOOP_CORE_HOME/build/native" ]; then
     JAVA_LIBRARY_PATH=${HADOOP_CORE_HOME}/build/native/${JAVA_PLATFORM}/lib

+ 1 - 1
bin/hadoop-daemon.sh

@@ -36,7 +36,7 @@ if [ $# -le 1 ]; then
   exit 1
 fi
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hadoop-config.sh

+ 1 - 1
bin/hadoop-daemons.sh

@@ -26,7 +26,7 @@ if [ $# -le 1 ]; then
   exit 1
 fi
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . $bin/hadoop-config.sh

+ 5 - 2
bin/hdfs

@@ -32,6 +32,7 @@ function print_usage(){
   echo "  balancer             run a cluster balancing utility"
   echo "  jmxget               get JMX exported values from NameNode or DataNode."
   echo "  oiv                  apply the offline fsimage viewer to an fsimage"
+  echo "  fetchdt              fetch a delegation token from the NameNode"
   echo "						Use -help to see options"
   echo ""
   echo "Most commands print help when invoked w/o parameters."
@@ -70,6 +71,8 @@ elif [ "$COMMAND" = "jmxget" ] ; then
   CLASS=org.apache.hadoop.hdfs.tools.JMXGet
 elif [ "$COMMAND" = "oiv" ] ; then
   CLASS=org.apache.hadoop.hdfs.tools.offlineImageViewer.OfflineImageViewer
+elif [ "$COMMAND" = "fetchdt" ] ; then
+  CLASS=org.apache.hadoop.hdfs.tools.DelegationTokenFetcher
 else
   echo $COMMAND - invalid command
   print_usage
@@ -106,5 +109,5 @@ done
 if $cygwin; then
   CLASSPATH=`cygpath -p -w "$CLASSPATH"`
 fi
-
-exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS -classpath "$CLASSPATH" $CLASS "$@"
+export CLASSPATH=$CLASSPATH
+exec "$JAVA" $JAVA_HEAP_MAX $HADOOP_OPTS $CLASS "$@"

+ 1 - 1
bin/rcc

@@ -27,7 +27,7 @@
 #   HADOOP_CONF_DIR  Alternate conf dir. Default is ${HADOOP_HOME}/conf.
 #
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hadoop-config.sh

+ 1 - 1
bin/slaves.sh

@@ -35,7 +35,7 @@ if [ $# -le 0 ]; then
   exit 1
 fi
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hadoop-config.sh

+ 1 - 1
bin/start-all.sh

@@ -20,7 +20,7 @@
 
 echo "This script is Deprecated. Instead use start-dfs.sh and start-mapred.sh"
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hadoop-config.sh

+ 1 - 1
bin/start-balancer.sh

@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hdfs-config.sh

+ 1 - 1
bin/start-dfs.sh

@@ -22,7 +22,7 @@
 
 usage="Usage: start-dfs.sh [-upgrade|-rollback]"
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hdfs-config.sh

+ 1 - 1
bin/start-mapred.sh

@@ -18,7 +18,7 @@
 
 # Start hadoop map reduce daemons.  Run this on master node.
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . $bin/mapred-config.sh

+ 1 - 1
bin/stop-all.sh

@@ -20,7 +20,7 @@
 
 echo "This script is Deprecated. Instead use stop-dfs.sh and stop-mapred.sh"
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hadoop-config.sh

+ 1 - 1
bin/stop-balancer.sh

@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hdfs-config.sh

+ 1 - 1
bin/stop-dfs.sh

@@ -18,7 +18,7 @@
 
 # Stop hadoop DFS daemons.  Run this on master node.
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . "$bin"/hdfs-config.sh

+ 1 - 1
bin/stop-mapred.sh

@@ -18,7 +18,7 @@
 
 # Stop hadoop map reduce daemons.  Run this on master node.
 
-bin=`dirname "$0"`
+bin=`dirname "${BASH_SOURCE-$0}"`
 bin=`cd "$bin"; pwd`
 
 . $bin/mapred-config.sh

+ 116 - 37
build.xml

@@ -28,7 +28,7 @@
  
   <property name="Name" value="Hadoop-core"/>
   <property name="name" value="hadoop-core"/>
-  <property name="version" value="0.21.0-SNAPSHOT"/>
+  <property name="version" value="0.22.0-SNAPSHOT"/>
   <property name="final.name" value="${name}-${version}"/>
   <property name="test.final.name" value="${name}-test-${version}"/>
   <property name="year" value="2009"/>
@@ -50,9 +50,10 @@
   <property name="build.webapps" value="${build.dir}/webapps"/>
 
   <!-- convert spaces to _ so that mac os doesn't break things -->
-  <exec executable="sed" inputstring="${os.name}" 
+  <exec executable="tr" inputstring="${os.name}" 
         outputproperty="nonspace.os">
-     <arg value="s/ /_/g"/>
+     <arg value="[:space:]"/>
+     <arg value="_"/>
   </exec>
   <property name="build.platform" 
             value="${nonspace.os}-${os.arch}-${sun.arch.data.model}"/>
@@ -113,7 +114,9 @@
 
   <property name="jdiff.build.dir" value="${build.docs}/jdiff"/>
   <property name="jdiff.xml.dir" value="${lib.dir}/jdiff"/>
-  <property name="jdiff.stable" value="0.20.1"/>
+  <property name="jdiff.stability" value="-unstable"/>
+  <property name="jdiff.compatibility" value=""/>
+  <property name="jdiff.stable" value="0.20.2"/>
   <property name="jdiff.stable.javadoc" 
             value="http://hadoop.apache.org/core/docs/r${jdiff.stable}/api/"/>
 
@@ -158,6 +161,8 @@
   <property name="ivy.publish.pattern" value="[artifact]-[revision].[ext]"/>
   <property name="hadoop-core.jar" location="${build.dir}/${final.name}.jar" />
   <property name="hadoop-core-test.jar" location="${build.dir}/${test.final.name}.jar" />
+  <property name="hadoop-core-sources.jar" location="${build.dir}/${final.name}-sources.jar" />
+  <property name="hadoop-core-test-sources.jar" location="${build.dir}/${test.final.name}-sources.jar" />
 
   <!-- jdiff.home property set -->
   <property name="jdiff.home" value="${build.ivy.lib.dir}/${ant.project.name}/jdiff"/>
@@ -166,6 +171,12 @@
 
   <property name="clover.jar" location="${clover.home}/lib/clover.jar"/>
   <available property="clover.present" file="${clover.jar}" />
+	
+  <!-- Eclipse properties -->
+  <property name="build.dir.eclipse" value="build/eclipse"/>
+  <property name="build.dir.eclipse-main-classes" value="${build.dir.eclipse}/classes-main"/>
+  <property name="build.dir.eclipse-test-classes" value="${build.dir.eclipse}/classes-test"/>
+  <property name="build.dir.eclipse-test-generated-classes" value="${build.dir.eclipse}/classes-test-generated"/>
 
   <!-- check if clover reports should be generated -->
   <condition property="clover.enabled">
@@ -415,6 +426,11 @@
       <fileset dir="${conf.dir}" includes="${jar.properties.list}" />
       <fileset file="${jar.extra.properties.list}" />
     </jar>
+
+    <jar jarfile="${hadoop-core-sources.jar}">
+      <fileset dir="${java.src.dir}" includes="org/apache/hadoop/**/*.java"/>
+      <fileset dir="${build.src}" includes="org/apache/hadoop/**/*.java"/>
+    </jar>
   </target>
 
   <!-- ================================================================== -->
@@ -519,6 +535,11 @@
           </section>
          </manifest>
     </jar>
+
+    <jar jarfile="${hadoop-core-test-sources.jar}">
+      <fileset dir="${test.generated.dir}" includes="org/apache/hadoop/**/*.java"/>
+      <fileset dir="${test.src.dir}/core" includes="org/apache/hadoop/**/*.java"/>
+    </jar>
   </target>
 
   <!-- ================================================================== -->
@@ -860,7 +881,7 @@
     </uptodate>
   </target>
  
-  <target name="javadoc" description="Generate javadoc" depends="javadoc-uptodate"
+  <target name="javadoc" description="Generate javadoc" depends="jar, javadoc-uptodate"
        unless="javadoc.is.uptodate">
     <mkdir dir="${build.javadoc}"/>
     <javadoc
@@ -891,15 +912,19 @@
 
        <group title="Core" packages="org.apache.*"/>
        <group title="contrib: FailMon" packages="org.apache.hadoop.contrib.failmon*"/>
+       <doclet name="org.apache.hadoop.classification.tools.ExcludePrivateAnnotationsStandardDoclet"
+               path="${build.dir}/${final.name}.jar"/>
     </javadoc>
   </target>	
 
   <target name="api-xml" depends="ivy-retrieve-jdiff,javadoc,write-null">
     <javadoc maxmemory="${javadoc.maxmemory}">
-       <doclet name="jdiff.JDiff"
-               path="${jdiff.jar}:${xerces.jar}">
+       <doclet name="org.apache.hadoop.classification.tools.ExcludePrivateAnnotationsJDiffDoclet"
+               path="${build.dir}/${final.name}.jar:${jdiff.jar}:${xerces.jar}">
          <param name="-apidir" value="${jdiff.xml.dir}"/>
          <param name="-apiname" value="hadoop-core ${version}"/>
+         <param name="${jdiff.stability}"/>
+         <param name="${jdiff.compatibility}"/>
        </doclet>
        <packageset dir="src/java"/>
        <classpath >
@@ -922,8 +947,8 @@
              destdir="${jdiff.build.dir}"
 	     sourceFiles="${jdiff.home}/Null.java"
 	     maxmemory="${javadoc.maxmemory}">
-       <doclet name="jdiff.JDiff"
-               path="${jdiff.jar}:${xerces.jar}">
+       <doclet name="org.apache.hadoop.classification.tools.ExcludePrivateAnnotationsJDiffDoclet"
+              path="${build.dir}/${final.name}.jar:${jdiff.jar}:${xerces.jar}">
          <param name="-oldapi" value="hadoop-core ${jdiff.stable}"/>
          <param name="-newapi" value="hadoop-core ${version}"/>
          <param name="-oldapidir" value="${jdiff.xml.dir}"/>
@@ -931,6 +956,8 @@
          <param name="-javadocold" value="${jdiff.stable.javadoc}"/>
          <param name="-javadocnew" value="../../api/"/>
          <param name="-stats"/>
+         <param name="${jdiff.stability}"/>
+         <param name="${jdiff.compatibility}"/>
        </doclet>
        <classpath >
          <path refid="classpath" />
@@ -1154,9 +1181,11 @@
      <artifact:pom file="${hadoop-core-test.pom}" id="hadoop.core.test"/>
      <artifact:install file="${hadoop-core.jar}">
         <pom refid="hadoop.core"/>
+	<attach file="${hadoop-core-sources.jar}" classifier="sources" />
      </artifact:install>
      <artifact:install file="${hadoop-core-test.jar}">
         <pom refid="hadoop.core.test"/>
+	<attach file="${hadoop-core-test-sources.jar}" classifier="sources" />
      </artifact:install>
   </target>
 
@@ -1169,10 +1198,12 @@
      <artifact:deploy file="${hadoop-core.jar}">
          <remoteRepository id="apache.snapshots.https" url="${asfrepo}"/>
          <pom refid="hadoop.core"/>
+	<attach file="${hadoop-core-sources.jar}" classifier="sources" />
      </artifact:deploy>
      <artifact:deploy file="${hadoop-core-test.jar}">
          <remoteRepository id="apache.snapshots.https" url="${asfrepo}"/>
          <pom refid="hadoop.core.test"/>
+	<attach file="${hadoop-core-test-sources.jar}" classifier="sources" />
      </artifact:deploy>
   </target>
   
@@ -1209,6 +1240,7 @@
         <exclude name="**/native/*"/>
         <exclude name="**/native/config/*"/>
         <exclude name="**/VERSION"/>
+        <exclude name="**/*.json"/>
         <exclude name="**/hod/*.txt"/>
       </fileset>
     </rat:report>
@@ -1330,20 +1362,51 @@
   </exec>
 </target>
 	
-  <target name="eclipse-files" depends="init"
-          description="Generate files for Eclipse">
-    <pathconvert property="eclipse.project">
-      <path path="${basedir}"/>
-      <regexpmapper from="^.*/([^/]+)$$" to="\1" handledirsep="yes"/>
-    </pathconvert>
-    <copy todir="." overwrite="true">
-      <fileset dir=".eclipse.templates">
-      	<exclude name="**/README.txt"/>
-      </fileset>
-      <filterset>
-        <filter token="PROJECT" value="${eclipse.project}"/>
-      </filterset>
-    </copy>
+  <condition property="ant-eclipse.jar.exists">
+    <available file="${build.dir}/lib/ant-eclipse-1.0-jvm1.2.jar"/>
+  </condition>
+
+  <target name="ant-eclipse-download" unless="ant-eclipse.jar.exists"
+          description="Downloads the ant-eclipse binary.">
+    <get src="http://downloads.sourceforge.net/project/ant-eclipse/ant-eclipse/1.0/ant-eclipse-1.0.bin.tar.bz2"
+         dest="${build.dir}/ant-eclipse-1.0.bin.tar.bz2" usetimestamp="false" />
+
+    <untar src="${build.dir}/ant-eclipse-1.0.bin.tar.bz2"
+           dest="${build.dir}" compression="bzip2">
+      <patternset>
+        <include name="lib/ant-eclipse-1.0-jvm1.2.jar"/>
+      </patternset>
+    </untar>
+    <delete file="${build.dir}/ant-eclipse-1.0.bin.tar.bz2" />
+  </target>
+
+  <target name="eclipse" 
+          depends="init,ant-eclipse-download,ivy-retrieve-common,ivy-retrieve-test,compile-core-test"
+          description="Create eclipse project files">
+	     <pathconvert property="eclipse.project">
+	       <path path="${basedir}"/>
+	       <regexpmapper from="^.*/([^/]+)$$" to="\1" handledirsep="yes"/>
+	     </pathconvert>
+    <taskdef name="eclipse"
+             classname="prantl.ant.eclipse.EclipseTask"
+             classpath="${build.dir}/lib/ant-eclipse-1.0-jvm1.2.jar" />
+    <eclipse updatealways="true">
+      <project name="${eclipse.project}" />
+      <classpath>
+        <source path="${java.src.dir}"
+                output="${build.dir.eclipse-main-classes}" />
+        <source path="${test.src.dir}/core"
+                output="${build.dir.eclipse-test-classes}" />
+        <source path="${test.src.dir}/aop"
+                output="${build.dir.eclipse-test-classes}" />
+        <source path="${test.generated.dir}"
+                output="${build.dir.eclipse-test-generated-classes}" />
+        <output path="${build.dir.eclipse-main-classes}" />
+        <library pathref="ivy-common.classpath" exported="true" />
+        <library pathref="ivy-test.classpath" exported="false" />
+        <library path="${conf.dir}" exported="false" />
+      </classpath>
+    </eclipse>
   </target>
 
   <target name="ivy-init-dirs">
@@ -1385,6 +1448,8 @@
     </fail>
   </target>
 
+  <property name="ivyresolvelog" value="download-only"/>
+  <property name="ivyretrievelog" value="quiet"/>
 
   <target name="ivy-init" depends="ivy-init-antlib" >
 
@@ -1397,78 +1462,92 @@
   </target>
 
   <target name="ivy-resolve" depends="ivy-init">
-    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings"/>
+    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings"
+    	log="${ivyresolvelog}"/>
   </target>
 
   <target name="ivy-resolve-javadoc" depends="ivy-init">
-    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="javadoc"/>
+    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="javadoc"
+    	log="${ivyresolvelog}"/>
   </target>
 
   <target name="ivy-resolve-releaseaudit" depends="ivy-init">
-    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="releaseaudit"/>
+    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="releaseaudit"
+  		log="${ivyresolvelog}"/>
   </target>
 
   <target name="ivy-resolve-test" depends="ivy-init">
-    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="test" />
+    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="test"
+    	log="${ivyresolvelog}"/>
   </target>
 
   <target name="ivy-resolve-common" depends="ivy-init">
-    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="common" />
+    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="common"
+    	log="${ivyresolvelog}"/>
   </target>
 
   <target name="ivy-resolve-jdiff" depends="ivy-init">
-    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="jdiff" />
+    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="jdiff"
+    	log="${ivyresolvelog}"/>
   </target>
 
   <target name="ivy-resolve-checkstyle" depends="ivy-init">
-    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="checkstyle"/>
+    <ivy:resolve settingsRef="${ant.project.name}.ivy.settings" conf="checkstyle"
+  		log="${ivyresolvelog}"/>
   </target>
 
   <target name="ivy-retrieve" depends="ivy-resolve"
     description="Retrieve Ivy-managed artifacts">
     <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
-      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"/>
+      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"
+    		log="${ivyretrievelog}"/>
   </target>
 
   <target name="ivy-retrieve-checkstyle" depends="ivy-resolve-checkstyle"
     description="Retrieve Ivy-managed artifacts for the checkstyle configurations">
     <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
-      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"/>
+      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"
+  			log="${ivyretrievelog}"/>
     <ivy:cachepath pathid="checkstyle-classpath" conf="checkstyle"/>
   </target>
 
   <target name="ivy-retrieve-jdiff" depends="ivy-resolve-jdiff"
     description="Retrieve Ivy-managed artifacts for the jdiff configurations">
     <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
-      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"/>
+      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"
+  			log="${ivyretrievelog}"/>
     <ivy:cachepath pathid="jdiff-classpath" conf="jdiff"/>
   </target>
 
   <target name="ivy-retrieve-javadoc" depends="ivy-resolve-javadoc"
     description="Retrieve Ivy-managed artifacts for the javadoc configurations">
     <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
-      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"/>
+      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"
+  			log="${ivyretrievelog}"/>
     <ivy:cachepath pathid="javadoc-classpath" conf="javadoc"/>
   </target>
 
   <target name="ivy-retrieve-test" depends="ivy-resolve-test"
     description="Retrieve Ivy-managed artifacts for the test configurations">
     <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
-      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"/>
+      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"
+    		log="${ivyretrievelog}"/>
     <ivy:cachepath pathid="ivy-test.classpath" conf="test"/>
   </target>
 
   <target name="ivy-retrieve-common" depends="ivy-resolve-common"
     description="Retrieve Ivy-managed artifacts for the compile configurations">
     <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
-      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"/>
+      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"
+    		log="${ivyretrievelog}"/>
     <ivy:cachepath pathid="ivy-common.classpath" conf="common"/>
   </target>
 
   <target name="ivy-retrieve-releaseaudit" depends="ivy-resolve-releaseaudit"
     description="Retrieve Ivy-managed artifacts for the compile configurations">
     <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings"
-      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}" />
+      pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}"
+    		log="${ivyretrievelog}"/>
     <ivy:cachepath pathid="releaseaudit-classpath" conf="releaseaudit"/>
   </target>
 

+ 1 - 1
conf/hadoop-env.sh.template

@@ -6,7 +6,7 @@
 # remote nodes.
 
 # The java implementation to use.  Required.
-# export JAVA_HOME=/usr/lib/j2sdk1.5-sun
+# export JAVA_HOME=/usr/lib/j2sdk1.6-sun
 
 # Extra Java CLASSPATH elements.  Optional.
 # export HADOOP_CLASSPATH=

+ 13 - 0
conf/log4j.properties

@@ -57,6 +57,19 @@ log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
 log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
 log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
 
+
+#
+#Security appender
+#
+hadoop.security.log.file=SecurityAuth.audit
+log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender 
+log4j.appender.DRFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
+
+log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
+log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
+#new logger
+log4j.category.SecurityLogger=INFO,DRFAS
+
 #
 # Rolling File Appender
 #

+ 6 - 6
ivy.xml

@@ -50,7 +50,7 @@
               extends="mandatory"/>
    <conf name="jetty" description="Jetty provides the in-VM HTTP daemon" extends="commons-logging"/>
 
-   <conf name="common" extends="runtime,mandatory,httpclient,ftp,jetty"
+   <conf name="common" extends="runtime,mandatory,httpclient,ftp,jetty,jdiff"
 		      description="common artifacts"/>
     <!--Testing pulls in everything-->
    <conf name="test" extends="master,common" description="the classpath needed to run tests"/>
@@ -91,11 +91,6 @@
       name="jdiff"
       rev="${jdiff.version}"
       conf="jdiff->default"/>
-    <dependency org="xerces"
-      name="xerces"
-      rev="${xerces.version}"
-      conf="jdiff->default">
-    </dependency>
 
     <dependency org="xmlenc"
       name="xmlenc"
@@ -294,5 +289,10 @@
       rev="${aspectj.version}"
       conf="common->default">
     </dependency>
+    <dependency org="org.mockito" 
+      name="mockito-all" 
+      rev="${mockito-all.version}" 
+      conf="common->default">
+    </dependency> 
     </dependencies>
 </ivy-module>

+ 2 - 2
ivy/hadoop-core-template.xml

@@ -41,7 +41,7 @@
     <dependency>
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
-      <version>1.3</version>
+      <version>1.4</version>
     </dependency>
     <dependency>
       <groupId>commons-net</groupId>
@@ -121,7 +121,7 @@
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>avro</artifactId>
-      <version>1.2.0</version>
+      <version>1.3.0</version>
     </dependency>
   </dependencies>
 </project>

+ 18 - 3
ivy/ivysettings.xml

@@ -19,17 +19,32 @@
   <property name="repo.maven.org" value="http://repo1.maven.org/maven2/" override="false"/>
 
   <property name="maven2.pattern" value="[organisation]/[module]/[revision]/[module]-[revision]"/>
+  <property name="repo.dir" value="${user.home}/.m2/repository"/>
   <property name="maven2.pattern.ext" value="${maven2.pattern}.[ext]"/>
       <!-- pull in the local repository -->
  <include url="${ivy.default.conf.dir}/ivyconf-local.xml"/> 
- <settings defaultResolver="default"/>
-  <resolvers>
-    <!--ibiblio resolvers-->
+
+ <property name="resolvers" value="default" override="false"/>
+ <settings defaultResolver="${resolvers}"/>
+
+ <resolvers>
+   <!--ibiblio resolvers-->
     <ibiblio name="maven2" root="${repo.maven.org}" m2compatible="true"/>
 
+    <filesystem name="fs" m2compatible="true" force="true">
+       <artifact pattern="${repo.dir}/[organisation]/[module]/[revision]/[module]-[revision].[ext]"/>
+       <ivy pattern="${repo.dir}/[organisation]/[module]/[revision]/[module]-[revision].pom"/>
+    </filesystem>
+
     <chain name="default" dual="true">
       <resolver ref="maven2"/>
     </chain>
+
+    <chain name="internal" dual="true">
+      <resolver ref="fs"/>
+      <resolver ref="maven2"/>
+    </chain>
+
   </resolvers>
 
 </ivysettings>

+ 5 - 6
ivy/libraries.properties

@@ -17,13 +17,13 @@
 apacheant.version=1.7.1
 ant-task.version=2.0.10
 
-avro.version=1.2.0
+avro.version=1.3.0
 
 checkstyle.version=4.2
 
 commons-cli.version=1.2
 commons-cli2.version=2.0-mahout
-commons-codec.version=1.3
+commons-codec.version=1.4
 commons-collections.version=3.1
 commons-httpclient.version=3.0.1
 commons-lang.version=2.4
@@ -44,8 +44,6 @@ hsqldb.version=1.8.0.10
 
 ivy.version=2.1.0-rc1
 
-jackson.version=1.0.1
-
 jasper.version=5.5.12
 jsp.version=2.1
 jsp-api.version=5.5.12
@@ -65,8 +63,6 @@ mina-core.version=2.0.0-M5
 
 oro.version=2.0.8
 
-paranamer.version=1.5
-
 rats-lib.version=0.6
 
 servlet.version=4.0.6
@@ -79,3 +75,6 @@ xmlenc.version=0.52
 xerces.version=1.4.4
 
 aspectj.version=1.6.5
+
+mockito-all.version=1.8.0
+

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 11 - 0
lib/jdiff/hadoop_0.20.2.xml


+ 497 - 0
src/contrib/cloud/README.txt

@@ -0,0 +1,497 @@
+Hadoop Cloud Scripts
+====================
+
+These scripts allow you to run Hadoop on cloud providers. These instructions
+assume you are running on Amazon EC2, the differences for other providers are
+noted at the end of this document.
+
+Getting Started
+===============
+
+First, unpack the scripts on your system. For convenience, you may like to put
+the top-level directory on your path.
+
+You'll also need python (version 2.5 or newer) and the boto and simplejson
+libraries. After you download boto and simplejson, you can install each in turn
+by running the following in the directory where you unpacked the distribution:
+
+% sudo python setup.py install
+
+Alternatively, you might like to use the python-boto and python-simplejson RPM
+and Debian packages.
+
+You need to tell the scripts your AWS credentials. The simplest way to do this
+is to set the environment variables (but see
+http://code.google.com/p/boto/wiki/BotoConfig for other options):
+
+    * AWS_ACCESS_KEY_ID - Your AWS Access Key ID
+    * AWS_SECRET_ACCESS_KEY - Your AWS Secret Access Key
+
+To configure the scripts, create a directory called .hadoop-cloud (note the
+leading ".") in your home directory. In it, create a file called
+clusters.cfg with a section for each cluster you want to control. e.g.:
+
+[my-hadoop-cluster]
+image_id=ami-6159bf08
+instance_type=c1.medium
+key_name=tom
+availability_zone=us-east-1c
+private_key=PATH_TO_PRIVATE_KEY
+ssh_options=-i %(private_key)s -o StrictHostKeyChecking=no
+
+The image chosen here is one with a i386 Fedora OS. For a list of suitable AMIs
+see http://wiki.apache.org/hadoop/AmazonEC2.
+
+The architecture must be compatible with the instance type. For m1.small and
+c1.medium instances use the i386 AMIs, while for m1.large, m1.xlarge, and
+c1.xlarge instances use the x86_64 AMIs. One of the high CPU instances
+(c1.medium or c1.xlarge) is recommended.
+
+Then you can run the hadoop-ec2 script. It will display usage instructions when
+invoked without arguments.
+
+You can test that it can connect to AWS by typing:
+
+% hadoop-ec2 list
+
+LAUNCHING A CLUSTER
+===================
+
+To launch a cluster called "my-hadoop-cluster" with 10 worker (slave) nodes
+type:
+
+% hadoop-ec2 launch-cluster my-hadoop-cluster 10
+
+This will boot the master node and 10 worker nodes. The master node runs the
+namenode, secondary namenode, and jobtracker, and each worker node runs a
+datanode and a tasktracker. Equivalently the cluster could be launched as:
+
+% hadoop-ec2 launch-cluster my-hadoop-cluster 1 nn,snn,jt 10 dn,tt
+
+Note that using this notation you can launch a split namenode/jobtracker cluster
+
+% hadoop-ec2 launch-cluster my-hadoop-cluster 1 nn,snn 1 jt 10 dn,tt
+
+When the nodes have started and the Hadoop cluster has come up, the console will
+display a message like
+
+  Browse the cluster at http://ec2-xxx-xxx-xxx-xxx.compute-1.amazonaws.com/
+
+You can access Hadoop's web UI by visiting this URL. By default, port 80 is
+opened for access from your client machine. You may change the firewall settings
+(to allow access from a network, rather than just a single machine, for example)
+by using the Amazon EC2 command line tools, or by using a tool like Elastic Fox.
+There is a security group for each node's role. The one for the namenode
+is <cluster-name>-nn, for example.
+
+For security reasons, traffic from the network your client is running on is
+proxied through the master node of the cluster using an SSH tunnel (a SOCKS
+proxy on port 6666). To set up the proxy run the following command:
+
+% hadoop-ec2 proxy my-hadoop-cluster
+
+Web browsers need to be configured to use this proxy too, so you can view pages
+served by worker nodes in the cluster. The most convenient way to do this is to
+use a proxy auto-config (PAC) file, such as this one:
+
+  http://apache-hadoop-ec2.s3.amazonaws.com/proxy.pac
+
+If you are using Firefox, then you may find
+FoxyProxy useful for managing PAC files. (If you use FoxyProxy, then you need to
+get it to use the proxy for DNS lookups. To do this, go to Tools -> FoxyProxy ->
+Options, and then under "Miscellaneous" in the bottom left, choose "Use SOCKS
+proxy for DNS lookups".)
+
+PERSISTENT CLUSTERS
+===================
+
+Hadoop clusters running on EC2 that use local EC2 storage (the default) will not
+retain data once the cluster has been terminated. It is possible to use EBS for
+persistent data, which allows a cluster to be shut down while it is not being
+used.
+
+Note: EBS support is a Beta feature.
+
+First create a new section called "my-ebs-cluster" in the
+.hadoop-cloud/clusters.cfg file.
+
+Now we need to create storage for the new cluster. Create a temporary EBS volume
+of size 100GiB, format it, and save it as a snapshot in S3. This way, we only
+have to do the formatting once.
+
+% hadoop-ec2 create-formatted-snapshot my-ebs-cluster 100
+
+We create storage for a single namenode and for two datanodes. The volumes to
+create are described in a JSON spec file, which references the snapshot we just
+created. Here is the contents of a JSON file, called
+my-ebs-cluster-storage-spec.json:
+
+{
+  "nn": [
+    {
+      "device": "/dev/sdj",
+      "mount_point": "/ebs1",
+      "size_gb": "100",
+      "snapshot_id": "snap-268e704f"
+    },
+    {
+      "device": "/dev/sdk",
+      "mount_point": "/ebs2",
+      "size_gb": "100",
+      "snapshot_id": "snap-268e704f"
+    }
+  ],
+  "dn": [
+    {
+      "device": "/dev/sdj",
+      "mount_point": "/ebs1",
+      "size_gb": "100",
+      "snapshot_id": "snap-268e704f"
+    },
+    {
+      "device": "/dev/sdk",
+      "mount_point": "/ebs2",
+      "size_gb": "100",
+      "snapshot_id": "snap-268e704f"
+    }
+  ]
+}
+
+
+Each role (here "nn" and "dn") is the key to an array of volume
+specifications. In this example, the "slave" role has two devices ("/dev/sdj"
+and "/dev/sdk") with different mount points, sizes, and generated from an EBS
+snapshot. The snapshot is the formatted snapshot created earlier, so that the
+volumes we create are pre-formatted. The size of the drives must match the size
+of the snapshot created earlier.
+
+Let's create actual volumes using this file.
+
+% hadoop-ec2 create-storage my-ebs-cluster nn 1 \
+    my-ebs-cluster-storage-spec.json
+% hadoop-ec2 create-storage my-ebs-cluster dn 2 \
+    my-ebs-cluster-storage-spec.json
+
+Now let's start the cluster with 2 slave nodes:
+
+% hadoop-ec2 launch-cluster my-ebs-cluster 2
+
+Login and run a job which creates some output.
+
+% hadoop-ec2 login my-ebs-cluster
+
+# hadoop fs -mkdir input
+# hadoop fs -put /etc/hadoop/conf/*.xml input
+# hadoop jar /usr/lib/hadoop/hadoop-*-examples.jar grep input output \
+    'dfs[a-z.]+'
+
+Look at the output:
+
+# hadoop fs -cat output/part-00000 | head
+
+Now let's shutdown the cluster.
+
+% hadoop-ec2 terminate-cluster my-ebs-cluster
+
+A little while later we restart the cluster and login.
+
+% hadoop-ec2 launch-cluster my-ebs-cluster 2
+% hadoop-ec2 login my-ebs-cluster
+
+The output from the job we ran before should still be there:
+
+# hadoop fs -cat output/part-00000 | head
+
+RUNNING JOBS
+============
+
+When you launched the cluster, a hadoop-site.xml file was created in the
+directory ~/.hadoop-cloud/<cluster-name>. You can use this to connect to the
+cluster by setting the HADOOP_CONF_DIR enviroment variable (it is also possible
+to set the configuration file to use by passing it as a -conf option to Hadoop
+Tools):
+
+% export HADOOP_CONF_DIR=~/.hadoop-cloud/my-hadoop-cluster
+
+Let's try browsing HDFS:
+
+% hadoop fs -ls /
+
+Running a job is straightforward:
+
+% hadoop fs -mkdir input # create an input directory
+% hadoop fs -put $HADOOP_HOME/LICENSE.txt input # copy a file there
+% hadoop jar $HADOOP_HOME/hadoop-*-examples.jar wordcount input output
+% hadoop fs -cat output/part-00000 | head
+
+Of course, these examples assume that you have installed Hadoop on your local
+machine. It is also possible to launch jobs from within the cluster. First log
+into the namenode:
+
+% hadoop-ec2 login my-hadoop-cluster
+
+Then run a job as before:
+
+# hadoop fs -mkdir input
+# hadoop fs -put /etc/hadoop/conf/*.xml input
+# hadoop jar /usr/lib/hadoop/hadoop-*-examples.jar grep input output 'dfs[a-z.]+'
+# hadoop fs -cat output/part-00000 | head
+
+TERMINATING A CLUSTER
+=====================
+
+When you've finished with your cluster you can stop it with the following
+command.
+
+NOTE: ALL DATA WILL BE LOST UNLESS YOU ARE USING EBS!
+
+% hadoop-ec2 terminate-cluster my-hadoop-cluster
+
+You can then delete the EC2 security groups with:
+
+% hadoop-ec2 delete-cluster my-hadoop-cluster
+
+AUTOMATIC CLUSTER SHUTDOWN
+==========================
+
+You may use the --auto-shutdown option to automatically terminate a cluster
+a given time (specified in minutes) after launch. This is useful for short-lived
+clusters where the jobs complete in a known amount of time.
+
+If you want to cancel the automatic shutdown, then run
+
+% hadoop-ec2 exec my-hadoop-cluster shutdown -c
+% hadoop-ec2 update-slaves-file my-hadoop-cluster
+% hadoop-ec2 exec my-hadoop-cluster /usr/lib/hadoop/bin/slaves.sh shutdown -c
+
+CONFIGURATION NOTES
+===================
+
+It is possible to specify options on the command line: these take precedence
+over any specified in the configuration file. For example:
+
+% hadoop-ec2 launch-cluster --image-id ami-2359bf4a --instance-type c1.xlarge \
+  my-hadoop-cluster 10
+
+This command launches a 10-node cluster using the specified image and instance
+type, overriding the equivalent settings (if any) that are in the
+"my-hadoop-cluster" section of the configuration file. Note that words in
+options are separated by hyphens (--instance-type) while the corresponding
+configuration parameter is are separated by underscores (instance_type).
+
+The scripts install Hadoop RPMs or Debian packages (depending on the OS) at
+instance boot time.
+
+By default, Apache Hadoop 0.20.1 is installed. You can also run other versions
+of Apache Hadoop. For example the following uses version 0.18.3:
+
+% hadoop-ec2 launch-cluster --env HADOOP_VERSION=0.18.3 \
+  my-hadoop-cluster 10
+
+CUSTOMIZATION
+=============
+
+You can specify a list of packages to install on every instance at boot time
+using the --user-packages command-line option (or the user_packages
+configuration parameter). Packages should be space-separated. Note that package
+names should reflect the package manager being used to install them (yum or
+apt-get depending on the OS).
+
+Here's an example that installs RPMs for R and git:
+
+% hadoop-ec2 launch-cluster --user-packages 'R git-core' my-hadoop-cluster 10
+
+You have full control over the script that is run when each instance boots. The
+default script, hadoop-ec2-init-remote.sh, may be used as a starting point to
+add extra configuration or customization of the instance. Make a copy of the
+script in your home directory, or somewhere similar, and set the
+--user-data-file command-line option (or the user_data_file configuration
+parameter) to point to the (modified) copy.  hadoop-ec2 will replace "%ENV%"
+in your user data script with
+USER_PACKAGES, AUTO_SHUTDOWN, and EBS_MAPPINGS, as well as extra parameters
+supplied using the --env commandline flag.
+
+Another way of customizing the instance, which may be more appropriate for
+larger changes, is to create you own image.
+
+It's possible to use any image, as long as it i) runs (gzip compressed) user
+data on boot, and ii) has Java installed.
+
+OTHER SERVICES
+==============
+
+ZooKeeper
+=========
+
+You can run ZooKeeper by setting the "service" parameter to "zookeeper". For
+example:
+
+[my-zookeeper-cluster]
+service=zookeeper
+ami=ami-ed59bf84
+instance_type=m1.small
+key_name=tom
+availability_zone=us-east-1c
+public_key=PATH_TO_PUBLIC_KEY
+private_key=PATH_TO_PRIVATE_KEY
+
+Then to launch a three-node ZooKeeper ensemble, run:
+
+% ./hadoop-ec2 launch-cluster my-zookeeper-cluster 3 zk
+
+PROVIDER-SPECIFIC DETAILS
+=========================
+
+Rackspace
+=========
+
+Running on Rackspace is very similar to running on EC2, with a few minor
+differences noted here.
+
+Security Warning
+================
+
+Currently, Hadoop clusters on Rackspace are insecure since they don't run behind
+a firewall.
+
+Creating an image
+=================
+
+Rackspace doesn't support shared images, so you will need to build your own base
+image to get started. See "Instructions for creating an image" at the end of
+this document for details.
+
+Installation
+============
+
+To run on rackspace you need to install libcloud by checking out the latest
+source from Apache:
+
+git clone git://git.apache.org/libcloud.git
+cd libcloud; python setup.py install
+
+Set up your Rackspace credentials by exporting the following environment
+variables:
+
+    * RACKSPACE_KEY - Your Rackspace user name
+    * RACKSPACE_SECRET - Your Rackspace API key
+    
+Configuration
+=============
+
+The cloud_provider parameter must be set to specify Rackspace as the provider.
+Here is a typical configuration:
+
+[my-rackspace-cluster]
+cloud_provider=rackspace
+image_id=200152
+instance_type=4
+public_key=/path/to/public/key/file
+private_key=/path/to/private/key/file
+ssh_options=-i %(private_key)s -o StrictHostKeyChecking=no
+
+It's a good idea to create a dedicated key using a command similar to:
+
+ssh-keygen -f id_rsa_rackspace -P ''
+
+Launching a cluster
+===================
+
+Use the "hadoop-cloud" command instead of "hadoop-ec2".
+
+After launching a cluster you need to manually add a hostname mapping for the
+master node to your client's /etc/hosts to get it to work. This is because DNS
+isn't set up for the cluster nodes so your client won't resolve their addresses.
+You can do this with
+
+hadoop-cloud list my-rackspace-cluster | grep 'nn,snn,jt' \
+ | awk '{print $4 " " $3 }'  | sudo tee -a /etc/hosts
+
+Instructions for creating an image
+==================================
+
+First set your Rackspace credentials:
+
+export RACKSPACE_KEY=<Your Rackspace user name>
+export RACKSPACE_SECRET=<Your Rackspace API key>
+
+Now create an authentication token for the session, and retrieve the server
+management URL to perform operations against.
+
+# Final SED is to remove trailing ^M
+AUTH_TOKEN=`curl -D - -H X-Auth-User:$RACKSPACE_KEY \
+  -H X-Auth-Key:$RACKSPACE_SECRET https://auth.api.rackspacecloud.com/v1.0 \
+  | grep 'X-Auth-Token:' | awk '{print $2}' | sed 's/.$//'`
+SERVER_MANAGEMENT_URL=`curl -D - -H X-Auth-User:$RACKSPACE_KEY \
+  -H X-Auth-Key:$RACKSPACE_SECRET https://auth.api.rackspacecloud.com/v1.0 \
+  | grep 'X-Server-Management-Url:' | awk '{print $2}' | sed 's/.$//'`
+
+echo $AUTH_TOKEN
+echo $SERVER_MANAGEMENT_URL
+
+You can get a list of images with the following
+
+curl -H X-Auth-Token:$AUTH_TOKEN $SERVER_MANAGEMENT_URL/images
+
+Here's the same query, but with pretty-printed XML output:
+
+curl -H X-Auth-Token:$AUTH_TOKEN $SERVER_MANAGEMENT_URL/images.xml | xmllint --format -
+
+There are similar queries for flavors and running instances:
+
+curl -H X-Auth-Token:$AUTH_TOKEN $SERVER_MANAGEMENT_URL/flavors.xml | xmllint --format -
+curl -H X-Auth-Token:$AUTH_TOKEN $SERVER_MANAGEMENT_URL/servers.xml | xmllint --format -
+
+The following command will create a new server. In this case it will create a
+2GB Ubuntu 8.10 instance, as determined by the imageId and flavorId attributes.
+The name of the instance is set to something meaningful too.
+
+curl -v -X POST -H X-Auth-Token:$AUTH_TOKEN -H 'Content-type: text/xml' -d @- $SERVER_MANAGEMENT_URL/servers << EOF
+<server xmlns="http://docs.rackspacecloud.com/servers/api/v1.0" name="apache-hadoop-ubuntu-8.10-base" imageId="11" flavorId="4">
+  <metadata/>
+</server>
+EOF
+
+Make a note of the new server's ID, public IP address and admin password as you
+will need these later.
+
+You can check the status of the server with
+
+curl -H X-Auth-Token:$AUTH_TOKEN $SERVER_MANAGEMENT_URL/servers/$SERVER_ID.xml | xmllint --format -
+
+When it has started (status "ACTIVE"), copy the setup script over:
+
+scp tools/rackspace/remote-setup.sh root@$SERVER:remote-setup.sh
+
+Log in to and run the setup script (you will need to manually accept the
+Sun Java license):
+
+sh remote-setup.sh
+
+Once the script has completed, log out and create an image of the running
+instance (giving it a memorable name):
+
+curl -v -X POST -H X-Auth-Token:$AUTH_TOKEN -H 'Content-type: text/xml' -d @- $SERVER_MANAGEMENT_URL/images << EOF
+<image xmlns="http://docs.rackspacecloud.com/servers/api/v1.0" name="Apache Hadoop Ubuntu 8.10" serverId="$SERVER_ID" />
+EOF
+
+Keep a note of the image ID as this is what you will use to launch fresh
+instances from.
+
+You can check the status of the image with
+
+curl -H X-Auth-Token:$AUTH_TOKEN $SERVER_MANAGEMENT_URL/images/$IMAGE_ID.xml | xmllint --format -
+
+When it's "ACTIVE" is is ready for use. It's important to realize that you have
+to keep the server from which you generated the image running for as long as the
+image is in use.
+
+However, if you want to clean up an old instance run:
+
+curl -X DELETE -H X-Auth-Token:$AUTH_TOKEN $SERVER_MANAGEMENT_URL/servers/$SERVER_ID
+
+Similarly, you can delete old images:
+
+curl -X DELETE -H X-Auth-Token:$AUTH_TOKEN $SERVER_MANAGEMENT_URL/images/$IMAGE_ID
+
+

+ 45 - 0
src/contrib/cloud/build.xml

@@ -0,0 +1,45 @@
+<?xml version="1.0"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<project name="hadoop-cloud" default="test-py">
+  <property name="lib.dir" value="${basedir}/lib"/>
+  <path id="java.classpath">
+    <fileset dir="${lib.dir}">
+      <include name="**/*.jar" />
+    </fileset>
+  </path>
+  <path id="test.py.path">
+    <pathelement location="${basedir}/src/py"/>
+    <pathelement location="${basedir}/src/test/py"/>
+  </path>
+  <target name="test-py" description="Run python unit tests">
+    <taskdef name="py-test" classname="org.pyant.tasks.PythonTestTask">
+      <classpath refid="java.classpath" />
+    </taskdef>
+    <py-test python="python" pythonpathref="test.py.path" >
+      <fileset dir="${basedir}/src/test/py">
+        <include name="*.py"/>
+      </fileset>
+    </py-test>
+  </target>
+  <target name="compile"/>
+  <target name="package"/>
+  <target name="test" depends="test-py"/>
+  <target name="clean"/>
+</project>

+ 202 - 0
src/contrib/cloud/lib/pyAntTasks-1.3-LICENSE.txt

@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

BIN
src/contrib/cloud/lib/pyAntTasks-1.3.jar


+ 52 - 0
src/contrib/cloud/src/integration-test/create-ebs-snapshot.sh

@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script tests the "hadoop-ec2 create-formatted-snapshot" command.
+# The snapshot is deleted immediately afterwards.
+#
+# Example usage:
+# ./create-ebs-snapshot.sh
+#
+
+set -e
+set -x
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+WORKSPACE=${WORKSPACE:-`pwd`}
+CONFIG_DIR=${CONFIG_DIR:-$WORKSPACE/.hadoop-cloud}
+CLUSTER=${CLUSTER:-hadoop-cloud-$USER-test-cluster}
+AVAILABILITY_ZONE=${AVAILABILITY_ZONE:-us-east-1c}
+KEY_NAME=${KEY_NAME:-$USER}
+HADOOP_CLOUD_HOME=${HADOOP_CLOUD_HOME:-$bin/../py}
+HADOOP_CLOUD_PROVIDER=${HADOOP_CLOUD_PROVIDER:-ec2}
+SSH_OPTIONS=${SSH_OPTIONS:-"-i ~/.$HADOOP_CLOUD_PROVIDER/id_rsa-$KEY_NAME \
+  -o StrictHostKeyChecking=no"}
+
+HADOOP_CLOUD_SCRIPT=$HADOOP_CLOUD_HOME/hadoop-$HADOOP_CLOUD_PROVIDER
+
+$HADOOP_CLOUD_SCRIPT create-formatted-snapshot --config-dir=$CONFIG_DIR \
+  --key-name=$KEY_NAME --availability-zone=$AVAILABILITY_ZONE \
+  --ssh-options="$SSH_OPTIONS" \
+  $CLUSTER 1 > out.tmp
+
+snapshot_id=`grep 'Created snapshot' out.tmp | awk '{print $3}'`
+
+ec2-delete-snapshot $snapshot_id
+
+rm -f out.tmp

+ 30 - 0
src/contrib/cloud/src/integration-test/ebs-storage-spec.json

@@ -0,0 +1,30 @@
+{
+  "nn": [
+    {
+      "device": "/dev/sdj",
+      "mount_point": "/ebs1",
+      "size_gb": "7",
+      "snapshot_id": "snap-fe44bb97"
+    },
+    {
+      "device": "/dev/sdk",
+      "mount_point": "/ebs2",
+      "size_gb": "7",
+      "snapshot_id": "snap-fe44bb97"
+    }
+  ],
+  "dn": [
+    {
+      "device": "/dev/sdj",
+      "mount_point": "/ebs1",
+      "size_gb": "7",
+      "snapshot_id": "snap-fe44bb97"
+    },
+    {
+      "device": "/dev/sdk",
+      "mount_point": "/ebs2",
+      "size_gb": "7",
+      "snapshot_id": "snap-fe44bb97"
+    }
+  ]
+}

+ 122 - 0
src/contrib/cloud/src/integration-test/persistent-cluster.sh

@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script tests the Hadoop cloud scripts by running through a minimal
+# sequence of steps to start a persistent (EBS) cluster, run a job, then
+# shutdown the cluster.
+#
+# Example usage:
+# HADOOP_HOME=~/dev/hadoop-0.20.1/ ./persistent-cluster.sh
+#
+
+function wait_for_volume_detachment() {
+  set +e
+  set +x
+  while true; do
+    attached=`$HADOOP_CLOUD_SCRIPT list-storage --config-dir=$CONFIG_DIR \
+      $CLUSTER | awk '{print $6}' | grep 'attached'`
+    sleep 5
+    if [ -z "$attached" ]; then
+      break
+    fi
+  done
+  set -e
+  set -x
+}
+
+set -e
+set -x
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+WORKSPACE=${WORKSPACE:-`pwd`}
+CONFIG_DIR=${CONFIG_DIR:-$WORKSPACE/.hadoop-cloud}
+CLUSTER=${CLUSTER:-hadoop-cloud-ebs-$USER-test-cluster}
+IMAGE_ID=${IMAGE_ID:-ami-6159bf08} # default to Fedora 32-bit AMI
+AVAILABILITY_ZONE=${AVAILABILITY_ZONE:-us-east-1c}
+KEY_NAME=${KEY_NAME:-$USER}
+AUTO_SHUTDOWN=${AUTO_SHUTDOWN:-15}
+LOCAL_HADOOP_VERSION=${LOCAL_HADOOP_VERSION:-0.20.1}
+HADOOP_HOME=${HADOOP_HOME:-$WORKSPACE/hadoop-$LOCAL_HADOOP_VERSION}
+HADOOP_CLOUD_HOME=${HADOOP_CLOUD_HOME:-$bin/../py}
+HADOOP_CLOUD_PROVIDER=${HADOOP_CLOUD_PROVIDER:-ec2}
+SSH_OPTIONS=${SSH_OPTIONS:-"-i ~/.$HADOOP_CLOUD_PROVIDER/id_rsa-$KEY_NAME \
+  -o StrictHostKeyChecking=no"}
+
+HADOOP_CLOUD_SCRIPT=$HADOOP_CLOUD_HOME/hadoop-$HADOOP_CLOUD_PROVIDER
+export HADOOP_CONF_DIR=$CONFIG_DIR/$CLUSTER
+
+# Install Hadoop locally
+if [ ! -d $HADOOP_HOME ]; then
+  wget http://archive.apache.org/dist/hadoop/core/hadoop-\
+$LOCAL_HADOOP_VERSION/hadoop-$LOCAL_HADOOP_VERSION.tar.gz
+  tar zxf hadoop-$LOCAL_HADOOP_VERSION.tar.gz -C $WORKSPACE
+  rm hadoop-$LOCAL_HADOOP_VERSION.tar.gz
+fi
+
+# Create storage
+$HADOOP_CLOUD_SCRIPT create-storage --config-dir=$CONFIG_DIR \
+  --availability-zone=$AVAILABILITY_ZONE $CLUSTER nn 1 \
+  $bin/ebs-storage-spec.json
+$HADOOP_CLOUD_SCRIPT create-storage --config-dir=$CONFIG_DIR \
+  --availability-zone=$AVAILABILITY_ZONE $CLUSTER dn 1 \
+  $bin/ebs-storage-spec.json
+
+# Launch a cluster
+$HADOOP_CLOUD_SCRIPT launch-cluster --config-dir=$CONFIG_DIR \
+  --image-id=$IMAGE_ID --key-name=$KEY_NAME --auto-shutdown=$AUTO_SHUTDOWN \
+  --availability-zone=$AVAILABILITY_ZONE $CLIENT_CIDRS $ENVS $CLUSTER 1
+
+# Run a proxy and save its pid in HADOOP_CLOUD_PROXY_PID
+eval `$HADOOP_CLOUD_SCRIPT proxy --config-dir=$CONFIG_DIR \
+  --ssh-options="$SSH_OPTIONS" $CLUSTER`
+
+# Run a job and check it works
+$HADOOP_HOME/bin/hadoop fs -mkdir input
+$HADOOP_HOME/bin/hadoop fs -put $HADOOP_HOME/LICENSE.txt input
+$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-*-examples.jar grep \
+  input output Apache
+# following returns a non-zero exit code if no match
+$HADOOP_HOME/bin/hadoop fs -cat 'output/part-00000' | grep Apache
+
+# Shutdown the cluster
+kill $HADOOP_CLOUD_PROXY_PID
+$HADOOP_CLOUD_SCRIPT terminate-cluster --config-dir=$CONFIG_DIR --force $CLUSTER
+sleep 5 # wait for termination to take effect
+
+# Relaunch the cluster
+$HADOOP_CLOUD_SCRIPT launch-cluster --config-dir=$CONFIG_DIR \
+  --image-id=$IMAGE_ID --key-name=$KEY_NAME --auto-shutdown=$AUTO_SHUTDOWN \
+  --availability-zone=$AVAILABILITY_ZONE $CLIENT_CIDRS $ENVS $CLUSTER 1
+
+# Run a proxy and save its pid in HADOOP_CLOUD_PROXY_PID
+eval `$HADOOP_CLOUD_SCRIPT proxy --config-dir=$CONFIG_DIR \
+  --ssh-options="$SSH_OPTIONS" $CLUSTER`
+
+# Check output is still there
+$HADOOP_HOME/bin/hadoop fs -cat 'output/part-00000' | grep Apache
+
+# Shutdown the cluster
+kill $HADOOP_CLOUD_PROXY_PID
+$HADOOP_CLOUD_SCRIPT terminate-cluster --config-dir=$CONFIG_DIR --force $CLUSTER
+sleep 5 # wait for termination to take effect
+
+# Cleanup
+$HADOOP_CLOUD_SCRIPT delete-cluster --config-dir=$CONFIG_DIR $CLUSTER
+wait_for_volume_detachment
+$HADOOP_CLOUD_SCRIPT delete-storage --config-dir=$CONFIG_DIR --force $CLUSTER

+ 112 - 0
src/contrib/cloud/src/integration-test/transient-cluster.sh

@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script tests the Hadoop cloud scripts by running through a minimal
+# sequence of steps to start a cluster, run a job, then shutdown the cluster.
+#
+# Example usage:
+# HADOOP_HOME=~/dev/hadoop-0.20.1/ ./transient-cluster.sh
+#
+
+set -e
+set -x
+
+bin=`dirname "$0"`
+bin=`cd "$bin"; pwd`
+
+WORKSPACE=${WORKSPACE:-`pwd`}
+CONFIG_DIR=${CONFIG_DIR:-$WORKSPACE/.hadoop-cloud}
+CLUSTER=${CLUSTER:-hadoop-cloud-$USER-test-cluster}
+IMAGE_ID=${IMAGE_ID:-ami-6159bf08} # default to Fedora 32-bit AMI
+INSTANCE_TYPE=${INSTANCE_TYPE:-m1.small}
+AVAILABILITY_ZONE=${AVAILABILITY_ZONE:-us-east-1c}
+KEY_NAME=${KEY_NAME:-$USER}
+AUTO_SHUTDOWN=${AUTO_SHUTDOWN:-15}
+LOCAL_HADOOP_VERSION=${LOCAL_HADOOP_VERSION:-0.20.1}
+HADOOP_HOME=${HADOOP_HOME:-$WORKSPACE/hadoop-$LOCAL_HADOOP_VERSION}
+HADOOP_CLOUD_HOME=${HADOOP_CLOUD_HOME:-$bin/../py}
+HADOOP_CLOUD_PROVIDER=${HADOOP_CLOUD_PROVIDER:-ec2}
+PUBLIC_KEY=${PUBLIC_KEY:-~/.$HADOOP_CLOUD_PROVIDER/id_rsa-$KEY_NAME.pub}
+PRIVATE_KEY=${PRIVATE_KEY:-~/.$HADOOP_CLOUD_PROVIDER/id_rsa-$KEY_NAME}
+SSH_OPTIONS=${SSH_OPTIONS:-"-i $PRIVATE_KEY -o StrictHostKeyChecking=no"}
+LAUNCH_ARGS=${LAUNCH_ARGS:-"1 nn,snn,jt 1 dn,tt"}
+
+HADOOP_CLOUD_SCRIPT=$HADOOP_CLOUD_HOME/hadoop-cloud
+export HADOOP_CONF_DIR=$CONFIG_DIR/$CLUSTER
+
+# Install Hadoop locally
+if [ ! -d $HADOOP_HOME ]; then
+  wget http://archive.apache.org/dist/hadoop/core/hadoop-\
+$LOCAL_HADOOP_VERSION/hadoop-$LOCAL_HADOOP_VERSION.tar.gz
+  tar zxf hadoop-$LOCAL_HADOOP_VERSION.tar.gz -C $WORKSPACE
+  rm hadoop-$LOCAL_HADOOP_VERSION.tar.gz
+fi
+
+# Launch a cluster
+if [ $HADOOP_CLOUD_PROVIDER == 'ec2' ]; then
+  $HADOOP_CLOUD_SCRIPT launch-cluster \
+    --config-dir=$CONFIG_DIR \
+    --image-id=$IMAGE_ID \
+    --instance-type=$INSTANCE_TYPE \
+    --key-name=$KEY_NAME \
+    --auto-shutdown=$AUTO_SHUTDOWN \
+    --availability-zone=$AVAILABILITY_ZONE \
+    $CLIENT_CIDRS $ENVS $CLUSTER $LAUNCH_ARGS
+else
+  $HADOOP_CLOUD_SCRIPT launch-cluster --cloud-provider=$HADOOP_CLOUD_PROVIDER \
+    --config-dir=$CONFIG_DIR \
+    --image-id=$IMAGE_ID \
+    --instance-type=$INSTANCE_TYPE \
+    --public-key=$PUBLIC_KEY \
+    --private-key=$PRIVATE_KEY \
+    --auto-shutdown=$AUTO_SHUTDOWN \
+    $CLIENT_CIDRS $ENVS $CLUSTER $LAUNCH_ARGS
+fi
+  
+# List clusters
+$HADOOP_CLOUD_SCRIPT list --cloud-provider=$HADOOP_CLOUD_PROVIDER \
+  --config-dir=$CONFIG_DIR
+$HADOOP_CLOUD_SCRIPT list --cloud-provider=$HADOOP_CLOUD_PROVIDER \
+  --config-dir=$CONFIG_DIR $CLUSTER
+
+# Run a proxy and save its pid in HADOOP_CLOUD_PROXY_PID
+eval `$HADOOP_CLOUD_SCRIPT proxy --cloud-provider=$HADOOP_CLOUD_PROVIDER \
+  --config-dir=$CONFIG_DIR \
+  --ssh-options="$SSH_OPTIONS" $CLUSTER`
+  
+if [ $HADOOP_CLOUD_PROVIDER == 'rackspace' ]; then
+  # Need to update /etc/hosts (interactively)
+  $HADOOP_CLOUD_SCRIPT list --cloud-provider=$HADOOP_CLOUD_PROVIDER \
+    --config-dir=$CONFIG_DIR $CLUSTER | grep 'nn,snn,jt' \
+    | awk '{print $4 " " $3 }'  | sudo tee -a /etc/hosts
+fi
+
+# Run a job and check it works
+$HADOOP_HOME/bin/hadoop fs -mkdir input
+$HADOOP_HOME/bin/hadoop fs -put $HADOOP_HOME/LICENSE.txt input
+$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-*-examples.jar grep \
+  input output Apache
+# following returns a non-zero exit code if no match
+$HADOOP_HOME/bin/hadoop fs -cat 'output/part-00000' | grep Apache
+
+# Shutdown the cluster
+kill $HADOOP_CLOUD_PROXY_PID
+$HADOOP_CLOUD_SCRIPT terminate-cluster --cloud-provider=$HADOOP_CLOUD_PROVIDER \
+  --config-dir=$CONFIG_DIR --force $CLUSTER
+sleep 5 # wait for termination to take effect
+$HADOOP_CLOUD_SCRIPT delete-cluster --cloud-provider=$HADOOP_CLOUD_PROVIDER \
+  --config-dir=$CONFIG_DIR $CLUSTER

+ 21 - 0
src/contrib/cloud/src/py/hadoop-cloud

@@ -0,0 +1,21 @@
+#!/usr/bin/env python2.5
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from hadoop.cloud.cli import main
+
+if __name__ == "__main__":
+  main()

+ 21 - 0
src/contrib/cloud/src/py/hadoop-ec2

@@ -0,0 +1,21 @@
+#!/usr/bin/env python2.5
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from hadoop.cloud.cli import main
+
+if __name__ == "__main__":
+  main()

+ 14 - 0
src/contrib/cloud/src/py/hadoop/__init__.py

@@ -0,0 +1,14 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 15 - 0
src/contrib/cloud/src/py/hadoop/cloud/__init__.py

@@ -0,0 +1,15 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+VERSION="0.22.0"

+ 438 - 0
src/contrib/cloud/src/py/hadoop/cloud/cli.py

@@ -0,0 +1,438 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import with_statement
+
+import ConfigParser
+from hadoop.cloud import VERSION
+from hadoop.cloud.cluster import get_cluster
+from hadoop.cloud.service import get_service
+from hadoop.cloud.service import InstanceTemplate
+from hadoop.cloud.service import NAMENODE
+from hadoop.cloud.service import SECONDARY_NAMENODE
+from hadoop.cloud.service import JOBTRACKER
+from hadoop.cloud.service import DATANODE
+from hadoop.cloud.service import TASKTRACKER
+from hadoop.cloud.util import merge_config_with_options
+from hadoop.cloud.util import xstr
+import logging
+from optparse import OptionParser
+from optparse import make_option
+import os
+import sys
+
+DEFAULT_SERVICE_NAME = 'hadoop'
+DEFAULT_CLOUD_PROVIDER = 'ec2'
+
+DEFAULT_CONFIG_DIR_NAME = '.hadoop-cloud'
+DEFAULT_CONFIG_DIR = os.path.join(os.environ['HOME'], DEFAULT_CONFIG_DIR_NAME)
+CONFIG_FILENAME = 'clusters.cfg'
+
+CONFIG_DIR_OPTION = \
+  make_option("--config-dir", metavar="CONFIG-DIR",
+    help="The configuration directory.")
+
+PROVIDER_OPTION = \
+  make_option("--cloud-provider", metavar="PROVIDER",
+    help="The cloud provider, e.g. 'ec2' for Amazon EC2.")
+
+BASIC_OPTIONS = [
+  CONFIG_DIR_OPTION,
+  PROVIDER_OPTION,
+]
+
+LAUNCH_OPTIONS = [
+  CONFIG_DIR_OPTION,
+  PROVIDER_OPTION,
+  make_option("-a", "--ami", metavar="AMI",
+    help="The AMI ID of the image to launch. (Amazon EC2 only. Deprecated, use \
+--image-id.)"),
+  make_option("-e", "--env", metavar="ENV", action="append",
+    help="An environment variable to pass to instances. \
+(May be specified multiple times.)"),
+  make_option("-f", "--user-data-file", metavar="URL",
+    help="The URL of the file containing user data to be made available to \
+instances."),
+  make_option("--image-id", metavar="ID",
+    help="The ID of the image to launch."),
+  make_option("-k", "--key-name", metavar="KEY-PAIR",
+    help="The key pair to use when launching instances. (Amazon EC2 only.)"),
+  make_option("-p", "--user-packages", metavar="PACKAGES",
+    help="A space-separated list of packages to install on instances on start \
+up."),
+  make_option("-t", "--instance-type", metavar="TYPE",
+    help="The type of instance to be launched. One of m1.small, m1.large, \
+m1.xlarge, c1.medium, or c1.xlarge."),
+  make_option("-z", "--availability-zone", metavar="ZONE",
+    help="The availability zone to run the instances in."),
+  make_option("--auto-shutdown", metavar="TIMEOUT_MINUTES",
+    help="The time in minutes after launch when an instance will be \
+automatically shut down."),
+  make_option("--client-cidr", metavar="CIDR", action="append",
+    help="The CIDR of the client, which is used to allow access through the \
+firewall to the master node. (May be specified multiple times.)"),
+  make_option("--security-group", metavar="SECURITY_GROUP", action="append",
+    default=[], help="Additional security groups within which the instances \
+should be run. (Amazon EC2 only.) (May be specified multiple times.)"),
+  make_option("--public-key", metavar="FILE",
+    help="The public key to authorize on launching instances. (Non-EC2 \
+providers only.)"),
+  make_option("--private-key", metavar="FILE",
+    help="The private key to use when connecting to instances. (Non-EC2 \
+providers only.)"),
+]
+
+SNAPSHOT_OPTIONS = [
+  CONFIG_DIR_OPTION,
+  PROVIDER_OPTION,
+  make_option("-k", "--key-name", metavar="KEY-PAIR",
+    help="The key pair to use when launching instances."),
+  make_option("-z", "--availability-zone", metavar="ZONE",
+    help="The availability zone to run the instances in."),
+  make_option("--ssh-options", metavar="SSH-OPTIONS",
+    help="SSH options to use."),
+]
+
+PLACEMENT_OPTIONS = [
+  CONFIG_DIR_OPTION,
+  PROVIDER_OPTION,
+  make_option("-z", "--availability-zone", metavar="ZONE",
+    help="The availability zone to run the instances in."),
+]
+
+FORCE_OPTIONS = [
+  CONFIG_DIR_OPTION,
+  PROVIDER_OPTION,
+  make_option("--force", action="store_true", default=False,
+  help="Do not ask for confirmation."),
+]
+
+SSH_OPTIONS = [
+  CONFIG_DIR_OPTION,
+  PROVIDER_OPTION,
+  make_option("--ssh-options", metavar="SSH-OPTIONS",
+    help="SSH options to use."),
+]
+
+def print_usage(script):
+  print """Usage: %(script)s COMMAND [OPTIONS]
+where COMMAND and [OPTIONS] may be one of:
+  list [CLUSTER]                      list all running Hadoop clusters
+                                        or instances in CLUSTER
+  launch-master CLUSTER               launch or find a master in CLUSTER
+  launch-slaves CLUSTER NUM_SLAVES    launch NUM_SLAVES slaves in CLUSTER
+  launch-cluster CLUSTER (NUM_SLAVES| launch a master and NUM_SLAVES slaves or
+    N ROLE [N ROLE ...])                N instances in ROLE in CLUSTER
+  create-formatted-snapshot CLUSTER   create an empty, formatted snapshot of
+    SIZE                                size SIZE GiB
+  list-storage CLUSTER                list storage volumes for CLUSTER
+  create-storage CLUSTER ROLE         create volumes for NUM_INSTANCES instances
+    NUM_INSTANCES SPEC_FILE             in ROLE for CLUSTER, using SPEC_FILE
+  attach-storage ROLE                 attach storage volumes for ROLE to CLUSTER
+  login CLUSTER                       log in to the master in CLUSTER over SSH
+  proxy CLUSTER                       start a SOCKS proxy on localhost into the
+                                        CLUSTER
+  push CLUSTER FILE                   scp FILE to the master in CLUSTER
+  exec CLUSTER CMD                    execute CMD on the master in CLUSTER
+  terminate-cluster CLUSTER           terminate all instances in CLUSTER
+  delete-cluster CLUSTER              delete the group information for CLUSTER
+  delete-storage CLUSTER              delete all storage volumes for CLUSTER
+  update-slaves-file CLUSTER          update the slaves file on the CLUSTER
+                                        master
+
+Use %(script)s COMMAND --help to see additional options for specific commands.
+""" % locals()
+
+def print_deprecation(script, replacement):
+  print "Deprecated. Use '%(script)s %(replacement)s'." % locals()
+
+def parse_options_and_config(command, option_list=[], extra_arguments=(),
+                             unbounded_args=False):
+  """
+  Parse the arguments to command using the given option list, and combine with
+  any configuration parameters.
+
+  If unbounded_args is true then there must be at least as many extra arguments
+  as specified by extra_arguments (the first argument is always CLUSTER).
+  Otherwise there must be exactly the same number of arguments as
+  extra_arguments.
+  """
+  expected_arguments = ["CLUSTER",]
+  expected_arguments.extend(extra_arguments)
+  (options_dict, args) = parse_options(command, option_list, expected_arguments,
+                                       unbounded_args)
+
+  config_dir = get_config_dir(options_dict)
+  config_files = [os.path.join(config_dir, CONFIG_FILENAME)]
+  if 'config_dir' not in options_dict:
+    # if config_dir not set, then also search in current directory
+    config_files.insert(0, CONFIG_FILENAME)
+
+  config = ConfigParser.ConfigParser()
+  read_files = config.read(config_files)
+  logging.debug("Read %d configuration files: %s", len(read_files),
+                ", ".join(read_files))
+  cluster_name = args[0]
+  opt = merge_config_with_options(cluster_name, config, options_dict)
+  logging.debug("Options: %s", str(opt))
+  service_name = get_service_name(opt)
+  cloud_provider = get_cloud_provider(opt)
+  cluster = get_cluster(cloud_provider)(cluster_name, config_dir)
+  service = get_service(service_name, cloud_provider)(cluster)
+  return (opt, args, service)
+
+def parse_options(command, option_list=[], expected_arguments=(),
+                  unbounded_args=False):
+  """
+  Parse the arguments to command using the given option list.
+
+  If unbounded_args is true then there must be at least as many extra arguments
+  as specified by extra_arguments (the first argument is always CLUSTER).
+  Otherwise there must be exactly the same number of arguments as
+  extra_arguments.
+  """
+
+  config_file_name = "%s/%s" % (DEFAULT_CONFIG_DIR_NAME, CONFIG_FILENAME)
+  usage = """%%prog %s [options] %s
+
+Options may also be specified in a configuration file called
+%s located in the user's home directory.
+Options specified on the command line take precedence over any in the
+configuration file.""" % (command, " ".join(expected_arguments),
+                          config_file_name)
+  parser = OptionParser(usage=usage, version="%%prog %s" % VERSION,
+                        option_list=option_list)
+  parser.disable_interspersed_args()
+  (options, args) = parser.parse_args(sys.argv[2:])
+  if unbounded_args:
+    if len(args) < len(expected_arguments):
+      parser.error("incorrect number of arguments")
+  elif len(args) != len(expected_arguments):
+    parser.error("incorrect number of arguments")
+  return (vars(options), args)
+
+def get_config_dir(options_dict):
+  config_dir = options_dict.get('config_dir')
+  if not config_dir:
+    config_dir = DEFAULT_CONFIG_DIR
+  return config_dir
+
+def get_service_name(options_dict):
+  service_name = options_dict.get("service", None)
+  if service_name is None:
+    service_name = DEFAULT_SERVICE_NAME
+  return service_name
+
+def get_cloud_provider(options_dict):
+  provider = options_dict.get("cloud_provider", None)
+  if provider is None:
+    provider = DEFAULT_CLOUD_PROVIDER
+  return provider
+
+def check_options_set(options, option_names):
+  for option_name in option_names:
+    if options.get(option_name) is None:
+      print "Option '%s' is missing. Aborting." % option_name
+      sys.exit(1)
+
+def check_launch_options_set(cluster, options):
+  if cluster.get_provider_code() == 'ec2':
+    if options.get('ami') is None and options.get('image_id') is None:
+      print "One of ami or image_id must be specified. Aborting."
+      sys.exit(1)
+    check_options_set(options, ['key_name'])
+  else:
+    check_options_set(options, ['image_id', 'public_key'])
+
+def get_image_id(cluster, options):
+  if cluster.get_provider_code() == 'ec2':
+    return options.get('image_id', options.get('ami'))
+  else:
+    return options.get('image_id')
+
+def main():
+  # Use HADOOP_CLOUD_LOGGING_LEVEL=DEBUG to enable debugging output.
+  logging.basicConfig(level=getattr(logging,
+                                    os.getenv("HADOOP_CLOUD_LOGGING_LEVEL",
+                                              "INFO")))
+
+  if len(sys.argv) < 2:
+    print_usage(sys.argv[0])
+    sys.exit(1)
+
+  command = sys.argv[1]
+
+  if command == 'list':
+    (opt, args) = parse_options(command, BASIC_OPTIONS, unbounded_args=True)
+    if len(args) == 0:
+      service_name = get_service_name(opt)
+      cloud_provider = get_cloud_provider(opt)
+      service = get_service(service_name, cloud_provider)(None)
+      service.list_all(cloud_provider)
+    else:
+      (opt, args, service) = parse_options_and_config(command, BASIC_OPTIONS)
+      service.list()
+
+  elif command == 'launch-master':
+    (opt, args, service) = parse_options_and_config(command, LAUNCH_OPTIONS)
+    check_launch_options_set(service.cluster, opt)
+    config_dir = get_config_dir(opt)
+    template = InstanceTemplate((NAMENODE, SECONDARY_NAMENODE, JOBTRACKER), 1,
+                         get_image_id(service.cluster, opt),
+                         opt.get('instance_type'), opt.get('key_name'),
+                         opt.get('public_key'), opt.get('private_key'),
+                         opt.get('user_data_file'),
+                         opt.get('availability_zone'), opt.get('user_packages'),
+                         opt.get('auto_shutdown'), opt.get('env'),
+                         opt.get('security_group'))
+    service.launch_master(template, config_dir, opt.get('client_cidr'))
+
+  elif command == 'launch-slaves':
+    (opt, args, service) = parse_options_and_config(command, LAUNCH_OPTIONS,
+                                                    ("NUM_SLAVES",))
+    number_of_slaves = int(args[1])
+    check_launch_options_set(service.cluster, opt)
+    template = InstanceTemplate((DATANODE, TASKTRACKER), number_of_slaves,
+                         get_image_id(service.cluster, opt),
+                         opt.get('instance_type'), opt.get('key_name'),
+                         opt.get('public_key'), opt.get('private_key'),
+                         opt.get('user_data_file'),
+                         opt.get('availability_zone'), opt.get('user_packages'),
+                         opt.get('auto_shutdown'), opt.get('env'),
+                         opt.get('security_group'))
+    service.launch_slaves(template)
+
+  elif command == 'launch-cluster':
+    (opt, args, service) = parse_options_and_config(command, LAUNCH_OPTIONS,
+                                                    ("NUM_SLAVES",),
+                                                    unbounded_args=True)
+    check_launch_options_set(service.cluster, opt)
+    config_dir = get_config_dir(opt)
+    instance_templates = []
+    if len(args) == 2:
+      number_of_slaves = int(args[1])
+      print_deprecation(sys.argv[0], 'launch-cluster %s 1 nn,snn,jt %s dn,tt' %
+                        (service.cluster.name, number_of_slaves))
+      instance_templates = [
+        InstanceTemplate((NAMENODE, SECONDARY_NAMENODE, JOBTRACKER), 1,
+                         get_image_id(service.cluster, opt),
+                         opt.get('instance_type'), opt.get('key_name'),
+                         opt.get('public_key'), opt.get('private_key'),
+                         opt.get('user_data_file'),
+                         opt.get('availability_zone'), opt.get('user_packages'),
+                         opt.get('auto_shutdown'), opt.get('env'),
+                         opt.get('security_group')),
+        InstanceTemplate((DATANODE, TASKTRACKER), number_of_slaves,
+                         get_image_id(service.cluster, opt),
+                         opt.get('instance_type'), opt.get('key_name'),
+                         opt.get('public_key'), opt.get('private_key'),
+                         opt.get('user_data_file'),
+                         opt.get('availability_zone'), opt.get('user_packages'),
+                         opt.get('auto_shutdown'), opt.get('env'),
+                         opt.get('security_group')),
+                         ]
+    elif len(args) > 2 and len(args) % 2 == 0:
+      print_usage(sys.argv[0])
+      sys.exit(1)
+    else:
+      for i in range(len(args) / 2):
+        number = int(args[2 * i + 1])
+        roles = args[2 * i + 2].split(",")
+        instance_templates.append(
+          InstanceTemplate(roles, number, get_image_id(service.cluster, opt),
+                           opt.get('instance_type'), opt.get('key_name'),
+                           opt.get('public_key'), opt.get('private_key'),
+                           opt.get('user_data_file'),
+                           opt.get('availability_zone'),
+                           opt.get('user_packages'),
+                           opt.get('auto_shutdown'), opt.get('env'),
+                           opt.get('security_group')))
+
+    service.launch_cluster(instance_templates, config_dir,
+                           opt.get('client_cidr'))
+
+  elif command == 'login':
+    (opt, args, service) = parse_options_and_config(command, SSH_OPTIONS)
+    service.login(opt.get('ssh_options'))
+
+  elif command == 'proxy':
+    (opt, args, service) = parse_options_and_config(command, SSH_OPTIONS)
+    service.proxy(opt.get('ssh_options'))
+
+  elif command == 'push':
+    (opt, args, service) = parse_options_and_config(command, SSH_OPTIONS,
+                                                    ("FILE",))
+    service.push(opt.get('ssh_options'), args[1])
+
+  elif command == 'exec':
+    (opt, args, service) = parse_options_and_config(command, SSH_OPTIONS,
+                                                    ("CMD",), True)
+    service.execute(opt.get('ssh_options'), args[1:])
+
+  elif command == 'terminate-cluster':
+    (opt, args, service) = parse_options_and_config(command, FORCE_OPTIONS)
+    service.terminate_cluster(opt["force"])
+
+  elif command == 'delete-cluster':
+    (opt, args, service) = parse_options_and_config(command, BASIC_OPTIONS)
+    service.delete_cluster()
+
+  elif command == 'create-formatted-snapshot':
+    (opt, args, service) = parse_options_and_config(command, SNAPSHOT_OPTIONS,
+                                                    ("SIZE",))
+    size = int(args[1])
+    check_options_set(opt, ['availability_zone', 'key_name'])
+    ami_ubuntu_intrepid_x86 = 'ami-ec48af85' # use a general AMI
+    service.create_formatted_snapshot(size,
+                                         opt.get('availability_zone'),
+                                         ami_ubuntu_intrepid_x86,
+                                         opt.get('key_name'),
+                                         xstr(opt.get('ssh_options')))
+
+  elif command == 'list-storage':
+    (opt, args, service) = parse_options_and_config(command, BASIC_OPTIONS)
+    service.list_storage()
+
+  elif command == 'create-storage':
+    (opt, args, service) = parse_options_and_config(command, PLACEMENT_OPTIONS,
+                                                    ("ROLE", "NUM_INSTANCES",
+                                                     "SPEC_FILE"))
+    role = args[1]
+    number_of_instances = int(args[2])
+    spec_file = args[3]
+    check_options_set(opt, ['availability_zone'])
+    service.create_storage(role, number_of_instances,
+                           opt.get('availability_zone'), spec_file)
+
+  elif command == 'attach-storage':
+    (opt, args, service) = parse_options_and_config(command, BASIC_OPTIONS,
+                                                    ("ROLE",))
+    service.attach_storage(args[1])
+
+  elif command == 'delete-storage':
+    (opt, args, service) = parse_options_and_config(command, FORCE_OPTIONS)
+    service.delete_storage(opt["force"])
+
+  elif command == 'update-slaves-file':
+    (opt, args, service) = parse_options_and_config(command, SSH_OPTIONS)
+    check_options_set(opt, ['private_key'])
+    ssh_options = xstr(opt.get('ssh_options'))
+    config_dir = get_config_dir(opt)
+    service.update_slaves_file(config_dir, ssh_options, opt.get('private_key'))
+
+  else:
+    print "Unrecognized command '%s'" % command
+    print_usage(sys.argv[0])
+    sys.exit(1)

+ 187 - 0
src/contrib/cloud/src/py/hadoop/cloud/cluster.py

@@ -0,0 +1,187 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Classes for controlling a cluster of cloud instances.
+"""
+
+from __future__ import with_statement
+
+import gzip
+import StringIO
+import urllib
+
+from hadoop.cloud.storage import Storage
+
+CLUSTER_PROVIDER_MAP = {
+  "dummy": ('hadoop.cloud.providers.dummy', 'DummyCluster'),
+  "ec2": ('hadoop.cloud.providers.ec2', 'Ec2Cluster'),
+  "rackspace": ('hadoop.cloud.providers.rackspace', 'RackspaceCluster'),
+}
+
+def get_cluster(provider):
+  """
+  Retrieve the Cluster class for a provider.
+  """
+  mod_name, driver_name = CLUSTER_PROVIDER_MAP[provider]
+  _mod = __import__(mod_name, globals(), locals(), [driver_name])
+  return getattr(_mod, driver_name)
+
+class Cluster(object):
+  """
+  A cluster of server instances. A cluster has a unique name.
+  One may launch instances which run in a certain role.
+  """
+
+  def __init__(self, name, config_dir):
+    self.name = name
+    self.config_dir = config_dir
+
+  def get_provider_code(self):
+    """
+    The code that uniquely identifies the cloud provider.
+    """
+    raise Exception("Unimplemented")
+
+  def authorize_role(self, role, from_port, to_port, cidr_ip):
+    """
+    Authorize access to machines in a given role from a given network.
+    """
+    pass
+
+  def get_instances_in_role(self, role, state_filter=None):
+    """
+    Get all the instances in a role, filtered by state.
+
+    @param role: the name of the role
+    @param state_filter: the state that the instance should be in
+    (e.g. "running"), or None for all states
+    """
+    raise Exception("Unimplemented")
+
+  def print_status(self, roles=None, state_filter="running"):
+    """
+    Print the status of instances in the given roles, filtered by state.
+    """
+    pass
+
+  def check_running(self, role, number):
+    """
+    Check that a certain number of instances in a role are running.
+    """
+    instances = self.get_instances_in_role(role, "running")
+    if len(instances) != number:
+      print "Expected %s instances in role %s, but was %s %s" % \
+        (number, role, len(instances), instances)
+      return False
+    else:
+      return instances
+
+  def launch_instances(self, roles, number, image_id, size_id,
+                       instance_user_data, **kwargs):
+    """
+    Launch instances (having the given roles) in the cluster.
+    Returns a list of IDs for the instances started.
+    """
+    pass
+
+  def wait_for_instances(self, instance_ids, timeout=600):
+    """
+    Wait for instances to start.
+    Raise TimeoutException if the timeout is exceeded.
+    """
+    pass
+
+  def terminate(self):
+    """
+    Terminate all instances in the cluster.
+    """
+    pass
+
+  def delete(self):
+    """
+    Delete the cluster permanently. This operation is only permitted if no
+    instances are running.
+    """
+    pass
+
+  def get_storage(self):
+    """
+    Return the external storage for the cluster.
+    """
+    return Storage(self)
+
+class InstanceUserData(object):
+  """
+  The data passed to an instance on start up.
+  """
+
+  def __init__(self, filename, replacements={}):
+    self.filename = filename
+    self.replacements = replacements
+
+  def _read_file(self, filename):
+    """
+    Read the user data.
+    """
+    return urllib.urlopen(filename).read()
+
+  def read(self):
+    """
+    Read the user data, making replacements.
+    """
+    contents = self._read_file(self.filename)
+    for (match, replacement) in self.replacements.iteritems():
+      if replacement == None:
+        replacement = ''
+      contents = contents.replace(match, replacement)
+    return contents
+
+  def read_as_gzip_stream(self):
+    """
+    Read and compress the data.
+    """
+    output = StringIO.StringIO()
+    compressed = gzip.GzipFile(mode='wb', fileobj=output)
+    compressed.write(self.read())
+    compressed.close()
+    return output.getvalue()
+
+class Instance(object):
+  """
+  A server instance.
+  """
+  def __init__(self, id, public_ip, private_ip):
+    self.id = id
+    self.public_ip = public_ip
+    self.private_ip = private_ip
+
+class RoleSyntaxException(Exception):
+  """
+  Raised when a role name is invalid. Role names may consist of a sequence
+  of alphanumeric characters and underscores. Dashes are not permitted in role
+  names.
+  """
+  def __init__(self, message):
+    super(RoleSyntaxException, self).__init__()
+    self.message = message
+  def __str__(self):
+    return repr(self.message)
+
+class TimeoutException(Exception):
+  """
+  Raised when a timeout is exceeded.
+  """
+  pass

+ 459 - 0
src/contrib/cloud/src/py/hadoop/cloud/data/boot-rackspace.sh

@@ -0,0 +1,459 @@
+#!/bin/bash -x
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+# Script that is run on each instance on boot.
+################################################################################
+
+################################################################################
+# Initialize variables
+################################################################################
+SELF_HOST=`/sbin/ifconfig eth0 | grep 'inet addr:' | cut -d: -f2 | awk '{ print $1}'`
+HADOOP_VERSION=${HADOOP_VERSION:-0.20.1}
+HADOOP_HOME=/usr/local/hadoop-$HADOOP_VERSION
+HADOOP_CONF_DIR=$HADOOP_HOME/conf
+for role in $(echo "$ROLES" | tr "," "\n"); do
+  case $role in
+  nn)
+    NN_HOST=$SELF_HOST
+    ;;
+  jt)
+    JT_HOST=$SELF_HOST
+    ;;
+  esac
+done
+
+function register_auto_shutdown() {
+  if [ ! -z "$AUTO_SHUTDOWN" ]; then
+    shutdown -h +$AUTO_SHUTDOWN >/dev/null &
+  fi
+}
+
+function update_repo() {
+  if which dpkg &> /dev/null; then
+    sudo apt-get update
+  elif which rpm &> /dev/null; then
+    yum update -y yum
+  fi
+}
+
+# Install a list of packages on debian or redhat as appropriate
+function install_packages() {
+  if which dpkg &> /dev/null; then
+    apt-get update
+    apt-get -y install $@
+  elif which rpm &> /dev/null; then
+    yum install -y $@
+  else
+    echo "No package manager found."
+  fi
+}
+
+# Install any user packages specified in the USER_PACKAGES environment variable
+function install_user_packages() {
+  if [ ! -z "$USER_PACKAGES" ]; then
+    install_packages $USER_PACKAGES
+  fi
+}
+
+function install_hadoop() {
+  useradd hadoop
+
+  hadoop_tar_url=http://s3.amazonaws.com/hadoop-releases/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
+  hadoop_tar_file=`basename $hadoop_tar_url`
+  hadoop_tar_md5_file=`basename $hadoop_tar_url.md5`
+
+  curl="curl --retry 3 --silent --show-error --fail"
+  for i in `seq 1 3`;
+  do
+    $curl -O $hadoop_tar_url
+    $curl -O $hadoop_tar_url.md5
+    if md5sum -c $hadoop_tar_md5_file; then
+      break;
+    else
+      rm -f $hadoop_tar_file $hadoop_tar_md5_file
+    fi
+  done
+
+  if [ ! -e $hadoop_tar_file ]; then
+    echo "Failed to download $hadoop_tar_url. Aborting."
+    exit 1
+  fi
+
+  tar zxf $hadoop_tar_file -C /usr/local
+  rm -f $hadoop_tar_file $hadoop_tar_md5_file
+
+  echo "export HADOOP_HOME=$HADOOP_HOME" >> ~root/.bashrc
+  echo 'export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$PATH' >> ~root/.bashrc
+}
+
+function prep_disk() {
+  mount=$1
+  device=$2
+  automount=${3:-false}
+
+  echo "warning: ERASING CONTENTS OF $device"
+  mkfs.xfs -f $device
+  if [ ! -e $mount ]; then
+    mkdir $mount
+  fi
+  mount -o defaults,noatime $device $mount
+  if $automount ; then
+    echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab
+  fi
+}
+
+function wait_for_mount {
+  mount=$1
+  device=$2
+
+  mkdir $mount
+
+  i=1
+  echo "Attempting to mount $device"
+  while true ; do
+    sleep 10
+    echo -n "$i "
+    i=$[$i+1]
+    mount -o defaults,noatime $device $mount || continue
+    echo " Mounted."
+    break;
+  done
+}
+
+function make_hadoop_dirs {
+  for mount in "$@"; do
+    if [ ! -e $mount/hadoop ]; then
+      mkdir -p $mount/hadoop
+      chown hadoop:hadoop $mount/hadoop
+    fi
+  done
+}
+
+# Configure Hadoop by setting up disks and site file
+function configure_hadoop() {
+
+  MOUNT=/data
+  FIRST_MOUNT=$MOUNT
+  DFS_NAME_DIR=$MOUNT/hadoop/hdfs/name
+  FS_CHECKPOINT_DIR=$MOUNT/hadoop/hdfs/secondary
+  DFS_DATA_DIR=$MOUNT/hadoop/hdfs/data
+  MAPRED_LOCAL_DIR=$MOUNT/hadoop/mapred/local
+  MAX_MAP_TASKS=2
+  MAX_REDUCE_TASKS=1
+  CHILD_OPTS=-Xmx550m
+  CHILD_ULIMIT=1126400
+  TMP_DIR=$MOUNT/tmp/hadoop-\${user.name}
+
+  mkdir -p $MOUNT/hadoop
+  chown hadoop:hadoop $MOUNT/hadoop
+  mkdir $MOUNT/tmp
+  chmod a+rwxt $MOUNT/tmp
+
+  mkdir /etc/hadoop
+  ln -s $HADOOP_CONF_DIR /etc/hadoop/conf
+
+  ##############################################################################
+  # Modify this section to customize your Hadoop cluster.
+  ##############################################################################
+  cat > $HADOOP_CONF_DIR/hadoop-site.xml <<EOF
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+<property>
+  <name>dfs.block.size</name>
+  <value>134217728</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.data.dir</name>
+  <value>$DFS_DATA_DIR</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.datanode.du.reserved</name>
+  <value>1073741824</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.datanode.handler.count</name>
+  <value>3</value>
+  <final>true</final>
+</property>
+<!--property>
+  <name>dfs.hosts</name>
+  <value>$HADOOP_CONF_DIR/dfs.hosts</value>
+  <final>true</final>
+</property-->
+<!--property>
+  <name>dfs.hosts.exclude</name>
+  <value>$HADOOP_CONF_DIR/dfs.hosts.exclude</value>
+  <final>true</final>
+</property-->
+<property>
+  <name>dfs.name.dir</name>
+  <value>$DFS_NAME_DIR</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.namenode.handler.count</name>
+  <value>5</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.permissions</name>
+  <value>true</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.replication</name>
+  <value>$DFS_REPLICATION</value>
+</property>
+<property>
+  <name>fs.checkpoint.dir</name>
+  <value>$FS_CHECKPOINT_DIR</value>
+  <final>true</final>
+</property>
+<property>
+  <name>fs.default.name</name>
+  <value>hdfs://$NN_HOST:8020/</value>
+</property>
+<property>
+  <name>fs.trash.interval</name>
+  <value>1440</value>
+  <final>true</final>
+</property>
+<property>
+  <name>hadoop.tmp.dir</name>
+  <value>/data/tmp/hadoop-\${user.name}</value>
+  <final>true</final>
+</property>
+<property>
+  <name>io.file.buffer.size</name>
+  <value>65536</value>
+</property>
+<property>
+  <name>mapred.child.java.opts</name>
+  <value>$CHILD_OPTS</value>
+</property>
+<property>
+  <name>mapred.child.ulimit</name>
+  <value>$CHILD_ULIMIT</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.job.tracker</name>
+  <value>$JT_HOST:8021</value>
+</property>
+<property>
+  <name>mapred.job.tracker.handler.count</name>
+  <value>5</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.local.dir</name>
+  <value>$MAPRED_LOCAL_DIR</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.map.tasks.speculative.execution</name>
+  <value>true</value>
+</property>
+<property>
+  <name>mapred.reduce.parallel.copies</name>
+  <value>10</value>
+</property>
+<property>
+  <name>mapred.reduce.tasks</name>
+  <value>10</value>
+</property>
+<property>
+  <name>mapred.reduce.tasks.speculative.execution</name>
+  <value>false</value>
+</property>
+<property>
+  <name>mapred.submit.replication</name>
+  <value>10</value>
+</property>
+<property>
+  <name>mapred.system.dir</name>
+  <value>/hadoop/system/mapred</value>
+</property>
+<property>
+  <name>mapred.tasktracker.map.tasks.maximum</name>
+  <value>$MAX_MAP_TASKS</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.tasktracker.reduce.tasks.maximum</name>
+  <value>$MAX_REDUCE_TASKS</value>
+  <final>true</final>
+</property>
+<property>
+  <name>tasktracker.http.threads</name>
+  <value>46</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.compress.map.output</name>
+  <value>true</value>
+</property>
+<property>
+  <name>mapred.output.compression.type</name>
+  <value>BLOCK</value>
+</property>
+<property>
+  <name>hadoop.rpc.socket.factory.class.default</name>
+  <value>org.apache.hadoop.net.StandardSocketFactory</value>
+  <final>true</final>
+</property>
+<property>
+  <name>hadoop.rpc.socket.factory.class.ClientProtocol</name>
+  <value></value>
+  <final>true</final>
+</property>
+<property>
+  <name>hadoop.rpc.socket.factory.class.JobSubmissionProtocol</name>
+  <value></value>
+  <final>true</final>
+</property>
+<property>
+  <name>io.compression.codecs</name>
+  <value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec</value>
+</property>
+</configuration>
+EOF
+
+  # Keep PID files in a non-temporary directory
+  sed -i -e "s|# export HADOOP_PID_DIR=.*|export HADOOP_PID_DIR=/var/run/hadoop|" \
+    $HADOOP_CONF_DIR/hadoop-env.sh
+  mkdir -p /var/run/hadoop
+  chown -R hadoop:hadoop /var/run/hadoop
+
+  # Set SSH options within the cluster
+  sed -i -e 's|# export HADOOP_SSH_OPTS=.*|export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no"|' \
+    $HADOOP_CONF_DIR/hadoop-env.sh
+    
+  # Disable IPv6
+  sed -i -e 's|# export HADOOP_OPTS=.*|export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true"|' \
+    $HADOOP_CONF_DIR/hadoop-env.sh
+
+  # Hadoop logs should be on the /mnt partition
+  sed -i -e 's|# export HADOOP_LOG_DIR=.*|export HADOOP_LOG_DIR=/var/log/hadoop/logs|' \
+    $HADOOP_CONF_DIR/hadoop-env.sh
+  rm -rf /var/log/hadoop
+  mkdir /data/hadoop/logs
+  chown hadoop:hadoop /data/hadoop/logs
+  ln -s /data/hadoop/logs /var/log/hadoop
+  chown -R hadoop:hadoop /var/log/hadoop
+
+}
+
+# Sets up small website on cluster.
+function setup_web() {
+
+  if which dpkg &> /dev/null; then
+    apt-get -y install thttpd
+    WWW_BASE=/var/www
+  elif which rpm &> /dev/null; then
+    yum install -y thttpd
+    chkconfig --add thttpd
+    WWW_BASE=/var/www/thttpd/html
+  fi
+
+  cat > $WWW_BASE/index.html << END
+<html>
+<head>
+<title>Hadoop Cloud Cluster</title>
+</head>
+<body>
+<h1>Hadoop Cloud Cluster</h1>
+To browse the cluster you need to have a proxy configured.
+Start the proxy with <tt>hadoop-cloud proxy &lt;cluster_name&gt;</tt>,
+and point your browser to
+<a href="http://apache-hadoop-ec2.s3.amazonaws.com/proxy.pac">this Proxy
+Auto-Configuration (PAC)</a> file.  To manage multiple proxy configurations,
+you may wish to use
+<a href="https://addons.mozilla.org/en-US/firefox/addon/2464">FoxyProxy</a>.
+<ul>
+<li><a href="http://$NN_HOST:50070/">NameNode</a>
+<li><a href="http://$JT_HOST:50030/">JobTracker</a>
+</ul>
+</body>
+</html>
+END
+
+  service thttpd start
+
+}
+
+function start_namenode() {
+  if which dpkg &> /dev/null; then
+    AS_HADOOP="su -s /bin/bash - hadoop -c"
+  elif which rpm &> /dev/null; then
+    AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c"
+  fi
+
+  # Format HDFS
+  [ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP_HOME/bin/hadoop namenode -format"
+
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start namenode"
+
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop dfsadmin -safemode wait"
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -mkdir /user"
+  # The following is questionable, as it allows a user to delete another user
+  # It's needed to allow users to create their own user directories
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -chmod +w /user"
+
+}
+
+function start_daemon() {
+  if which dpkg &> /dev/null; then
+    AS_HADOOP="su -s /bin/bash - hadoop -c"
+  elif which rpm &> /dev/null; then
+    AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c"
+  fi
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start $1"
+}
+
+update_repo
+register_auto_shutdown
+install_user_packages
+install_hadoop
+configure_hadoop
+
+for role in $(echo "$ROLES" | tr "," "\n"); do
+  case $role in
+  nn)
+    setup_web
+    start_namenode
+    ;;
+  snn)
+    start_daemon secondarynamenode
+    ;;
+  jt)
+    start_daemon jobtracker
+    ;;
+  dn)
+    start_daemon datanode
+    ;;
+  tt)
+    start_daemon tasktracker
+    ;;
+  esac
+done
+

+ 548 - 0
src/contrib/cloud/src/py/hadoop/cloud/data/hadoop-ec2-init-remote.sh

@@ -0,0 +1,548 @@
+#!/bin/bash -x
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+# Script that is run on each EC2 instance on boot. It is passed in the EC2 user
+# data, so should not exceed 16K in size after gzip compression.
+#
+# This script is executed by /etc/init.d/ec2-run-user-data, and output is
+# logged to /var/log/messages.
+################################################################################
+
+################################################################################
+# Initialize variables
+################################################################################
+
+# Substitute environment variables passed by the client
+export %ENV%
+
+HADOOP_VERSION=${HADOOP_VERSION:-0.20.1}
+HADOOP_HOME=/usr/local/hadoop-$HADOOP_VERSION
+HADOOP_CONF_DIR=$HADOOP_HOME/conf
+SELF_HOST=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname`
+for role in $(echo "$ROLES" | tr "," "\n"); do
+  case $role in
+  nn)
+    NN_HOST=$SELF_HOST
+    ;;
+  jt)
+    JT_HOST=$SELF_HOST
+    ;;
+  esac
+done
+
+function register_auto_shutdown() {
+  if [ ! -z "$AUTO_SHUTDOWN" ]; then
+    shutdown -h +$AUTO_SHUTDOWN >/dev/null &
+  fi
+}
+
+# Install a list of packages on debian or redhat as appropriate
+function install_packages() {
+  if which dpkg &> /dev/null; then
+    apt-get update
+    apt-get -y install $@
+  elif which rpm &> /dev/null; then
+    yum install -y $@
+  else
+    echo "No package manager found."
+  fi
+}
+
+# Install any user packages specified in the USER_PACKAGES environment variable
+function install_user_packages() {
+  if [ ! -z "$USER_PACKAGES" ]; then
+    install_packages $USER_PACKAGES
+  fi
+}
+
+function install_hadoop() {
+  useradd hadoop
+
+  hadoop_tar_url=http://s3.amazonaws.com/hadoop-releases/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
+  hadoop_tar_file=`basename $hadoop_tar_url`
+  hadoop_tar_md5_file=`basename $hadoop_tar_url.md5`
+
+  curl="curl --retry 3 --silent --show-error --fail"
+  for i in `seq 1 3`;
+  do
+    $curl -O $hadoop_tar_url
+    $curl -O $hadoop_tar_url.md5
+    if md5sum -c $hadoop_tar_md5_file; then
+      break;
+    else
+      rm -f $hadoop_tar_file $hadoop_tar_md5_file
+    fi
+  done
+
+  if [ ! -e $hadoop_tar_file ]; then
+    echo "Failed to download $hadoop_tar_url. Aborting."
+    exit 1
+  fi
+
+  tar zxf $hadoop_tar_file -C /usr/local
+  rm -f $hadoop_tar_file $hadoop_tar_md5_file
+
+  echo "export HADOOP_HOME=$HADOOP_HOME" >> ~root/.bashrc
+  echo 'export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$PATH' >> ~root/.bashrc
+}
+
+function prep_disk() {
+  mount=$1
+  device=$2
+  automount=${3:-false}
+
+  echo "warning: ERASING CONTENTS OF $device"
+  mkfs.xfs -f $device
+  if [ ! -e $mount ]; then
+    mkdir $mount
+  fi
+  mount -o defaults,noatime $device $mount
+  if $automount ; then
+    echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab
+  fi
+}
+
+function wait_for_mount {
+  mount=$1
+  device=$2
+
+  mkdir $mount
+
+  i=1
+  echo "Attempting to mount $device"
+  while true ; do
+    sleep 10
+    echo -n "$i "
+    i=$[$i+1]
+    mount -o defaults,noatime $device $mount || continue
+    echo " Mounted."
+    break;
+  done
+}
+
+function make_hadoop_dirs {
+  for mount in "$@"; do
+    if [ ! -e $mount/hadoop ]; then
+      mkdir -p $mount/hadoop
+      chown hadoop:hadoop $mount/hadoop
+    fi
+  done
+}
+
+# Configure Hadoop by setting up disks and site file
+function configure_hadoop() {
+
+  install_packages xfsprogs # needed for XFS
+
+  INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type`
+
+  if [ -n "$EBS_MAPPINGS" ]; then
+    # EBS_MAPPINGS is like "/ebs1,/dev/sdj;/ebs2,/dev/sdk"
+    DFS_NAME_DIR=''
+    FS_CHECKPOINT_DIR=''
+    DFS_DATA_DIR=''
+    for mapping in $(echo "$EBS_MAPPINGS" | tr ";" "\n"); do
+      # Split on the comma (see "Parameter Expansion" in the bash man page)
+      mount=${mapping%,*}
+      device=${mapping#*,}
+      wait_for_mount $mount $device
+      DFS_NAME_DIR=${DFS_NAME_DIR},"$mount/hadoop/hdfs/name"
+      FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR},"$mount/hadoop/hdfs/secondary"
+      DFS_DATA_DIR=${DFS_DATA_DIR},"$mount/hadoop/hdfs/data"
+      FIRST_MOUNT=${FIRST_MOUNT-$mount}
+      make_hadoop_dirs $mount
+    done
+    # Remove leading commas
+    DFS_NAME_DIR=${DFS_NAME_DIR#?}
+    FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR#?}
+    DFS_DATA_DIR=${DFS_DATA_DIR#?}
+
+    DFS_REPLICATION=3 # EBS is internally replicated, but we also use HDFS replication for safety
+  else
+    case $INSTANCE_TYPE in
+    m1.xlarge|c1.xlarge)
+      DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name
+      FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary
+      DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data,/mnt3/hadoop/hdfs/data,/mnt4/hadoop/hdfs/data
+      ;;
+    m1.large)
+      DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name
+      FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary
+      DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data
+      ;;
+    *)
+      # "m1.small" or "c1.medium"
+      DFS_NAME_DIR=/mnt/hadoop/hdfs/name
+      FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary
+      DFS_DATA_DIR=/mnt/hadoop/hdfs/data
+      ;;
+    esac
+    FIRST_MOUNT=/mnt
+    DFS_REPLICATION=3
+  fi
+
+  case $INSTANCE_TYPE in
+  m1.xlarge|c1.xlarge)
+    prep_disk /mnt2 /dev/sdc true &
+    disk2_pid=$!
+    prep_disk /mnt3 /dev/sdd true &
+    disk3_pid=$!
+    prep_disk /mnt4 /dev/sde true &
+    disk4_pid=$!
+    wait $disk2_pid $disk3_pid $disk4_pid
+    MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local,/mnt3/hadoop/mapred/local,/mnt4/hadoop/mapred/local
+    MAX_MAP_TASKS=8
+    MAX_REDUCE_TASKS=4
+    CHILD_OPTS=-Xmx680m
+    CHILD_ULIMIT=1392640
+    ;;
+  m1.large)
+    prep_disk /mnt2 /dev/sdc true
+    MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local
+    MAX_MAP_TASKS=4
+    MAX_REDUCE_TASKS=2
+    CHILD_OPTS=-Xmx1024m
+    CHILD_ULIMIT=2097152
+    ;;
+  c1.medium)
+    MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local
+    MAX_MAP_TASKS=4
+    MAX_REDUCE_TASKS=2
+    CHILD_OPTS=-Xmx550m
+    CHILD_ULIMIT=1126400
+    ;;
+  *)
+    # "m1.small"
+    MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local
+    MAX_MAP_TASKS=2
+    MAX_REDUCE_TASKS=1
+    CHILD_OPTS=-Xmx550m
+    CHILD_ULIMIT=1126400
+    ;;
+  esac
+
+  make_hadoop_dirs `ls -d /mnt*`
+
+  # Create tmp directory
+  mkdir /mnt/tmp
+  chmod a+rwxt /mnt/tmp
+  
+  mkdir /etc/hadoop
+  ln -s $HADOOP_CONF_DIR /etc/hadoop/conf
+
+  ##############################################################################
+  # Modify this section to customize your Hadoop cluster.
+  ##############################################################################
+  cat > $HADOOP_CONF_DIR/hadoop-site.xml <<EOF
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<configuration>
+<property>
+  <name>dfs.block.size</name>
+  <value>134217728</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.data.dir</name>
+  <value>$DFS_DATA_DIR</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.datanode.du.reserved</name>
+  <value>1073741824</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.datanode.handler.count</name>
+  <value>3</value>
+  <final>true</final>
+</property>
+<!--property>
+  <name>dfs.hosts</name>
+  <value>$HADOOP_CONF_DIR/dfs.hosts</value>
+  <final>true</final>
+</property-->
+<!--property>
+  <name>dfs.hosts.exclude</name>
+  <value>$HADOOP_CONF_DIR/dfs.hosts.exclude</value>
+  <final>true</final>
+</property-->
+<property>
+  <name>dfs.name.dir</name>
+  <value>$DFS_NAME_DIR</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.namenode.handler.count</name>
+  <value>5</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.permissions</name>
+  <value>true</value>
+  <final>true</final>
+</property>
+<property>
+  <name>dfs.replication</name>
+  <value>$DFS_REPLICATION</value>
+</property>
+<property>
+  <name>fs.checkpoint.dir</name>
+  <value>$FS_CHECKPOINT_DIR</value>
+  <final>true</final>
+</property>
+<property>
+  <name>fs.default.name</name>
+  <value>hdfs://$NN_HOST:8020/</value>
+</property>
+<property>
+  <name>fs.trash.interval</name>
+  <value>1440</value>
+  <final>true</final>
+</property>
+<property>
+  <name>hadoop.tmp.dir</name>
+  <value>/mnt/tmp/hadoop-\${user.name}</value>
+  <final>true</final>
+</property>
+<property>
+  <name>io.file.buffer.size</name>
+  <value>65536</value>
+</property>
+<property>
+  <name>mapred.child.java.opts</name>
+  <value>$CHILD_OPTS</value>
+</property>
+<property>
+  <name>mapred.child.ulimit</name>
+  <value>$CHILD_ULIMIT</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.job.tracker</name>
+  <value>$JT_HOST:8021</value>
+</property>
+<property>
+  <name>mapred.job.tracker.handler.count</name>
+  <value>5</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.local.dir</name>
+  <value>$MAPRED_LOCAL_DIR</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.map.tasks.speculative.execution</name>
+  <value>true</value>
+</property>
+<property>
+  <name>mapred.reduce.parallel.copies</name>
+  <value>10</value>
+</property>
+<property>
+  <name>mapred.reduce.tasks</name>
+  <value>10</value>
+</property>
+<property>
+  <name>mapred.reduce.tasks.speculative.execution</name>
+  <value>false</value>
+</property>
+<property>
+  <name>mapred.submit.replication</name>
+  <value>10</value>
+</property>
+<property>
+  <name>mapred.system.dir</name>
+  <value>/hadoop/system/mapred</value>
+</property>
+<property>
+  <name>mapred.tasktracker.map.tasks.maximum</name>
+  <value>$MAX_MAP_TASKS</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.tasktracker.reduce.tasks.maximum</name>
+  <value>$MAX_REDUCE_TASKS</value>
+  <final>true</final>
+</property>
+<property>
+  <name>tasktracker.http.threads</name>
+  <value>46</value>
+  <final>true</final>
+</property>
+<property>
+  <name>mapred.compress.map.output</name>
+  <value>true</value>
+</property>
+<property>
+  <name>mapred.output.compression.type</name>
+  <value>BLOCK</value>
+</property>
+<property>
+  <name>hadoop.rpc.socket.factory.class.default</name>
+  <value>org.apache.hadoop.net.StandardSocketFactory</value>
+  <final>true</final>
+</property>
+<property>
+  <name>hadoop.rpc.socket.factory.class.ClientProtocol</name>
+  <value></value>
+  <final>true</final>
+</property>
+<property>
+  <name>hadoop.rpc.socket.factory.class.JobSubmissionProtocol</name>
+  <value></value>
+  <final>true</final>
+</property>
+<property>
+  <name>io.compression.codecs</name>
+  <value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec</value>
+</property>
+<property>
+  <name>fs.s3.awsAccessKeyId</name>
+  <value>$AWS_ACCESS_KEY_ID</value>
+</property>
+<property>
+  <name>fs.s3.awsSecretAccessKey</name>
+  <value>$AWS_SECRET_ACCESS_KEY</value>
+</property>
+<property>
+  <name>fs.s3n.awsAccessKeyId</name>
+  <value>$AWS_ACCESS_KEY_ID</value>
+</property>
+<property>
+  <name>fs.s3n.awsSecretAccessKey</name>
+  <value>$AWS_SECRET_ACCESS_KEY</value>
+</property>
+</configuration>
+EOF
+
+  # Keep PID files in a non-temporary directory
+  sed -i -e "s|# export HADOOP_PID_DIR=.*|export HADOOP_PID_DIR=/var/run/hadoop|" \
+    $HADOOP_CONF_DIR/hadoop-env.sh
+  mkdir -p /var/run/hadoop
+  chown -R hadoop:hadoop /var/run/hadoop
+
+  # Set SSH options within the cluster
+  sed -i -e 's|# export HADOOP_SSH_OPTS=.*|export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no"|' \
+    $HADOOP_CONF_DIR/hadoop-env.sh
+
+  # Hadoop logs should be on the /mnt partition
+  sed -i -e 's|# export HADOOP_LOG_DIR=.*|export HADOOP_LOG_DIR=/var/log/hadoop/logs|' \
+    $HADOOP_CONF_DIR/hadoop-env.sh
+  rm -rf /var/log/hadoop
+  mkdir /mnt/hadoop/logs
+  chown hadoop:hadoop /mnt/hadoop/logs
+  ln -s /mnt/hadoop/logs /var/log/hadoop
+  chown -R hadoop:hadoop /var/log/hadoop
+
+}
+
+# Sets up small website on cluster.
+function setup_web() {
+
+  if which dpkg &> /dev/null; then
+    apt-get -y install thttpd
+    WWW_BASE=/var/www
+  elif which rpm &> /dev/null; then
+    yum install -y thttpd
+    chkconfig --add thttpd
+    WWW_BASE=/var/www/thttpd/html
+  fi
+
+  cat > $WWW_BASE/index.html << END
+<html>
+<head>
+<title>Hadoop EC2 Cluster</title>
+</head>
+<body>
+<h1>Hadoop EC2 Cluster</h1>
+To browse the cluster you need to have a proxy configured.
+Start the proxy with <tt>hadoop-ec2 proxy &lt;cluster_name&gt;</tt>,
+and point your browser to
+<a href="http://apache-hadoop-ec2.s3.amazonaws.com/proxy.pac">this Proxy
+Auto-Configuration (PAC)</a> file.  To manage multiple proxy configurations,
+you may wish to use
+<a href="https://addons.mozilla.org/en-US/firefox/addon/2464">FoxyProxy</a>.
+<ul>
+<li><a href="http://$NN_HOST:50070/">NameNode</a>
+<li><a href="http://$JT_HOST:50030/">JobTracker</a>
+</ul>
+</body>
+</html>
+END
+
+  service thttpd start
+
+}
+
+function start_namenode() {
+  if which dpkg &> /dev/null; then
+    AS_HADOOP="su -s /bin/bash - hadoop -c"
+  elif which rpm &> /dev/null; then
+    AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c"
+  fi
+
+  # Format HDFS
+  [ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP_HOME/bin/hadoop namenode -format"
+
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start namenode"
+
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop dfsadmin -safemode wait"
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -mkdir /user"
+  # The following is questionable, as it allows a user to delete another user
+  # It's needed to allow users to create their own user directories
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -chmod +w /user"
+
+}
+
+function start_daemon() {
+  if which dpkg &> /dev/null; then
+    AS_HADOOP="su -s /bin/bash - hadoop -c"
+  elif which rpm &> /dev/null; then
+    AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c"
+  fi
+  $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start $1"
+}
+
+register_auto_shutdown
+install_user_packages
+install_hadoop
+configure_hadoop
+
+for role in $(echo "$ROLES" | tr "," "\n"); do
+  case $role in
+  nn)
+    setup_web
+    start_namenode
+    ;;
+  snn)
+    start_daemon secondarynamenode
+    ;;
+  jt)
+    start_daemon jobtracker
+    ;;
+  dn)
+    start_daemon datanode
+    ;;
+  tt)
+    start_daemon tasktracker
+    ;;
+  esac
+done
+

+ 22 - 0
src/contrib/cloud/src/py/hadoop/cloud/data/hadoop-rackspace-init-remote.sh

@@ -0,0 +1,22 @@
+#!/bin/bash -ex
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Run a script downloaded at boot time to avoid Rackspace's 10K limitation.
+
+wget -qO/usr/bin/runurl run.alestic.com/runurl
+chmod 755 /usr/bin/runurl
+%ENV% runurl http://hadoop-dev-test.s3.amazonaws.com/boot-rackspace.sh

+ 112 - 0
src/contrib/cloud/src/py/hadoop/cloud/data/zookeeper-ec2-init-remote.sh

@@ -0,0 +1,112 @@
+#!/bin/bash -x
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+# Script that is run on each EC2 instance on boot. It is passed in the EC2 user
+# data, so should not exceed 16K in size after gzip compression.
+#
+# This script is executed by /etc/init.d/ec2-run-user-data, and output is
+# logged to /var/log/messages.
+################################################################################
+
+################################################################################
+# Initialize variables
+################################################################################
+
+# Substitute environment variables passed by the client
+export %ENV%
+
+ZK_VERSION=${ZK_VERSION:-3.2.2}
+ZOOKEEPER_HOME=/usr/local/zookeeper-$ZK_VERSION
+ZK_CONF_DIR=/etc/zookeeper/conf
+
+function register_auto_shutdown() {
+  if [ ! -z "$AUTO_SHUTDOWN" ]; then
+    shutdown -h +$AUTO_SHUTDOWN >/dev/null &
+  fi
+}
+
+# Install a list of packages on debian or redhat as appropriate
+function install_packages() {
+  if which dpkg &> /dev/null; then
+    apt-get update
+    apt-get -y install $@
+  elif which rpm &> /dev/null; then
+    yum install -y $@
+  else
+    echo "No package manager found."
+  fi
+}
+
+# Install any user packages specified in the USER_PACKAGES environment variable
+function install_user_packages() {
+  if [ ! -z "$USER_PACKAGES" ]; then
+    install_packages $USER_PACKAGES
+  fi
+}
+
+function install_zookeeper() {
+  zk_tar_url=http://www.apache.org/dist/hadoop/zookeeper/zookeeper-$ZK_VERSION/zookeeper-$ZK_VERSION.tar.gz
+  zk_tar_file=`basename $zk_tar_url`
+  zk_tar_md5_file=`basename $zk_tar_url.md5`
+
+  curl="curl --retry 3 --silent --show-error --fail"
+  for i in `seq 1 3`;
+  do
+    $curl -O $zk_tar_url
+    $curl -O $zk_tar_url.md5
+    if md5sum -c $zk_tar_md5_file; then
+      break;
+    else
+      rm -f $zk_tar_file $zk_tar_md5_file
+    fi
+  done
+
+  if [ ! -e $zk_tar_file ]; then
+    echo "Failed to download $zk_tar_url. Aborting."
+    exit 1
+  fi
+
+  tar zxf $zk_tar_file -C /usr/local
+  rm -f $zk_tar_file $zk_tar_md5_file
+
+  echo "export ZOOKEEPER_HOME=$ZOOKEEPER_HOME" >> ~root/.bashrc
+  echo 'export PATH=$JAVA_HOME/bin:$ZOOKEEPER_HOME/bin:$PATH' >> ~root/.bashrc
+}
+
+function configure_zookeeper() {
+  mkdir -p /mnt/zookeeper/logs
+  ln -s /mnt/zookeeper/logs /var/log/zookeeper
+  mkdir -p /var/log/zookeeper/txlog
+  mkdir -p $ZK_CONF_DIR
+  cp $ZOOKEEPER_HOME/conf/log4j.properties $ZK_CONF_DIR
+
+  sed -i -e "s|log4j.rootLogger=INFO, CONSOLE|log4j.rootLogger=INFO, ROLLINGFILE|" \
+         -e "s|log4j.appender.ROLLINGFILE.File=zookeeper.log|log4j.appender.ROLLINGFILE.File=/var/log/zookeeper/zookeeper.log|" \
+      $ZK_CONF_DIR/log4j.properties
+      
+  # Ensure ZooKeeper starts on boot
+  cat > /etc/rc.local <<EOF
+ZOOCFGDIR=$ZK_CONF_DIR $ZOOKEEPER_HOME/bin/zkServer.sh start > /dev/null 2>&1 &
+EOF
+
+}
+
+register_auto_shutdown
+install_user_packages
+install_zookeeper
+configure_zookeeper

+ 14 - 0
src/contrib/cloud/src/py/hadoop/cloud/providers/__init__.py

@@ -0,0 +1,14 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 61 - 0
src/contrib/cloud/src/py/hadoop/cloud/providers/dummy.py

@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from hadoop.cloud.cluster import Cluster
+from hadoop.cloud.cluster import Instance
+
+logger = logging.getLogger(__name__)
+
+class DummyCluster(Cluster):
+
+  @staticmethod
+  def get_clusters_with_role(role, state="running"):
+    logger.info("get_clusters_with_role(%s, %s)", role, state)
+    return ["dummy-cluster"]
+
+  def __init__(self, name, config_dir):
+    super(DummyCluster, self).__init__(name, config_dir)
+    logger.info("__init__(%s, %s)", name, config_dir)
+
+  def get_provider_code(self):
+    return "dummy"
+
+  def authorize_role(self, role, from_port, to_port, cidr_ip):
+    logger.info("authorize_role(%s, %s, %s, %s)", role, from_port, to_port,
+                cidr_ip)
+
+  def get_instances_in_role(self, role, state_filter=None):
+    logger.info("get_instances_in_role(%s, %s)", role, state_filter)
+    return [Instance(1, '127.0.0.1', '127.0.0.1')]
+
+  def print_status(self, roles, state_filter="running"):
+    logger.info("print_status(%s, %s)", roles, state_filter)
+
+  def launch_instances(self, role, number, image_id, size_id,
+                       instance_user_data, **kwargs):
+    logger.info("launch_instances(%s, %s, %s, %s, %s, %s)", role, number,
+                image_id, size_id, instance_user_data, str(kwargs))
+    return [1]
+
+  def wait_for_instances(self, instance_ids, timeout=600):
+    logger.info("wait_for_instances(%s, %s)", instance_ids, timeout)
+
+  def terminate(self):
+    logger.info("terminate")
+
+  def delete(self):
+    logger.info("delete")

+ 479 - 0
src/contrib/cloud/src/py/hadoop/cloud/providers/ec2.py

@@ -0,0 +1,479 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from boto.ec2.connection import EC2Connection
+from boto.exception import EC2ResponseError
+import logging
+from hadoop.cloud.cluster import Cluster
+from hadoop.cloud.cluster import Instance
+from hadoop.cloud.cluster import RoleSyntaxException
+from hadoop.cloud.cluster import TimeoutException
+from hadoop.cloud.storage import JsonVolumeManager
+from hadoop.cloud.storage import JsonVolumeSpecManager
+from hadoop.cloud.storage import MountableVolume
+from hadoop.cloud.storage import Storage
+from hadoop.cloud.util import xstr
+import os
+import re
+import subprocess
+import sys
+import time
+
+logger = logging.getLogger(__name__)
+
+def _run_command_on_instance(instance, ssh_options, command):
+  print "Running ssh %s root@%s '%s'" % \
+    (ssh_options, instance.public_dns_name, command)
+  retcode = subprocess.call("ssh %s root@%s '%s'" %
+                           (ssh_options, instance.public_dns_name, command),
+                           shell=True)
+  print "Command running on %s returned with value %s" % \
+    (instance.public_dns_name, retcode)
+
+def _wait_for_volume(ec2_connection, volume_id):
+  """
+  Waits until a volume becomes available.
+  """
+  while True:
+    volumes = ec2_connection.get_all_volumes([volume_id,])
+    if volumes[0].status == 'available':
+      break
+    sys.stdout.write(".")
+    sys.stdout.flush()
+    time.sleep(1)
+
+class Ec2Cluster(Cluster):
+  """
+  A cluster of EC2 instances. A cluster has a unique name.
+
+  Instances running in the cluster run in a security group with the cluster's
+  name, and also a name indicating the instance's role, e.g. <cluster-name>-foo
+  to show a "foo" instance.
+  """
+
+  @staticmethod
+  def get_clusters_with_role(role, state="running"):
+    all_instances = EC2Connection().get_all_instances()
+    clusters = []
+    for res in all_instances:
+      instance = res.instances[0]
+      for group in res.groups:
+        if group.id.endswith("-" + role) and instance.state == state:
+          clusters.append(re.sub("-%s$" % re.escape(role), "", group.id))
+    return clusters
+
+  def __init__(self, name, config_dir):
+    super(Ec2Cluster, self).__init__(name, config_dir)
+    self.ec2Connection = EC2Connection()
+
+  def get_provider_code(self):
+    return "ec2"
+
+  def _get_cluster_group_name(self):
+    return self.name
+
+  def _check_role_name(self, role):
+    if not re.match("^[a-zA-Z0-9_+]+$", role):
+      raise RoleSyntaxException("Invalid role name '%s'" % role)
+
+  def _group_name_for_role(self, role):
+    """
+    Return the security group name for an instance in a given role.
+    """
+    self._check_role_name(role)
+    return "%s-%s" % (self.name, role)
+
+  def _get_group_names(self, roles):
+    group_names = [self._get_cluster_group_name()]
+    for role in roles:
+      group_names.append(self._group_name_for_role(role))
+    return group_names
+
+  def _get_all_group_names(self):
+    security_groups = self.ec2Connection.get_all_security_groups()
+    security_group_names = \
+      [security_group.name for security_group in security_groups]
+    return security_group_names
+
+  def _get_all_group_names_for_cluster(self):
+    all_group_names = self._get_all_group_names()
+    r = []
+    if self.name not in all_group_names:
+      return r
+    for group in all_group_names:
+      if re.match("^%s(-[a-zA-Z0-9_+]+)?$" % self.name, group):
+        r.append(group)
+    return r
+
+  def _create_groups(self, role):
+    """
+    Create the security groups for a given role, including a group for the
+    cluster if it doesn't exist.
+    """
+    self._check_role_name(role)
+    security_group_names = self._get_all_group_names()
+
+    cluster_group_name = self._get_cluster_group_name()
+    if not cluster_group_name in security_group_names:
+      self.ec2Connection.create_security_group(cluster_group_name,
+                                               "Cluster (%s)" % (self.name))
+      self.ec2Connection.authorize_security_group(cluster_group_name,
+                                                  cluster_group_name)
+      # Allow SSH from anywhere
+      self.ec2Connection.authorize_security_group(cluster_group_name,
+                                                  ip_protocol="tcp",
+                                                  from_port=22, to_port=22,
+                                                  cidr_ip="0.0.0.0/0")
+
+    role_group_name = self._group_name_for_role(role)
+    if not role_group_name in security_group_names:
+      self.ec2Connection.create_security_group(role_group_name,
+        "Role %s (%s)" % (role, self.name))
+
+  def authorize_role(self, role, from_port, to_port, cidr_ip):
+    """
+    Authorize access to machines in a given role from a given network.
+    """
+    self._check_role_name(role)
+    role_group_name = self._group_name_for_role(role)
+    # Revoke first to avoid InvalidPermission.Duplicate error
+    self.ec2Connection.revoke_security_group(role_group_name,
+                                             ip_protocol="tcp",
+                                             from_port=from_port,
+                                             to_port=to_port, cidr_ip=cidr_ip)
+    self.ec2Connection.authorize_security_group(role_group_name,
+                                                ip_protocol="tcp",
+                                                from_port=from_port,
+                                                to_port=to_port,
+                                                cidr_ip=cidr_ip)
+
+  def _get_instances(self, group_name, state_filter=None):
+    """
+    Get all the instances in a group, filtered by state.
+
+    @param group_name: the name of the group
+    @param state_filter: the state that the instance should be in
+      (e.g. "running"), or None for all states
+    """
+    all_instances = self.ec2Connection.get_all_instances()
+    instances = []
+    for res in all_instances:
+      for group in res.groups:
+        if group.id == group_name:
+          for instance in res.instances:
+            if state_filter == None or instance.state == state_filter:
+              instances.append(instance)
+    return instances
+
+  def get_instances_in_role(self, role, state_filter=None):
+    """
+    Get all the instances in a role, filtered by state.
+
+    @param role: the name of the role
+    @param state_filter: the state that the instance should be in
+      (e.g. "running"), or None for all states
+    """
+    self._check_role_name(role)
+    instances = []
+    for instance in self._get_instances(self._group_name_for_role(role),
+                                        state_filter):
+      instances.append(Instance(instance.id, instance.dns_name,
+                                instance.private_dns_name))
+    return instances
+
+  def _print_instance(self, role, instance):
+    print "\t".join((role, instance.id,
+      instance.image_id,
+      instance.dns_name, instance.private_dns_name,
+      instance.state, xstr(instance.key_name), instance.instance_type,
+      str(instance.launch_time), instance.placement))
+
+  def print_status(self, roles=None, state_filter="running"):
+    """
+    Print the status of instances in the given roles, filtered by state.
+    """
+    if not roles:
+      for instance in self._get_instances(self._get_cluster_group_name(),
+                                          state_filter):
+        self._print_instance("", instance)
+    else:
+      for role in roles:
+        for instance in self._get_instances(self._group_name_for_role(role),
+                                            state_filter):
+          self._print_instance(role, instance)
+
+  def launch_instances(self, roles, number, image_id, size_id,
+                       instance_user_data, **kwargs):
+    for role in roles:
+      self._check_role_name(role)  
+      self._create_groups(role)
+      
+    user_data = instance_user_data.read_as_gzip_stream()
+    security_groups = self._get_group_names(roles) + kwargs.get('security_groups', [])
+
+    reservation = self.ec2Connection.run_instances(image_id, min_count=number,
+      max_count=number, key_name=kwargs.get('key_name', None),
+      security_groups=security_groups, user_data=user_data,
+      instance_type=size_id, placement=kwargs.get('placement', None))
+    return [instance.id for instance in reservation.instances]
+
+  def wait_for_instances(self, instance_ids, timeout=600):
+    start_time = time.time()
+    while True:
+      if (time.time() - start_time >= timeout):
+        raise TimeoutException()
+      try:
+        if self._all_started(self.ec2Connection.get_all_instances(instance_ids)):
+          break
+      # don't timeout for race condition where instance is not yet registered
+      except EC2ResponseError:
+        pass
+      sys.stdout.write(".")
+      sys.stdout.flush()
+      time.sleep(1)
+
+  def _all_started(self, reservations):
+    for res in reservations:
+      for instance in res.instances:
+        if instance.state != "running":
+          return False
+    return True
+
+  def terminate(self):
+    instances = self._get_instances(self._get_cluster_group_name(), "running")
+    if instances:
+      self.ec2Connection.terminate_instances([i.id for i in instances])
+
+  def delete(self):
+    """
+    Delete the security groups for each role in the cluster, and the group for
+    the cluster.
+    """
+    group_names = self._get_all_group_names_for_cluster()
+    for group in group_names:
+      self.ec2Connection.delete_security_group(group)
+
+  def get_storage(self):
+    """
+    Return the external storage for the cluster.
+    """
+    return Ec2Storage(self)
+
+
+class Ec2Storage(Storage):
+  """
+  Storage volumes for an EC2 cluster. The storage is associated with a named
+  cluster. Metadata for the storage volumes is kept in a JSON file on the client
+  machine (in a file called "ec2-storage-<cluster-name>.json" in the
+  configuration directory).
+  """
+
+  @staticmethod
+  def create_formatted_snapshot(cluster, size, availability_zone, image_id,
+                                key_name, ssh_options):
+    """
+    Creates a formatted snapshot of a given size. This saves having to format
+    volumes when they are first attached.
+    """
+    conn = cluster.ec2Connection
+    print "Starting instance"
+    reservation = conn.run_instances(image_id, key_name=key_name,
+                                     placement=availability_zone)
+    instance = reservation.instances[0]
+    try:
+      cluster.wait_for_instances([instance.id,])
+      print "Started instance %s" % instance.id
+    except TimeoutException:
+      print "Timeout"
+      return
+    print
+    print "Waiting 60 seconds before attaching storage"
+    time.sleep(60)
+    # Re-populate instance object since it has more details filled in
+    instance.update()
+
+    print "Creating volume of size %s in %s" % (size, availability_zone)
+    volume = conn.create_volume(size, availability_zone)
+    print "Created volume %s" % volume
+    print "Attaching volume to %s" % instance.id
+    volume.attach(instance.id, '/dev/sdj')
+
+    _run_command_on_instance(instance, ssh_options, """
+      while true ; do
+        echo 'Waiting for /dev/sdj...';
+        if [ -e /dev/sdj ]; then break; fi;
+        sleep 1;
+      done;
+      mkfs.ext3 -F -m 0.5 /dev/sdj
+    """)
+
+    print "Detaching volume"
+    conn.detach_volume(volume.id, instance.id)
+    print "Creating snapshot"
+    snapshot = volume.create_snapshot()
+    print "Created snapshot %s" % snapshot.id
+    _wait_for_volume(conn, volume.id)
+    print
+    print "Deleting volume"
+    volume.delete()
+    print "Deleted volume"
+    print "Stopping instance"
+    terminated = conn.terminate_instances([instance.id,])
+    print "Stopped instance %s" % terminated
+
+  def __init__(self, cluster):
+    super(Ec2Storage, self).__init__(cluster)
+    self.config_dir = cluster.config_dir
+
+  def _get_storage_filename(self):
+    return os.path.join(self.config_dir,
+                        "ec2-storage-%s.json" % (self.cluster.name))
+
+  def create(self, role, number_of_instances, availability_zone, spec_filename):
+    spec_file = open(spec_filename, 'r')
+    volume_spec_manager = JsonVolumeSpecManager(spec_file)
+    volume_manager = JsonVolumeManager(self._get_storage_filename())
+    for dummy in range(number_of_instances):
+      mountable_volumes = []
+      volume_specs = volume_spec_manager.volume_specs_for_role(role)
+      for spec in volume_specs:
+        logger.info("Creating volume of size %s in %s from snapshot %s" % \
+                    (spec.size, availability_zone, spec.snapshot_id))
+        volume = self.cluster.ec2Connection.create_volume(spec.size,
+                                                          availability_zone,
+                                                          spec.snapshot_id)
+        mountable_volumes.append(MountableVolume(volume.id, spec.mount_point,
+                                                 spec.device))
+      volume_manager.add_instance_storage_for_role(role, mountable_volumes)
+
+  def _get_mountable_volumes(self, role):
+    storage_filename = self._get_storage_filename()
+    volume_manager = JsonVolumeManager(storage_filename)
+    return volume_manager.get_instance_storage_for_role(role)
+
+  def get_mappings_string_for_role(self, role):
+    mappings = {}
+    mountable_volumes_list = self._get_mountable_volumes(role)
+    for mountable_volumes in mountable_volumes_list:
+      for mountable_volume in mountable_volumes:
+        mappings[mountable_volume.mount_point] = mountable_volume.device
+    return ";".join(["%s,%s" % (mount_point, device) for (mount_point, device)
+                     in mappings.items()])
+
+  def _has_storage(self, role):
+    return self._get_mountable_volumes(role)
+
+  def has_any_storage(self, roles):
+    for role in roles:
+      if self._has_storage(role):
+        return True
+    return False
+
+  def get_roles(self):
+    storage_filename = self._get_storage_filename()
+    volume_manager = JsonVolumeManager(storage_filename)
+    return volume_manager.get_roles()
+  
+  def _get_ec2_volumes_dict(self, mountable_volumes):
+    volume_ids = [mv.volume_id for mv in sum(mountable_volumes, [])]
+    volumes = self.cluster.ec2Connection.get_all_volumes(volume_ids)
+    volumes_dict = {}
+    for volume in volumes:
+      volumes_dict[volume.id] = volume
+    return volumes_dict
+
+  def _print_volume(self, role, volume):
+    print "\t".join((role, volume.id, str(volume.size),
+                     volume.snapshot_id, volume.availabilityZone,
+                     volume.status, str(volume.create_time),
+                     str(volume.attach_time)))
+
+  def print_status(self, roles=None):
+    if roles == None:
+      storage_filename = self._get_storage_filename()
+      volume_manager = JsonVolumeManager(storage_filename)
+      roles = volume_manager.get_roles()
+    for role in roles:
+      mountable_volumes_list = self._get_mountable_volumes(role)
+      ec2_volumes = self._get_ec2_volumes_dict(mountable_volumes_list)
+      for mountable_volumes in mountable_volumes_list:
+        for mountable_volume in mountable_volumes:
+          self._print_volume(role, ec2_volumes[mountable_volume.volume_id])
+
+  def _replace(self, string, replacements):
+    for (match, replacement) in replacements.iteritems():
+      string = string.replace(match, replacement)
+    return string
+
+  def attach(self, role, instances):
+    mountable_volumes_list = self._get_mountable_volumes(role)
+    if not mountable_volumes_list:
+      return
+    ec2_volumes = self._get_ec2_volumes_dict(mountable_volumes_list)
+
+    available_mountable_volumes_list = []
+
+    available_instances_dict = {}
+    for instance in instances:
+      available_instances_dict[instance.id] = instance
+
+    # Iterate over mountable_volumes and retain those that are not attached
+    # Also maintain a list of instances that have no attached storage
+    # Note that we do not fill in "holes" (instances that only have some of
+    # their storage attached)
+    for mountable_volumes in mountable_volumes_list:
+      available = True
+      for mountable_volume in mountable_volumes:
+        if ec2_volumes[mountable_volume.volume_id].status != 'available':
+          available = False
+          attach_data = ec2_volumes[mountable_volume.volume_id].attach_data
+          instance_id = attach_data.instance_id
+          if available_instances_dict.has_key(instance_id):
+            del available_instances_dict[instance_id]
+      if available:
+        available_mountable_volumes_list.append(mountable_volumes)
+
+    if len(available_instances_dict) != len(available_mountable_volumes_list):
+      logger.warning("Number of available instances (%s) and volumes (%s) \
+        do not match." \
+        % (len(available_instances_dict),
+           len(available_mountable_volumes_list)))
+
+    for (instance, mountable_volumes) in zip(available_instances_dict.values(),
+                                             available_mountable_volumes_list):
+      print "Attaching storage to %s" % instance.id
+      for mountable_volume in mountable_volumes:
+        volume = ec2_volumes[mountable_volume.volume_id]
+        print "Attaching %s to %s" % (volume.id, instance.id)
+        volume.attach(instance.id, mountable_volume.device)
+
+  def delete(self, roles=[]):
+    storage_filename = self._get_storage_filename()
+    volume_manager = JsonVolumeManager(storage_filename)
+    for role in roles:
+      mountable_volumes_list = volume_manager.get_instance_storage_for_role(role)
+      ec2_volumes = self._get_ec2_volumes_dict(mountable_volumes_list)
+      all_available = True
+      for volume in ec2_volumes.itervalues():
+        if volume.status != 'available':
+          all_available = False
+          logger.warning("Volume %s is not available.", volume)
+      if not all_available:
+        logger.warning("Some volumes are still in use for role %s.\
+          Aborting delete.", role)
+        return
+      for volume in ec2_volumes.itervalues():
+        volume.delete()
+      volume_manager.remove_instance_storage_for_role(role)

+ 239 - 0
src/contrib/cloud/src/py/hadoop/cloud/providers/rackspace.py

@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import with_statement
+
+import base64
+import os
+import subprocess
+import sys
+import time
+import uuid
+
+from hadoop.cloud.cluster import Cluster
+from hadoop.cloud.cluster import Instance
+from hadoop.cloud.cluster import TimeoutException
+from hadoop.cloud.service import HadoopService
+from hadoop.cloud.service import TASKTRACKER
+from libcloud.drivers.rackspace import RackspaceNodeDriver
+from libcloud.base import Node
+from libcloud.base import NodeImage
+
+RACKSPACE_KEY = os.environ['RACKSPACE_KEY']
+RACKSPACE_SECRET = os.environ['RACKSPACE_SECRET']
+
+STATE_MAP = { 'running': 'ACTIVE' }
+STATE_MAP_REVERSED = dict((v, k) for k, v in STATE_MAP.iteritems())
+
+USER_DATA_FILENAME = "/etc/init.d/rackspace-init.sh"
+
+class RackspaceCluster(Cluster):
+  """
+  A cluster of instances running on Rackspace Cloud Servers. A cluster has a
+  unique name, which is stored under the "cluster" metadata key of each server.
+
+  Every instance in the cluster has one or more roles, stored as a
+  comma-separated string under the "roles" metadata key. For example, an instance
+  with roles "foo" and "bar" has a "foo,bar" "roles" key.
+  
+  At boot time two files are injected into an instance's filesystem: the user
+  data file (which is used as a boot script), and the user's public key.
+  """
+  @staticmethod
+  def get_clusters_with_role(role, state="running", driver=None):
+    driver = driver or RackspaceNodeDriver(RACKSPACE_KEY, RACKSPACE_SECRET)
+    all_nodes = RackspaceCluster._list_nodes(driver)
+    clusters = set()
+    for node in all_nodes:
+      try:
+        if node.extra['metadata'].has_key('cluster') and \
+            role in node.extra['metadata']['roles'].split(','):
+          if node.state == STATE_MAP[state]:
+            clusters.add(node.extra['metadata']['cluster'])
+      except KeyError:
+        pass
+    return clusters
+  
+  @staticmethod
+  def _list_nodes(driver, retries=5):
+    attempts = 0
+    while True:
+      try:
+        return driver.list_nodes()
+      except IOError:
+        attempts = attempts + 1
+        if attempts > retries:
+          raise
+        time.sleep(5)
+
+  def __init__(self, name, config_dir, driver=None):
+    super(RackspaceCluster, self).__init__(name, config_dir)
+    self.driver = driver or RackspaceNodeDriver(RACKSPACE_KEY, RACKSPACE_SECRET)
+
+  def get_provider_code(self):
+    return "rackspace"
+  
+  def _get_nodes(self, state_filter=None):
+    all_nodes = RackspaceCluster._list_nodes(self.driver)
+    nodes = []
+    for node in all_nodes:
+      try:
+        if node.extra['metadata']['cluster'] == self.name:
+          if state_filter == None or node.state == STATE_MAP[state_filter]:
+            nodes.append(node)
+      except KeyError:
+        pass
+    return nodes
+
+  def _to_instance(self, node):
+    return Instance(node.id, node.public_ip[0], node.private_ip[0])
+  
+  def _get_nodes_in_role(self, role, state_filter=None):
+    all_nodes = RackspaceCluster._list_nodes(self.driver)
+    nodes = []
+    for node in all_nodes:
+      try:
+        if node.extra['metadata']['cluster'] == self.name and \
+          role in node.extra['metadata']['roles'].split(','):
+          if state_filter == None or node.state == STATE_MAP[state_filter]:
+            nodes.append(node)
+      except KeyError:
+        pass
+    return nodes
+  
+  def get_instances_in_role(self, role, state_filter=None):
+    """
+    Get all the instances in a role, filtered by state.
+
+    @param role: the name of the role
+    @param state_filter: the state that the instance should be in
+      (e.g. "running"), or None for all states
+    """
+    return [self._to_instance(node) for node in \
+            self._get_nodes_in_role(role, state_filter)]
+
+  def _print_node(self, node, out):
+    out.write("\t".join((node.extra['metadata']['roles'], node.id,
+      node.name,
+      self._ip_list_to_string(node.public_ip),
+      self._ip_list_to_string(node.private_ip),
+      STATE_MAP_REVERSED[node.state])))
+    out.write("\n")
+    
+  def _ip_list_to_string(self, ips):
+    if ips is None:
+      return ""
+    return ",".join(ips)
+
+  def print_status(self, roles=None, state_filter="running", out=sys.stdout):
+    if not roles:
+      for node in self._get_nodes(state_filter):
+        self._print_node(node, out)
+    else:
+      for role in roles:
+        for node in self._get_nodes_in_role(role, state_filter):
+          self._print_node(node, out)
+
+  def launch_instances(self, roles, number, image_id, size_id,
+                       instance_user_data, **kwargs):
+    metadata = {"cluster": self.name, "roles": ",".join(roles)}
+    node_ids = []
+    files = { USER_DATA_FILENAME: instance_user_data.read() }
+    if "public_key" in kwargs:
+      files["/root/.ssh/authorized_keys"] = open(kwargs["public_key"]).read()
+    for dummy in range(number):
+      node = self._launch_instance(roles, image_id, size_id, metadata, files)
+      node_ids.append(node.id)
+    return node_ids
+
+  def _launch_instance(self, roles, image_id, size_id, metadata, files):
+    instance_name = "%s-%s" % (self.name, uuid.uuid4().hex[-8:])
+    node = self.driver.create_node(instance_name, self._find_image(image_id),
+                                   self._find_size(size_id), metadata=metadata,
+                                   files=files)
+    return node
+
+  def _find_image(self, image_id):
+    return NodeImage(id=image_id, name=None, driver=None)
+
+  def _find_size(self, size_id):
+    matches = [i for i in self.driver.list_sizes() if i.id == str(size_id)]
+    if len(matches) != 1:
+      return None
+    return matches[0]
+
+  def wait_for_instances(self, instance_ids, timeout=600):
+    start_time = time.time()
+    while True:
+      if (time.time() - start_time >= timeout):
+        raise TimeoutException()
+      try:
+        if self._all_started(instance_ids):
+          break
+      except Exception:
+        pass
+      sys.stdout.write(".")
+      sys.stdout.flush()
+      time.sleep(1)
+
+  def _all_started(self, node_ids):
+    all_nodes = RackspaceCluster._list_nodes(self.driver)
+    node_id_to_node = {}
+    for node in all_nodes:
+      node_id_to_node[node.id] = node
+    for node_id in node_ids:
+      try:
+        if node_id_to_node[node_id].state != STATE_MAP["running"]:
+          return False
+      except KeyError:
+        return False
+    return True
+
+  def terminate(self):
+    nodes = self._get_nodes("running")
+    print nodes
+    for node in nodes:
+      self.driver.destroy_node(node)
+
+class RackspaceHadoopService(HadoopService):
+    
+  def _update_cluster_membership(self, public_key, private_key):
+    """
+    Creates a cluster-wide hosts file and copies it across the cluster.
+    This is a stop gap until DNS is configured on the cluster. 
+    """
+    ssh_options = '-o StrictHostKeyChecking=no'
+
+    time.sleep(30) # wait for SSH daemon to start
+    nodes = self.cluster._get_nodes('running')
+    # create hosts file
+    hosts_file = 'hosts'
+    with open(hosts_file, 'w') as f:
+      f.write("127.0.0.1 localhost localhost.localdomain\n")
+      for node in nodes:
+        f.write(node.public_ip[0] + "\t" + node.name + "\n")
+    # copy to each node in the cluster
+    for node in nodes:
+      self._call('scp -i %s %s %s root@%s:/etc/hosts' \
+                 % (private_key, ssh_options, hosts_file, node.public_ip[0]))
+    os.remove(hosts_file)
+
+  def _call(self, command):
+    print command
+    try:
+      subprocess.call(command, shell=True)
+    except Exception, e:
+      print e
+  

+ 640 - 0
src/contrib/cloud/src/py/hadoop/cloud/service.py

@@ -0,0 +1,640 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Classes for running services on a cluster.
+"""
+
+from __future__ import with_statement
+
+from hadoop.cloud.cluster import get_cluster
+from hadoop.cloud.cluster import InstanceUserData
+from hadoop.cloud.cluster import TimeoutException
+from hadoop.cloud.providers.ec2 import Ec2Storage
+from hadoop.cloud.util import build_env_string
+from hadoop.cloud.util import url_get
+from hadoop.cloud.util import xstr
+import logging
+import os
+import re
+import socket
+import subprocess
+import sys
+import time
+
+logger = logging.getLogger(__name__)
+
+MASTER = "master"  # Deprecated.
+
+NAMENODE = "nn"
+SECONDARY_NAMENODE = "snn"
+JOBTRACKER = "jt"
+DATANODE = "dn"
+TASKTRACKER = "tt"
+
+class InstanceTemplate(object):
+  """
+  A template for creating server instances in a cluster.
+  """
+  def __init__(self, roles, number, image_id, size_id,
+                     key_name, public_key, private_key,
+                     user_data_file_template=None, placement=None,
+                     user_packages=None, auto_shutdown=None, env_strings=[],
+                     security_groups=[]):
+    self.roles = roles
+    self.number = number
+    self.image_id = image_id
+    self.size_id = size_id
+    self.key_name = key_name
+    self.public_key = public_key
+    self.private_key = private_key
+    self.user_data_file_template = user_data_file_template
+    self.placement = placement
+    self.user_packages = user_packages
+    self.auto_shutdown = auto_shutdown
+    self.env_strings = env_strings
+    self.security_groups = security_groups
+
+  def add_env_strings(self, env_strings):
+    new_env_strings = list(self.env_strings or [])
+    new_env_strings.extend(env_strings)
+    self.env_strings = new_env_strings
+
+
+class Service(object):
+  """
+  A general service that runs on a cluster.
+  """
+  
+  def __init__(self, cluster):
+    self.cluster = cluster
+    
+  def get_service_code(self):
+    """
+    The code that uniquely identifies the service.
+    """
+    raise Exception("Unimplemented")
+    
+  def list_all(self, provider):
+    """
+    Find and print all clusters running this type of service.
+    """
+    raise Exception("Unimplemented")
+
+  def list(self):
+    """
+    Find and print all the instances running in this cluster.
+    """
+    raise Exception("Unimplemented")
+  
+  def launch_master(self, instance_template, config_dir, client_cidr):
+    """
+    Launch a "master" instance.
+    """
+    raise Exception("Unimplemented")
+  
+  def launch_slaves(self, instance_template):
+    """
+    Launch "slave" instance.
+    """
+    raise Exception("Unimplemented")
+  
+  def launch_cluster(self, instance_templates, config_dir, client_cidr):
+    """
+    Launch a cluster of instances.
+    """
+    raise Exception("Unimplemented")
+  
+  def terminate_cluster(self,  force=False):
+    self.cluster.print_status()
+    if not force and not self._prompt("Terminate all instances?"):
+      print "Not terminating cluster."
+    else:
+      print "Terminating cluster"
+      self.cluster.terminate()
+      
+  def delete_cluster(self):
+    self.cluster.delete()
+    
+  def create_formatted_snapshot(self, size, availability_zone,
+                                image_id, key_name, ssh_options):
+    Ec2Storage.create_formatted_snapshot(self.cluster, size,
+                                         availability_zone,
+                                         image_id,
+                                         key_name,
+                                         ssh_options)
+
+  def list_storage(self):
+    storage = self.cluster.get_storage()
+    storage.print_status()
+
+  def create_storage(self, role, number_of_instances,
+                     availability_zone, spec_file):
+    storage = self.cluster.get_storage()
+    storage.create(role, number_of_instances, availability_zone, spec_file)
+    storage.print_status()
+    
+  def attach_storage(self, role):
+    storage = self.cluster.get_storage()
+    storage.attach(role, self.cluster.get_instances_in_role(role, 'running'))
+    storage.print_status()
+    
+  def delete_storage(self, force=False):
+    storage = self.cluster.get_storage()
+    storage.print_status()
+    if not force and not self._prompt("Delete all storage volumes? THIS WILL \
+      PERMANENTLY DELETE ALL DATA"):
+      print "Not deleting storage volumes."
+    else:
+      print "Deleting storage"
+      for role in storage.get_roles():
+        storage.delete(role)
+  
+  def login(self, ssh_options):
+    raise Exception("Unimplemented")
+    
+  def proxy(self, ssh_options):
+    raise Exception("Unimplemented")
+    
+  def push(self, ssh_options, file):
+    raise Exception("Unimplemented")
+    
+  def execute(self, ssh_options, args):
+    raise Exception("Unimplemented")
+  
+  def update_slaves_file(self, config_dir, ssh_options, private_key):
+    raise Exception("Unimplemented")
+  
+  def _prompt(self, prompt):
+    """ Returns true if user responds "yes" to prompt. """
+    return raw_input("%s [yes or no]: " % prompt).lower() == "yes"
+
+  def _call(self, command):
+    print command
+    try:
+      subprocess.call(command, shell=True)
+    except Exception, e:
+      print e
+
+  def _get_default_user_data_file_template(self):
+    data_path = os.path.join(os.path.dirname(__file__), 'data')
+    return os.path.join(data_path, '%s-%s-init-remote.sh' %
+                 (self.get_service_code(), self.cluster.get_provider_code()))
+
+  def _launch_instances(self, instance_template):
+    it = instance_template
+    user_data_file_template = it.user_data_file_template
+    if it.user_data_file_template == None:
+      user_data_file_template = self._get_default_user_data_file_template()
+    ebs_mappings = ''
+    storage = self.cluster.get_storage()
+    for role in it.roles:
+      if storage.has_any_storage((role,)):
+        ebs_mappings = storage.get_mappings_string_for_role(role)
+    replacements = { "%ENV%": build_env_string(it.env_strings, {
+      "ROLES": ",".join(it.roles),
+      "USER_PACKAGES": it.user_packages,
+      "AUTO_SHUTDOWN": it.auto_shutdown,
+      "EBS_MAPPINGS": ebs_mappings,
+    }) }
+    instance_user_data = InstanceUserData(user_data_file_template, replacements)
+    instance_ids = self.cluster.launch_instances(it.roles, it.number, it.image_id,
+                                            it.size_id,
+                                            instance_user_data,
+                                            key_name=it.key_name,
+                                            public_key=it.public_key,
+                                            placement=it.placement)
+    print "Waiting for %s instances in role %s to start" % \
+      (it.number, ",".join(it.roles))
+    try:
+      self.cluster.wait_for_instances(instance_ids)
+      print "%s instances started" % ",".join(it.roles)
+    except TimeoutException:
+      print "Timeout while waiting for %s instance to start." % ",".join(it.roles)
+      return
+    print
+    self.cluster.print_status(it.roles[0])
+    return self.cluster.get_instances_in_role(it.roles[0], "running")
+
+  
+class HadoopService(Service):
+  """
+  A HDFS and MapReduce service.
+  """
+  
+  def __init__(self, cluster):
+    super(HadoopService, self).__init__(cluster)
+    
+  def get_service_code(self):
+    return "hadoop"
+    
+  def list_all(self, provider):
+    """
+    Find and print clusters that have a running namenode instances
+    """
+    legacy_clusters = get_cluster(provider).get_clusters_with_role(MASTER)
+    clusters = list(get_cluster(provider).get_clusters_with_role(NAMENODE))
+    clusters.extend(legacy_clusters)
+    if not clusters:
+      print "No running clusters"
+    else:
+      for cluster in clusters:
+        print cluster
+    
+  def list(self):
+    self.cluster.print_status()
+
+  def launch_master(self, instance_template, config_dir, client_cidr):
+    if self.cluster.check_running(NAMENODE, 0) == False:
+      return  # don't proceed if another master is running
+    self.launch_cluster((instance_template,), config_dir, client_cidr)
+  
+  def launch_slaves(self, instance_template):
+    instances = self.cluster.check_running(NAMENODE, 1)
+    if not instances:
+      return
+    master = instances[0]
+    for role in (NAMENODE, SECONDARY_NAMENODE, JOBTRACKER): 
+      singleton_host_env = "%s_HOST=%s" % \
+              (self._sanitize_role_name(role), master.public_ip)
+      instance_template.add_env_strings((singleton_host_env))
+    self._launch_instances(instance_template)              
+    self._attach_storage(instance_template.roles)
+    self._print_master_url()
+      
+  def launch_cluster(self, instance_templates, config_dir, client_cidr):
+    number_of_tasktrackers = 0
+    roles = []
+    for it in instance_templates:
+      roles.extend(it.roles)
+      if TASKTRACKER in it.roles:
+        number_of_tasktrackers += it.number
+    self._launch_cluster_instances(instance_templates)
+    self._create_client_hadoop_site_file(config_dir)
+    self._authorize_client_ports(client_cidr)
+    self._attach_storage(roles)
+    self._update_cluster_membership(instance_templates[0].public_key,
+                                    instance_templates[0].private_key)
+    try:
+      self._wait_for_hadoop(number_of_tasktrackers)
+    except TimeoutException:
+      print "Timeout while waiting for Hadoop to start. Please check logs on" +\
+        " cluster."
+    self._print_master_url()
+    
+  def login(self, ssh_options):
+    master = self._get_master()
+    if not master:
+      sys.exit(1)
+    subprocess.call('ssh %s root@%s' % \
+                    (xstr(ssh_options), master.public_ip),
+                    shell=True)
+    
+  def proxy(self, ssh_options):
+    master = self._get_master()
+    if not master:
+      sys.exit(1)
+    options = '-o "ConnectTimeout 10" -o "ServerAliveInterval 60" ' \
+              '-N -D 6666'
+    process = subprocess.Popen('ssh %s %s root@%s' %
+      (xstr(ssh_options), options, master.public_ip),
+      stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+      shell=True)
+    print """export HADOOP_CLOUD_PROXY_PID=%s;
+echo Proxy pid %s;""" % (process.pid, process.pid)
+    
+  def push(self, ssh_options, file):
+    master = self._get_master()
+    if not master:
+      sys.exit(1)
+    subprocess.call('scp %s -r %s root@%s:' % (xstr(ssh_options),
+                                               file, master.public_ip),
+                                               shell=True)
+    
+  def execute(self, ssh_options, args):
+    master = self._get_master()
+    if not master:
+      sys.exit(1)
+    subprocess.call("ssh %s root@%s '%s'" % (xstr(ssh_options),
+                                             master.public_ip,
+                                             " ".join(args)), shell=True)
+  
+  def update_slaves_file(self, config_dir, ssh_options, private_key):
+    instances = self.cluster.check_running(NAMENODE, 1)
+    if not instances:
+      sys.exit(1)
+    master = instances[0]
+    slaves = self.cluster.get_instances_in_role(DATANODE, "running")
+    cluster_dir = os.path.join(config_dir, self.cluster.name)
+    slaves_file = os.path.join(cluster_dir, 'slaves')
+    with open(slaves_file, 'w') as f:
+      for slave in slaves:
+        f.write(slave.public_ip + "\n")
+    subprocess.call('scp %s -r %s root@%s:/etc/hadoop/conf' % \
+                    (ssh_options, slaves_file, master.public_ip), shell=True)
+    # Copy private key
+    subprocess.call('scp %s -r %s root@%s:/root/.ssh/id_rsa' % \
+                    (ssh_options, private_key, master.public_ip), shell=True)
+    for slave in slaves:
+      subprocess.call('scp %s -r %s root@%s:/root/.ssh/id_rsa' % \
+                      (ssh_options, private_key, slave.public_ip), shell=True)
+        
+  def _get_master(self):
+    # For split namenode/jobtracker, designate the namenode as the master
+    return self._get_namenode()
+
+  def _get_namenode(self):
+    instances = self.cluster.get_instances_in_role(NAMENODE, "running")
+    if not instances:
+      return None
+    return instances[0]
+
+  def _get_jobtracker(self):
+    instances = self.cluster.get_instances_in_role(JOBTRACKER, "running")
+    if not instances:
+      return None
+    return instances[0]
+
+  def _launch_cluster_instances(self, instance_templates):
+    singleton_hosts = []
+    for instance_template in instance_templates:
+      instance_template.add_env_strings(singleton_hosts)
+      instances = self._launch_instances(instance_template)
+      if instance_template.number == 1:
+        if len(instances) != 1:
+          logger.error("Expected a single '%s' instance, but found %s.",
+                       "".join(instance_template.roles), len(instances))
+          return
+        else:
+          for role in instance_template.roles:
+            singleton_host_env = "%s_HOST=%s" % \
+              (self._sanitize_role_name(role),
+               instances[0].public_ip)
+            singleton_hosts.append(singleton_host_env)
+
+  def _sanitize_role_name(self, role):
+    """Replace characters in role name with ones allowed in bash variable names"""
+    return role.replace('+', '_').upper()
+
+  def _authorize_client_ports(self, client_cidrs=[]):
+    if not client_cidrs:
+      logger.debug("No client CIDRs specified, using local address.")
+      client_ip = url_get('http://checkip.amazonaws.com/').strip()
+      client_cidrs = ("%s/32" % client_ip,)
+    logger.debug("Client CIDRs: %s", client_cidrs)
+    namenode = self._get_namenode()
+    jobtracker = self._get_jobtracker()
+    for client_cidr in client_cidrs:
+      # Allow access to port 80 on namenode from client
+      self.cluster.authorize_role(NAMENODE, 80, 80, client_cidr)
+      # Allow access to jobtracker UI on master from client
+      # (so we can see when the cluster is ready)
+      self.cluster.authorize_role(JOBTRACKER, 50030, 50030, client_cidr)
+    # Allow access to namenode and jobtracker via public address from each other
+    namenode_ip = socket.gethostbyname(namenode.public_ip)
+    jobtracker_ip = socket.gethostbyname(jobtracker.public_ip)
+    self.cluster.authorize_role(NAMENODE, 8020, 8020, "%s/32" % namenode_ip)
+    self.cluster.authorize_role(NAMENODE, 8020, 8020, "%s/32" % jobtracker_ip)
+    self.cluster.authorize_role(JOBTRACKER, 8021, 8021, "%s/32" % namenode_ip)
+    self.cluster.authorize_role(JOBTRACKER, 8021, 8021,
+                                "%s/32" % jobtracker_ip)
+  
+  def _create_client_hadoop_site_file(self, config_dir):
+    namenode = self._get_namenode()
+    jobtracker = self._get_jobtracker()
+    cluster_dir = os.path.join(config_dir, self.cluster.name)
+    aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID') or ''
+    aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY') or ''
+    if not os.path.exists(cluster_dir):
+      os.makedirs(cluster_dir)
+    with open(os.path.join(cluster_dir, 'hadoop-site.xml'), 'w') as f:
+      f.write("""<?xml version="1.0"?>
+  <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+  <!-- Put site-specific property overrides in this file. -->
+  <configuration>
+  <property>
+    <name>hadoop.job.ugi</name>
+    <value>root,root</value>
+  </property>
+  <property>
+    <name>fs.default.name</name>
+    <value>hdfs://%(namenode)s:8020/</value>
+  </property>
+  <property>
+    <name>mapred.job.tracker</name>
+    <value>%(jobtracker)s:8021</value>
+  </property>
+  <property>
+    <name>hadoop.socks.server</name>
+    <value>localhost:6666</value>
+  </property>
+  <property>
+    <name>hadoop.rpc.socket.factory.class.default</name>
+    <value>org.apache.hadoop.net.SocksSocketFactory</value>
+  </property>
+  <property>
+    <name>fs.s3.awsAccessKeyId</name>
+    <value>%(aws_access_key_id)s</value>
+  </property>
+  <property>
+    <name>fs.s3.awsSecretAccessKey</name>
+    <value>%(aws_secret_access_key)s</value>
+  </property>
+  <property>
+    <name>fs.s3n.awsAccessKeyId</name>
+    <value>%(aws_access_key_id)s</value>
+  </property>
+  <property>
+    <name>fs.s3n.awsSecretAccessKey</name>
+    <value>%(aws_secret_access_key)s</value>
+  </property>
+  </configuration>
+  """ % {'namenode': namenode.public_ip,
+    'jobtracker': jobtracker.public_ip,
+    'aws_access_key_id': aws_access_key_id,
+    'aws_secret_access_key': aws_secret_access_key})        
+
+  def _wait_for_hadoop(self, number, timeout=600):
+    start_time = time.time()
+    jobtracker = self._get_jobtracker()
+    if not jobtracker:
+      return
+    print "Waiting for jobtracker to start"
+    previous_running = 0
+    while True:
+      if (time.time() - start_time >= timeout):
+        raise TimeoutException()
+      try:
+        actual_running = self._number_of_tasktrackers(jobtracker.public_ip, 1)
+        break
+      except IOError:
+        pass
+      sys.stdout.write(".")
+      sys.stdout.flush()
+      time.sleep(1)
+    print
+    if number > 0:
+      print "Waiting for %d tasktrackers to start" % number
+      while actual_running < number:
+        if (time.time() - start_time >= timeout):
+          raise TimeoutException()
+        try:
+          actual_running = self._number_of_tasktrackers(jobtracker.public_ip, 5, 2)
+          if actual_running != previous_running:
+            sys.stdout.write("%d" % actual_running)
+          sys.stdout.write(".")
+          sys.stdout.flush()
+          time.sleep(1)
+          previous_running = actual_running
+        except IOError:
+          pass
+      print
+
+  # The optional ?type=active is a difference between Hadoop 0.18 and 0.20
+  _NUMBER_OF_TASK_TRACKERS = re.compile(
+    r'<a href="machines.jsp(?:\?type=active)?">(\d+)</a>')
+  
+  def _number_of_tasktrackers(self, jt_hostname, timeout, retries=0):
+    jt_page = url_get("http://%s:50030/jobtracker.jsp" % jt_hostname, timeout,
+                      retries)
+    m = self._NUMBER_OF_TASK_TRACKERS.search(jt_page)
+    if m:
+      return int(m.group(1))
+    return 0
+
+  def _print_master_url(self):
+    webserver = self._get_jobtracker()
+    if not webserver:
+      return
+    print "Browse the cluster at http://%s/" % webserver.public_ip
+
+  def _attach_storage(self, roles):
+    storage = self.cluster.get_storage()
+    if storage.has_any_storage(roles):
+      print "Waiting 10 seconds before attaching storage"
+      time.sleep(10)
+      for role in roles:
+        storage.attach(role, self.cluster.get_instances_in_role(role, 'running'))
+      storage.print_status(roles)
+      
+  def _update_cluster_membership(self, public_key, private_key):
+    pass
+
+
+class ZooKeeperService(Service):
+  """
+  A ZooKeeper service.
+  """
+
+  ZOOKEEPER_ROLE = "zk"
+
+  def __init__(self, cluster):
+    super(ZooKeeperService, self).__init__(cluster)
+    
+  def get_service_code(self):
+    return "zookeeper"
+
+  def launch_cluster(self, instance_templates, config_dir, client_cidr):
+    self._launch_cluster_instances(instance_templates)
+    self._authorize_client_ports(client_cidr)
+    self._update_cluster_membership(instance_templates[0].public_key)
+    
+  def _launch_cluster_instances(self, instance_templates):
+    for instance_template in instance_templates:
+      instances = self._launch_instances(instance_template)
+
+  def _authorize_client_ports(self, client_cidrs=[]):
+    if not client_cidrs:
+      logger.debug("No client CIDRs specified, using local address.")
+      client_ip = url_get('http://checkip.amazonaws.com/').strip()
+      client_cidrs = ("%s/32" % client_ip,)
+    logger.debug("Client CIDRs: %s", client_cidrs)
+    for client_cidr in client_cidrs:
+      self.cluster.authorize_role(self.ZOOKEEPER_ROLE, 2181, 2181, client_cidr)
+  
+  def _update_cluster_membership(self, public_key):
+    time.sleep(30) # wait for SSH daemon to start
+    
+    ssh_options = '-o StrictHostKeyChecking=no'
+    private_key = public_key[:-4] # TODO: pass in private key explicitly
+
+    instances = self.cluster.get_instances_in_role(self.ZOOKEEPER_ROLE,
+                                                   'running')
+    config_file = 'zoo.cfg'
+    with open(config_file, 'w') as f:
+      f.write("""# The number of milliseconds of each tick
+tickTime=2000
+# The number of ticks that the initial
+# synchronization phase can take
+initLimit=10
+# The number of ticks that can pass between
+# sending a request and getting an acknowledgement
+syncLimit=5
+# The directory where the snapshot is stored.
+dataDir=/var/log/zookeeper/txlog
+# The port at which the clients will connect
+clientPort=2181
+# The servers in the ensemble
+""")
+      counter = 1
+      for i in instances:
+        f.write("server.%s=%s:2888:3888\n" % (counter, i.private_ip))
+        counter += 1
+    # copy to each node in the cluster
+    myid_file = 'myid'
+    counter = 1
+    for i in instances:
+      self._call('scp -i %s %s %s root@%s:/etc/zookeeper/conf/zoo.cfg' \
+                 % (private_key, ssh_options, config_file, i.public_ip))
+      with open(myid_file, 'w') as f:
+        f.write(str(counter) + "\n")
+      self._call('scp -i %s %s %s root@%s:/var/log/zookeeper/txlog/myid' \
+                 % (private_key, ssh_options, myid_file, i.public_ip))
+      counter += 1
+    os.remove(config_file)
+    os.remove(myid_file)
+
+    # start the zookeeper servers
+    for i in instances:
+      self._call('ssh -i %s %s root@%s nohup /etc/rc.local &' \
+                 % (private_key, ssh_options, i.public_ip))
+      
+    hosts_string = ",".join(["%s:2181" % i.public_ip for i in instances]) 
+    print "ZooKeeper cluster: %s" % hosts_string
+
+SERVICE_PROVIDER_MAP = {
+  "hadoop": {
+     "rackspace": ('hadoop.cloud.providers.rackspace', 'RackspaceHadoopService')
+  },
+  "zookeeper": {
+    # "provider_code": ('hadoop.cloud.providers.provider_code', 'ProviderZooKeeperService')
+  },
+}
+
+DEFAULT_SERVICE_PROVIDER_MAP = {
+  "hadoop": HadoopService,
+  "zookeeper": ZooKeeperService
+}
+
+def get_service(service, provider):
+  """
+  Retrieve the Service class for a service and provider.
+  """
+  try:
+    mod_name, service_classname = SERVICE_PROVIDER_MAP[service][provider]
+    _mod = __import__(mod_name, globals(), locals(), [service_classname])
+    return getattr(_mod, service_classname)
+  except KeyError:
+    return DEFAULT_SERVICE_PROVIDER_MAP[service]

+ 173 - 0
src/contrib/cloud/src/py/hadoop/cloud/storage.py

@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Classes for controlling external cluster storage.
+"""
+
+import logging
+import simplejson as json
+
+logger = logging.getLogger(__name__)
+
+class VolumeSpec(object):
+  """
+  The specification for a storage volume, encapsulating all the information
+  needed to create a volume and ultimately mount it on an instance.
+  """
+  def __init__(self, size, mount_point, device, snapshot_id):
+    self.size = size
+    self.mount_point = mount_point
+    self.device = device
+    self.snapshot_id = snapshot_id
+
+
+class JsonVolumeSpecManager(object):
+  """
+  A container for VolumeSpecs. This object can read VolumeSpecs specified in
+  JSON.
+  """
+  def __init__(self, spec_file):
+    self.spec = json.load(spec_file)
+
+  def volume_specs_for_role(self, role):
+    return [VolumeSpec(d["size_gb"], d["mount_point"], d["device"],
+                       d["snapshot_id"]) for d in self.spec[role]]
+
+  def get_mappings_string_for_role(self, role):
+    """
+    Returns a short string of the form
+    "mount_point1,device1;mount_point2,device2;..."
+    which is useful for passing as an environment variable.
+    """
+    return ";".join(["%s,%s" % (d["mount_point"], d["device"])
+                     for d in self.spec[role]])
+
+
+class MountableVolume(object):
+  """
+  A storage volume that has been created. It may or may not have been attached
+  or mounted to an instance.
+  """
+  def __init__(self, volume_id, mount_point, device):
+    self.volume_id = volume_id
+    self.mount_point = mount_point
+    self.device = device
+
+
+class JsonVolumeManager(object):
+
+  def __init__(self, filename):
+    self.filename = filename
+
+  def _load(self):
+    try:
+      return json.load(open(self.filename, "r"))
+    except IOError:
+      logger.debug("File %s does not exist.", self.filename)
+      return {}
+
+  def _store(self, obj):
+    return json.dump(obj, open(self.filename, "w"), sort_keys=True, indent=2)
+  
+  def get_roles(self):
+    json_dict = self._load()
+    return json_dict.keys()
+
+  def add_instance_storage_for_role(self, role, mountable_volumes):
+    json_dict = self._load()
+    mv_dicts = [mv.__dict__ for mv in mountable_volumes]
+    json_dict.setdefault(role, []).append(mv_dicts)
+    self._store(json_dict)
+
+  def remove_instance_storage_for_role(self, role):
+    json_dict = self._load()
+    del json_dict[role]
+    self._store(json_dict)
+
+  def get_instance_storage_for_role(self, role):
+    """
+    Returns a list of lists of MountableVolume objects. Each nested list is
+    the storage for one instance.
+    """
+    try:
+      json_dict = self._load()
+      instance_storage = []
+      for instance in json_dict[role]:
+        vols = []
+        for vol in instance:
+          vols.append(MountableVolume(vol["volume_id"], vol["mount_point"],
+                                      vol["device"]))
+        instance_storage.append(vols)
+      return instance_storage
+    except KeyError:
+      return []
+
+class Storage(object):
+  """
+  Storage volumes for a cluster. The storage is associated with a named
+  cluster. Many clusters just have local storage, in which case this is
+  not used.
+  """
+
+  def __init__(self, cluster):
+    self.cluster = cluster
+
+  def create(self, role, number_of_instances, availability_zone, spec_filename):
+    """
+    Create new storage volumes for instances with the given role, according to
+    the mapping defined in the spec file.
+    """
+    pass
+
+  def get_mappings_string_for_role(self, role):
+    """
+    Returns a short string of the form
+    "mount_point1,device1;mount_point2,device2;..."
+    which is useful for passing as an environment variable.
+    """
+    raise Exception("Unimplemented")
+
+  def has_any_storage(self, roles):
+    """
+    Return True if any of the given roles has associated storage
+    """
+    return False
+
+  def get_roles(self):
+    """
+    Return a list of roles that have storage defined.
+    """
+    return []
+
+  def print_status(self, roles=None):
+    """
+    Print the status of storage volumes for the given roles.
+    """
+    pass
+
+  def attach(self, role, instances):
+    """
+    Attach volumes for a role to instances. Some volumes may already be
+    attached, in which case they are ignored, and we take care not to attach
+    multiple volumes to an instance.
+    """
+    pass
+
+  def delete(self, roles=[]):
+    """
+    Permanently delete all the storage for the given roles.
+    """
+    pass

+ 84 - 0
src/contrib/cloud/src/py/hadoop/cloud/util.py

@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Utility functions.
+"""
+
+import ConfigParser
+import socket
+import urllib2
+
+def bash_quote(text):
+  """Quotes a string for bash, by using single quotes."""
+  if text == None:
+    return ""
+  return "'%s'" % text.replace("'", "'\\''")
+
+def bash_quote_env(env):
+  """Quotes the value in an environment variable assignment."""
+  if env.find("=") == -1:
+    return env
+  (var, value) = env.split("=")
+  return "%s=%s" % (var, bash_quote(value))
+
+def build_env_string(env_strings=[], pairs={}):
+  """Build a bash environment variable assignment"""
+  env = ''
+  if env_strings:
+    for env_string in env_strings:
+      env += "%s " % bash_quote_env(env_string)
+  if pairs:
+    for key, val in pairs.items():
+      env += "%s=%s " % (key, bash_quote(val))
+  return env[:-1]
+
+def merge_config_with_options(section_name, config, options):
+  """
+  Merge configuration options with a dictionary of options.
+  Keys in the options dictionary take precedence.
+  """
+  res = {}
+  try:
+    for (key, value) in config.items(section_name):
+      if value.find("\n") != -1:
+        res[key] = value.split("\n")
+      else:
+        res[key] = value
+  except ConfigParser.NoSectionError:
+    pass
+  for key in options:
+    if options[key] != None:
+      res[key] = options[key]
+  return res
+
+def url_get(url, timeout=10, retries=0):
+  """
+  Retrieve content from the given URL.
+  """
+   # in Python 2.6 we can pass timeout to urllib2.urlopen
+  socket.setdefaulttimeout(timeout)
+  attempts = 0
+  while True:
+    try:
+      return urllib2.urlopen(url).read()
+    except urllib2.URLError:
+      attempts = attempts + 1
+      if attempts > retries:
+        raise
+
+def xstr(string):
+  """Sane string conversion: return an empty string if string is None."""
+  return '' if string is None else str(string)

+ 30 - 0
src/contrib/cloud/src/py/setup.py

@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from distutils.core import setup
+
+version = __import__('hadoop.cloud').cloud.VERSION
+
+setup(name='hadoop-cloud',
+      version=version,
+      description='Scripts for running Hadoop on cloud providers',
+      license = 'Apache License (2.0)',
+      url = 'http://hadoop.apache.org/common/',
+      packages=['hadoop', 'hadoop.cloud','hadoop.cloud.providers'],
+      package_data={'hadoop.cloud': ['data/*.sh']},
+      scripts=['hadoop-ec2'],
+      author = 'Apache Hadoop Contributors',
+      author_email = 'common-dev@hadoop.apache.org',
+)

+ 37 - 0
src/contrib/cloud/src/test/py/testcluster.py

@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from hadoop.cloud.cluster import RoleSyntaxException
+from hadoop.cloud.providers.ec2 import Ec2Cluster
+
+class TestCluster(unittest.TestCase):
+
+  def test_group_name_for_role(self):
+    cluster = Ec2Cluster("test-cluster", None)
+    self.assertEqual("test-cluster-foo", cluster._group_name_for_role("foo"))
+
+  def test_check_role_name_valid(self):
+    cluster = Ec2Cluster("test-cluster", None)
+    cluster._check_role_name(
+      "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_+")
+
+  def test_check_role_name_dash_is_invalid(self):
+    cluster = Ec2Cluster("test-cluster", None)
+    self.assertRaises(RoleSyntaxException, cluster._check_role_name, "a-b")
+
+if __name__ == '__main__':
+  unittest.main()

+ 74 - 0
src/contrib/cloud/src/test/py/testrackspace.py

@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import StringIO
+import unittest
+
+from hadoop.cloud.providers.rackspace import RackspaceCluster
+
+class TestCluster(unittest.TestCase):
+
+  class DriverStub(object):
+    def list_nodes(self):
+      class NodeStub(object):
+        def __init__(self, name, metadata):
+          self.id = name
+          self.name = name
+          self.state = 'ACTIVE'
+          self.public_ip = ['100.0.0.1']
+          self.private_ip = ['10.0.0.1']
+          self.extra = { 'metadata': metadata }
+      return [NodeStub('random_instance', {}),
+              NodeStub('cluster1-nj-000', {'cluster': 'cluster1', 'roles': 'nn,jt'}),
+              NodeStub('cluster1-dt-000', {'cluster': 'cluster1', 'roles': 'dn,tt'}),
+              NodeStub('cluster1-dt-001', {'cluster': 'cluster1', 'roles': 'dn,tt'}),
+              NodeStub('cluster2-dt-000', {'cluster': 'cluster2', 'roles': 'dn,tt'}),
+              NodeStub('cluster3-nj-000', {'cluster': 'cluster3', 'roles': 'nn,jt'})]
+
+  def test_get_clusters_with_role(self):
+    self.assertEqual(set(['cluster1', 'cluster2']),
+      RackspaceCluster.get_clusters_with_role('dn', 'running',
+                                           TestCluster.DriverStub()))
+    
+  def test_get_instances_in_role(self):
+    cluster = RackspaceCluster('cluster1', None, TestCluster.DriverStub())
+    
+    instances = cluster.get_instances_in_role('nn')
+    self.assertEquals(1, len(instances))
+    self.assertEquals('cluster1-nj-000', instances[0].id)
+
+    instances = cluster.get_instances_in_role('tt')
+    self.assertEquals(2, len(instances))
+    self.assertEquals(set(['cluster1-dt-000', 'cluster1-dt-001']),
+                      set([i.id for i in instances]))
+    
+  def test_print_status(self):
+    cluster = RackspaceCluster('cluster1', None, TestCluster.DriverStub())
+    
+    out = StringIO.StringIO()
+    cluster.print_status(None, "running", out)
+    self.assertEquals("""nn,jt cluster1-nj-000 cluster1-nj-000 100.0.0.1 10.0.0.1 running
+dn,tt cluster1-dt-000 cluster1-dt-000 100.0.0.1 10.0.0.1 running
+dn,tt cluster1-dt-001 cluster1-dt-001 100.0.0.1 10.0.0.1 running
+""", out.getvalue().replace("\t", " "))
+
+    out = StringIO.StringIO()
+    cluster.print_status(["dn"], "running", out)
+    self.assertEquals("""dn,tt cluster1-dt-000 cluster1-dt-000 100.0.0.1 10.0.0.1 running
+dn,tt cluster1-dt-001 cluster1-dt-001 100.0.0.1 10.0.0.1 running
+""", out.getvalue().replace("\t", " "))
+
+if __name__ == '__main__':
+  unittest.main()

+ 143 - 0
src/contrib/cloud/src/test/py/teststorage.py

@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import simplejson as json
+from StringIO import StringIO
+
+from hadoop.cloud.storage import MountableVolume
+from hadoop.cloud.storage import JsonVolumeManager
+from hadoop.cloud.storage import JsonVolumeSpecManager
+
+spec = {
+ "master": ({"size_gb":"8", "mount_point":"/", "device":"/dev/sdj",
+             "snapshot_id": "snap_1"},
+            ),
+ "slave": ({"size_gb":"8", "mount_point":"/", "device":"/dev/sdj",
+            "snapshot_id": "snap_2"},
+           {"size_gb":"10", "mount_point":"/data1", "device":"/dev/sdk",
+            "snapshot_id": "snap_3"},
+           )
+ }
+
+class TestJsonVolumeSpecManager(unittest.TestCase):
+
+  def test_volume_specs_for_role(self):
+
+    input = StringIO(json.dumps(spec))
+
+    volume_spec_manager = JsonVolumeSpecManager(input)
+
+    master_specs = volume_spec_manager.volume_specs_for_role("master")
+    self.assertEqual(1, len(master_specs))
+    self.assertEqual("/", master_specs[0].mount_point)
+    self.assertEqual("8", master_specs[0].size)
+    self.assertEqual("/dev/sdj", master_specs[0].device)
+    self.assertEqual("snap_1", master_specs[0].snapshot_id)
+
+    slave_specs = volume_spec_manager.volume_specs_for_role("slave")
+    self.assertEqual(2, len(slave_specs))
+    self.assertEqual("snap_2", slave_specs[0].snapshot_id)
+    self.assertEqual("snap_3", slave_specs[1].snapshot_id)
+
+    self.assertRaises(KeyError, volume_spec_manager.volume_specs_for_role,
+                      "no-such-role")
+
+  def test_get_mappings_string_for_role(self):
+
+    input = StringIO(json.dumps(spec))
+
+    volume_spec_manager = JsonVolumeSpecManager(input)
+
+    master_mappings = volume_spec_manager.get_mappings_string_for_role("master")
+    self.assertEqual("/,/dev/sdj", master_mappings)
+
+    slave_mappings = volume_spec_manager.get_mappings_string_for_role("slave")
+    self.assertEqual("/,/dev/sdj;/data1,/dev/sdk", slave_mappings)
+
+    self.assertRaises(KeyError,
+                      volume_spec_manager.get_mappings_string_for_role,
+                      "no-such-role")
+
+class TestJsonVolumeManager(unittest.TestCase):
+
+  def tearDown(self):
+    try:
+      os.remove("volumemanagertest.json")
+    except OSError:
+      pass
+    
+  def test_add_instance_storage_for_role(self):
+    volume_manager = JsonVolumeManager("volumemanagertest.json")
+    self.assertEqual(0,
+      len(volume_manager.get_instance_storage_for_role("master")))
+    self.assertEqual(0, len(volume_manager.get_roles()))
+
+    volume_manager.add_instance_storage_for_role("master",
+                                                 [MountableVolume("vol_1", "/",
+                                                                  "/dev/sdj")])
+    master_storage = volume_manager.get_instance_storage_for_role("master")
+    self.assertEqual(1, len(master_storage))
+    master_storage_instance0 = master_storage[0]
+    self.assertEqual(1, len(master_storage_instance0))
+    master_storage_instance0_vol0 = master_storage_instance0[0]
+    self.assertEqual("vol_1", master_storage_instance0_vol0.volume_id)
+    self.assertEqual("/", master_storage_instance0_vol0.mount_point)
+    self.assertEqual("/dev/sdj", master_storage_instance0_vol0.device)
+
+    volume_manager.add_instance_storage_for_role("slave",
+                                                 [MountableVolume("vol_2", "/",
+                                                                  "/dev/sdj")])
+    self.assertEqual(1,
+      len(volume_manager.get_instance_storage_for_role("master")))
+    slave_storage = volume_manager.get_instance_storage_for_role("slave")
+    self.assertEqual(1, len(slave_storage))
+    slave_storage_instance0 = slave_storage[0]
+    self.assertEqual(1, len(slave_storage_instance0))
+    slave_storage_instance0_vol0 = slave_storage_instance0[0]
+    self.assertEqual("vol_2", slave_storage_instance0_vol0.volume_id)
+    self.assertEqual("/", slave_storage_instance0_vol0.mount_point)
+    self.assertEqual("/dev/sdj", slave_storage_instance0_vol0.device)
+
+    volume_manager.add_instance_storage_for_role("slave",
+      [MountableVolume("vol_3", "/", "/dev/sdj"),
+       MountableVolume("vol_4", "/data1", "/dev/sdk")])
+    self.assertEqual(1,
+      len(volume_manager.get_instance_storage_for_role("master")))
+    slave_storage = volume_manager.get_instance_storage_for_role("slave")
+    self.assertEqual(2, len(slave_storage))
+    slave_storage_instance0 = slave_storage[0]
+    slave_storage_instance1 = slave_storage[1]
+    self.assertEqual(1, len(slave_storage_instance0))
+    self.assertEqual(2, len(slave_storage_instance1))
+    slave_storage_instance1_vol0 = slave_storage_instance1[0]
+    slave_storage_instance1_vol1 = slave_storage_instance1[1]
+    self.assertEqual("vol_3", slave_storage_instance1_vol0.volume_id)
+    self.assertEqual("/", slave_storage_instance1_vol0.mount_point)
+    self.assertEqual("/dev/sdj", slave_storage_instance1_vol0.device)
+    self.assertEqual("vol_4", slave_storage_instance1_vol1.volume_id)
+    self.assertEqual("/data1", slave_storage_instance1_vol1.mount_point)
+    self.assertEqual("/dev/sdk", slave_storage_instance1_vol1.device)
+    
+    roles = volume_manager.get_roles()
+    self.assertEqual(2, len(roles))
+    self.assertTrue("slave" in roles)
+    self.assertTrue("master" in roles)
+
+
+if __name__ == '__main__':
+  unittest.main()

+ 44 - 0
src/contrib/cloud/src/test/py/testuserdata.py

@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+
+from hadoop.cloud.cluster import InstanceUserData
+
+class TestInstanceUserData(unittest.TestCase):
+
+  def test_replacement(self):
+    file = tempfile.NamedTemporaryFile()
+    file.write("Contents go here")
+    file.flush()
+    self.assertEqual("Contents go here",
+                     InstanceUserData(file.name, {}).read())
+    self.assertEqual("Contents were here",
+                     InstanceUserData(file.name, { "go": "were"}).read())
+    self.assertEqual("Contents  here",
+                     InstanceUserData(file.name, { "go": None}).read())
+    file.close()
+
+  def test_read_file_url(self):
+    file = tempfile.NamedTemporaryFile()
+    file.write("Contents go here")
+    file.flush()
+    self.assertEqual("Contents go here",
+                     InstanceUserData("file://%s" % file.name, {}).read())
+    file.close()
+
+if __name__ == '__main__':
+  unittest.main()

+ 81 - 0
src/contrib/cloud/src/test/py/testutil.py

@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ConfigParser
+import StringIO
+import unittest
+
+from hadoop.cloud.util import bash_quote
+from hadoop.cloud.util import bash_quote_env
+from hadoop.cloud.util import build_env_string
+from hadoop.cloud.util import merge_config_with_options
+from hadoop.cloud.util import xstr
+
+class TestUtilFunctions(unittest.TestCase):
+
+  def test_bash_quote(self):
+    self.assertEqual("", bash_quote(None))
+    self.assertEqual("''", bash_quote(""))
+    self.assertEqual("'a'", bash_quote("a"))
+    self.assertEqual("'a b'", bash_quote("a b"))
+    self.assertEqual("'a\b'", bash_quote("a\b"))
+    self.assertEqual("'a '\\'' b'", bash_quote("a ' b"))
+
+  def test_bash_quote_env(self):
+    self.assertEqual("", bash_quote_env(""))
+    self.assertEqual("a", bash_quote_env("a"))
+    self.assertEqual("a='b'", bash_quote_env("a=b"))
+    self.assertEqual("a='b c'", bash_quote_env("a=b c"))
+    self.assertEqual("a='b\c'", bash_quote_env("a=b\c"))
+    self.assertEqual("a='b '\\'' c'", bash_quote_env("a=b ' c"))
+
+  def test_build_env_string(self):
+    self.assertEqual("", build_env_string())
+    self.assertEqual("a='b' c='d'",
+                     build_env_string(env_strings=["a=b", "c=d"]))
+    self.assertEqual("a='b' c='d'",
+                     build_env_string(pairs={"a": "b", "c": "d"}))
+
+  def test_merge_config_with_options(self):
+    options = { "a": "b" }
+    config = ConfigParser.ConfigParser()
+    self.assertEqual({ "a": "b" },
+                     merge_config_with_options("section", config, options))
+    config.add_section("section")
+    self.assertEqual({ "a": "b" },
+                     merge_config_with_options("section", config, options))
+    config.set("section", "a", "z")
+    config.set("section", "c", "d")
+    self.assertEqual({ "a": "z", "c": "d" },
+                     merge_config_with_options("section", config, {}))
+    self.assertEqual({ "a": "b", "c": "d" },
+                     merge_config_with_options("section", config, options))
+
+  def test_merge_config_with_options_list(self):
+    config = ConfigParser.ConfigParser()
+    config.readfp(StringIO.StringIO("""[section]
+env1=a=b
+ c=d
+env2=e=f
+ g=h"""))
+    self.assertEqual({ "env1": ["a=b", "c=d"], "env2": ["e=f", "g=h"] },
+                     merge_config_with_options("section", config, {}))
+
+  def test_xstr(self):
+    self.assertEqual("", xstr(None))
+    self.assertEqual("a", xstr("a"))
+
+if __name__ == '__main__':
+  unittest.main()

+ 46 - 0
src/contrib/cloud/tools/rackspace/remote-setup.sh

@@ -0,0 +1,46 @@
+#!/bin/bash -x
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Given an Ubuntu base system install, install the base packages we need.
+#
+
+# We require multiverse to be enabled.
+cat >> /etc/apt/sources.list << EOF
+deb http://us.archive.ubuntu.com/ubuntu/ intrepid multiverse
+deb-src http://us.archive.ubuntu.com/ubuntu/ intrepid multiverse
+deb http://us.archive.ubuntu.com/ubuntu/ intrepid-updates multiverse
+deb-src http://us.archive.ubuntu.com/ubuntu/ intrepid-updates multiverse
+EOF
+
+apt-get update
+
+# Install Java
+apt-get -y install sun-java6-jdk
+echo "export JAVA_HOME=/usr/lib/jvm/java-6-sun" >> /etc/profile
+export JAVA_HOME=/usr/lib/jvm/java-6-sun
+java -version
+
+# Install general packages
+apt-get -y install vim curl screen ssh rsync unzip openssh-server
+apt-get -y install policykit # http://www.bergek.com/2008/11/24/ubuntu-810-libpolkit-error/
+
+# Create root's .ssh directory if it doesn't exist
+mkdir -p /root/.ssh
+
+# Run any rackspace init script injected at boot time
+echo '[ -f /etc/init.d/rackspace-init.sh ] && /bin/sh /etc/init.d/rackspace-init.sh; exit 0' > /etc/rc.local

+ 1 - 1
src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/Anonymizer.java

@@ -136,7 +136,7 @@ public class Anonymizer {
   }
 
   private static String convertToHex(byte[] data) {
-    StringBuffer buf = new StringBuffer();
+    StringBuilder buf = new StringBuilder();
     for (int i = 0; i < data.length; i++) {
       int halfbyte = (data[i] >>> 4) & 0x0F;
       int two_halfs = 0;

+ 1 - 1
src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/CPUParser.java

@@ -46,7 +46,7 @@ public class CPUParser extends ShellParser {
    * @return the EventRecord created
    */
   public EventRecord query(String s) throws Exception {
-    StringBuffer sb = Environment.runCommand("cat /proc/cpuinfo");
+    CharSequence sb = Environment.runCommandGeneric("cat /proc/cpuinfo");
     EventRecord retval = new EventRecord(InetAddress.getLocalHost()
         .getCanonicalHostName(), InetAddress.getAllByName(InetAddress.getLocalHost()
         .getHostName()), Calendar.getInstance(), "CPU", "Unknown", "CPU", "-");

+ 39 - 11
src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/Environment.java

@@ -247,7 +247,7 @@ public class Environment {
         
         if (!file_present) 
           if (superuser) {
-              StringBuffer sb = runCommand("sudo smartctl -i " + devices[i]);
+              CharSequence sb = runCommandGeneric("sudo smartctl -i " + devices[i]);
               String patternStr = "[(failed)(device not supported)]";
               Pattern pattern = Pattern.compile(patternStr);
               Matcher matcher = pattern.matcher(sb.toString());
@@ -263,7 +263,7 @@ public class Environment {
       } 
       
       // now remove disks that dont exist
-      StringBuffer resetSB = new StringBuffer();
+      StringBuilder resetSB = new StringBuilder();
       for (int j = 0; j < devices.length; j++) {
         resetSB.append(devices[j] == null ? "" : devices[j] + ", ");
 	if (devices[j] != null)
@@ -323,7 +323,7 @@ public class Environment {
    *  @return true, if the command is availble, false otherwise
    */
   public static boolean checkExistence(String cmd) {
-    StringBuffer sb = runCommand("which " + cmd);
+    CharSequence sb = runCommandGeneric("which " + cmd);
     if (sb.length() > 1)
       return true;
 
@@ -331,15 +331,30 @@ public class Environment {
   }
 
   /**
-   * Runs a shell command in the system and provides a StringBuffer
+   * Runs a shell command in the system and provides a StringBuilder
    * with the output of the command.
+   * <p>This method is deprecated. See related method that returns a CharSequence as oppposed to a StringBuffer.
    * 
    *  @param cmd an array of string that form the command to run 
    *  
-   *  @return a StringBuffer that contains the output of the command 
+   *  @return a text that contains the output of the command 
+   *  @see #runCommandGeneric(String[])
+   *  @deprecated
    */
   public static StringBuffer runCommand(String[] cmd) {
-    StringBuffer retval = new StringBuffer(MAX_OUTPUT_LENGTH);
+    return new StringBuffer(runCommandGeneric(cmd));
+  }
+
+  /**
+   * Runs a shell command in the system and provides a StringBuilder
+   * with the output of the command.
+   * 
+   *  @param cmd an array of string that form the command to run 
+   *  
+   *  @return a text that contains the output of the command 
+   */
+  public static CharSequence runCommandGeneric(String[] cmd) {
+    StringBuilder retval = new StringBuilder(MAX_OUTPUT_LENGTH);
     Process p;
     try {
       p = Runtime.getRuntime().exec(cmd);
@@ -356,19 +371,32 @@ public class Environment {
 
     return retval;
   }
-
+  
   /**
-   * Runs a shell command in the system and provides a StringBuffer
+   * Runs a shell command in the system and provides a StringBuilder
    * with the output of the command.
-   * 
+   * <p>This method is deprecated in favor of the one that returns CharSequence as opposed to StringBuffer
    *  @param cmd the command to run 
    *  
-   *  @return a StringBuffer that contains the output of the command 
+   *  @return a text that contains the output of the command 
+   *  @see #runCommandGeneric(String)
+   *  @deprecated
    */
   public static StringBuffer runCommand(String cmd) {
-    return runCommand(cmd.split("\\s+"));
+    return new StringBuffer(runCommandGeneric(cmd));
   }
 
+  /**
+   * Runs a shell command in the system and provides a StringBuilder
+   * with the output of the command.
+   * 
+   *  @param cmd the command to run 
+   *  
+   *  @return a text that contains the output of the command 
+   */
+  public static CharSequence runCommandGeneric(String cmd) {
+    return runCommandGeneric(cmd.split("\\s+"));
+  }  
   /**
    * Determines the greatest common divisor (GCD) of two integers.
    * 

+ 1 - 1
src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/HadoopLogParser.java

@@ -113,7 +113,7 @@ public class HadoopLogParser extends LogParser {
    * 
    */
   private void findHostname() {
-    String startupInfo = Environment.runCommand(
+    String startupInfo = Environment.runCommandGeneric(
         "grep --max-count=1 STARTUP_MSG:\\s*host " + file.getName()).toString();
     Pattern pattern = Pattern.compile("\\s+(\\w+/.+)\\s+");
     Matcher matcher = pattern.matcher(startupInfo);

+ 17 - 3
src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/LocalStore.java

@@ -144,11 +144,25 @@ public class LocalStore {
 
   /**
    * Pack a SerializedRecord into an array of bytes
-   * 
+   * <p>
+   * This method is deprecated. 
    * @param sr the SerializedRecord to be packed
+   * @return Packed representation fo the Serialized Record
+   * @see #packConcurrent(SerializedRecord)
+   * @deprecated
    */
   public static StringBuffer pack(SerializedRecord sr) {
-    StringBuffer sb = new StringBuffer();
+    return new StringBuffer(packConcurrent(sr));
+  }
+
+  /**
+   * Pack a SerializedRecord into an array of bytes
+   * 
+   * @param sr the SerializedRecord to be packed
+   * @return Packed representation fo the Serialized Record
+   */
+  public static CharSequence packConcurrent(SerializedRecord sr) {
+    StringBuilder sb = new StringBuilder();
 
     ArrayList<String> keys = new ArrayList<String>(sr.fields.keySet());
 
@@ -162,7 +176,7 @@ public class LocalStore {
     }
     return sb;
   }
-
+  
   /**
    * Upload the local file store into HDFS, after it 
    * compressing it. Then a new local file is created 

+ 1 - 1
src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/NICParser.java

@@ -54,7 +54,7 @@ public class NICParser extends ShellParser {
    * @return the EventRecord created
    */
   public EventRecord query(String device) throws UnknownHostException {
-    StringBuffer sb = Environment.runCommand("/sbin/ifconfig " + device);
+    CharSequence sb = Environment.runCommandGeneric("/sbin/ifconfig " + device);
     EventRecord retval = new EventRecord(InetAddress.getLocalHost()
         .getCanonicalHostName(), InetAddress.getAllByName(InetAddress.getLocalHost()
         .getHostName()), Calendar.getInstance(), "NIC", "Unknown", device, "-");

+ 5 - 5
src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/SMARTParser.java

@@ -66,12 +66,12 @@ public class SMARTParser extends ShellParser {
    */
   public EventRecord query(String device) throws Exception {
     String conf = Environment.getProperty("disks." + device + ".source");
-    StringBuffer sb;
+    CharSequence sb;
 
     if (conf == null)
-      sb = Environment.runCommand("sudo smartctl --all " + device);
+      sb = Environment.runCommandGeneric("sudo smartctl --all " + device);
     else
-      sb = Environment.runCommand("cat " + conf);
+      sb = Environment.runCommandGeneric("cat " + conf);
 
     EventRecord retval = new EventRecord(InetAddress.getLocalHost()
         .getCanonicalHostName(), InetAddress.getAllByName(InetAddress.getLocalHost()
@@ -146,11 +146,11 @@ public class SMARTParser extends ShellParser {
    * This format is mostly found in IDE and SATA disks.
    * 
    * @param er the EventRecord in which to store attributes found
-   * @param sb the StringBuffer with the text to parse
+   * @param sb the text to parse
    * 
    * @return the EventRecord in which new attributes are stored.
    */
-  private EventRecord readColumns(EventRecord er, StringBuffer sb) {
+  private EventRecord readColumns(EventRecord er, CharSequence sb) {
 
     Pattern pattern = Pattern.compile("^\\s{0,2}(\\d{1,3}\\s+.*)$",
         Pattern.MULTILINE);

+ 4 - 4
src/contrib/failmon/src/java/org/apache/hadoop/contrib/failmon/SensorsParser.java

@@ -42,10 +42,10 @@ public class SensorsParser extends ShellParser {
    * @return the EventRecord created
    */
   public EventRecord query(String s) throws Exception {
-    StringBuffer sb;
+    CharSequence sb;
 
-    //sb = Environment.runCommand("sensors -A");
-     sb = Environment.runCommand("cat sensors.out");
+    //sb = Environment.runCommandGeneric("sensors -A");
+     sb = Environment.runCommandGeneric("cat sensors.out");
 
     EventRecord retval = new EventRecord(InetAddress.getLocalHost()
         .getCanonicalHostName(), InetAddress.getAllByName(InetAddress.getLocalHost()
@@ -70,7 +70,7 @@ public class SensorsParser extends ShellParser {
    * 
    * @return the EventRecord created
    */
-  private EventRecord readGroup(EventRecord er, StringBuffer sb, String prefix) {
+  private EventRecord readGroup(EventRecord er, CharSequence sb, String prefix) {
 
     Pattern pattern = Pattern.compile(".*(" + prefix
         + "\\s*\\d*)\\s*:\\s*(\\+?\\d+)", Pattern.MULTILINE);

+ 2 - 2
src/contrib/test/core-site.xml

@@ -1,3 +1,5 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
@@ -14,8 +16,6 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
 <!-- Values used when running unit tests.  This is mostly empty, to -->
 <!-- use of the default values, overriding the potentially -->

+ 2 - 2
src/contrib/test/hadoop-site.xml

@@ -1,3 +1,5 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
@@ -14,8 +16,6 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
 
 <!-- DO NOT PUT ANY PROPERTY IN THIS FILE. INSTEAD USE -->
 <!-- core-site.xml, mapred-site.xml OR hdfs-site.xml -->

+ 2 - 2
src/contrib/test/hdfs-site.xml

@@ -1,3 +1,5 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
@@ -14,8 +16,6 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
 <!-- Put site-specific property overrides in this file. -->
 

+ 2 - 2
src/contrib/test/mapred-site.xml

@@ -1,3 +1,5 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 <!--
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
@@ -14,8 +16,6 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 
 <!-- Put site-specific property overrides in this file. -->
 

+ 0 - 192
src/docs/src/documentation/content/xdocs/hdfs_permissions_guide.xml

@@ -1,192 +0,0 @@
-<?xml version="1.0"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-
-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
-          "http://forrest.apache.org/dtd/document-v20.dtd">
-
-
-<document>
-
-  <header>
-    <title>
-      HDFS Permissions Guide
-    </title>
-  </header>
-
-  <body>
-    <section> <title>Overview</title>
-      <p>
-		The Hadoop Distributed File System (HDFS) implements a permissions model for files and directories that shares much of the POSIX model. Each file and directory is associated with an <em>owner</em> and a <em>group</em>. The file or directory has separate permissions for the user that is the owner, for other users that are members of the group, and for all other users. For files, the <em>r</em> permission is required to read the file, and the <em>w</em> permission is required to write or append to the file. For directories, the <em>r</em> permission is required to list the contents of the directory, the <em>w</em> permission is required to create or delete files or directories, and the <em>x</em> permission is required to access a child of the directory. In contrast to the POSIX model, there are no <em>setuid</em> or <em>setgid</em> bits for files as there is no notion of executable files. For directories, there are no <em>setuid</em> or <em>setgid</em> bits directory as a simplification. The <em>Sticky bit</em> can be set on directories, preventing anyone except the superuser, directory owner or file owner from deleting or moving the files within the directory. Setting the sticky bit for a file has no effect. Collectively, the permissions of a file or directory are its <em>mode</em>. In general, Unix customs for representing and displaying modes will be used, including the use of octal numbers in this description. When a file or directory is created, its owner is the user identity of the client process, and its group is the group of the parent directory (the BSD rule).
-	</p>
-	<p>
-		Each client process that accesses HDFS has a two-part identity composed of the <em>user name</em>, and <em>groups list</em>. Whenever HDFS must do a permissions check for a file or directory <code>foo</code> accessed by a client process,
-	</p>
-	<ul>
-		<li>
-		   If the user name matches the owner of <code>foo</code>, then the owner permissions are tested;
-		</li>
-		<li>
-		   Else if the group of <code>foo</code> matches any of member of the groups list, then the group permissions are tested;
-		</li>
-		<li>
-		   Otherwise the other permissions of <code>foo</code> are tested.
-		</li>
-	</ul>
-
-<p>
-		If a permissions check fails, the client operation fails.	
-</p>
-     </section>
-
-<section><title>User Identity</title>
-<p>
-In this release of Hadoop the identity of a client process is just whatever the host operating system says it is. For Unix-like systems,
-</p>
-<ul>
-<li>
-   The user name is the equivalent of <code>`whoami`</code>;
-</li>
-<li>
-   The group list is the equivalent of <code>`bash -c groups`</code>.
-</li>
-</ul>
-
-<p>
-In the future there will be other ways of establishing user identity (think Kerberos, LDAP, and others). There is no expectation that this first method is secure in protecting one user from impersonating another. This user identity mechanism combined with the permissions model allows a cooperative community to share file system resources in an organized fashion.
-</p>
-<p>
-In any case, the user identity mechanism is extrinsic to HDFS itself. There is no provision within HDFS for creating user identities, establishing groups, or processing user credentials.
-</p>
-</section>
-
-<section> <title>Understanding the Implementation</title>
-<p>
-Each file or directory operation passes the full path name to the name node, and the permissions checks are applied along the path for each operation. The client framework will implicitly associate the user identity with the connection to the name node, reducing the need for changes to the existing client API. It has always been the case that when one operation on a file succeeds, the operation might fail when repeated because the file, or some directory on the path, no longer exists. For instance, when the client first begins reading a file, it makes a first request to the name node to discover the location of the first blocks of the file. A second request made to find additional blocks may fail. On the other hand, deleting a file does not revoke access by a client that already knows the blocks of the file. With the addition of permissions, a client's access to a file may be withdrawn between requests. Again, changing permissions does not revoke the access of a client that already knows the file's blocks.
-</p>
-<p>
-The map-reduce framework delegates the user identity by passing strings without special concern for confidentiality. The owner and group of a file or directory are stored as strings; there is no conversion from user and group identity numbers as is conventional in Unix.
-</p>
-<p>
-The permissions features of this release did not require any changes to the behavior of data nodes. Blocks on the data nodes do not have any of the <em>Hadoop</em> ownership or permissions attributes associated with them.
-</p>
-</section>
-     
-<section> <title>Changes to the File System API</title>
-<p>
-	All methods that use a path parameter will throw <code>AccessControlException</code> if permission checking fails.
-</p>
-<p>New methods:</p>
-<ul>
-	<li>
-		<code>public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException;</code>
-	</li>
-	<li>
-		<code>public boolean mkdirs(Path f, FsPermission permission) throws IOException;</code>
-	</li>
-	<li>
-		<code>public void setPermission(Path p, FsPermission permission) throws IOException;</code>
-	</li>
-	<li>
-		<code>public void setOwner(Path p, String username, String groupname) throws IOException;</code>
-	</li>
-	<li>
-		<code>public FileStatus getFileStatus(Path f) throws IOException;</code> will additionally return the user, group and mode associated with the path.
-	</li>
-
-</ul>
-<p>
-The mode of a new file or directory is restricted my the <code>umask</code> set as a configuration parameter. When the existing <code>create(path, &hellip;)</code> method (<em>without</em> the permission parameter) is used, the mode of the new file is <code>666&thinsp;&amp;&thinsp;^umask</code>. When the new <code>create(path, </code><em>permission</em><code>, &hellip;)</code> method (<em>with</em> the permission parameter <em>P</em>) is used, the mode of the new file is <code>P&thinsp;&amp;&thinsp;^umask&thinsp;&amp;&thinsp;666</code>. When a new directory is created with the existing <code>mkdirs(path)</code> method (<em>without</em> the permission parameter), the mode of the new directory is <code>777&thinsp;&amp;&thinsp;^umask</code>. When the new <code>mkdirs(path, </code><em>permission</em> <code>)</code> method (<em>with</em> the permission parameter <em>P</em>) is used, the mode of new directory is <code>P&thinsp;&amp;&thinsp;^umask&thinsp;&amp;&thinsp;777</code>. 
-</p>
-</section>
-
-     
-<section> <title>Changes to the Application Shell</title>
-<p>New operations:</p>
-<dl>
-	<dt><code>chmod [-R]</code> <em>mode file &hellip;</em></dt>
-	<dd>
-		Only the owner of a file or the super-user is permitted to change the mode of a file.
-	</dd>
-	<dt><code>chgrp [-R]</code> <em>group file &hellip;</em></dt>
-	<dd>
-		The user invoking <code>chgrp</code> must belong to the specified group and be the owner of the file, or be the super-user.
-	</dd>
-	<dt><code>chown [-R]</code> <em>[owner][:[group]] file &hellip;</em></dt>
-	<dd>
-		The owner of a file may only be altered by a super-user.
-	</dd>
-	<dt><code>ls </code> <em>file &hellip;</em></dt><dd></dd>
-	<dt><code>lsr </code> <em>file &hellip;</em></dt>
-	<dd>
-		The output is reformatted to display the owner, group and mode.
-	</dd>
-</dl></section>
-
-     
-<section> <title>The Super-User</title>
-<p>
-	The super-user is the user with the same identity as name node process itself. Loosely, if you started the name node, then you are the super-user. The super-user can do anything in that permissions checks never fail for the super-user. There is no persistent notion of who <em>was</em> the super-user; when the name node is started the process identity determines who is the super-user <em>for now</em>. The HDFS super-user does not have to be the super-user of the name node host, nor is it necessary that all clusters have the same super-user. Also, an experimenter running HDFS on a personal workstation, conveniently becomes that installation's super-user without any configuration.
-	</p>
-	<p>
-	In addition, the administrator my identify a distinguished group using a configuration parameter. If set, members of this group are also super-users.
-</p>
-</section>
-
-<section> <title>The Web Server</title>
-<p>
-The identity of the web server is a configuration parameter. That is, the name node has no notion of the identity of the <em>real</em> user, but the web server behaves as if it has the identity (user and groups) of a user chosen by the administrator. Unless the chosen identity matches the super-user, parts of the name space may be invisible to the web server.</p>
-</section>
-
-<section> <title>On-line Upgrade</title>
-<p>
-If a cluster starts with a version 0.15 data set (<code>fsimage</code>), all files and directories will have owner <em>O</em>, group <em>G</em>, and mode <em>M</em>, where <em>O</em> and <em>G</em> are the user and group identity of the super-user, and <em>M</em> is a configuration parameter. </p>
-</section>
-
-<section> <title>Configuration Parameters</title>
-<dl>
-	<dt><code>dfs.permissions = true </code></dt>
-	<dd>
-		If <code>yes</code> use the permissions system as described here. If <code>no</code>, permission <em>checking</em> is turned off, but all other behavior is unchanged. Switching from one parameter value to the other does not change the mode, owner or group of files or directories.
-		<p>
-		</p>
-		Regardless of whether permissions are on or off, <code>chmod</code>, <code>chgrp</code> and <code>chown</code> <em>always</em> check permissions. These functions are only useful in the permissions context, and so there is no backwards compatibility issue. Furthermore, this allows administrators to reliably set owners and permissions in advance of turning on regular permissions checking.
-	</dd>
-	<dt><code>dfs.web.ugi = webuser,webgroup</code></dt>
-	<dd>
-		The user name to be used by the web server. Setting this to the name of the super-user allows any web client to see everything. Changing this to an otherwise unused identity allows web clients to see only those things visible using "other" permissions. Additional groups may be added to the comma-separated list.
-	</dd>
-	<dt><code>dfs.permissions.supergroup = supergroup</code></dt>
-	<dd>
-		The name of the group of super-users.
-	</dd>
-	<dt><code>dfs.upgrade.permission = 0777</code></dt>
-	<dd>
-		The choice of initial mode during upgrade. The <em>x</em> permission is <em>never</em> set for files. For configuration files, the decimal value <em>511<sub>10</sub></em> may be used.
-	</dd>
-	<dt><code>dfs.umaskmode = 022</code></dt>
-	<dd>
-		The <code>umask</code> used when creating files and directories. May be specified either via three octal digits or symbolic values, with the same constraints as the dfs chmod command.
-	</dd>
-</dl>
-</section>
-
-     
-  </body>
-</document>
- 	
-

+ 0 - 97
src/docs/src/documentation/content/xdocs/libhdfs.xml

@@ -1,97 +0,0 @@
-<?xml version="1.0"?>
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
-          "http://forrest.apache.org/dtd/document-v20.dtd">
-
-
-<document>
-<header>
-<title>C API to HDFS: libhdfs</title>
-<meta name="http-equiv">Content-Type</meta>
-<meta name="content">text/html;</meta>
-<meta name="charset">utf-8</meta>
-</header>
-<body>
-<section>
-<title>C API to HDFS: libhdfs</title>
-
-<p>
-libhdfs is a JNI based C api for Hadoop's DFS. It provides C apis to a subset of the HDFS APIs to manipulate DFS files and the filesystem. libhdfs is part of the hadoop distribution and comes pre-compiled in ${HADOOP_HOME}/libhdfs/libhdfs.so .
-</p>
-
-</section>
-<section>
-<title>The APIs</title>
-
-<p>
-The libhdfs APIs are a subset of: <a href="api/org/apache/hadoop/fs/FileSystem.html" >hadoop fs APIs</a>.  
-</p>
-<p>
-The header file for libhdfs describes each API in detail and is available in ${HADOOP_HOME}/src/c++/libhdfs/hdfs.h
-</p>
-</section>
-<section>
-<title>A sample program</title>
-
-<source>
-#include "hdfs.h" 
-
-int main(int argc, char **argv) {
-
-    hdfsFS fs = hdfsConnect("default", 0);
-    const char* writePath = "/tmp/testfile.txt";
-    hdfsFile writeFile = hdfsOpenFile(fs, writePath, O_WRONLY|O_CREAT, 0, 0, 0);
-    if(!writeFile) {
-          fprintf(stderr, "Failed to open %s for writing!\n", writePath);
-          exit(-1);
-    }
-    char* buffer = "Hello, World!";
-    tSize num_written_bytes = hdfsWrite(fs, writeFile, (void*)buffer, strlen(buffer)+1);
-    if (hdfsFlush(fs, writeFile)) {
-           fprintf(stderr, "Failed to 'flush' %s\n", writePath); 
-          exit(-1);
-    }
-   hdfsCloseFile(fs, writeFile);
-}
-
-</source>
-</section>
-
-<section>
-<title>How to link with the library</title>
-<p>
-See the Makefile for hdfs_test.c in the libhdfs source directory (${HADOOP_HOME}/src/c++/libhdfs/Makefile) or something like:
-gcc above_sample.c -I${HADOOP_HOME}/src/c++/libhdfs -L${HADOOP_HOME}/libhdfs -lhdfs -o above_sample
-</p>
-</section>
-<section>
-<title>Common problems</title>
-<p>
-The most common problem is the CLASSPATH is not set properly when calling a program that uses libhdfs. Make sure you set it to all the hadoop jars needed to run Hadoop itself. Currently, there is no way to programmatically generate the classpath, but a good bet is to include all the jar files in ${HADOOP_HOME} and ${HADOOP_HOME}/lib as well as the right configuration directory containing hdfs-site.xml
-</p>
-</section>
-<section>
-<title>libhdfs is thread safe</title>
-<p>Concurrency and Hadoop FS "handles" - the hadoop FS implementation includes a FS handle cache which caches based on the URI of the namenode along with the user connecting. So, all calls to hdfsConnect will return the same handle but calls to hdfsConnectAsUser with different users will return different handles.  But, since HDFS client handles are completely thread safe, this has no bearing on concurrency. 
-</p>
-<p>Concurrency and libhdfs/JNI - the libhdfs calls to JNI should always be creating thread local storage, so (in theory), libhdfs should be as thread safe as the underlying calls to the Hadoop FS.
-</p>
-</section>
-</body>
-</document>

+ 0 - 670
src/docs/src/documentation/content/xdocs/streaming.xml

@@ -1,670 +0,0 @@
-<?xml version="1.0"?>
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN"
-          "http://forrest.apache.org/dtd/document-v20.dtd">
-
-
-<document>
-<header>
-<title>Hadoop Streaming</title>
-<meta name="http-equiv">Content-Type</meta>
-<meta name="content">text/html;</meta>
-<meta name="charset">utf-8</meta>
-</header>
-<body>
-<section>
-<title>Hadoop Streaming</title>
-
-<p>
-Hadoop streaming is a utility that comes with the Hadoop distribution. The utility allows you to create and run Map/Reduce jobs with any executable or script as the mapper and/or the reducer. For example:
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper /bin/cat \
-    -reducer /bin/wc
-</source>
-</section>
-
-<section>
-<title>How Does Streaming Work </title>
-<p>
-In the above example, both the mapper and the reducer are executables that read the input from stdin (line by line) and emit the output to stdout. The utility will create a Map/Reduce job, submit the job to an appropriate cluster, and monitor the progress of the job until it completes.
-</p><p>
-  When an executable is specified for mappers, each mapper task will launch the executable as a separate process when the mapper is initialized. As the mapper task runs, it converts its inputs into lines and feed the lines to the stdin of the process. In the meantime, the mapper collects the line oriented outputs from the stdout of the process and converts each line into a key/value pair, which is collected as the output of the mapper. By default, the 
-  <em>prefix of a line up to the first tab character</em> is the <strong>key</strong> and the rest of the line (excluding the tab character) will be the <strong>value</strong>. 
-  If there is no tab character in the line, then entire line is considered as key and the value is null. However, this can be customized, as discussed later.
-</p>
-<p>
-When an executable is specified for reducers, each reducer task will launch the executable as a separate process then the reducer is initialized. As the reducer task runs, it converts its input key/values pairs into lines and feeds the lines to the stdin of the process. In the meantime, the reducer collects the line oriented outputs from the stdout of the process, converts each line into a key/value pair, which is collected as the output of the reducer. By default, the prefix of a line up to the first tab character is the key and the rest of the line (excluding the tab character) is the value. However, this can be customized, as discussed later.
-</p><p>
-This is the basis for the communication protocol between the Map/Reduce framework and the streaming mapper/reducer.
-</p><p>
-You can supply a Java class as the mapper and/or the reducer. The above example is equivalent to:
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
-    -reducer /bin/wc
-</source>
-<p>User can specify <code>stream.non.zero.exit.is.failure</code> as 
-<code>true</code> or <code>false</code> to make a streaming task that exits 
-with a non-zero status to be <code>Failure</code> 
-or <code>Success</code> respectively. By default, streaming tasks exiting 
-with non-zero status are considered to be failed tasks.</p>
-
-</section>
-
-<section>
-<title>Package Files With Job Submissions</title>
-<p>
-You can specify any executable as the mapper and/or the reducer. The executables do not need to pre-exist on the machines in the cluster; however, if they don't, you will need to use "-file" option to tell the framework to pack your executable files as a part of job submission. For example:
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper myPythonScript.py \
-    -reducer /bin/wc \
-    -file myPythonScript.py 
-</source>
-<p>
-The above example specifies a user defined Python executable as the mapper. The option "-file myPythonScript.py" causes the python executable shipped to the cluster machines as a part of job submission.
-</p>
-<p>
-In addition to executable files, you can also package other auxiliary files (such as dictionaries, configuration files, etc) that may be used by the mapper and/or the reducer. For example:
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper myPythonScript.py \
-    -reducer /bin/wc \
-    -file myPythonScript.py \
-    -file myDictionary.txt
-</source>
-</section>
-
-<section>
-<title>Streaming Options and Usage </title>
-
-<section>
-<title>Mapper-Only Jobs </title>
-<p>
-Often, you may want to process input data using a map function only. To do this, simply set mapred.reduce.tasks to zero. The Map/Reduce framework will not create any reducer tasks. Rather, the outputs of the mapper tasks will be the final output of the job.
-</p><p>
-To be backward compatible, Hadoop Streaming also supports the "-reduce NONE" option, which is equivalent to "-D mapred.reduce.tasks=0".
-</p>
-</section>
-
-<section>
-<title>Specifying Other Plugins for Jobs </title>
-<p>
-Just as with a normal Map/Reduce job, you can specify other plugins for a streaming job:
-</p>
-<source>
-   -inputformat JavaClassName
-   -outputformat JavaClassName
-   -partitioner JavaClassName
-   -combiner streamingCommand or JavaClassName
-</source>
-<p>
-The class you supply for the input format should return key/value pairs of Text class. If you do not specify an input format class, the TextInputFormat is used as the default. Since the TextInputFormat returns keys of LongWritable class, which are actually not part of the input data, the keys will be discarded; only the values will be piped to the streaming mapper.
-</p><p>
-The class you supply for the output format is expected to take key/value pairs of Text class. If you do not specify an output format class, the TextOutputFormat is used as the default.
-</p>
-</section>
-
-<section>
-<title>Large files and archives in Hadoop Streaming </title>
-
-<p>
-The -files and -archives options allow you to make files and archives available to the tasks. The argument is a URI to the file or archive that you have already uploaded to HDFS. These files and archives are cached across jobs. You can retrieve the host and fs_port values from the fs.default.name config variable.
-</p>
-<p>
-Here are examples of the -files option:
-</p> 
-<source>
--files hdfs://host:fs_port/user/testfile.txt#testlink
-</source>
-<p>
-In the above example, the part of the url after # is used as the symlink name that is created in the current working directory of tasks. So the tasks will have a symlink called testlink in the cwd that points to a local copy of testfile.txt. Multiple entries can be specified as: 
-</p>
-<source>
--files hdfs://host:fs_port/user/testfile1.txt#testlink1 -files hdfs://host:fs_port/user/testfile2.txt#testlink2
-</source>
-<p>
-The -archives option allows you to copy jars locally to the cwd of tasks and automatically unjar the files. For example:
-</p>
-<source>
--archives hdfs://host:fs_port/user/testfile.jar#testlink3
-</source>
-<p>
-In the example above, a symlink testlink3 is created in the current working directory of tasks. This symlink points to the directory that stores the unjarred contents of the uploaded jar file.
-</p>
-<p>
-Here's another example of the -archives option. Here, the input.txt file has two lines specifying the names of the two files: testlink/cache.txt and testlink/cache2.txt. "testlink" is a symlink to the archived directory, which has the files "cache.txt" and "cache2.txt".
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-                  -input "/user/me/samples/cachefile/input.txt"  \
-                  -mapper "xargs cat"  \
-                  -reducer "cat"  \
-                  -output "/user/me/samples/cachefile/out" \  
-                  -archives 'hdfs://hadoop-nn1.example.com/user/me/samples/cachefile/cachedir.jar#testlink' \  
-                  -D mapred.map.tasks=1 \
-                  -D mapred.reduce.tasks=1 \ 
-                  -D mapred.job.name="Experiment"
-
-$ ls test_jar/
-cache.txt  cache2.txt
-
-$ jar cvf cachedir.jar -C test_jar/ .
-added manifest
-adding: cache.txt(in = 30) (out= 29)(deflated 3%)
-adding: cache2.txt(in = 37) (out= 35)(deflated 5%)
-
-$ hadoop dfs -put cachedir.jar samples/cachefile
-
-$ hadoop dfs -cat /user/me/samples/cachefile/input.txt
-testlink/cache.txt
-testlink/cache2.txt
-
-$ cat test_jar/cache.txt 
-This is just the cache string
-
-$ cat test_jar/cache2.txt 
-This is just the second cache string
-
-$ hadoop dfs -ls /user/me/samples/cachefile/out      
-Found 1 items
-/user/me/samples/cachefile/out/part-00000  &lt;r 3&gt;   69
-
-$ hadoop dfs -cat /user/me/samples/cachefile/out/part-00000
-This is just the cache string   
-This is just the second cache string
-
-</source>
-</section>
-
-<section>
-<title>Specifying Additional Configuration Variables for Jobs </title>
-<p>
-You can specify additional configuration variables by using "-D  &lt;n&gt;=&lt;v&gt;". For example: 
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper\
-    -reducer /bin/wc \
-    -D mapred.reduce.tasks=2
-</source>
-<p>
-The -D mapred.reduce.tasks=2 in the above example specifies to use two reducers for the job.
-</p>
-<p>
-For more details on the jobconf parameters see:
-<a href="ext:mapred-default">mapred-default.html</a></p>
-</section>
-
-<section>
-<title>Other Supported Options </title>
-<p>
-Other options you may specify for a streaming job are described here:
-</p>
-<table>
-<tr><th>Parameter</th><th>Optional/Required </th><th>Description </th></tr>
-
-<tr><td> -cmdenv   name=value </td><td> Optional </td><td> Pass env var to streaming commands </td></tr>
-
-<tr><td> -inputreader JavaClassName </td><td> Optional </td><td> For backwards-compatibility: specifies a record reader class (instead of an input format class) </td></tr>
-<tr><td> -verbose </td><td> Optional </td><td> Verbose output </td></tr>
-<tr><td> -lazyOutput </td><td> Optional </td><td> Create output lazily. For example, if the output format is based on FileOutputFormat, the output file is created only on the first call to output.collect (or Context.write)</td></tr>
-</table>
-<p>
-Streaming support Hadoop generic command line options. 
-
-Supported parameters are : 
-The general command line syntax is :
-<br/>    bin/hadoop command [genericOptions] [commandOptions]
-</p>
-
-<table>
-<tr><th>Parameter</th><th>Optional/Required </th><th>Description </th></tr>
-
-<tr><td> -conf  configuration_file </td><td> Optional </td><td> specify an application configuration file </td></tr>
-<tr><td> -D  property=value </td><td> Optional </td><td> use value for given property </td></tr>
-<tr><td> -fs host:port or local </td><td> Optional </td><td> specify a namenode </td></tr>
-<tr><td> -jt host:port or local </td><td> Optional </td><td> specify a job tracker </td></tr>
-<tr><td> -files </td><td> Optional </td><td> specify comma separated files to be copied to the map reduce cluster </td></tr>
-<tr><td> -archives </td><td> Optional </td><td> specify comma separated archives to be unarchived on the compute machines </td></tr>
-<tr><td>  </td><td> Optional </td><td>  </td></tr>
-<tr><td> -jt host:port or local </td><td> Optional </td><td>  </td></tr>
-</table>
-
-<p>
-To change the local temp directory use:
-</p>
-<source>
-  -D dfs.data.dir=/tmp
-</source>
-<p>
-To specify additional local temp directories use:
-</p>
-<source>
-   -D mapred.local.dir=/tmp/local
-   -D mapred.system.dir=/tmp/system
-   -D mapred.temp.dir=/tmp/temp
-</source>
-<p>
-For more details on jobconf parameters see:
-<a href="ext:mapred-default">mapred-default.html</a></p>
-<p>
-To set an environment variable in a streaming command use:
-</p>
-<source>
--cmdenv EXAMPLE_DIR=/home/example/dictionaries/
-</source>
-</section>
-</section>
-
-<section>
-<title>More usage examples </title>
-
-<section>
-<title>Customizing the Way to Split Lines into Key/Value Pairs </title>
-<p>
-As noted earlier, when the Map/Reduce framework reads a line from the stdout of the mapper, it splits the line into a key/value pair. By default, the prefix of the line up to the first tab character is the key and the rest of the line (excluding the tab character) is the value.
-</p>
-<p>
-However, you can customize this default. You can specify a field separator other than the tab character (the default), and you can specify the nth (n >= 1) character rather than the first character in a line (the default) as the separator between the key and value. For example:
-</p>
-
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
-    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
-    -D stream.map.output.field.separator=. \
-    -D stream.num.map.output.key.fields=4 
-</source>
-<p>
-In the above example, "-D stream.map.output.field.separator=." specifies "." as the field separator for the map outputs, and the prefix up to the fourth "." in a line will be the key and the rest of the line (excluding the fourth ".") will be the value. If a line has less than four "."s, then the whole line will be the key and the value will be an empty Text object (like the one created by new Text("")).
-</p><p>
-Similarly, you can use "-D stream.reduce.output.field.separator=SEP" and "-D stream.num.reduce.output.fields=NUM" to specify the nth field separator in a line of the reduce outputs as the separator between the key and the value.
-</p>
-<p> Similarly, you can specify "stream.map.input.field.separator" and 
-"stream.reduce.input.field.separator" as the input separator for map/reduce 
-inputs. By default the separator is the tab character.</p>
-</section>
-
-
-<section>
-<title>A Useful Partitioner Class (secondary sort, the -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner option) </title>
-<p>
-Hadoop has a library class, 
-<a href="ext:api/org/apache/hadoop/mapred/lib/keyfieldbasedpartitioner">KeyFieldBasedPartitioner</a>, 
-that is useful for many applications. This class allows the Map/Reduce 
-framework to partition the map outputs based on certain key fields, not
-the whole keys. For example:
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
-    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
-    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-    -D stream.map.output.field.separator=. \
-    -D stream.num.map.output.key.fields=4 \
-    -D map.output.key.field.separator=. \
-    -D mapred.text.key.partitioner.options=-k1,2\
-    -D mapred.reduce.tasks=12
-</source>
-<p>
-Here, <em>-D stream.map.output.field.separator=.</em> and <em>-D stream.num.map.output.key.fields=4</em> are as explained in previous example. The two variables are used by streaming to identify the key/value pair of mapper. 
-</p><p>
-The map output keys of the above Map/Reduce job normally have four fields
-separated by ".". However, the Map/Reduce framework will partition the map
-outputs by the first two fields of the keys using the 
-<em>-D mapred.text.key.partitioner.options=-k1,2</em> option. 
-Here, <em>-D map.output.key.field.separator=.</em> specifies the separator 
-for the partition. This guarantees that all the key/value pairs with the 
-same first two fields in the keys will be partitioned into the same reducer.
-</p><p>
-<em>This is effectively equivalent to specifying the first two fields as the primary key and the next two fields as the secondary. The primary key is used for partitioning, and the combination of the primary and secondary keys is used for sorting.</em> A simple illustration is shown here:
-</p>
-<p>
-Output of map (the keys)</p><source>
-11.12.1.2
-11.14.2.3
-11.11.4.1
-11.12.1.1
-11.14.2.2
-
-</source>
-<p>
-Partition into 3 reducers (the first 2 fields are used as keys for partition)</p><source>
-11.11.4.1
------------
-11.12.1.2
-11.12.1.1
------------
-11.14.2.3
-11.14.2.2
-</source>
-<p>
-Sorting within each partition for the reducer(all 4 fields used for sorting)</p><source>
-11.11.4.1
------------
-11.12.1.1
-11.12.1.2
------------
-11.14.2.2
-11.14.2.3
-</source>
-</section>
-<section>
-<title>A Useful Comparator Class</title>
-<p>
-Hadoop has a library class, 
-<a href="ext:api/org/apache/hadoop/mapred/lib/keyfieldbasedcomparator">KeyFieldBasedComparator</a>, 
-that is useful for many applications. This class provides a subset of features
-provided by the Unix/GNU Sort. For example:
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper org.apache.hadoop.mapred.lib.IdentityMapper \
-    -reducer org.apache.hadoop.mapred.lib.IdentityReducer \
-    -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
-    -D stream.map.output.field.separator=. \
-    -D stream.num.map.output.key.fields=4 \
-    -D map.output.key.field.separator=. \
-    -D mapred.text.key.comparator.options=-k2,2nr\
-    -D mapred.reduce.tasks=12
-</source>
-<p>
-The map output keys of the above Map/Reduce job normally have four fields
-separated by ".". However, the Map/Reduce framework will sort the 
-outputs by the second field of the keys using the 
-<em>-D mapred.text.key.comparator.options=-k2,2nr</em> option. 
-Here, <em>-n</em> specifies that the sorting is numerical sorting and 
-<em>-r</em> specifies that the result should be reversed. A simple illustration
-is shown below:
-</p>
-<p>
-Output of map (the keys)</p>
-<source>
-11.12.1.2
-11.14.2.3
-11.11.4.1
-11.12.1.1
-11.14.2.2
-</source>
-<p>
-Sorting output for the reducer(where second field used for sorting)</p>
-<source>
-11.14.2.3
-11.14.2.2
-11.12.1.2
-11.12.1.1
-11.11.4.1
-</source>
-</section>
-
-<section>
-<title>Working with the Hadoop Aggregate Package (the -reduce aggregate option) </title>
-<p>
-Hadoop has a library package called 
-<a href="ext:api/org/apache/hadoop/mapred/lib/aggregate/package-summary">Aggregate</a>.
-Aggregate provides a special reducer class and a special combiner class, and
-a list of simple aggregators that perform aggregations such as "sum", "max",
-"min" and so on  over a sequence of values. Aggregate allows you to define a
-mapper plugin class that is expected to generate "aggregatable items" for each
-input key/value pair of the mappers. The combiner/reducer will aggregate those
-aggregatable items by invoking the appropriate aggregators.
-</p><p>
-To use Aggregate, simply specify "-reducer aggregate":
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper myAggregatorForKeyCount.py \
-    -reducer aggregate \
-    -file myAggregatorForKeyCount.py \
-    -D mapred.reduce.tasks=12
-</source>
-<p>
-The python program myAggregatorForKeyCount.py looks like:
-</p>
-<source>
-#!/usr/bin/python
-
-import sys;
-
-def generateLongCountToken(id):
-    return "LongValueSum:" + id + "\t" + "1"
-
-def main(argv):
-    line = sys.stdin.readline();
-    try:
-        while line:
-            line = line&#91;:-1];
-            fields = line.split("\t");
-            print generateLongCountToken(fields&#91;0]);
-            line = sys.stdin.readline();
-    except "end of file":
-        return None
-if __name__ == "__main__":
-     main(sys.argv)
-</source>
-</section>
-
-<section>
-<title>Field Selection ( similar to unix 'cut' command) </title>
-<p>
-Hadoop has a library class, org.apache.hadoop.mapred.lib.FieldSelectionMapReduce, that effectively allows you to process text data like the unix "cut" utility. The map function defined in the class treats each input key/value pair as a list of fields. You can specify the field separator (the default is the tab character). You can select an arbitrary list of fields as the map output key, and an arbitrary list of fields as the map output value. Similarly, the reduce function defined in the class treats each input key/value pair as a list of fields. You can select an arbitrary list of fields as the reduce output key, and an arbitrary list of fields as the reduce output value. For example:
-</p>
-<source>
-$HADOOP_HOME/bin/hadoop  jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input myInputDirs \
-    -output myOutputDir \
-    -mapper org.apache.hadoop.mapred.lib.FieldSelectionMapReduce\
-    -reducer org.apache.hadoop.mapred.lib.FieldSelectionMapReduce\
-    -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
-    -D map.output.key.field.separa=. \
-    -D mapred.text.key.partitioner.options=-k1,2 \
-    -D mapred.data.field.separator=. \
-    -D map.output.key.value.fields.spec=6,5,1-3:0- \
-    -D reduce.output.key.value.fields.spec=0-2:5- \
-    -D mapred.reduce.tasks=12
-</source>
-<p>
-The option "-D map.output.key.value.fields.spec=6,5,1-3:0-" specifies key/value selection for the map outputs. Key selection spec and value selection spec are separated by ":". In this case, the map output key will consist of fields 6, 5, 1, 2, and 3. The map output value will consist of all fields (0- means field 0 and all 
-the subsequent fields). 
-</p><p>
-The option "-D reduce.output.key.value.fields.spec=0-2:5-" specifies 
-key/value selection for the reduce outputs. In this case, the reduce 
-output key will consist of fields 0, 1, 2 (corresponding to the original 
-fields 6, 5, 1). The reduce output value will consist of all fields starting
-from field 5 (corresponding to all the original fields).  
-</p>
-</section>
-</section>
-
-<section>
-<title>Frequently Asked Questions </title>
-
-<section>
-<title>How do I use Hadoop Streaming to run an arbitrary set of (semi-)independent tasks? </title>
-<p>
-Often you do not need the full power of Map Reduce, but only need to run multiple instances of the same program - either on different parts of the data, or on the same data, but with different parameters. You can use Hadoop Streaming to do this.
-</p>
-
-</section>
-
-<section>
-<title>How do I process files, one per map? </title>
-<p>
-As an example, consider the problem of zipping (compressing) a set of files across the hadoop cluster. You can achieve this using either of these methods:
-</p><ol>
-<li> Hadoop Streaming and custom mapper script:<ul>
-  <li> Generate a file containing the full HDFS path of the input files. Each map task would get one file name as input.</li>
-  <li> Create a mapper script which, given a filename, will get the file to local disk, gzip the file and put it back in the desired output directory</li>
-</ul></li>
-<li>The existing Hadoop Framework:<ul>
-   <li>Add these commands to your main function:
-<source>
-       FileOutputFormat.setCompressOutput(conf, true);
-       FileOutputFormat.setOutputCompressorClass(conf, org.apache.hadoop.io.compress.GzipCodec.class);
-       conf.setOutputFormat(NonSplitableTextInputFormat.class);
-       conf.setNumReduceTasks(0);
-</source></li>
-   <li>Write your map function:
-<source>
-
-       public void map(WritableComparable key, Writable value, 
-                               OutputCollector output, 
-                               Reporter reporter) throws IOException {
-            output.collect((Text)value, null);
-       }
-</source></li>
-  <li>Note that the output filename will not be the same as the original filename</li>
-</ul></li>
-</ol>
-</section>
-
-<section>
-<title>How many reducers should I use? </title>
-<p>
-See the Hadoop Wiki for details: <a href="mapred_tutorial.html#Reducer">Reducer</a>
-</p>
-</section>
-
-<section>
-<title>If I set up an alias in my shell script, will that work after -mapper, i.e. say I do: alias c1='cut -f1'. Will -mapper "c1" work? </title>
-<p>
-Using an alias will not work, but variable substitution is allowed as shown in this example:
-</p>
-<source>
-$ hadoop dfs -cat samples/student_marks
-alice   50
-bruce   70
-charlie 80
-dan     75
-
-$ c2='cut -f2'; $HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/hadoop-streaming.jar \
-    -input /user/me/samples/student_marks 
-    -mapper \"$c2\" -reducer 'cat'  
-    -output /user/me/samples/student_out 
-    -D mapred.job.name='Experiment'
-
-$ hadoop dfs -ls samples/student_out
-Found 1 items/user/me/samples/student_out/part-00000    &lt;r 3&gt;   16
-
-$ hadoop dfs -cat samples/student_out/part-00000
-50
-70
-75
-80
-</source>
-</section>
-
-<section>
-<title>Can I use UNIX pipes? For example, will -mapper "cut -f1 | sed s/foo/bar/g" work?</title>
-<p>
-Currently this does not work and gives an "java.io.IOException: Broken pipe" error. This is probably a bug that needs to be investigated.
-</p>
-</section>
-
-<section>
-<title>When I run a streaming job by <strong>distributing large executables</strong> (for example, 3.6G) through the -file option, I get a "No space left on device" error. What do I do? </title>
-<p>
-The jar packaging happens in a directory pointed to by the configuration variable stream.tmpdir. The default value of stream.tmpdir is /tmp. Set the value to a directory with more space:
-</p>
-<source>
--D stream.tmpdir=/export/bigspace/...
-</source>
-</section>
-
-<section>
-<title>How do I specify multiple input directories? </title>
-<p>
-You can specify multiple input directories with multiple '-input' options:
-</p><source>
- hadoop jar hadoop-streaming.jar -input '/user/foo/dir1' -input '/user/foo/dir2' 
-</source>
-</section>
-
-<section>
-<title>How do I generate output files with gzip format? </title>
-<p>
-Instead of plain text files, you can generate gzip files as your generated output. Pass '-D mapred.output.compress=true -D  mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCodec' as option to your streaming job.
-</p>
-</section>
-
-<section>
-<title>How do I provide my own input/output format with streaming? </title>
-<p>
-At least as late as version 0.14, Hadoop does not support multiple jar files. So, when specifying your own custom classes you will have to pack them along with the streaming jar and use the custom jar instead of the default hadoop streaming jar. 
-</p>
-</section>
-
-<section>
-<title>How do I parse XML documents using streaming? </title>
-<p>
-You can use the record reader StreamXmlRecordReader to process XML documents. 
-</p>
-<source>
-hadoop jar hadoop-streaming.jar -inputreader "StreamXmlRecord,begin=BEGIN_STRING,end=END_STRING" ..... (rest of the command)
-</source>
-<p>
-Anything found between BEGIN_STRING and END_STRING would be treated as one record for map tasks.
-</p>
-</section>
-
-<section>
-<title>How do I update counters in streaming applications? </title>
-<p>
-A streaming process can use the stderr to emit counter information.
-<code>reporter:counter:&lt;group&gt;,&lt;counter&gt;,&lt;amount&gt;</code> 
-should be sent to stderr to update the counter.
-</p>
-</section>
-
-<section>
-<title>How do I update status in streaming applications? </title>
-<p>
-A streaming process can use the stderr to emit status information.
-To set a status, <code>reporter:status:&lt;message&gt;</code> should be sent 
-to stderr.
-</p>
-</section>
-
-</section>
-</body>
-</document>

+ 31 - 1
src/java/core-default.xml

@@ -53,12 +53,29 @@
   ordering of the filters.</description>
 </property>
 
+<property>
+  <name>hadoop.cluster.administrators</name>
+  <property>Users and/or groups who are designated as the administrators of a
+  hadoop cluster. For specifying a list of users and groups the format to use
+  is "user1,user2 group1,group". If set to '*', it allows all users/groups to
+  do administrations operations of the cluster. If set to '', it allows none.
+  </property>
+  <value>${user.name}</value>
+</property>
+
 <property>
   <name>hadoop.security.authorization</name>
   <value>false</value>
   <description>Is service-level authorization enabled?</description>
 </property>
 
+<property>
+  <name>hadoop.security.authentication</name>
+  <value>simple</value>
+  <description>Possible values are simple (no authentication), and kerberos
+  </description>
+</property>
+
 <!--- logging properties -->
 
 <property>
@@ -107,7 +124,7 @@
 
 <property>
   <name>io.serializations</name>
-  <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,org.apache.hadoop.io.serializer.avro.AvroGenericSerialization</value>
+  <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,org.apache.hadoop.io.serializer.avro.AvroReflectSerialization</value>
   <description>A list of serialization classes that can be used for
   obtaining serializers and deserializers.</description>
 </property>
@@ -162,6 +179,19 @@
   <description>The FileSystem for hdfs: uris.</description>
 </property>
 
+<property>
+  <name>fs.AbstractFileSystem.file.impl</name>
+  <value>org.apache.hadoop.fs.local.LocalFs</value>
+  <description>The AbstractFileSystem for file: uris.</description>
+</property>
+
+
+<property>
+  <name>fs.AbstractFileSystem.hdfs.impl</name>
+  <value>org.apache.hadoop.fs.Hdfs</value>
+  <description>The FileSystem for hdfs: uris.</description>
+</property>
+
 <property>
   <name>fs.s3.impl</name>
   <value>org.apache.hadoop.fs.s3.S3FileSystem</value>

+ 35 - 0
src/java/org/apache/hadoop/HadoopIllegalArgumentException.java

@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop;
+
+/**
+ * Indicates that a method has been passed illegal or invalid argument. This
+ * exception is thrown instead of IllegalArgumentException to differentiate the
+ * exception thrown in Hadoop implementation from the one thrown in JDK.
+ */
+public class HadoopIllegalArgumentException extends IllegalArgumentException {
+  private static final long serialVersionUID = 1L;
+  
+  /**
+   * Constructs exception with the specified detail message. 
+   * @param message detailed message.
+   */
+  public HadoopIllegalArgumentException(final String message) {
+    super(message);
+  }
+}

+ 3 - 5
src/java/org/apache/hadoop/classification/InterfaceAudience.java

@@ -29,13 +29,11 @@ public class InterfaceAudience {
   @Documented public @interface Public {};
   
   /**
-   * Intended only for the project(s) specified in the annotation
+   * Intended only for the project(s) specified in the annotation.
+   * For example, "Common", "HDFS", "MapReduce", "ZooKeeper", "HBase".
    */
   @Documented public @interface LimitedPrivate {
-    public enum Project {COMMON, AVRO, CHUKWA, HBASE, HDFS, 
-                         HIVE, MAPREDUCE, PIG, ZOOKEEPER};
-    
-    Project[] value();
+    String[] value();
   };
   
   /**

+ 59 - 0
src/java/org/apache/hadoop/classification/tools/ExcludePrivateAnnotationsJDiffDoclet.java

@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.classification.tools;
+
+import com.sun.javadoc.DocErrorReporter;
+import com.sun.javadoc.LanguageVersion;
+import com.sun.javadoc.RootDoc;
+
+import jdiff.JDiff;
+
+/**
+ * A <a href="http://java.sun.com/javase/6/docs/jdk/api/javadoc/doclet/">Doclet</a>
+ * for excluding elements that are annotated with
+ * {@link org.apache.hadoop.classification.InterfaceAudience.Private} or
+ * {@link org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate}.
+ * It delegates to the JDiff Doclet, and takes the same options.
+ */
+public class ExcludePrivateAnnotationsJDiffDoclet {
+  
+  public static LanguageVersion languageVersion() {
+    return LanguageVersion.JAVA_1_5;
+  }
+  
+  public static boolean start(RootDoc root) {
+    System.out.println(
+	ExcludePrivateAnnotationsJDiffDoclet.class.getSimpleName());
+    return JDiff.start(RootDocProcessor.process(root));
+  }
+  
+  public static int optionLength(String option) {
+    Integer length = StabilityOptions.optionLength(option);
+    if (length != null) {
+      return length;
+    }
+    return JDiff.optionLength(option);
+  }
+  
+  public static boolean validOptions(String[][] options,
+      DocErrorReporter reporter) {
+    StabilityOptions.validOptions(options, reporter);
+    String[][] filteredOptions = StabilityOptions.filterOptions(options);
+    return JDiff.validOptions(filteredOptions, reporter);
+  }
+}

+ 58 - 0
src/java/org/apache/hadoop/classification/tools/ExcludePrivateAnnotationsStandardDoclet.java

@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.classification.tools;
+
+import com.sun.javadoc.DocErrorReporter;
+import com.sun.javadoc.LanguageVersion;
+import com.sun.javadoc.RootDoc;
+import com.sun.tools.doclets.standard.Standard;
+
+/**
+ * A <a href="http://java.sun.com/javase/6/docs/jdk/api/javadoc/doclet/">Doclet</a>
+ * for excluding elements that are annotated with
+ * {@link org.apache.hadoop.classification.InterfaceAudience.Private} or
+ * {@link org.apache.hadoop.classification.InterfaceAudience.LimitedPrivate}.
+ * It delegates to the Standard Doclet, and takes the same options.
+ */
+public class ExcludePrivateAnnotationsStandardDoclet {
+  
+  public static LanguageVersion languageVersion() {
+    return LanguageVersion.JAVA_1_5;
+  }
+  
+  public static boolean start(RootDoc root) {
+    System.out.println(
+	ExcludePrivateAnnotationsStandardDoclet.class.getSimpleName());
+    return Standard.start(RootDocProcessor.process(root));
+  }
+  
+  public static int optionLength(String option) {
+    Integer length = StabilityOptions.optionLength(option);
+    if (length != null) {
+      return length;
+    }
+    return Standard.optionLength(option);
+  }
+  
+  public static boolean validOptions(String[][] options,
+      DocErrorReporter reporter) {
+    StabilityOptions.validOptions(options, reporter);
+    String[][] filteredOptions = StabilityOptions.filterOptions(options);
+    return Standard.validOptions(filteredOptions, reporter);
+  }
+}

+ 234 - 0
src/java/org/apache/hadoop/classification/tools/RootDocProcessor.java

@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.classification.tools;
+
+import com.sun.javadoc.AnnotationDesc;
+import com.sun.javadoc.AnnotationTypeDoc;
+import com.sun.javadoc.ClassDoc;
+import com.sun.javadoc.ConstructorDoc;
+import com.sun.javadoc.Doc;
+import com.sun.javadoc.FieldDoc;
+import com.sun.javadoc.MethodDoc;
+import com.sun.javadoc.PackageDoc;
+import com.sun.javadoc.ProgramElementDoc;
+import com.sun.javadoc.RootDoc;
+
+import java.lang.reflect.Array;
+import java.lang.reflect.InvocationHandler;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.lang.reflect.Proxy;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.WeakHashMap;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+
+/**
+ * Process the {@link RootDoc} by substituting with (nested) proxy objects that
+ * exclude elements with Private or LimitedPrivate annotations.
+ * <p>
+ * Based on code from http://www.sixlegs.com/blog/java/exclude-javadoc-tag.html.
+ */
+class RootDocProcessor {
+  
+  static String stability = StabilityOptions.UNSTABLE_OPTION;
+  
+  public static RootDoc process(RootDoc root) {
+    return (RootDoc) process(root, RootDoc.class);
+  }
+  
+  private static Object process(Object obj, Class<?> type) { 
+    if (obj == null) { 
+      return null; 
+    } 
+    Class<?> cls = obj.getClass(); 
+    if (cls.getName().startsWith("com.sun.")) { 
+      return getProxy(obj); 
+    } else if (obj instanceof Object[]) { 
+      Class<?> componentType = type.isArray() ? type.getComponentType() 
+	  : cls.getComponentType();
+      Object[] array = (Object[]) obj;
+      Object[] newArray = (Object[]) Array.newInstance(componentType,
+	  array.length); 
+      for (int i = 0; i < array.length; ++i) {
+        newArray[i] = process(array[i], componentType);
+      }
+      return newArray;
+    } 
+    return obj; 
+  }
+  
+  private static Map<Object, Object> proxies =
+    new WeakHashMap<Object, Object>(); 
+  
+  private static Object getProxy(Object obj) { 
+    Object proxy = proxies.get(obj); 
+    if (proxy == null) { 
+      proxy = Proxy.newProxyInstance(obj.getClass().getClassLoader(), 
+        obj.getClass().getInterfaces(), new ExcludeHandler(obj)); 
+      proxies.put(obj, proxy); 
+    } 
+    return proxy; 
+  } 
+
+  private static class ExcludeHandler implements InvocationHandler {
+    private Object target;
+
+    public ExcludeHandler(Object target) {
+      this.target = target;
+    }
+    
+    public Object invoke(Object proxy, Method method, Object[] args)
+	throws Throwable {
+      String methodName = method.getName();
+      if (target instanceof Doc) {
+	if (methodName.equals("isIncluded")) {
+	  Doc doc = (Doc) target;
+	  return !exclude(doc) && doc.isIncluded();
+	}
+	if (target instanceof RootDoc) {
+	  if (methodName.equals("classes")) {
+	    return filter(((RootDoc) target).classes(), ClassDoc.class);
+	  } else if (methodName.equals("specifiedClasses")) {
+	    return filter(((RootDoc) target).specifiedClasses(), ClassDoc.class);
+	  } else if (methodName.equals("specifiedPackages")) {
+	    return filter(((RootDoc) target).specifiedPackages(), PackageDoc.class);
+	  }
+	} else if (target instanceof ClassDoc) {
+	  if (isFiltered(args)) {
+	    if (methodName.equals("methods")) {
+	      return filter(((ClassDoc) target).methods(true), MethodDoc.class);
+	    } else if (methodName.equals("fields")) {
+	      return filter(((ClassDoc) target).fields(true), FieldDoc.class);
+	    } else if (methodName.equals("innerClasses")) {
+	      return filter(((ClassDoc) target).innerClasses(true),
+		  ClassDoc.class);
+	    } else if (methodName.equals("constructors")) {
+	      return filter(((ClassDoc) target).constructors(true),
+		  ConstructorDoc.class);
+	    }
+	  }
+	} else if (target instanceof PackageDoc) {
+	  if (methodName.equals("allClasses")) {
+	    if (isFiltered(args)) {
+	      return filter(((PackageDoc) target).allClasses(true),
+		ClassDoc.class);
+	    } else {
+	      return filter(((PackageDoc) target).allClasses(), ClassDoc.class);  
+	    }
+	  } else if (methodName.equals("annotationTypes")) {
+	    return filter(((PackageDoc) target).annotationTypes(),
+		AnnotationTypeDoc.class);
+	  } else if (methodName.equals("enums")) {
+	    return filter(((PackageDoc) target).enums(),
+		ClassDoc.class);
+	  } else if (methodName.equals("errors")) {
+	    return filter(((PackageDoc) target).errors(),
+		ClassDoc.class);
+	  } else if (methodName.equals("exceptions")) {
+	    return filter(((PackageDoc) target).exceptions(),
+		ClassDoc.class);
+	  } else if (methodName.equals("interfaces")) {
+	    return filter(((PackageDoc) target).interfaces(),
+		ClassDoc.class);
+	  } else if (methodName.equals("ordinaryClasses")) {
+	    return filter(((PackageDoc) target).ordinaryClasses(),
+		ClassDoc.class);
+	  }
+	}
+      }
+
+      if (args != null) {
+	if (methodName.equals("compareTo") || methodName.equals("equals")
+	    || methodName.equals("overrides")
+	    || methodName.equals("subclassOf")) {
+	  args[0] = unwrap(args[0]);
+	}
+      }
+      try {
+	return process(method.invoke(target, args), method.getReturnType());
+      } catch (InvocationTargetException e) {
+	throw e.getTargetException();
+      }
+    }
+      
+    private static boolean exclude(Doc doc) {
+      AnnotationDesc[] annotations = null;
+      if (doc instanceof ProgramElementDoc) {
+	annotations = ((ProgramElementDoc) doc).annotations();
+      } else if (doc instanceof PackageDoc) {
+	annotations = ((PackageDoc) doc).annotations();
+      }
+      if (annotations != null) {
+	for (AnnotationDesc annotation : annotations) {
+	  String qualifiedTypeName = annotation.annotationType().qualifiedTypeName();
+	  if (qualifiedTypeName.equals(
+	        InterfaceAudience.Private.class.getCanonicalName())
+	    || qualifiedTypeName.equals(
+                InterfaceAudience.LimitedPrivate.class.getCanonicalName())) {
+	    return true;
+	  }
+	  if (stability.equals(StabilityOptions.EVOLVING_OPTION)) {
+	    if (qualifiedTypeName.equals(
+		InterfaceStability.Unstable.class.getCanonicalName())) {
+	      return true;
+	    }
+	  }
+	  if (stability.equals(StabilityOptions.STABLE_OPTION)) {
+	    if (qualifiedTypeName.equals(
+		InterfaceStability.Unstable.class.getCanonicalName())
+              || qualifiedTypeName.equals(
+  		InterfaceStability.Evolving.class.getCanonicalName())) {
+	      return true;
+	    }
+	  }
+	}
+      }
+      return false;
+    }
+      
+    private static Object[] filter(Doc[] array, Class<?> componentType) {
+      if (array == null || array.length == 0) {
+	return array;
+      }
+      List<Object> list = new ArrayList<Object>(array.length);
+      for (Doc entry : array) {
+	if (!exclude(entry)) {
+	  list.add(process(entry, componentType));
+	}
+      }
+      return list.toArray((Object[]) Array.newInstance(componentType, list
+	  .size()));
+    }
+
+    private Object unwrap(Object proxy) {
+      if (proxy instanceof Proxy)
+	return ((ExcludeHandler) Proxy.getInvocationHandler(proxy)).target;
+      return proxy;
+    }
+      
+    private boolean isFiltered(Object[] args) {
+      return args != null && Boolean.TRUE.equals(args[0]);
+    }
+
+  }
+
+}

+ 69 - 0
src/java/org/apache/hadoop/classification/tools/StabilityOptions.java

@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.classification.tools;
+
+import com.sun.javadoc.DocErrorReporter;
+
+import java.util.ArrayList;
+import java.util.List;
+
+class StabilityOptions {
+  public static final String STABLE_OPTION = "-stable";
+  public static final String EVOLVING_OPTION = "-evolving";
+  public static final String UNSTABLE_OPTION = "-unstable";
+
+  public static Integer optionLength(String option) {
+    String opt = option.toLowerCase();
+    if (opt.equals(UNSTABLE_OPTION)) return 1;
+    if (opt.equals(EVOLVING_OPTION)) return 1;
+    if (opt.equals(STABLE_OPTION)) return 1;
+    return null;
+  }
+
+  public static void validOptions(String[][] options,
+      DocErrorReporter reporter) {
+    for (int i = 0; i < options.length; i++) {
+      String opt = options[i][0].toLowerCase();
+      if (opt.equals(UNSTABLE_OPTION)) {
+	RootDocProcessor.stability = UNSTABLE_OPTION;
+      } else if (opt.equals(EVOLVING_OPTION)) {
+	RootDocProcessor.stability = EVOLVING_OPTION;
+      } else if (opt.equals(STABLE_OPTION)) {
+	RootDocProcessor.stability = STABLE_OPTION;	
+      }
+    }
+  }
+  
+  public static String[][] filterOptions(String[][] options) {
+    List<String[]> optionsList = new ArrayList<String[]>();
+    for (int i = 0; i < options.length; i++) {
+      if (!options[i][0].equalsIgnoreCase(UNSTABLE_OPTION)
+	  && !options[i][0].equalsIgnoreCase(EVOLVING_OPTION)
+	  && !options[i][0].equalsIgnoreCase(STABLE_OPTION)) {
+	optionsList.add(options[i]);
+      }
+    }
+    String[][] filteredOptions = new String[optionsList.size()][];
+    int i = 0;
+    for (String[] option : optionsList) {
+      filteredOptions[i++] = option;
+    }
+    return filteredOptions;
+  }
+
+}

+ 22 - 0
src/java/org/apache/hadoop/classification/tools/package-info.java

@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+@InterfaceAudience.LimitedPrivate({"Common", "Avro", "Chukwa", "HBase", "HDFS",
+  "Hive", "MapReduce", "Pig", "ZooKeeper"})
+package org.apache.hadoop.classification.tools;
+
+import org.apache.hadoop.classification.InterfaceAudience;

+ 104 - 0
src/java/org/apache/hadoop/conf/ConfServlet.java

@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.conf;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.apache.hadoop.http.HttpServer;
+
+/**
+ * A servlet to print out the running configuration data.
+ */
+public class ConfServlet extends HttpServlet {
+  private static final long serialVersionUID = 1L;
+
+  private static final String FORMAT_JSON = "json";
+  private static final String FORMAT_XML = "xml";
+  private static final String FORMAT_PARAM = "format";
+
+  /**
+   * Return the Configuration of the daemon hosting this servlet.
+   * This is populated when the HttpServer starts.
+   */
+  private Configuration getConfFromContext() {
+    Configuration conf = (Configuration)getServletContext().getAttribute(
+        HttpServer.CONF_CONTEXT_ATTRIBUTE);
+    assert conf != null;
+    return conf;
+  }
+
+  @Override
+  public void doGet(HttpServletRequest request, HttpServletResponse response)
+      throws ServletException, IOException {
+
+    // Do the authorization
+    if (!HttpServer.hasAdministratorAccess(getServletContext(), request,
+        response)) {
+      return;
+    }
+
+    String format = request.getParameter(FORMAT_PARAM);
+    if (null == format) {
+      format = FORMAT_XML;
+    }
+
+    if (FORMAT_XML.equals(format)) {
+      response.setContentType("text/xml");
+    } else if (FORMAT_JSON.equals(format)) {
+      response.setContentType("text/javascript");
+    }
+
+    OutputStreamWriter out = new OutputStreamWriter(response.getOutputStream());
+    try {
+      writeResponse(getConfFromContext(), out, format);
+    } catch (BadFormatException bfe) {
+      response.sendError(HttpServletResponse.SC_BAD_REQUEST, bfe.getMessage());
+    }
+    out.close();
+  }
+
+  /**
+   * Guts of the servlet - extracted for easy testing.
+   */
+  static void writeResponse(Configuration conf, Writer out, String format)
+    throws IOException, BadFormatException {
+    if (FORMAT_JSON.equals(format)) {
+      Configuration.dumpConfiguration(conf, out);
+    } else if (FORMAT_XML.equals(format)) {
+      conf.writeXml(out);
+    } else {
+      throw new BadFormatException("Bad format: " + format);
+    }
+  }
+
+  public static class BadFormatException extends Exception {
+    private static final long serialVersionUID = 1L;
+
+    public BadFormatException(String msg) {
+      super(msg);
+    }
+  }
+
+}

+ 120 - 151
src/java/org/apache/hadoop/conf/Configuration.java

@@ -27,6 +27,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.OutputStream;
+import java.io.OutputStreamWriter;
 import java.io.Reader;
 import java.io.Writer;
 import java.net.URL;
@@ -47,11 +48,13 @@ import java.util.WeakHashMap;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
 
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
@@ -67,6 +70,7 @@ import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.hadoop.util.StringUtils;
 import org.codehaus.jackson.JsonFactory;
 import org.codehaus.jackson.JsonGenerator;
+import org.w3c.dom.Comment;
 import org.w3c.dom.DOMException;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -152,6 +156,12 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
    */
   private ArrayList<Object> resources = new ArrayList<Object>();
 
+  /**
+   * The value reported as the setting resource when a key is set
+   * by code rather than a file resource.
+   */
+  static final String UNKNOWN_RESOURCE = "Unknown";
+
   /**
    * List of configuration parameters marked <b>final</b>. 
    */
@@ -174,13 +184,7 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
 
   private static final Map<ClassLoader, Map<String, Class<?>>>
     CACHE_CLASSES = new WeakHashMap<ClassLoader, Map<String, Class<?>>>();
-  
-  /**
-   * Flag to indicate if the storage of resource which updates a key needs 
-   * to be stored for each key
-   */
-  private boolean storeResource;
-  
+
   /**
    * Stores the mapping of key to the resource which modifies or loads 
    * the key most recently
@@ -205,9 +209,6 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
       this.customMessage = customMessage;
       accessed = false;
     }
-    DeprecatedKeyInfo(String[] newKeys) {
-      this(newKeys, null);
-    }
 
     /**
      * Method to provide the warning message. It gives the custom message if
@@ -262,12 +263,7 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
     }
     if (!isDeprecated(key)) {
       DeprecatedKeyInfo newKeyInfo;
-      if (customMessage == null) {
-        newKeyInfo = new DeprecatedKeyInfo(newKeys);
-      }
-      else {
-        newKeyInfo = new DeprecatedKeyInfo(newKeys, customMessage);
-      }
+      newKeyInfo = new DeprecatedKeyInfo(newKeys, customMessage);
       deprecatedKeyMap.put(key, newKeyInfo);
     }
   }
@@ -298,20 +294,6 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
     return deprecatedKeyMap.containsKey(key);
   }
  
-  /**
-   * Check whether or not the deprecated key has been specified in the
-   * configuration file rather than the new key
-   * 
-   * Returns false if the specified key is not included in the deprecated
-   * key mapping.
-   * 
-   * @param oldKey Old configuration key 
-   * @return If the old configuration key was specified rather than the new one
-   */
-  public boolean deprecatedKeyWasSet(String oldKey) {
-    return isDeprecated(oldKey) && deprecatedKeyMap.get(oldKey).accessed;
-  }
-  
   /**
    * Checks for the presence of the property <code>name</code> in the
    * deprecation map. Returns the first of the list of new keys if present
@@ -384,29 +366,10 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
    */
   public Configuration(boolean loadDefaults) {
     this.loadDefaults = loadDefaults;
-    if (LOG.isDebugEnabled()) {
-      LOG.debug(StringUtils.stringifyException(new IOException("config()")));
-    }
+    updatingResource = new HashMap<String, String>();
     synchronized(Configuration.class) {
       REGISTRY.put(this, null);
     }
-    this.storeResource = false;
-  }
-  
-  /**
-   * A new configuration with the same settings and additional facility for
-   * storage of resource to each key which loads or updates 
-   * the key most recently
-   * @param other the configuration from which to clone settings
-   * @param storeResource flag to indicate if the storage of resource to 
-   * each key is to be stored
-   */
-  private Configuration(Configuration other, boolean storeResource) {
-    this(other);
-    this.storeResource = storeResource;
-    if (storeResource) {
-      updatingResource = new HashMap<String, String>();
-    }
   }
   
   /** 
@@ -416,11 +379,6 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
    */
   @SuppressWarnings("unchecked")
   public Configuration(Configuration other) {
-    if (LOG.isDebugEnabled()) {
-      LOG.debug(StringUtils.stringifyException
-                (new IOException("config(config)")));
-    }
-   
    this.resources = (ArrayList)other.resources.clone();
    synchronized(other) {
      if (other.properties != null) {
@@ -430,6 +388,8 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
      if (other.overlay!=null) {
        this.overlay = (Properties)other.overlay.clone();
      }
+
+     this.updatingResource = new HashMap<String, String>(other.updatingResource);
    }
    
     this.finalParameters = new HashSet<String>(other.finalParameters);
@@ -611,6 +571,7 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
     if (!isDeprecated(name)) {
       getOverlay().setProperty(name, value);
       getProps().setProperty(name, value);
+      updatingResource.put(name, UNKNOWN_RESOURCE);
     }
     else {
       DeprecatedKeyInfo keyInfo = deprecatedKeyMap.get(name);
@@ -839,6 +800,45 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
       : Enum.valueOf(defaultValue.getDeclaringClass(), val);
   }
 
+  /**
+   * Get the value of the <code>name</code> property as a <code>Pattern</code>.
+   * If no such property is specified, or if the specified value is not a valid
+   * <code>Pattern</code>, then <code>DefaultValue</code> is returned.
+   *
+   * @param name property name
+   * @param defaultValue default value
+   * @return property value as a compiled Pattern, or defaultValue
+   */
+  public Pattern getPattern(String name, Pattern defaultValue) {
+    String valString = get(name);
+    if (null == valString || "".equals(valString)) {
+      return defaultValue;
+    }
+    try {
+      return Pattern.compile(valString);
+    } catch (PatternSyntaxException pse) {
+      LOG.warn("Regular expression '" + valString + "' for property '" +
+               name + "' not valid. Using default", pse);
+      return defaultValue;
+    }
+  }
+
+  /**
+   * Set the given property to <code>Pattern</code>.
+   * If the pattern is passed as null, sets the empty pattern which results in
+   * further calls to getPattern(...) returning the default value.
+   *
+   * @param name property name
+   * @param pattern new value
+   */
+  public void setPattern(String name, Pattern pattern) {
+    if (null == pattern) {
+      set(name, null);
+    } else {
+      set(name, pattern.pattern());
+    }
+  }
+
   /**
    * A class that represents a set of positive integer ranges. It parses 
    * strings of the form: "2-3,5,7-" where ranges are separated by comma and 
@@ -911,7 +911,7 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
     
     @Override
     public String toString() {
-      StringBuffer result = new StringBuffer();
+      StringBuilder result = new StringBuilder();
       boolean first = true;
       for(Range r: ranges) {
         if (first) {
@@ -1045,7 +1045,7 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
   public void setStrings(String name, String... values) {
     set(name, StringUtils.arrayToString(values));
   }
- 
+
   /**
    * Load a class by name.
    * 
@@ -1320,10 +1320,8 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
       loadResources(properties, resources, quietmode);
       if (overlay!= null) {
         properties.putAll(overlay);
-        if (storeResource) {
-          for (Map.Entry<Object,Object> item: overlay.entrySet()) {
-            updatingResource.put((String) item.getKey(), "Unknown");
-          }
+        for (Map.Entry<Object,Object> item: overlay.entrySet()) {
+          updatingResource.put((String) item.getKey(), UNKNOWN_RESOURCE);
         }
       }
     }
@@ -1385,60 +1383,6 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
     for (Object resource : resources) {
       loadResource(properties, resource, quiet);
     }
-    // process for deprecation.
-    processDeprecatedKeys();
-  }
-  /**
-   * Updates the keys that are replacing the deprecated keys and removes the 
-   * deprecated keys from memory.
-   */
-  private void processDeprecatedKeys() {
-    for (Map.Entry<String, DeprecatedKeyInfo> item : 
-      deprecatedKeyMap.entrySet()) {
-      if (!properties.containsKey(item.getKey())) {
-        continue;
-      }
-      String oldKey = item.getKey();
-      deprecatedKeyMap.get(oldKey).accessed = false;
-      setDeprecatedValue(oldKey, properties.getProperty(oldKey),
-          finalParameters.contains(oldKey));
-      properties.remove(oldKey);
-      if (finalParameters.contains(oldKey)) {
-        finalParameters.remove(oldKey);
-      }
-      if (storeResource) {
-        updatingResource.remove(oldKey);
-      }
-    }
-  }
-  
-  /**
-   * Sets the deprecated key's value to the associated mapped keys
-   * @param attr the deprecated key
-   * @param value the value corresponding to the deprecated key
-   * @param finalParameter flag to indicate if <code>attr</code> is
-   *        marked as final
-   */
-  private void setDeprecatedValue(String attr,
-      String value, boolean finalParameter) {
-    DeprecatedKeyInfo keyInfo = deprecatedKeyMap.get(attr);
-    for (String key:keyInfo.newKeys) {
-      // update replacing keys with deprecated key's value in all cases,
-      // except when the replacing key is already set to final
-      // and finalParameter is false
-      if (finalParameters.contains(key) && !finalParameter) {
-        LOG.warn("An attempt to override final parameter: "+key
-            +";  Ignoring.");
-        continue;
-      }
-      properties.setProperty(key, value);
-      if (storeResource) {
-        updatingResource.put(key, updatingResource.get(attr));
-      }
-      if (finalParameter) {
-        finalParameters.add(key);
-      }
-    }
   }
   
   private void loadResource(Properties properties, Object name, boolean quiet) {
@@ -1546,19 +1490,16 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
         
         // Ignore this parameter if it has already been marked as 'final'
         if (attr != null) {
-          if (value != null) {
-            if (!finalParameters.contains(attr)) {
-              properties.setProperty(attr, value);
-              if (storeResource) {
-                updatingResource.put(attr, name.toString());
-              }
-            } else {
-              LOG.warn(name+":a attempt to override final parameter: "+attr
-                     +";  Ignoring.");
+          if (deprecatedKeyMap.containsKey(attr)) {
+            DeprecatedKeyInfo keyInfo = deprecatedKeyMap.get(attr);
+            keyInfo.accessed = false;
+            for (String key:keyInfo.newKeys) {
+              // update new keys with deprecated key's value 
+              loadProperty(properties, name, key, value, finalParameter);
             }
           }
-          if (finalParameter) {
-            finalParameters.add(attr);
+          else {
+            loadProperty(properties, name, attr, value, finalParameter);
           }
         }
       }
@@ -1578,13 +1519,39 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
     }
   }
 
+  private void loadProperty(Properties properties, Object name, String attr,
+      String value, boolean finalParameter) {
+    if (value != null) {
+      if (!finalParameters.contains(attr)) {
+        properties.setProperty(attr, value);
+        updatingResource.put(attr, name.toString());
+      } else {
+        LOG.warn(name+":an attempt to override final parameter: "+attr
+            +";  Ignoring.");
+      }
+    }
+    if (finalParameter) {
+      finalParameters.add(attr);
+    }
+  }
+
   /** 
-   * Write out the non-default properties in this configuration to the give
+   * Write out the non-default properties in this configuration to the given
    * {@link OutputStream}.
    * 
    * @param out the output stream to write to.
    */
   public void writeXml(OutputStream out) throws IOException {
+    writeXml(new OutputStreamWriter(out));
+  }
+
+  /** 
+   * Write out the non-default properties in this configuration to the given
+   * {@link Writer}.
+   * 
+   * @param out the writer to write to.
+   */
+  public synchronized void writeXml(Writer out) throws IOException {
     Properties properties = getProps();
     try {
       Document doc =
@@ -1603,7 +1570,12 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
         }
         Element propNode = doc.createElement("property");
         conf.appendChild(propNode);
-      
+
+        if (updatingResource != null) {
+          Comment commentNode = doc.createComment(
+            "Loaded from " + updatingResource.get(name));
+          propNode.appendChild(commentNode);
+        }
         Element nameNode = doc.createElement("name");
         nameNode.appendChild(doc.createTextNode(name));
         propNode.appendChild(nameNode);
@@ -1620,8 +1592,10 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
       TransformerFactory transFactory = TransformerFactory.newInstance();
       Transformer transformer = transFactory.newTransformer();
       transformer.transform(source, result);
-    } catch (Exception e) {
-      throw new RuntimeException(e);
+    } catch (TransformerException te) {
+      throw new IOException(te);
+    } catch (ParserConfigurationException pe) {
+      throw new IOException(pe);
     }
   }
 
@@ -1636,26 +1610,26 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
    * @param out the Writer to write to
    * @throws IOException
    */
-  public static void dumpConfiguration(Configuration conf, 
+  public static void dumpConfiguration(Configuration config,
       Writer out) throws IOException {
-    Configuration config = new Configuration(conf,true);
-    config.reloadConfiguration();
     JsonFactory dumpFactory = new JsonFactory();
     JsonGenerator dumpGenerator = dumpFactory.createJsonGenerator(out);
     dumpGenerator.writeStartObject();
     dumpGenerator.writeFieldName("properties");
     dumpGenerator.writeStartArray();
     dumpGenerator.flush();
-    for (Map.Entry<Object,Object> item: config.getProps().entrySet()) {
-      dumpGenerator.writeStartObject();
-      dumpGenerator.writeStringField("key", (String) item.getKey());
-      dumpGenerator.writeStringField("value", 
-          config.get((String) item.getKey()));
-      dumpGenerator.writeBooleanField("isFinal",
-          config.finalParameters.contains(item.getKey()));
-      dumpGenerator.writeStringField("resource",
-          config.updatingResource.get(item.getKey()));
-      dumpGenerator.writeEndObject();
+    synchronized (config) {
+      for (Map.Entry<Object,Object> item: config.getProps().entrySet()) {
+        dumpGenerator.writeStartObject();
+        dumpGenerator.writeStringField("key", (String) item.getKey());
+        dumpGenerator.writeStringField("value", 
+                                       config.get((String) item.getKey()));
+        dumpGenerator.writeBooleanField("isFinal",
+                                        config.finalParameters.contains(item.getKey()));
+        dumpGenerator.writeStringField("resource",
+                                       config.updatingResource.get(item.getKey()));
+        dumpGenerator.writeEndObject();
+      }
     }
     dumpGenerator.writeEndArray();
     dumpGenerator.writeEndObject();
@@ -1682,7 +1656,7 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
   
   @Override
   public String toString() {
-    StringBuffer sb = new StringBuffer();
+    StringBuilder sb = new StringBuilder();
     sb.append("Configuration: ");
     if(loadDefaults) {
       toString(defaultResources, sb);
@@ -1694,8 +1668,8 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
     return sb.toString();
   }
 
-  private void toString(List resources, StringBuffer sb) {
-    ListIterator i = resources.listIterator();
+  private <T> void toString(List<T> resources, StringBuilder sb) {
+    ListIterator<T> i = resources.listIterator();
     while (i.hasNext()) {
       if (i.nextIndex() != 0) {
         sb.append(", ");
@@ -1755,11 +1729,6 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
                new String[]{CommonConfigurationKeys.NET_TOPOLOGY_CONFIGURED_NODE_MAPPING_KEY});
     Configuration.addDeprecation("topology.node.switch.mapping.impl", 
                new String[]{CommonConfigurationKeys.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY});
-    Configuration.addDeprecation("dfs.umask", 
-               new String[]{CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY},
-               "dfs.umask is deprecated, use " + 
-               CommonConfigurationKeys.FS_PERMISSIONS_UMASK_KEY + 
-               " with octal or symbolic specifications.");
     Configuration.addDeprecation("dfs.df.interval", 
                new String[]{CommonConfigurationKeys.FS_DF_INTERVAL_KEY});
     Configuration.addDeprecation("dfs.client.buffer.dir", 

+ 809 - 0
src/java/org/apache/hadoop/fs/AbstractFileSystem.java

@@ -0,0 +1,809 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.EnumSet;
+import java.util.IdentityHashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.StringTokenizer;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.HadoopIllegalArgumentException;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem.Statistics;
+import org.apache.hadoop.fs.Options.CreateOpts;
+import org.apache.hadoop.fs.Options.Rename;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.fs.InvalidPathException;
+import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.util.Progressable;
+
+/**
+ * This class provides an interface for implementors of a Hadoop file system
+ * (analogous to the VFS of Unix). Applications do not access this class;
+ * instead they access files across all file systems using {@link FileContext}.
+ * 
+ * Pathnames passed to AbstractFileSystem can be fully qualified URI that
+ * matches the "this" file system (ie same scheme and authority) 
+ * or a Slash-relative name that is assumed to be relative
+ * to the root of the "this" file system .
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving /*Evolving for a release,to be changed to Stable */
+public abstract class AbstractFileSystem {
+  static final Log LOG = LogFactory.getLog(AbstractFileSystem.class);
+
+  /** Recording statistics per a file system class. */
+  private static final Map<Class<? extends AbstractFileSystem>, Statistics> 
+  STATISTICS_TABLE =
+      new IdentityHashMap<Class<? extends AbstractFileSystem>, Statistics>();
+  
+  /** Cache of constructors for each file system class. */
+  private static final Map<Class<?>, Constructor<?>> CONSTRUCTOR_CACHE = 
+    new ConcurrentHashMap<Class<?>, Constructor<?>>();
+  
+  private static final Class<?>[] URI_CONFIG_ARGS = 
+    new Class[]{URI.class, Configuration.class};
+  
+  /** The statistics for this file system. */
+  protected Statistics statistics;
+  
+  private final URI myUri;
+  
+  protected Statistics getStatistics() {
+    return statistics;
+  }
+  
+  /**
+   * Prohibits names which contain a ".", "..", ":" or "/" 
+   */
+  private static boolean isValidName(String src) {
+    // Check for ".." "." ":" "/"
+    StringTokenizer tokens = new StringTokenizer(src, Path.SEPARATOR);
+    while(tokens.hasMoreTokens()) {
+      String element = tokens.nextToken();
+      if (element.equals("..") || 
+          element.equals(".")  ||
+          (element.indexOf(":") >= 0)) {
+        return false;
+      }
+    }
+    return true;
+  }
+  
+  /** 
+   * Create an object for the given class and initialize it from conf.
+   * @param theClass class of which an object is created
+   * @param conf Configuration
+   * @return a new object
+   */
+  @SuppressWarnings("unchecked")
+  static <T> T newInstance(Class<T> theClass,
+    URI uri, Configuration conf) {
+    T result;
+    try {
+      Constructor<T> meth = (Constructor<T>) CONSTRUCTOR_CACHE.get(theClass);
+      if (meth == null) {
+        meth = theClass.getDeclaredConstructor(URI_CONFIG_ARGS);
+        meth.setAccessible(true);
+        CONSTRUCTOR_CACHE.put(theClass, meth);
+      }
+      result = meth.newInstance(uri, conf);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+    return result;
+  }
+  
+  /**
+   * Create a file system instance for the specified uri using the conf. The
+   * conf is used to find the class name that implements the file system. The
+   * conf is also passed to the file system for its configuration.
+   *
+   * @param uri URI of the file system
+   * @param conf Configuration for the file system
+   * 
+   * @return Returns the file system for the given URI
+   *
+   * @throws UnsupportedFileSystemException file system for <code>uri</code> is
+   *           not found
+   */
+  private static AbstractFileSystem createFileSystem(URI uri, Configuration conf)
+      throws UnsupportedFileSystemException {
+    Class<?> clazz = conf.getClass("fs.AbstractFileSystem." + 
+                                uri.getScheme() + ".impl", null);
+    if (clazz == null) {
+      throw new UnsupportedFileSystemException(
+          "No AbstractFileSystem for scheme: " + uri.getScheme());
+    }
+    return (AbstractFileSystem) newInstance(clazz, uri, conf);
+  }
+  
+  
+  /**
+   * Get the statistics for a particular file system.
+   * @param cls the class to lookup
+   * @return a statistics object
+   */
+  protected static synchronized Statistics getStatistics(String scheme,
+      Class<? extends AbstractFileSystem> cls) {
+    Statistics result = STATISTICS_TABLE.get(cls);
+    if (result == null) {
+      result = new Statistics(scheme);
+      STATISTICS_TABLE.put(cls, result);
+    }
+    return result;
+  }
+  
+  protected static synchronized void clearStatistics() {
+    for(Statistics stat: STATISTICS_TABLE.values()) {
+      stat.reset();
+    }
+  }
+
+  protected static synchronized void printStatistics() {
+    for (Map.Entry<Class<? extends AbstractFileSystem>, Statistics> pair: 
+            STATISTICS_TABLE.entrySet()) {
+      System.out.println("  FileSystem " + pair.getKey().getName() + 
+                         ": " + pair.getValue());
+    }
+  }
+
+  /**
+   * The main factory method for creating a file system. Get a file system for
+   * the URI's scheme and authority. The scheme of the <code>uri</code>
+   * determines a configuration property name,
+   * <tt>fs.AbstractFileSystem.<i>scheme</i>.impl</tt> whose value names the
+   * AbstractFileSystem class.
+   * 
+   * The entire URI and conf is passed to the AbstractFileSystem factory method.
+   * 
+   * @param uri for the file system to be created.
+   * @param conf which is passed to the file system impl.
+   * 
+   * @return file system for the given URI.
+   * 
+   * @throws UnsupportedFileSystemException if the file system for
+   *           <code>uri</code> is not supported.
+   */
+  static AbstractFileSystem get(final URI uri, final Configuration conf)
+      throws UnsupportedFileSystemException {
+    return createFileSystem(uri, conf);
+  }
+
+  /**
+   * Constructor to be called by subclasses.
+   * 
+   * @param uri for this file system.
+   * @param supportedScheme the scheme supported by the implementor
+   * @param authorityNeeded if true then theURI must have authority, if false
+   *          then the URI must have null authority.
+   *
+   * @throws URISyntaxException <code>uri</code> has syntax error
+   */
+  protected AbstractFileSystem(final URI uri, final String supportedScheme,
+      final boolean authorityNeeded, final int defaultPort)
+      throws URISyntaxException {
+    myUri = getUri(uri, supportedScheme, authorityNeeded, defaultPort);
+    statistics = getStatistics(supportedScheme, getClass()); 
+  }
+  
+  protected void checkScheme(URI uri, String supportedScheme) {
+    String scheme = uri.getScheme();
+    if (scheme == null) {
+      throw new HadoopIllegalArgumentException("Uri without scheme: " + uri);
+    }
+    if (!scheme.equals(supportedScheme)) {
+      throw new HadoopIllegalArgumentException("Uri scheme " + uri
+          + " does not match the scheme " + supportedScheme);
+    }
+  }
+
+  /**
+   * Get the URI for the file system based on the given URI. The path, query
+   * part of the given URI is stripped out and default file system port is used
+   * to form the URI.
+   * 
+   * @param uri FileSystem URI.
+   * @param authorityNeeded if true authority cannot be null in the URI. If
+   *          false authority must be null.
+   * @param defaultPort default port to use if port is not specified in the URI.
+   * 
+   * @return URI of the file system
+   * 
+   * @throws URISyntaxException <code>uri</code> has syntax error
+   */
+  private URI getUri(URI uri, String supportedScheme,
+      boolean authorityNeeded, int defaultPort) throws URISyntaxException {
+    checkScheme(uri, supportedScheme);
+    // A file system implementation that requires authority must always
+    // specify default port
+    if (defaultPort < 0 && authorityNeeded) {
+      throw new HadoopIllegalArgumentException(
+          "FileSystem implementation error -  default port " + defaultPort
+              + " is not valid");
+    }
+    String authority = uri.getAuthority();
+    if (!authorityNeeded) {
+      if (authority != null) {
+        throw new HadoopIllegalArgumentException("Scheme with non-null authority: "
+            + uri);
+      }
+      return new URI(supportedScheme + ":///");
+    }
+    if (authority == null) {
+      throw new HadoopIllegalArgumentException("Uri without authority: " + uri);
+    }
+    int port = uri.getPort();
+    port = port == -1 ? defaultPort : port;
+    return new URI(supportedScheme + "://" + uri.getHost() + ":" + port);
+  }
+  
+  /**
+   * The default port of this file system.
+   * 
+   * @return default port of this file system's Uri scheme
+   *         A uri with a port of -1 => default port;
+   */
+  protected abstract int getUriDefaultPort();
+
+  /**
+   * Returns a URI whose scheme and authority identify this FileSystem.
+   * 
+   * @return the uri of this file system.
+   */
+  protected URI getUri() {
+    return myUri;
+  }
+  
+  /**
+   * Check that a Path belongs to this FileSystem.
+   * 
+   * If the path is fully qualified URI, then its scheme and authority
+   * matches that of this file system. Otherwise the path must be 
+   * slash-relative name.
+   * 
+   * @throws InvalidPathException if the path is invalid
+   */
+  protected void checkPath(Path path) {
+    URI uri = path.toUri();
+    String thatScheme = uri.getScheme();
+    String thatAuthority = uri.getAuthority();
+    if (thatScheme == null) {
+      if (thatAuthority == null) {
+        if (path.isUriPathAbsolute()) {
+          return;
+        }
+        throw new InvalidPathException("relative paths not allowed:" + 
+            path);
+      } else {
+        throw new InvalidPathException(
+            "Path without scheme with non-null autorhrity:" + path);
+      }
+    }
+    String thisScheme = this.getUri().getScheme();
+    String thisAuthority = this.getUri().getAuthority();
+    
+    // Schemes and authorities must match.
+    // Allow for null Authority for file:///
+    if (!thisScheme.equalsIgnoreCase(thatScheme) ||
+       (thisAuthority != null && 
+            !thisAuthority.equalsIgnoreCase(thatAuthority)) ||
+       (thisAuthority == null && thatAuthority != null)) {
+      throw new InvalidPathException("Wrong FS: " + path + ", expected: "
+          + this.getUri());
+    }
+    
+    int thisPort = this.getUri().getPort();
+    int thatPort = path.toUri().getPort();
+    if (thatPort == -1) { // -1 => defaultPort of Uri scheme
+      thatPort = this.getUriDefaultPort();
+    }
+    if (thisPort != thatPort) {
+      throw new InvalidPathException("Wrong FS: " + path + ", expected: "
+          + this.getUri());
+    }
+  }
+  
+  /**
+   * Get the path-part of a pathname. Checks that URI matches this file system
+   * and that the path-part is a valid name.
+   * 
+   * @param p path
+   * 
+   * @return path-part of the Path p
+   */
+  protected String getUriPath(final Path p) {
+    checkPath(p);
+    String s = p.toUri().getPath();
+    if (!isValidName(s)) {
+      throw new InvalidPathException("Path part " + s + " from URI" + p
+          + " is not a valid filename.");
+    }
+    return s;
+  }
+  
+  /**
+   * Some file systems like LocalFileSystem have an initial workingDir
+   * that is used as the starting workingDir. For other file systems
+   * like HDFS there is no built in notion of an initial workingDir.
+   * 
+   * @return the initial workingDir if the file system has such a notion
+   *         otherwise return a null.
+   */
+  protected Path getInitialWorkingDirectory() {
+    return null;
+  }
+  
+  /** 
+   * Return the current user's home directory in this file system.
+   * The default implementation returns "/user/$USER/".
+   * 
+   * @return current user's home directory.
+   */
+  protected Path getHomeDirectory() {
+    return new Path("/user/"+System.getProperty("user.name")).makeQualified(
+                                                                getUri(), null);
+  }
+  
+  /**
+   * Return a set of server default configuration values.
+   * 
+   * @return server default configuration values
+   * 
+   * @throws IOException an I/O error occurred
+   */
+  protected abstract FsServerDefaults getServerDefaults() throws IOException; 
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#create(Path, EnumSet, Options.CreateOpts...)} except
+   * that the Path f must be fully qualified and the permission is absolute
+   * (i.e. umask has been applied).
+   */
+  protected final FSDataOutputStream create(final Path f,
+      final EnumSet<CreateFlag> createFlag, Options.CreateOpts... opts)
+      throws AccessControlException, FileAlreadyExistsException,
+      FileNotFoundException, ParentNotDirectoryException,
+      UnsupportedFileSystemException, UnresolvedLinkException, IOException {
+    checkPath(f);
+    int bufferSize = -1;
+    short replication = -1;
+    long blockSize = -1;
+    int bytesPerChecksum = -1;
+    FsPermission permission = null;
+    Progressable progress = null;
+    Boolean createParent = null;
+ 
+    for (CreateOpts iOpt : opts) {
+      if (CreateOpts.BlockSize.class.isInstance(iOpt)) {
+        if (blockSize != -1) {
+          throw new HadoopIllegalArgumentException(
+              "BlockSize option is set multiple times");
+        }
+        blockSize = ((CreateOpts.BlockSize) iOpt).getValue();
+      } else if (CreateOpts.BufferSize.class.isInstance(iOpt)) {
+        if (bufferSize != -1) {
+          throw new HadoopIllegalArgumentException(
+              "BufferSize option is set multiple times");
+        }
+        bufferSize = ((CreateOpts.BufferSize) iOpt).getValue();
+      } else if (CreateOpts.ReplicationFactor.class.isInstance(iOpt)) {
+        if (replication != -1) {
+          throw new HadoopIllegalArgumentException(
+              "ReplicationFactor option is set multiple times");
+        }
+        replication = ((CreateOpts.ReplicationFactor) iOpt).getValue();
+      } else if (CreateOpts.BytesPerChecksum.class.isInstance(iOpt)) {
+        if (bytesPerChecksum != -1) {
+          throw new HadoopIllegalArgumentException(
+              "BytesPerChecksum option is set multiple times");
+        }
+        bytesPerChecksum = ((CreateOpts.BytesPerChecksum) iOpt).getValue();
+      } else if (CreateOpts.Perms.class.isInstance(iOpt)) {
+        if (permission != null) {
+          throw new HadoopIllegalArgumentException(
+              "Perms option is set multiple times");
+        }
+        permission = ((CreateOpts.Perms) iOpt).getValue();
+      } else if (CreateOpts.Progress.class.isInstance(iOpt)) {
+        if (progress != null) {
+          throw new HadoopIllegalArgumentException(
+              "Progress option is set multiple times");
+        }
+        progress = ((CreateOpts.Progress) iOpt).getValue();
+      } else if (CreateOpts.CreateParent.class.isInstance(iOpt)) {
+        if (createParent != null) {
+          throw new HadoopIllegalArgumentException(
+              "CreateParent option is set multiple times");
+        }
+        createParent = ((CreateOpts.CreateParent) iOpt).getValue();
+      } else {
+        throw new HadoopIllegalArgumentException("Unkown CreateOpts of type " +
+            iOpt.getClass().getName());
+      }
+    }
+    if (permission == null) {
+      throw new HadoopIllegalArgumentException("no permission supplied");
+    }
+
+
+    FsServerDefaults ssDef = getServerDefaults();
+    if (ssDef.getBlockSize() % ssDef.getBytesPerChecksum() != 0) {
+      throw new IOException("Internal error: default blockSize is" + 
+          " not a multiple of default bytesPerChecksum ");
+    }
+    
+    if (blockSize == -1) {
+      blockSize = ssDef.getBlockSize();
+    }
+    if (bytesPerChecksum == -1) {
+      bytesPerChecksum = ssDef.getBytesPerChecksum();
+    }
+    if (bufferSize == -1) {
+      bufferSize = ssDef.getFileBufferSize();
+    }
+    if (replication == -1) {
+      replication = ssDef.getReplication();
+    }
+    if (createParent == null) {
+      createParent = false;
+    }
+
+    if (blockSize % bytesPerChecksum != 0) {
+      throw new HadoopIllegalArgumentException(
+             "blockSize should be a multiple of checksumsize");
+    }
+
+    return this.createInternal(f, createFlag, permission, bufferSize,
+      replication, blockSize, progress, bytesPerChecksum, createParent);
+  }
+
+  /**
+   * The specification of this method matches that of
+   * {@link #create(Path, EnumSet, Options.CreateOpts...)} except that the opts
+   * have been declared explicitly.
+   */
+  protected abstract FSDataOutputStream createInternal(Path f,
+      EnumSet<CreateFlag> flag, FsPermission absolutePermission,
+      int bufferSize, short replication, long blockSize, Progressable progress,
+      int bytesPerChecksum, boolean createParent)
+      throws AccessControlException, FileAlreadyExistsException,
+      FileNotFoundException, ParentNotDirectoryException,
+      UnsupportedFileSystemException, UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#mkdir(Path, FsPermission, boolean)} except that the Path
+   * f must be fully qualified and the permission is absolute (i.e. 
+   * umask has been applied).
+   */
+  protected abstract void mkdir(final Path dir, final FsPermission permission,
+      final boolean createParent) throws AccessControlException,
+      FileAlreadyExistsException, FileNotFoundException,
+      UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#delete(Path, boolean)} except that Path f must be for
+   * this file system.
+   */
+  protected abstract boolean delete(final Path f, final boolean recursive)
+      throws AccessControlException, FileNotFoundException,
+      UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#open(Path)} except that Path f must be for this
+   * file system.
+   */
+  protected FSDataInputStream open(final Path f) throws AccessControlException,
+      FileNotFoundException, UnresolvedLinkException, IOException {
+    return open(f, getServerDefaults().getFileBufferSize());
+  }
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#open(Path, int)} except that Path f must be for this
+   * file system.
+   */
+  protected abstract FSDataInputStream open(final Path f, int bufferSize)
+      throws AccessControlException, FileNotFoundException,
+      UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#setReplication(Path, short)} except that Path f must be
+   * for this file system.
+   */
+  protected abstract boolean setReplication(final Path f,
+      final short replication) throws AccessControlException,
+      FileNotFoundException, UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#rename(Path, Path, Options.Rename...)} except that Path
+   * f must be for this file system.
+   */
+  protected final void rename(final Path src, final Path dst,
+      final Options.Rename... options) throws AccessControlException,
+      FileAlreadyExistsException, FileNotFoundException,
+      ParentNotDirectoryException, UnresolvedLinkException, IOException {
+    boolean overwrite = false;
+    if (null != options) {
+      for (Rename option : options) {
+        if (option == Rename.OVERWRITE) {
+          overwrite = true;
+        }
+      }
+    }
+    renameInternal(src, dst, overwrite);
+  }
+  
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#rename(Path, Path, Options.Rename...)} except that Path
+   * f must be for this file system and NO OVERWRITE is performed.
+   * 
+   * File systems that do not have a built in overwrite need implement only this
+   * method and can take advantage of the default impl of the other
+   * {@link #renameInternal(Path, Path, boolean)}
+   */
+  protected abstract void renameInternal(final Path src, final Path dst)
+      throws AccessControlException, FileAlreadyExistsException,
+      FileNotFoundException, ParentNotDirectoryException,
+      UnresolvedLinkException, IOException;
+  
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#rename(Path, Path, Options.Rename...)} except that Path
+   * f must be for this file system.
+   */
+  protected void renameInternal(final Path src, final Path dst,
+      boolean overwrite) throws AccessControlException,
+      FileAlreadyExistsException, FileNotFoundException,
+      ParentNotDirectoryException, UnresolvedLinkException, IOException {
+    // Default implementation deals with overwrite in a non-atomic way
+    final FileStatus srcStatus = getFileLinkStatus(src);
+    if (srcStatus == null) {
+      throw new FileNotFoundException("rename source " + src + " not found.");
+    }
+
+    FileStatus dstStatus;
+    try {
+      dstStatus = getFileLinkStatus(dst);
+    } catch (IOException e) {
+      dstStatus = null;
+    }
+    if (dstStatus != null) {
+      if (dst.equals(src)) {
+        throw new FileAlreadyExistsException(
+            "The source "+src+" and destination "+dst+" are the same");
+      }
+      if (srcStatus.isSymlink() && dst.equals(srcStatus.getSymlink())) {
+        throw new FileAlreadyExistsException(
+            "Cannot rename symlink "+src+" to its target "+dst);
+      }
+      if (srcStatus.isDir() != dstStatus.isDir()) {
+        throw new IOException("Source " + src + " Destination " + dst
+            + " both should be either file or directory");
+      }
+      if (!overwrite) {
+        throw new FileAlreadyExistsException("rename destination " + dst
+            + " already exists.");
+      }
+      // Delete the destination that is a file or an empty directory
+      if (dstStatus.isDir()) {
+        Iterator<FileStatus> list = listStatusIterator(dst);
+        if (list != null && list.hasNext()) {
+          throw new IOException(
+              "rename cannot overwrite non empty destination directory " + dst);
+        }
+      }
+      delete(dst, false);
+    } else {
+      final Path parent = dst.getParent();
+      final FileStatus parentStatus = getFileLinkStatus(parent);
+      if (parentStatus == null) {
+        throw new FileNotFoundException("rename destination parent " + parent
+            + " not found.");
+      }
+      if (!parentStatus.isDir() && !parentStatus.isSymlink()) {
+        throw new ParentNotDirectoryException("rename destination parent "
+            + parent + " is a file.");
+      }
+    }
+    renameInternal(src, dst);
+  }
+  
+  /**
+   * Returns true if the file system supports symlinks, false otherwise.
+   */
+  protected boolean supportsSymlinks() {
+    return false;
+  }
+  
+  /**
+   * The specification of this method matches that of  
+   * {@link FileContext#createSymlink(Path, Path, boolean)};
+   */
+  protected void createSymlink(final Path target, final Path link,
+      final boolean createParent) throws IOException, UnresolvedLinkException {
+    throw new IOException("File system does not support symlinks");    
+  }
+
+  /**
+   * The specification of this method matches that of  
+   * {@link FileContext#getLinkTarget(Path)};
+   */
+  protected Path getLinkTarget(final Path f) throws IOException {
+    /* We should never get here. Any file system that threw an
+     * UnresolvedLinkException, causing this function to be called,
+     * needs to override this method.
+     */
+    throw new AssertionError();
+  }
+    
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#setPermission(Path, FsPermission)} except that Path f
+   * must be for this file system.
+   */
+  protected abstract void setPermission(final Path f,
+      final FsPermission permission) throws AccessControlException,
+      FileNotFoundException, UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#setOwner(Path, String, String)} except that Path f must
+   * be for this file system.
+   */
+  protected abstract void setOwner(final Path f, final String username,
+      final String groupname) throws AccessControlException,
+      FileNotFoundException, UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#setTimes(Path, long, long)} except that Path f must be
+   * for this file system.
+   */
+  protected abstract void setTimes(final Path f, final long mtime,
+    final long atime) throws AccessControlException, FileNotFoundException,
+      UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#getFileChecksum(Path)} except that Path f must be for
+   * this file system.
+   */
+  protected abstract FileChecksum getFileChecksum(final Path f)
+      throws AccessControlException, FileNotFoundException,
+      UnresolvedLinkException, IOException;
+  
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#getFileStatus(Path)} 
+   * except that an UnresolvedLinkException may be thrown if a symlink is 
+   * encountered in the path.
+   */
+  protected abstract FileStatus getFileStatus(final Path f)
+      throws AccessControlException, FileNotFoundException,
+      UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#getFileLinkStatus(Path)}
+   * except that an UnresolvedLinkException may be thrown if a symlink is  
+   * encountered in the path leading up to the final path component.
+   * If the file system does not support symlinks then the behavior is
+   * equivalent to {@link AbstractFileSystem#getFileStatus(Path)}.
+   */
+  protected FileStatus getFileLinkStatus(final Path f)
+      throws AccessControlException, FileNotFoundException,
+      UnsupportedFileSystemException, IOException {
+    return getFileStatus(f);
+  }
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#getFileBlockLocations(Path, long, long)} except that
+   * Path f must be for this file system.
+   */
+  protected abstract BlockLocation[] getFileBlockLocations(final Path f,
+      final long start, final long len) throws AccessControlException,
+      FileNotFoundException, UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#getFsStatus(Path)} except that Path f must be for this
+   * file system.
+   */
+  protected FsStatus getFsStatus(final Path f) throws AccessControlException,
+      FileNotFoundException, UnresolvedLinkException, IOException {
+    // default impl gets FsStatus of root
+    return getFsStatus();
+  }
+  
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#getFsStatus(Path)}.
+   */
+  protected abstract FsStatus getFsStatus() throws AccessControlException,
+      FileNotFoundException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#listStatus(Path)} except that Path f must be for this
+   * file system.
+   */
+  protected Iterator<FileStatus> listStatusIterator(final Path f)
+      throws AccessControlException, FileNotFoundException,
+      UnresolvedLinkException, IOException {
+    return new Iterator<FileStatus>() {
+      private int i = 0;
+      private FileStatus[] statusList = listStatus(f);
+      
+      @Override
+      public boolean hasNext() {
+        return i < statusList.length;
+      }
+      
+      @Override
+      public FileStatus next() {
+        if (!hasNext()) {
+          throw new NoSuchElementException();
+        }
+        return statusList[i++];
+      }
+      
+      @Override
+      public void remove() {
+        throw new UnsupportedOperationException("Remove is not supported");
+      }
+    };
+  }
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext.Util#listStatus(Path)} except that Path f must be 
+   * for this file system.
+   */
+  protected abstract FileStatus[] listStatus(final Path f)
+      throws AccessControlException, FileNotFoundException,
+      UnresolvedLinkException, IOException;
+
+  /**
+   * The specification of this method matches that of
+   * {@link FileContext#setVerifyChecksum(boolean, Path)} except that Path f
+   * must be for this file system.
+   */
+  protected abstract void setVerifyChecksum(final boolean verifyChecksum)
+      throws AccessControlException, IOException;
+}

+ 63 - 0
src/java/org/apache/hadoop/fs/AvroFSInput.java

@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.avro.file.SeekableInput;
+
+/** Adapts an {@link FSDataInputStream} to Avro's SeekableInput interface. */
+public class AvroFSInput implements Closeable, SeekableInput {
+  private final FSDataInputStream stream;
+  private final long len;
+
+  /** Construct given an {@link FSDataInputStream} and its length. */
+  public AvroFSInput(final FSDataInputStream in, final long len) {
+    this.stream = in;
+    this.len = len;
+  }
+
+  /** Construct given a {@link FileContext} and a {@link Path}. */
+  public AvroFSInput(final FileContext fc, final Path p) throws IOException {
+    FileStatus status = fc.getFileStatus(p);
+    this.len = status.getLen();
+    this.stream = fc.open(p);
+  }
+
+  public long length() {
+    return len;
+  }
+
+  public int read(byte[] b, int off, int len) throws IOException {
+    return stream.read(b, off, len);
+  }
+
+  public void seek(long p) throws IOException {
+    stream.seek(p);
+  }
+
+  public long tell() throws IOException {
+    return stream.getPos();
+  }
+
+  public void close() throws IOException {
+    stream.close();
+  }
+}

+ 27 - 10
src/java/org/apache/hadoop/fs/ChecksumFileSystem.java

@@ -205,24 +205,41 @@ public abstract class ChecksumFileSystem extends FilterFileSystem {
     @Override
     protected int readChunk(long pos, byte[] buf, int offset, int len,
         byte[] checksum) throws IOException {
+
       boolean eof = false;
-      if(needChecksum()) {
-        try {
-          long checksumPos = getChecksumFilePos(pos); 
-          if(checksumPos != sums.getPos()) {
-            sums.seek(checksumPos);
-          }
-          sums.readFully(checksum);
-        } catch (EOFException e) {
+      if (needChecksum()) {
+        assert checksum != null; // we have a checksum buffer
+        assert checksum.length % CHECKSUM_SIZE == 0; // it is sane length
+        assert len >= bytesPerSum; // we must read at least one chunk
+
+        final int checksumsToRead = Math.min(
+          len/bytesPerSum, // number of checksums based on len to read
+          checksum.length / CHECKSUM_SIZE); // size of checksum buffer
+        long checksumPos = getChecksumFilePos(pos); 
+        if(checksumPos != sums.getPos()) {
+          sums.seek(checksumPos);
+        }
+
+        int sumLenRead = sums.read(checksum, 0, CHECKSUM_SIZE * checksumsToRead);
+        if (sumLenRead >= 0 && sumLenRead % CHECKSUM_SIZE != 0) {
+          throw new ChecksumException(
+            "Checksum file not a length multiple of checksum size " +
+            "in " + file + " at " + pos + " checksumpos: " + checksumPos +
+            " sumLenread: " + sumLenRead,
+            pos);
+        }
+        if (sumLenRead <= 0) { // we're at the end of the file
           eof = true;
+        } else {
+          // Adjust amount of data to read based on how many checksum chunks we read
+          len = Math.min(len, bytesPerSum * (sumLenRead / CHECKSUM_SIZE));
         }
-        len = bytesPerSum;
       }
       if(pos != datas.getPos()) {
         datas.seek(pos);
       }
       int nread = readFully(datas, buf, offset, len);
-      if( eof && nread > 0) {
+      if (eof && nread > 0) {
         throw new ChecksumException("Checksum error: "+file+" at "+pos, pos);
       }
       return nread;

+ 481 - 0
src/java/org/apache/hadoop/fs/ChecksumFs.java

@@ -0,0 +1,481 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs;
+
+import java.io.*;
+import java.net.URISyntaxException;
+import java.util.Arrays;
+import java.util.EnumSet;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.PureJavaCrc32;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Abstract Checksumed Fs.
+ * It provide a basic implementation of a Checksumed Fs,
+ * which creates a checksum file for each raw file.
+ * It generates & verifies checksums at the client side.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving /*Evolving for a release,to be changed to Stable */
+public abstract class ChecksumFs extends FilterFs {
+  private static final byte[] CHECKSUM_VERSION = new byte[] {'c', 'r', 'c', 0};
+  private int defaultBytesPerChecksum = 512;
+  private boolean verifyChecksum = true;
+
+  public static double getApproxChkSumLength(long size) {
+    return ChecksumFSOutputSummer.CHKSUM_AS_FRACTION * size;
+  }
+  
+  public ChecksumFs(AbstractFileSystem theFs)
+    throws IOException, URISyntaxException {
+    super(theFs);
+    defaultBytesPerChecksum = 
+      getMyFs().getServerDefaults().getBytesPerChecksum();
+  }
+  
+  /**
+   * Set whether to verify checksum.
+   */
+  public void setVerifyChecksum(boolean inVerifyChecksum) {
+    this.verifyChecksum = inVerifyChecksum;
+  }
+
+  /** get the raw file system. */
+  public AbstractFileSystem getRawFs() {
+    return getMyFs();
+  }
+
+  /** Return the name of the checksum file associated with a file.*/
+  public Path getChecksumFile(Path file) {
+    return new Path(file.getParent(), "." + file.getName() + ".crc");
+  }
+
+  /** Return true iff file is a checksum file name.*/
+  public static boolean isChecksumFile(Path file) {
+    String name = file.getName();
+    return name.startsWith(".") && name.endsWith(".crc");
+  }
+
+  /** Return the length of the checksum file given the size of the 
+   * actual file.
+   **/
+  public long getChecksumFileLength(Path file, long fileSize) {
+    return getChecksumLength(fileSize, getBytesPerSum());
+  }
+
+  /** Return the bytes Per Checksum. */
+  public int getBytesPerSum() {
+    return defaultBytesPerChecksum;
+  }
+
+  private int getSumBufferSize(int bytesPerSum, int bufferSize)
+    throws IOException {
+    int defaultBufferSize =  getMyFs().getServerDefaults().getFileBufferSize();
+    int proportionalBufferSize = bufferSize / bytesPerSum;
+    return Math.max(bytesPerSum,
+                    Math.max(proportionalBufferSize, defaultBufferSize));
+  }
+
+  /*******************************************************
+   * For open()'s FSInputStream
+   * It verifies that data matches checksums.
+   *******************************************************/
+  private static class ChecksumFSInputChecker extends FSInputChecker {
+    public static final Log LOG 
+      = LogFactory.getLog(FSInputChecker.class);
+    private static final int HEADER_LENGTH = 8;
+    
+    private ChecksumFs fs;
+    private FSDataInputStream datas;
+    private FSDataInputStream sums;
+    private int bytesPerSum = 1;
+    private long fileLen = -1L;
+    
+    public ChecksumFSInputChecker(ChecksumFs fs, Path file)
+      throws IOException, UnresolvedLinkException {
+      this(fs, file, fs.getServerDefaults().getFileBufferSize());
+    }
+    
+    public ChecksumFSInputChecker(ChecksumFs fs, Path file, int bufferSize)
+      throws IOException, UnresolvedLinkException {
+      super(file, fs.getFileStatus(file).getReplication());
+      this.datas = fs.getRawFs().open(file, bufferSize);
+      this.fs = fs;
+      Path sumFile = fs.getChecksumFile(file);
+      try {
+        int sumBufferSize = fs.getSumBufferSize(fs.getBytesPerSum(),
+                                                bufferSize);
+        sums = fs.getRawFs().open(sumFile, sumBufferSize);
+
+        byte[] version = new byte[CHECKSUM_VERSION.length];
+        sums.readFully(version);
+        if (!Arrays.equals(version, CHECKSUM_VERSION)) {
+          throw new IOException("Not a checksum file: "+sumFile);
+        }
+        this.bytesPerSum = sums.readInt();
+        set(fs.verifyChecksum, new PureJavaCrc32(), bytesPerSum, 4);
+      } catch (FileNotFoundException e) {         // quietly ignore
+        set(fs.verifyChecksum, null, 1, 0);
+      } catch (IOException e) {                   // loudly ignore
+        LOG.warn("Problem opening checksum file: "+ file + 
+                 ".  Ignoring exception: " + 
+                 StringUtils.stringifyException(e));
+        set(fs.verifyChecksum, null, 1, 0);
+      }
+    }
+    
+    private long getChecksumFilePos(long dataPos) {
+      return HEADER_LENGTH + 4*(dataPos/bytesPerSum);
+    }
+    
+    protected long getChunkPosition(long dataPos) {
+      return dataPos/bytesPerSum*bytesPerSum;
+    }
+    
+    public int available() throws IOException {
+      return datas.available() + super.available();
+    }
+    
+    public int read(long position, byte[] b, int off, int len)
+      throws IOException, UnresolvedLinkException {
+      // parameter check
+      if ((off | len | (off + len) | (b.length - (off + len))) < 0) {
+        throw new IndexOutOfBoundsException();
+      } else if (len == 0) {
+        return 0;
+      }
+      if (position<0) {
+        throw new IllegalArgumentException(
+            "Parameter position can not to be negative");
+      }
+
+      ChecksumFSInputChecker checker = new ChecksumFSInputChecker(fs, file);
+      checker.seek(position);
+      int nread = checker.read(b, off, len);
+      checker.close();
+      return nread;
+    }
+    
+    public void close() throws IOException {
+      datas.close();
+      if (sums != null) {
+        sums.close();
+      }
+      set(fs.verifyChecksum, null, 1, 0);
+    }
+    
+    @Override
+    public boolean seekToNewSource(long targetPos) throws IOException {
+      final long sumsPos = getChecksumFilePos(targetPos);
+      fs.reportChecksumFailure(file, datas, targetPos, sums, sumsPos);
+      final boolean newDataSource = datas.seekToNewSource(targetPos);
+      return sums.seekToNewSource(sumsPos) || newDataSource;
+    }
+
+    @Override
+    protected int readChunk(long pos, byte[] buf, int offset, int len,
+        byte[] checksum) throws IOException {
+      boolean eof = false;
+      if (needChecksum()) {
+        assert checksum != null; // we have a checksum buffer
+        assert checksum.length % CHECKSUM_SIZE == 0; // it is sane length
+        assert len >= bytesPerSum; // we must read at least one chunk
+
+        final int checksumsToRead = Math.min(
+          len/bytesPerSum, // number of checksums based on len to read
+          checksum.length / CHECKSUM_SIZE); // size of checksum buffer
+        long checksumPos = getChecksumFilePos(pos); 
+        if(checksumPos != sums.getPos()) {
+          sums.seek(checksumPos);
+        }
+
+        int sumLenRead = sums.read(checksum, 0, CHECKSUM_SIZE * checksumsToRead);
+        if (sumLenRead >= 0 && sumLenRead % CHECKSUM_SIZE != 0) {
+          throw new EOFException("Checksum file not a length multiple of checksum size " +
+                                 "in " + file + " at " + pos + " checksumpos: " + checksumPos +
+                                 " sumLenread: " + sumLenRead );
+        }
+        if (sumLenRead <= 0) { // we're at the end of the file
+          eof = true;
+        } else {
+          // Adjust amount of data to read based on how many checksum chunks we read
+          len = Math.min(len, bytesPerSum * (sumLenRead / CHECKSUM_SIZE));
+        }
+      }
+      if (pos != datas.getPos()) {
+        datas.seek(pos);
+      }
+      int nread = readFully(datas, buf, offset, len);
+      if (eof && nread > 0) {
+        throw new ChecksumException("Checksum error: "+file+" at "+pos, pos);
+      }
+      return nread;
+    }
+    
+    /* Return the file length */
+    private long getFileLength() throws IOException, UnresolvedLinkException {
+      if (fileLen==-1L) {
+        fileLen = fs.getFileStatus(file).getLen();
+      }
+      return fileLen;
+    }
+    
+    /**
+     * Skips over and discards <code>n</code> bytes of data from the
+     * input stream.
+     *
+     * The <code>skip</code> method skips over some smaller number of bytes
+     * when reaching end of file before <code>n</code> bytes have been skipped.
+     * The actual number of bytes skipped is returned.  If <code>n</code> is
+     * negative, no bytes are skipped.
+     *
+     * @param      n   the number of bytes to be skipped.
+     * @return     the actual number of bytes skipped.
+     * @exception  IOException  if an I/O error occurs.
+     *             ChecksumException if the chunk to skip to is corrupted
+     */
+    public synchronized long skip(long n) throws IOException { 
+      final long curPos = getPos();
+      final long fileLength = getFileLength();
+      if (n+curPos > fileLength) {
+        n = fileLength - curPos;
+      }
+      return super.skip(n);
+    }
+    
+    /**
+     * Seek to the given position in the stream.
+     * The next read() will be from that position.
+     * 
+     * <p>This method does not allow seek past the end of the file.
+     * This produces IOException.
+     *
+     * @param      pos   the postion to seek to.
+     * @exception  IOException  if an I/O error occurs or seeks after EOF
+     *             ChecksumException if the chunk to seek to is corrupted
+     */
+
+    public synchronized void seek(long pos) throws IOException { 
+      if (pos>getFileLength()) {
+        throw new IOException("Cannot seek after EOF");
+      }
+      super.seek(pos);
+    }
+
+  }
+
+  /**
+   * Opens an FSDataInputStream at the indicated Path.
+   * @param f the file name to open
+   * @param bufferSize the size of the buffer to be used.
+   */
+  @Override
+  public FSDataInputStream open(Path f, int bufferSize) 
+    throws IOException, UnresolvedLinkException {
+    return new FSDataInputStream(
+        new ChecksumFSInputChecker(this, f, bufferSize));
+  }
+
+  /**
+   * Calculated the length of the checksum file in bytes.
+   * @param size the length of the data file in bytes
+   * @param bytesPerSum the number of bytes in a checksum block
+   * @return the number of bytes in the checksum file
+   */
+  public static long getChecksumLength(long size, int bytesPerSum) {
+    //the checksum length is equal to size passed divided by bytesPerSum +
+    //bytes written in the beginning of the checksum file.  
+    return ((size + bytesPerSum - 1) / bytesPerSum) * 4 +
+             CHECKSUM_VERSION.length + 4;  
+  }
+
+  /** This class provides an output stream for a checksummed file.
+   * It generates checksums for data. */
+  private static class ChecksumFSOutputSummer extends FSOutputSummer {
+    private FSDataOutputStream datas;    
+    private FSDataOutputStream sums;
+    private static final float CHKSUM_AS_FRACTION = 0.01f;
+    
+    
+    public ChecksumFSOutputSummer(final ChecksumFs fs, final Path file, 
+      final EnumSet<CreateFlag> createFlag,
+      final FsPermission absolutePermission, final int bufferSize,
+      final short replication, final long blockSize, 
+      final Progressable progress, final int bytesPerChecksum,
+      final boolean createParent) throws IOException {
+      super(new PureJavaCrc32(), fs.getBytesPerSum(), 4);
+
+      this.datas = fs.getRawFs().createInternal(file, createFlag,
+          absolutePermission, bufferSize, replication, blockSize, progress,
+           bytesPerChecksum,  createParent);
+      
+      // Now create the chekcsumfile; adjust the buffsize
+      int bytesPerSum = fs.getBytesPerSum();
+      int sumBufferSize = fs.getSumBufferSize(bytesPerSum, bufferSize);
+      this.sums = fs.getRawFs().createInternal(fs.getChecksumFile(file),
+          EnumSet.of(CreateFlag.OVERWRITE), absolutePermission, sumBufferSize,
+          replication,  blockSize,  progress, bytesPerChecksum,  createParent);
+      sums.write(CHECKSUM_VERSION, 0, CHECKSUM_VERSION.length);
+      sums.writeInt(bytesPerSum);
+    }
+    
+    public void close() throws IOException {
+      flushBuffer();
+      sums.close();
+      datas.close();
+    }
+    
+    @Override
+    protected void writeChunk(byte[] b, int offset, int len, byte[] checksum)
+      throws IOException {
+      datas.write(b, offset, len);
+      sums.write(checksum);
+    }
+  }
+
+  @Override
+  protected FSDataOutputStream createInternal(Path f,
+      EnumSet<CreateFlag> createFlag, FsPermission absolutePermission,
+      int bufferSize, short replication, long blockSize, Progressable progress,
+      int bytesPerChecksum, boolean createParent) throws IOException {
+
+    final FSDataOutputStream out = new FSDataOutputStream(
+        new ChecksumFSOutputSummer(this, f, createFlag, absolutePermission,
+            bufferSize, replication, blockSize, progress,
+            bytesPerChecksum,  createParent), null);
+    return out;
+  }
+
+  /** Check if exists.
+   * @param f source file
+   */
+  private boolean exists(Path f) 
+    throws IOException, UnresolvedLinkException {
+    try {
+      return getMyFs().getFileStatus(f) != null;
+    } catch (FileNotFoundException e) {
+      return false;
+    }
+  }
+  
+  /** True iff the named path is a directory.
+   * Note: Avoid using this method. Instead reuse the FileStatus 
+   * returned by getFileStatus() or listStatus() methods.
+   */
+  private boolean isDirectory(Path f) 
+    throws IOException, UnresolvedLinkException {
+    try {
+      return getMyFs().getFileStatus(f).isDir();
+    } catch (FileNotFoundException e) {
+      return false;               // f does not exist
+    }
+  }
+  /**
+   * Set replication for an existing file.
+   * Implement the abstract <tt>setReplication</tt> of <tt>FileSystem</tt>
+   * @param src file name
+   * @param replication new replication
+   * @throws IOException
+   * @return true if successful;
+   *         false if file does not exist or is a directory
+   */
+  @Override
+  public boolean setReplication(Path src, short replication)
+    throws IOException, UnresolvedLinkException {
+    boolean value = getMyFs().setReplication(src, replication);
+    if (!value) {
+      return false;
+    }
+    Path checkFile = getChecksumFile(src);
+    if (exists(checkFile)) {
+      getMyFs().setReplication(checkFile, replication);
+    }
+    return true;
+  }
+
+  /**
+   * Rename files/dirs.
+   */
+  @Override
+  public void renameInternal(Path src, Path dst) 
+    throws IOException, UnresolvedLinkException {
+    if (isDirectory(src)) {
+      getMyFs().rename(src, dst);
+    } else {
+      getMyFs().rename(src, dst);
+
+      Path checkFile = getChecksumFile(src);
+      if (exists(checkFile)) { //try to rename checksum
+        if (isDirectory(dst)) {
+          getMyFs().rename(checkFile, dst);
+        } else {
+          getMyFs().rename(checkFile, getChecksumFile(dst));
+        }
+      }
+    }
+  }
+
+  /**
+   * Implement the delete(Path, boolean) in checksum
+   * file system.
+   */
+  public boolean delete(Path f, boolean recursive) 
+    throws IOException, UnresolvedLinkException {
+    FileStatus fstatus = null;
+    try {
+      fstatus = getMyFs().getFileStatus(f);
+    } catch(FileNotFoundException e) {
+      return false;
+    }
+    if (fstatus.isDir()) {
+      //this works since the crcs are in the same
+      //directories and the files. so we just delete
+      //everything in the underlying filesystem
+      return getMyFs().delete(f, recursive);
+    } else {
+      Path checkFile = getChecksumFile(f);
+      if (exists(checkFile)) {
+        getMyFs().delete(checkFile, true);
+      }
+      return getMyFs().delete(f, true);
+    }
+  }
+
+  /**
+   * Report a checksum error to the file system.
+   * @param f the file name containing the error
+   * @param in the stream open on the file
+   * @param inPos the position of the beginning of the bad data in the file
+   * @param sums the stream open on the checksum file
+   * @param sumsPos the position of the beginning of the bad data in the
+   *         checksum file
+   * @return if retry is neccessary
+   */
+  public boolean reportChecksumFailure(Path f, FSDataInputStream in,
+    long inPos, FSDataInputStream sums, long sumsPos) {
+    return false;
+  }
+}

+ 17 - 1
src/java/org/apache/hadoop/fs/CommonConfigurationKeys.java

@@ -119,6 +119,13 @@ public class CommonConfigurationKeys {
   public static final int     IPC_CLIENT_IDLETHRESHOLD_DEFAULT = 4000;
   public static final String  IPC_SERVER_TCPNODELAY_KEY = "ipc.server.tcpnodelay";
   public static final boolean IPC_SERVER_TCPNODELAY_DEFAULT = false;
+  public static final String  IPC_SERVER_RPC_MAX_RESPONSE_SIZE_KEY = 
+                                       "ipc.server.max.response.size";
+  public static final int     IPC_SERVER_RPC_MAX_RESPONSE_SIZE_DEFAULT = 
+                                        1024*1024;
+  public static final String IPC_SERVER_RPC_READ_THREADS_KEY =
+                                        "ipc.server.read.threadpool.size";
+  public static final int IPC_SERVER_RPC_READ_THREADS_DEFAULT = 1;
   /**
    * How many calls per handler are allowed in the queue.
    */
@@ -128,7 +135,6 @@ public class CommonConfigurationKeys {
    * The default number of calls per handler in the queue.
    */
   public static final int IPC_SERVER_HANDLER_QUEUE_SIZE_DEFAULT = 100;
-  
 
   public static final String  HADOOP_RPC_SOCKET_FACTORY_CLASS_DEFAULT_KEY = 
                                        "hadoop.rpc.socket.factory.class.default";
@@ -136,5 +142,15 @@ public class CommonConfigurationKeys {
   public static final String  HADOOP_JOB_UGI_KEY = "hadoop.job.ugi";
   public static final String  HADOOP_UTIL_HASH_TYPE_KEY = "hadoop.util.hash.type";
   public static final String  HADOOP_UTIL_HASH_TYPE_DEFAULT = "murmur";
+  public static final String  HADOOP_SECURITY_GROUP_MAPPING = "hadoop.security.group.mapping";
+  public static final String  HADOOP_SECURITY_GROUPS_CACHE_SECS = "hadoop.security.groups.cache.secs";
+  public static final String  HADOOP_SECURITY_AUTHENTICATION = "hadoop.security.authentication";
+  public static final String HADOOP_SECURITY_AUTHORIZATION =
+      "hadoop.security.authorization";
+  /**
+   * ACL denoting the administrator ACLs for a hadoop cluster.
+   */
+  public final static String HADOOP_CLUSTER_ADMINISTRATORS_PROPERTY =
+      "hadoop.cluster.administrators";
 }
 

+ 46 - 41
src/java/org/apache/hadoop/fs/DF.java

@@ -28,17 +28,17 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.CommonConfigurationKeys;
 import org.apache.hadoop.util.Shell;
 
-/** Filesystem disk space usage statistics.  Uses the unix 'df' program.
- * Tested on Linux, FreeBSD, Cygwin. */
+/** Filesystem disk space usage statistics.
+ * Uses the unix 'df' program to get mount points, and java.io.File for
+ * space utilization. Tested on Linux, FreeBSD, Cygwin. */
 public class DF extends Shell {
-  public static final long DF_INTERVAL_DEFAULT = 3 * 1000; // default DF refresh interval 
-  
-  private String dirPath;
+
+  /** Default DF refresh interval. */
+  public static final long DF_INTERVAL_DEFAULT = 3 * 1000;
+
+  private final String dirPath;
+  private final File dirFile;
   private String filesystem;
-  private long capacity;
-  private long used;
-  private long available;
-  private int percentUsed;
   private String mount;
 
   enum OSType {
@@ -79,6 +79,7 @@ public class DF extends Shell {
   public DF(File path, long dfInterval) throws IOException {
     super(dfInterval);
     this.dirPath = path.getCanonicalPath();
+    this.dirFile = new File(this.dirPath);
   }
 
   protected OSType getOSType() {
@@ -87,35 +88,40 @@ public class DF extends Shell {
   
   /// ACCESSORS
 
+  /** @return the canonical path to the volume we're checking. */
   public String getDirPath() {
     return dirPath;
   }
-  
-  public String getFilesystem() throws IOException { 
-    run(); 
-    return filesystem; 
+
+  /** @return a string indicating which filesystem volume we're checking. */
+  public String getFilesystem() throws IOException {
+    run();
+    return filesystem;
   }
-  
-  public long getCapacity() throws IOException { 
-    run(); 
-    return capacity; 
+
+  /** @return the capacity of the measured filesystem in bytes. */
+  public long getCapacity() {
+    return dirFile.getTotalSpace();
   }
-  
-  public long getUsed() throws IOException { 
-    run(); 
-    return used;
+
+  /** @return the total used space on the filesystem in bytes. */
+  public long getUsed() {
+    return dirFile.getTotalSpace() - dirFile.getFreeSpace();
   }
-  
-  public long getAvailable() throws IOException { 
-    run(); 
-    return available;
+
+  /** @return the usable space remaining on the filesystem in bytes. */
+  public long getAvailable() {
+    return dirFile.getUsableSpace();
   }
-  
-  public int getPercentUsed() throws IOException {
-    run();
-    return percentUsed;
+
+  /** @return the amount of the volume full, as a percent. */
+  public int getPercentUsed() {
+    double cap = (double) getCapacity();
+    double used = (cap - (double) getAvailable());
+    return (int) (used * 100.0 / cap);
   }
 
+  /** @return the filesystem mount point for the indicated volume */
   public String getMount() throws IOException {
     run();
     return mount;
@@ -125,10 +131,10 @@ public class DF extends Shell {
     return
       "df -k " + mount +"\n" +
       filesystem + "\t" +
-      capacity / 1024 + "\t" +
-      used / 1024 + "\t" +
-      available / 1024 + "\t" +
-      percentUsed + "%\t" +
+      getCapacity() / 1024 + "\t" +
+      getUsed() / 1024 + "\t" +
+      getAvailable() / 1024 + "\t" +
+      getPercentUsed() + "%\t" +
       mount;
   }
 
@@ -161,13 +167,12 @@ public class DF extends Shell {
 
     switch(getOSType()) {
       case OS_TYPE_AIX:
-        this.capacity = Long.parseLong(tokens.nextToken()) * 1024;
-        this.available = Long.parseLong(tokens.nextToken()) * 1024;
-        this.percentUsed = Integer.parseInt(tokens.nextToken());
+        Long.parseLong(tokens.nextToken()); // capacity
+        Long.parseLong(tokens.nextToken()); // available
+        Integer.parseInt(tokens.nextToken()); // pct used
         tokens.nextToken();
         tokens.nextToken();
         this.mount = tokens.nextToken();
-        this.used = this.capacity - this.available;
         break;
 
       case OS_TYPE_WIN:
@@ -175,10 +180,10 @@ public class DF extends Shell {
       case OS_TYPE_MAC:
       case OS_TYPE_UNIX:
       default:
-        this.capacity = Long.parseLong(tokens.nextToken()) * 1024;
-        this.used = Long.parseLong(tokens.nextToken()) * 1024;
-        this.available = Long.parseLong(tokens.nextToken()) * 1024;
-        this.percentUsed = Integer.parseInt(tokens.nextToken());
+        Long.parseLong(tokens.nextToken()); // capacity
+        Long.parseLong(tokens.nextToken()); // used
+        Long.parseLong(tokens.nextToken()); // available
+        Integer.parseInt(tokens.nextToken()); // pct used
         this.mount = tokens.nextToken();
         break;
    }

+ 210 - 0
src/java/org/apache/hadoop/fs/DelegateToFileSystem.java

@@ -0,0 +1,210 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.fs;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.EnumSet;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.util.Progressable;
+
+/**
+ * Implementation of AbstractFileSystem based on the existing implementation of 
+ * {@link FileSystem}.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+public abstract class DelegateToFileSystem extends AbstractFileSystem {
+  protected final FileSystem fsImpl;
+  
+  protected DelegateToFileSystem(URI theUri, FileSystem theFsImpl,
+      Configuration conf, String supportedScheme, boolean authorityRequired)
+      throws IOException, URISyntaxException {
+    super(theUri, supportedScheme, authorityRequired, 
+        FileSystem.getDefaultUri(conf).getPort());
+    fsImpl = theFsImpl;
+    fsImpl.initialize(theUri, conf);
+    fsImpl.statistics = getStatistics();
+  }
+
+  @Override
+  protected Path getInitialWorkingDirectory() {
+    return fsImpl.getInitialWorkingDirectory();
+  }
+  
+  @Override
+  @SuppressWarnings("deprecation") // call to primitiveCreate
+  protected FSDataOutputStream createInternal (Path f,
+      EnumSet<CreateFlag> flag, FsPermission absolutePermission, int bufferSize,
+      short replication, long blockSize, Progressable progress,
+      int bytesPerChecksum, boolean createParent) throws IOException {
+    checkPath(f);
+    
+    // Default impl assumes that permissions do not matter
+    // calling the regular create is good enough.
+    // FSs that implement permissions should override this.
+
+    if (!createParent) { // parent must exist.
+      // since this.create makes parent dirs automatically
+      // we must throw exception if parent does not exist.
+      final FileStatus stat = getFileStatus(f.getParent());
+      if (stat == null) {
+        throw new FileNotFoundException("Missing parent:" + f);
+      }
+      if (!stat.isDir()) {
+          throw new ParentNotDirectoryException("parent is not a dir:" + f);
+      }
+      // parent does exist - go ahead with create of file.
+    }
+    return fsImpl.primitiveCreate(f, absolutePermission, flag, 
+        bufferSize, replication, blockSize, progress, bytesPerChecksum);
+  }
+
+  @Override
+  protected boolean delete(Path f, boolean recursive) throws IOException {
+    checkPath(f);
+    return fsImpl.delete(f, recursive);
+  }
+
+  @Override
+  protected BlockLocation[] getFileBlockLocations(Path f, long start, long len)
+      throws IOException {
+    checkPath(f);
+    return fsImpl.getFileBlockLocations(f, start, len);
+  }
+
+  @Override
+  protected FileChecksum getFileChecksum(Path f) throws IOException {
+    checkPath(f);
+    return fsImpl.getFileChecksum(f);
+  }
+
+  @Override
+  protected FileStatus getFileStatus(Path f) throws IOException {
+    checkPath(f);
+    return fsImpl.getFileStatus(f);
+  }
+
+  @Override
+  protected FileStatus getFileLinkStatus(final Path f) throws IOException {
+    return getFileStatus(f);
+  }
+
+  @Override
+  protected FsStatus getFsStatus() throws IOException {
+    return fsImpl.getStatus();
+  }
+
+  @Override
+  protected FsServerDefaults getServerDefaults() throws IOException {
+    return fsImpl.getServerDefaults();
+  }
+
+  @Override
+  protected int getUriDefaultPort() {
+    return 0;
+  }
+
+  @Override
+  protected FileStatus[] listStatus(Path f) throws IOException {
+    checkPath(f);
+    return fsImpl.listStatus(f);
+  }
+
+  @Override
+  @SuppressWarnings("deprecation") // call to primitiveMkdir
+  protected void mkdir(Path dir, FsPermission permission, boolean createParent)
+      throws IOException {
+    checkPath(dir);
+    fsImpl.primitiveMkdir(dir, permission, createParent);
+    
+  }
+
+  @Override
+  protected FSDataInputStream open(Path f, int bufferSize) throws IOException {
+    checkPath(f);
+    return fsImpl.open(f, bufferSize);
+  }
+
+  @Override
+  @SuppressWarnings("deprecation") // call to rename
+  protected void renameInternal(Path src, Path dst) throws IOException {
+    checkPath(src);
+    checkPath(dst);
+    fsImpl.rename(src, dst, Options.Rename.NONE);
+  }
+
+  @Override
+  protected void setOwner(Path f, String username, String groupname)
+      throws IOException {
+    checkPath(f);
+    fsImpl.setOwner(f, username, groupname);
+  }
+
+  @Override
+  protected void setPermission(Path f, FsPermission permission)
+      throws IOException {
+    checkPath(f);
+    fsImpl.setPermission(f, permission);
+  }
+
+  @Override
+  protected boolean setReplication(Path f, short replication)
+      throws IOException {
+    checkPath(f);
+    return fsImpl.setReplication(f, replication);
+  }
+
+  @Override
+  protected void setTimes(Path f, long mtime, long atime) throws IOException {
+    checkPath(f);
+    fsImpl.setTimes(f, mtime, atime);
+  }
+
+  @Override
+  protected void setVerifyChecksum(boolean verifyChecksum) throws IOException {
+    fsImpl.setVerifyChecksum(verifyChecksum);
+  }
+
+  @Override
+  protected boolean supportsSymlinks() {
+    return false;
+  }  
+  
+  @Override
+  protected void createSymlink(Path target, Path link, boolean createParent) 
+      throws IOException { 
+    throw new IOException("File system does not support symlinks");
+  } 
+  
+  @Override
+  protected Path getLinkTarget(final Path f) throws IOException {
+    /* We should never get here. Any file system that threw an 
+     * UnresolvedLinkException, causing this function to be called,
+     * should override getLinkTarget. 
+     */
+    throw new AssertionError();
+  }
+}

+ 90 - 39
src/java/org/apache/hadoop/fs/FSInputChecker.java

@@ -24,6 +24,8 @@ import java.util.zip.Checksum;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.util.StringUtils;
+import java.nio.ByteBuffer;
+import java.nio.IntBuffer;
 
 /**
  * This is a generic input stream for verifying checksums for
@@ -38,16 +40,26 @@ abstract public class FSInputChecker extends FSInputStream {
   protected Path file;
   private Checksum sum;
   private boolean verifyChecksum = true;
-  private byte[] buf;
+  private int maxChunkSize; // data bytes for checksum (eg 512)
+  private byte[] buf; // buffer for non-chunk-aligned reading
   private byte[] checksum;
-  private int pos;
-  private int count;
+  private IntBuffer checksumInts; // wrapper on checksum buffer
+  private int pos; // the position of the reader inside buf
+  private int count; // the number of bytes currently in buf
   
   private int numOfRetries;
   
   // cached file position
+  // this should always be a multiple of maxChunkSize
   private long chunkPos = 0;
-  
+
+  // Number of checksum chunks that can be read at once into a user
+  // buffer. Chosen by benchmarks - higher values do not reduce
+  // CPU usage. The size of the data reads made to the underlying stream
+  // will be CHUNKS_PER_READ * maxChunkSize.
+  private static final int CHUNKS_PER_READ = 32;
+  protected static final int CHECKSUM_SIZE = 4; // 32-bit checksum
+
   /** Constructor
    * 
    * @param file The name of the file to be read
@@ -72,14 +84,34 @@ abstract public class FSInputChecker extends FSInputStream {
     set(verifyChecksum, sum, chunkSize, checksumSize);
   }
   
-  /** Reads in next checksum chunk data into <code>buf</code> at <code>offset</code>
+  /**
+   * Reads in checksum chunks into <code>buf</code> at <code>offset</code>
    * and checksum into <code>checksum</code>.
+   * Since checksums can be disabled, there are two cases implementors need
+   * to worry about:
+   *
+   *  (a) needChecksum() will return false:
+   *     - len can be any positive value
+   *     - checksum will be null
+   *     Implementors should simply pass through to the underlying data stream.
+   * or
+   *  (b) needChecksum() will return true:
+   *    - len >= maxChunkSize
+   *    - checksum.length is a multiple of CHECKSUM_SIZE
+   *    Implementors should read an integer number of data chunks into
+   *    buf. The amount read should be bounded by len or by 
+   *    checksum.length / CHECKSUM_SIZE * maxChunkSize. Note that len may
+   *    be a value that is not a multiple of maxChunkSize, in which case
+   *    the implementation may return less than len.
+   *
    * The method is used for implementing read, therefore, it should be optimized
-   * for sequential reading
+   * for sequential reading.
+   *
    * @param pos chunkPos
    * @param buf desitination buffer
    * @param offset offset in buf at which to store data
-   * @param len maximun number of bytes to read
+   * @param len maximum number of bytes to read
+   * @param checksum the data buffer into which to write checksums
    * @return number of bytes read
    */
   abstract protected int readChunk(long pos, byte[] buf, int offset, int len,
@@ -96,7 +128,7 @@ abstract public class FSInputChecker extends FSInputStream {
   protected synchronized boolean needChecksum() {
     return verifyChecksum && sum != null;
   }
-  
+
   /**
    * Read one checksum-verified byte
    * 
@@ -173,7 +205,7 @@ abstract public class FSInputChecker extends FSInputStream {
   private void fill(  ) throws IOException {
     assert(pos>=count);
     // fill internal buffer
-    count = readChecksumChunk(buf, 0, buf.length);
+    count = readChecksumChunk(buf, 0, maxChunkSize);
     if (count < 0) count = 0;
   }
   
@@ -185,13 +217,13 @@ abstract public class FSInputChecker extends FSInputStream {
   throws IOException {
     int avail = count-pos;
     if( avail <= 0 ) {
-      if(len>=buf.length) {
+      if(len >= maxChunkSize) {
         // read a chunk to user buffer directly; avoid one copy
         int nread = readChecksumChunk(b, off, len);
         return nread;
       } else {
         // read a chunk into the local buffer
-        fill();
+         fill();
         if( count <= 0 ) {
           return -1;
         } else {
@@ -207,10 +239,10 @@ abstract public class FSInputChecker extends FSInputStream {
     return cnt;    
   }
   
-  /* Read up one checksum chunk to array <i>b</i> at pos <i>off</i>
-   * It requires a checksum chunk boundary
+  /* Read up one or more checksum chunk to array <i>b</i> at pos <i>off</i>
+   * It requires at least one checksum chunk boundary
    * in between <cur_pos, cur_pos+len> 
-   * and it stops reading at the boundary or at the end of the stream;
+   * and it stops reading at the last boundary or at the end of the stream;
    * Otherwise an IllegalArgumentException is thrown.
    * This makes sure that all data read are checksum verified.
    * 
@@ -223,7 +255,7 @@ abstract public class FSInputChecker extends FSInputStream {
    *            the stream has been reached.
    * @throws IOException if an I/O error occurs.
    */ 
-  private int readChecksumChunk(byte b[], int off, int len)
+  private int readChecksumChunk(byte b[], final int off, final int len)
   throws IOException {
     // invalidate buffer
     count = pos = 0;
@@ -236,13 +268,12 @@ abstract public class FSInputChecker extends FSInputStream {
 
       try {
         read = readChunk(chunkPos, b, off, len, checksum);
-        if( read > 0 ) {
+        if( read > 0) {
           if( needChecksum() ) {
-            sum.update(b, off, read);
-            verifySum(chunkPos);
+            verifySums(b, off, read);
           }
           chunkPos += read;
-        } 
+        }
         retry = false;
       } catch (ChecksumException ce) {
           LOG.info("Found checksum error: b[" + off + ", " + (off+read) + "]="
@@ -266,26 +297,38 @@ abstract public class FSInputChecker extends FSInputStream {
     } while (retry);
     return read;
   }
-  
-  /* verify checksum for the chunk.
-   * @throws ChecksumException if there is a mismatch
-   */
-  private void verifySum(long errPos) throws ChecksumException {
-    long crc = getChecksum();
-    long sumValue = sum.getValue();
-    sum.reset();
-    if (crc != sumValue) {
-      throw new ChecksumException(
-          "Checksum error: "+file+" at "+errPos, errPos);
+
+  private void verifySums(final byte b[], final int off, int read)
+    throws ChecksumException
+  {
+    int leftToVerify = read;
+    int verifyOff = 0;
+    checksumInts.rewind();
+    checksumInts.limit((read - 1)/maxChunkSize + 1);
+
+    while (leftToVerify > 0) {
+      sum.update(b, off + verifyOff, Math.min(leftToVerify, maxChunkSize));
+      int expected = checksumInts.get();
+      int calculated = (int)sum.getValue();
+      sum.reset();
+
+      if (expected != calculated) {
+        long errPos = chunkPos + verifyOff;
+        throw new ChecksumException(
+          "Checksum error: "+file+" at "+ errPos +
+          " exp: " + expected + " got: " + calculated, errPos);
+      }
+      leftToVerify -= maxChunkSize;
+      verifyOff += maxChunkSize;
     }
   }
-  
-  /* calculate checksum value */
-  private long getChecksum() {
-    return checksum2long(checksum);
-  }
 
-  /** Convert a checksum byte array to a long */
+  /**
+   * Convert a checksum byte array to a long
+   * This is deprecated since 0.22 since it is no longer in use
+   * by this class.
+   */
+  @Deprecated
   static public long checksum2long(byte[] checksum) {
     long crc = 0L;
     for(int i=0; i<checksum.length; i++) {
@@ -293,7 +336,7 @@ abstract public class FSInputChecker extends FSInputStream {
     }
     return crc;
   }
-  
+
   @Override
   public synchronized long getPos() throws IOException {
     return chunkPos-Math.max(0L, count - pos);
@@ -399,11 +442,19 @@ abstract public class FSInputChecker extends FSInputStream {
    * @param checksumSize checksum size
    */
   final protected synchronized void set(boolean verifyChecksum,
-      Checksum sum, int maxChunkSize, int checksumSize ) {
+      Checksum sum, int maxChunkSize, int checksumSize) {
+
+    // The code makes assumptions that checksums are always 32-bit.
+    assert !verifyChecksum || sum == null || checksumSize == CHECKSUM_SIZE;
+
+    this.maxChunkSize = maxChunkSize;
     this.verifyChecksum = verifyChecksum;
     this.sum = sum;
     this.buf = new byte[maxChunkSize];
-    this.checksum = new byte[checksumSize];
+    // The size of the checksum array here determines how much we can
+    // read in a single call to readChunk
+    this.checksum = new byte[CHUNKS_PER_READ * checksumSize];
+    this.checksumInts = ByteBuffer.wrap(checksum).asIntBuffer();
     this.count = 0;
     this.pos = 0;
   }

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 498 - 261
src/java/org/apache/hadoop/fs/FileContext.java


+ 53 - 2
src/java/org/apache/hadoop/fs/FileStatus.java

@@ -39,6 +39,7 @@ public class FileStatus implements Writable, Comparable {
   private FsPermission permission;
   private String owner;
   private String group;
+  private Path symlink;
   
   public FileStatus() { this(0, false, 0, 0, 0, 0, null, null, null, null); }
   
@@ -49,11 +50,25 @@ public class FileStatus implements Writable, Comparable {
     this(length, isdir, block_replication, blocksize, modification_time,
          0, null, null, null, path);
   }
-  
-  public FileStatus(long length, boolean isdir, int block_replication,
+
+  /**
+   * Constructor for file systems on which symbolic links are not supported
+   */
+  public FileStatus(long length, boolean isdir,
+                    int block_replication,
                     long blocksize, long modification_time, long access_time,
                     FsPermission permission, String owner, String group, 
                     Path path) {
+    this(length, isdir, block_replication, blocksize, modification_time,
+         access_time, permission, owner, group, null, path);
+  }
+
+  public FileStatus(long length, boolean isdir,
+                    int block_replication,
+                    long blocksize, long modification_time, long access_time,
+                    FsPermission permission, String owner, String group, 
+                    Path symlink,
+                    Path path) {
     this.length = length;
     this.isdir = isdir;
     this.block_replication = (short)block_replication;
@@ -64,6 +79,7 @@ public class FileStatus implements Writable, Comparable {
                       FsPermission.getDefault() : permission;
     this.owner = (owner == null) ? "" : owner;
     this.group = (group == null) ? "" : group;
+    this.symlink = symlink;
     this.path = path;
   }
 
@@ -148,6 +164,10 @@ public class FileStatus implements Writable, Comparable {
   public Path getPath() {
     return path;
   }
+  
+  public void setPath(final Path p) {
+    path = p;
+  }
 
   /* These are provided so that these values could be loaded lazily 
    * by a filesystem (e.g. local file system).
@@ -178,6 +198,28 @@ public class FileStatus implements Writable, Comparable {
     this.group = (group == null) ? "" :  group;
   }
 
+  /**
+   * Is this a symbolic link?
+   * @return true if this is a symbolic link
+   */
+  public boolean isSymlink() {
+    return symlink != null;
+  }
+
+  /**
+   * @return The contents of the symbolic link.
+   */
+  public Path getSymlink() throws IOException {
+    if (!isSymlink()) {
+      throw new IOException("Path " + path + " is not a symbolic link");
+    }
+    return symlink;
+  }
+
+  public void setSymlink(final Path p) {
+    symlink = p;
+  }
+  
   //////////////////////////////////////////////////
   // Writable
   //////////////////////////////////////////////////
@@ -192,6 +234,10 @@ public class FileStatus implements Writable, Comparable {
     permission.write(out);
     Text.writeString(out, owner);
     Text.writeString(out, group);
+    out.writeBoolean(isSymlink());
+    if (isSymlink()) {
+      Text.writeString(out, symlink.toString());
+    }
   }
 
   public void readFields(DataInput in) throws IOException {
@@ -206,6 +252,11 @@ public class FileStatus implements Writable, Comparable {
     permission.readFields(in);
     owner = Text.readString(in);
     group = Text.readString(in);
+    if (in.readBoolean()) {
+      this.symlink = new Path(Text.readString(in));
+    } else {
+      this.symlink = null;
+    }
   }
 
   /**

+ 77 - 25
src/java/org/apache/hadoop/fs/FileSystem.java

@@ -35,8 +35,6 @@ import java.util.TreeSet;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.regex.Pattern;
 
-import javax.security.auth.login.LoginException;
-
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -139,6 +137,17 @@ public abstract class FileSystem extends Configured implements Closeable {
   /** Returns a URI whose scheme and authority identify this FileSystem.*/
   public abstract URI getUri();
   
+  /** @deprecated call #getUri() instead.*/
+  @Deprecated
+  public String getName() { return getUri().toString(); }
+
+  /** @deprecated call #get(URI,Configuration) instead. */
+  @Deprecated
+  public static FileSystem getNamed(String name, Configuration conf)
+    throws IOException {
+    return get(URI.create(fixName(name)), conf);
+  }
+  
   /** Update old-format filesystem names, for back-compatibility.  This should
    * eventually be replaced with a checkName() method that throws an exception
    * for old-format names. */ 
@@ -815,6 +824,19 @@ public abstract class FileSystem extends Configured implements Closeable {
   public abstract FSDataOutputStream append(Path f, int bufferSize,
       Progressable progress) throws IOException;
 
+ /**
+   * Get replication.
+   * 
+   * @deprecated Use getFileStatus() instead
+   * @param src file name
+   * @return file replication
+   * @throws IOException
+   */ 
+  @Deprecated
+  public short getReplication(Path src) throws IOException {
+    return getFileStatus(src).getReplication();
+  }
+
   /**
    * Set replication for an existing file.
    * 
@@ -924,6 +946,15 @@ public abstract class FileSystem extends Configured implements Closeable {
     }
   }
   
+  /**
+   * Delete a file 
+   * @deprecated Use {@link #delete(Path, boolean)} instead.
+   */
+  @Deprecated
+  public boolean delete(Path f) throws IOException {
+    return delete(f, true);
+  }
+  
   /** Delete a file.
    *
    * @param f the path to delete.
@@ -1011,6 +1042,13 @@ public abstract class FileSystem extends Configured implements Closeable {
       return false;               // f does not exist
     }
   }
+  
+  /** The number of bytes in a file. */
+  /** @deprecated Use getFileStatus() instead */
+  @Deprecated
+  public long getLength(Path f) throws IOException {
+    return getFileStatus(f).getLen();
+  }
     
   /** Return the {@link ContentSummary} of a given {@link Path}. */
   public ContentSummary getContentSummary(Path f) throws IOException {
@@ -1318,9 +1356,6 @@ public abstract class FileSystem extends Configured implements Closeable {
     /** Default pattern character: Character set close. */
     private static final char  PAT_SET_CLOSE = ']';
       
-    GlobFilter() {
-    }
-      
     GlobFilter(String filePattern) throws IOException {
       setRegex(filePattern);
     }
@@ -1613,6 +1648,17 @@ public abstract class FileSystem extends Configured implements Closeable {
     }
     return used;
   }
+  
+  /**
+   * Get the block size for a particular file.
+   * @param f the filename
+   * @return the number of bytes in a block
+   */
+  /** @deprecated Use getFileStatus() instead */
+  @Deprecated
+  public long getBlockSize(Path f) throws IOException {
+    return getFileStatus(f).getBlockSize();
+  }
 
   /** Return the number of bytes that large input files should be optimally
    * be split into to minimize i/o time. */
@@ -1763,32 +1809,45 @@ public abstract class FileSystem extends Configured implements Closeable {
     /** A variable that makes all objects in the cache unique */
     private static AtomicLong unique = new AtomicLong(1);
 
-    synchronized FileSystem get(URI uri, Configuration conf) throws IOException{
+    FileSystem get(URI uri, Configuration conf) throws IOException{
       Key key = new Key(uri, conf);
       return getInternal(uri, conf, key);
     }
 
     /** The objects inserted into the cache using this method are all unique */
-    synchronized FileSystem getUnique(URI uri, Configuration conf) throws IOException{
+    FileSystem getUnique(URI uri, Configuration conf) throws IOException{
       Key key = new Key(uri, conf, unique.getAndIncrement());
       return getInternal(uri, conf, key);
     }
 
     private FileSystem getInternal(URI uri, Configuration conf, Key key) throws IOException{
-      FileSystem fs = map.get(key);
-      if (fs == null) {
-        fs = createFileSystem(uri, conf);
+      FileSystem fs;
+      synchronized (this) {
+        fs = map.get(key);
+      }
+      if (fs != null) {
+        return fs;
+      }
+
+      fs = createFileSystem(uri, conf);
+      synchronized (this) { // refetch the lock again
+        FileSystem oldfs = map.get(key);
+        if (oldfs != null) { // a file system is created while lock is releasing
+          fs.close(); // close the new file system
+          return oldfs;  // return the old file system
+        }
+        
+        // now insert the new file system into the map
         if (map.isEmpty() && !clientFinalizer.isAlive()) {
           Runtime.getRuntime().addShutdownHook(clientFinalizer);
         }
         fs.key = key;
         map.put(key, fs);
-
         if (conf.getBoolean("fs.automatic.close", true)) {
           toAutoClose.add(key);
         }
+        return fs;
       }
-      return fs;
     }
 
     synchronized void remove(Key key, FileSystem fs) {
@@ -1859,7 +1918,7 @@ public abstract class FileSystem extends Configured implements Closeable {
     static class Key {
       final String scheme;
       final String authority;
-      final String username;
+      final UserGroupInformation ugi;
       final long unique;   // an artificial way to make a key unique
 
       Key(URI uri, Configuration conf) throws IOException {
@@ -1870,20 +1929,13 @@ public abstract class FileSystem extends Configured implements Closeable {
         scheme = uri.getScheme()==null?"":uri.getScheme().toLowerCase();
         authority = uri.getAuthority()==null?"":uri.getAuthority().toLowerCase();
         this.unique = unique;
-        UserGroupInformation ugi = UserGroupInformation.readFrom(conf);
-        if (ugi == null) {
-          try {
-            ugi = UserGroupInformation.login(conf);
-          } catch(LoginException e) {
-            LOG.warn("uri=" + uri, e);
-          }
-        }
-        username = ugi == null? null: ugi.getUserName();
+        
+        this.ugi = UserGroupInformation.getCurrentUser();
       }
 
       /** {@inheritDoc} */
       public int hashCode() {
-        return (scheme + authority + username).hashCode() + (int)unique;
+        return (scheme + authority).hashCode() + ugi.hashCode() + (int)unique;
       }
 
       static boolean isEqual(Object a, Object b) {
@@ -1899,7 +1951,7 @@ public abstract class FileSystem extends Configured implements Closeable {
           Key that = (Key)obj;
           return isEqual(this.scheme, that.scheme)
                  && isEqual(this.authority, that.authority)
-                 && isEqual(this.username, that.username)
+                 && isEqual(this.ugi, that.ugi)
                  && (this.unique == that.unique);
         }
         return false;        
@@ -1907,7 +1959,7 @@ public abstract class FileSystem extends Configured implements Closeable {
 
       /** {@inheritDoc} */
       public String toString() {
-        return username + "@" + scheme + "://" + authority;        
+        return "("+ugi.toString() + ")@" + scheme + "://" + authority;        
       }
     }
   }

+ 43 - 20
src/java/org/apache/hadoop/fs/FileUtil.java

@@ -27,12 +27,17 @@ import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.util.Shell.ShellCommandExecutor;
-import org.mortbay.log.Log;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 
 /**
  * A collection of file-processing util methods
  */
 public class FileUtil {
+
+  private static final Log LOG = LogFactory.getLog(FileUtil.class);
+
   /**
    * convert an array of FileStatus to an array of Path
    * 
@@ -71,6 +76,17 @@ public class FileUtil {
    * we return false, the directory may be partially-deleted.
    */
   public static boolean fullyDelete(File dir) throws IOException {
+    if (!fullyDeleteContents(dir)) {
+      return false;
+    }
+    return dir.delete();
+  }
+
+  /**
+   * Delete the contents of a directory, not the directory itself.  If
+   * we return false, the directory may be partially-deleted.
+   */
+  public static boolean fullyDeleteContents(File dir) throws IOException {
     File contents[] = dir.listFiles();
     if (contents != null) {
       for (int i = 0; i < contents.length; i++) {
@@ -95,7 +111,7 @@ public class FileUtil {
         }
       }
     }
-    return dir.delete();
+    return true;
   }
 
   /**
@@ -150,7 +166,7 @@ public class FileUtil {
                              throws IOException {
     boolean gotException = false;
     boolean returnVal = true;
-    StringBuffer exceptions = new StringBuffer();
+    StringBuilder exceptions = new StringBuilder();
 
     if (srcs.length == 1)
       return copy(srcFS, srcs[0], dstFS, dst, deleteSource, overwrite, conf);
@@ -513,7 +529,7 @@ public class FileUtil {
       }
     }
 
-    StringBuffer untarCommand = new StringBuffer();
+    StringBuilder untarCommand = new StringBuilder();
     boolean gzipped = inFile.toString().endsWith("gz");
     if (gzipped) {
       untarCommand.append(" gzip -dc '");
@@ -628,14 +644,18 @@ public class FileUtil {
      * Retrieves the number of links to the specified file.
      */
     public static int getLinkCount(File fileName) throws IOException {
+      if (!fileName.exists()) {
+        throw new FileNotFoundException(fileName + " not found.");
+      }
+
       int len = getLinkCountCommand.length;
       String[] cmd = new String[len + 1];
       for (int i = 0; i < len; i++) {
         cmd[i] = getLinkCountCommand[i];
       }
       cmd[len] = fileName.toString();
-      String inpMsg = "";
-      String errMsg = "";
+      String inpMsg = null;
+      String errMsg = null;
       int exitValue = -1;
       BufferedReader in = null;
       BufferedReader err = null;
@@ -647,14 +667,11 @@ public class FileUtil {
         in = new BufferedReader(new InputStreamReader(
                                     process.getInputStream()));
         inpMsg = in.readLine();
-        if (inpMsg == null)  inpMsg = "";
-        
         err = new BufferedReader(new InputStreamReader(
                                      process.getErrorStream()));
         errMsg = err.readLine();
-        if (errMsg == null)  errMsg = "";
-        if (exitValue != 0) {
-          throw new IOException(inpMsg + errMsg);
+        if (inpMsg == null || exitValue != 0) {
+          throw createIOException(fileName, inpMsg, errMsg, exitValue, null);
         }
         if (getOSType() == OSType.OS_TYPE_SOLARIS) {
           String[] result = inpMsg.split("\\s+");
@@ -663,13 +680,9 @@ public class FileUtil {
           return Integer.parseInt(inpMsg);
         }
       } catch (NumberFormatException e) {
-        throw new IOException(StringUtils.stringifyException(e) + 
-                              inpMsg + errMsg +
-                              " on file:" + fileName);
+        throw createIOException(fileName, inpMsg, errMsg, exitValue, e);
       } catch (InterruptedException e) {
-        throw new IOException(StringUtils.stringifyException(e) + 
-                              inpMsg + errMsg +
-                              " on file:" + fileName);
+        throw createIOException(fileName, inpMsg, errMsg, exitValue, e);
       } finally {
         process.destroy();
         if (in != null) in.close();
@@ -678,6 +691,16 @@ public class FileUtil {
     }
   }
 
+  /** Create an IOException for failing to get link count. */
+  static private IOException createIOException(File f, String message,
+      String error, int exitvalue, Exception cause) {
+    final String s = "Failed to get link count on file " + f
+        + ": message=" + message
+        + "; error=" + error
+        + "; exit value=" + exitvalue;
+    return cause == null? new IOException(s): new IOException(s, cause);
+  }
+
   /**
    * Create a soft link between a src and destination
    * only on a local disk. HDFS does not support this
@@ -722,7 +745,7 @@ public class FileUtil {
    */
   public static int chmod(String filename, String perm, boolean recursive)
                             throws IOException, InterruptedException {
-    StringBuffer cmdBuf = new StringBuffer();
+    StringBuilder cmdBuf = new StringBuilder();
     cmdBuf.append("chmod ");
     if (recursive) {
       cmdBuf.append("-R ");
@@ -734,8 +757,8 @@ public class FileUtil {
     try {
       shExec.execute();
     }catch(Exception e) {
-      if(Log.isDebugEnabled()) {
-        Log.debug("Error while changing permission : " + filename 
+      if(LOG.isDebugEnabled()) {
+        LOG.debug("Error while changing permission : " + filename 
             +" Exception: " + StringUtils.stringifyException(e));
       }
     }

+ 51 - 0
src/java/org/apache/hadoop/fs/FilterFileSystem.java

@@ -139,6 +139,23 @@ public class FilterFileSystem extends FileSystem {
     return fs.delete(f, recursive);
   }
   
+  /**
+   * Mark a path to be deleted when FileSystem is closed.
+   * When the JVM shuts down,
+   * all FileSystem objects will be closed automatically.
+   * Then,
+   * the marked path will be deleted as a result of closing the FileSystem.
+   *
+   * The path has to exist in the file system.
+   * 
+   * @param f the path to delete.
+   * @return  true if deleteOnExit is successful, otherwise false.
+   * @throws IOException
+   */
+  public boolean deleteOnExit(Path f) throws IOException {
+    return fs.deleteOnExit(f);
+  }    
+
   /** List files in a directory. */
   public FileStatus[] listStatus(Path f) throws IOException {
     return fs.listStatus(f);
@@ -194,6 +211,28 @@ public class FilterFileSystem extends FileSystem {
     fs.copyFromLocalFile(delSrc, src, dst);
   }
   
+  /**
+   * The src files are on the local disk.  Add it to FS at
+   * the given dst name.
+   * delSrc indicates if the source should be removed
+   */
+  public void copyFromLocalFile(boolean delSrc, boolean overwrite, 
+                                Path[] srcs, Path dst)
+    throws IOException {
+    fs.copyFromLocalFile(delSrc, overwrite, srcs, dst);
+  }
+  
+  /**
+   * The src file is on the local disk.  Add it to FS at
+   * the given dst name.
+   * delSrc indicates if the source should be removed
+   */
+  public void copyFromLocalFile(boolean delSrc, boolean overwrite, 
+                                Path src, Path dst)
+    throws IOException {
+    fs.copyFromLocalFile(delSrc, overwrite, src, dst);
+  }
+
   /**
    * The src file is under FS, and the dst is on the local disk.
    * Copy it from FS control to the local dst name.
@@ -226,6 +265,11 @@ public class FilterFileSystem extends FileSystem {
     fs.completeLocalOutput(fsOutputFile, tmpLocalFile);
   }
 
+  /** Return the total size of all files in the filesystem.*/
+  public long getUsed() throws IOException{
+    return fs.getUsed();
+  }
+  
   /** Return the number of bytes that large input files should be optimally
    * be split into to minimize i/o time. */
   public long getDefaultBlockSize() {
@@ -274,6 +318,13 @@ public class FilterFileSystem extends FileSystem {
     fs.setOwner(p, username, groupname);
   }
 
+  /** {@inheritDoc} */
+  @Override
+  public void setTimes(Path p, long mtime, long atime
+      ) throws IOException {
+    fs.setTimes(p, mtime, atime);
+  }
+
   /** {@inheritDoc} */
   @Override
   public void setPermission(Path p, FsPermission permission

+ 250 - 0
src/java/org/apache/hadoop/fs/FilterFs.java

@@ -0,0 +1,250 @@
+package org.apache.hadoop.fs;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.EnumSet;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.FileSystem.Statistics;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.security.AccessControlException;
+import org.apache.hadoop.util.Progressable;
+
+/**
+ * A <code>FilterFs</code> contains some other file system, which it uses as its
+ * basic file system, possibly transforming the data along the way or providing
+ * additional functionality. The class <code>FilterFs</code> itself simply
+ * overrides all methods of <code>AbstractFileSystem</code> with versions that
+ * pass all requests to the contained file system. Subclasses of
+ * <code>FilterFs</code> may further override some of these methods and may also
+ * provide additional methods and fields.
+ * 
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Evolving /*Evolving for a release,to be changed to Stable */
+public abstract class FilterFs extends AbstractFileSystem {
+  private final AbstractFileSystem myFs;
+  
+  protected AbstractFileSystem getMyFs() {
+    return myFs;
+  }
+  
+  protected FilterFs(AbstractFileSystem fs) throws IOException,
+      URISyntaxException {
+    super(fs.getUri(), fs.getUri().getScheme(),
+        fs.getUri().getAuthority() != null, fs.getUriDefaultPort());
+    myFs = fs;
+  }
+
+  @Override
+  protected Statistics getStatistics() {
+    return myFs.getStatistics();
+  }
+
+  @Override
+  protected Path getInitialWorkingDirectory() {
+    return myFs.getInitialWorkingDirectory();
+  }
+  
+  @Override
+  protected Path getHomeDirectory() {
+    return myFs.getHomeDirectory();
+  }
+  
+  @Override
+  protected FSDataOutputStream createInternal(Path f,
+    EnumSet<CreateFlag> flag, FsPermission absolutePermission, int bufferSize,
+    short replication, long blockSize, Progressable progress,
+    int bytesPerChecksum, boolean createParent) 
+      throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.createInternal(f, flag, absolutePermission, bufferSize,
+        replication, blockSize, progress, bytesPerChecksum, createParent);
+  }
+
+  @Override
+  protected boolean delete(Path f, boolean recursive) 
+      throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.delete(f, recursive);
+  }
+
+  @Override
+  protected BlockLocation[] getFileBlockLocations(Path f, long start, long len)
+      throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.getFileBlockLocations(f, start, len);
+  }
+
+  @Override
+  protected FileChecksum getFileChecksum(Path f) 
+      throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.getFileChecksum(f);
+  }
+
+  @Override
+  protected FileStatus getFileStatus(Path f) 
+      throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.getFileStatus(f);
+  }
+
+  @Override
+  protected FileStatus getFileLinkStatus(final Path f) 
+    throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.getFileLinkStatus(f);
+  }
+  
+  @Override
+  protected FsStatus getFsStatus(final Path f) throws AccessControlException,
+    FileNotFoundException, UnresolvedLinkException, IOException {
+    return myFs.getFsStatus(f);
+  }
+
+  @Override
+  protected FsStatus getFsStatus() throws IOException {
+    return myFs.getFsStatus();
+  }
+
+  @Override
+  protected FsServerDefaults getServerDefaults() throws IOException {
+    return myFs.getServerDefaults();
+  }
+
+  @Override
+  protected int getUriDefaultPort() {
+    return myFs.getUriDefaultPort();
+  }
+
+  @Override
+  protected URI getUri() {
+    return myFs.getUri();
+  }
+  
+  @Override
+  protected void checkPath(Path path) {
+    myFs.checkPath(path);
+  }
+  
+  @Override
+  protected String getUriPath(final Path p) {
+    return myFs.getUriPath(p);
+  }
+  
+  @Override
+  protected FileStatus[] listStatus(Path f) 
+      throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.listStatus(f);
+  }
+
+  @Override
+  protected void mkdir(Path dir, FsPermission permission, boolean createParent)
+    throws IOException, UnresolvedLinkException {
+    checkPath(dir);
+    myFs.mkdir(dir, permission, createParent);
+    
+  }
+
+  @Override
+  protected FSDataInputStream open(final Path f) throws AccessControlException,
+    FileNotFoundException, UnresolvedLinkException, IOException {
+    checkPath(f);
+    return myFs.open(f);
+  }
+
+  @Override
+  protected FSDataInputStream open(Path f, int bufferSize) 
+    throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.open(f, bufferSize);
+  }
+
+  @Override
+  protected void renameInternal(Path src, Path dst) 
+    throws IOException, UnresolvedLinkException {
+    checkPath(src);
+    checkPath(dst);
+    myFs.rename(src, dst, Options.Rename.NONE);
+  }
+
+  @Override
+  protected void renameInternal(final Path src, final Path dst,
+      boolean overwrite) throws AccessControlException,
+      FileAlreadyExistsException, FileNotFoundException,
+      ParentNotDirectoryException, UnresolvedLinkException, IOException {
+    myFs.renameInternal(src, dst, overwrite);
+  }
+  
+  @Override
+  protected void setOwner(Path f, String username, String groupname)
+    throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    myFs.setOwner(f, username, groupname);
+    
+  }
+
+  @Override
+  protected void setPermission(Path f, FsPermission permission)
+    throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    myFs.setPermission(f, permission);
+  }
+
+  @Override
+  protected boolean setReplication(Path f, short replication)
+    throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    return myFs.setReplication(f, replication);
+  }
+
+  @Override
+  protected void setTimes(Path f, long mtime, long atime) 
+      throws IOException, UnresolvedLinkException {
+    checkPath(f);
+    myFs.setTimes(f, mtime, atime);
+  }
+
+  @Override
+  protected void setVerifyChecksum(boolean verifyChecksum) 
+      throws IOException, UnresolvedLinkException {
+    myFs.setVerifyChecksum(verifyChecksum);
+  }
+
+  @Override
+  protected boolean supportsSymlinks() {
+    return myFs.supportsSymlinks();
+  }
+
+  @Override
+  protected void createSymlink(Path target, Path link, boolean createParent) 
+    throws IOException, UnresolvedLinkException {
+    myFs.createSymlink(target, link, createParent);
+  }
+
+  @Override
+  protected Path getLinkTarget(final Path f) throws IOException {
+    return myFs.getLinkTarget(f);
+  }
+}

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.