Forráskód Böngészése

HADOOP-425. Add a python MapReduce example, using Jython. Contributed by Owen.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@428862 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 19 éve
szülő
commit
41be68f884
3 módosított fájl, 92 hozzáadás és 0 törlés
  1. 3 0
      CHANGES.txt
  2. 68 0
      src/examples/python/WordCount.py
  3. 21 0
      src/examples/python/compile

+ 3 - 0
CHANGES.txt

@@ -142,6 +142,9 @@ Trunk (unreleased changes)
 40. HADOOP-226.  Fix fsck command to properly consider replication
 40. HADOOP-226.  Fix fsck command to properly consider replication
     counts, now that these can vary per file.  (Bryan Pendleton via cutting)
     counts, now that these can vary per file.  (Bryan Pendleton via cutting)
 
 
+41. HADOOP-425.  Add a Python MapReduce example, using Jython.
+    (omalley via cutting)
+
 
 
 Release 0.4.0 - 2006-06-28
 Release 0.4.0 - 2006-06-28
 
 

+ 68 - 0
src/examples/python/WordCount.py

@@ -0,0 +1,68 @@
+#
+# Copyright 2006 The Apache Software Foundation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from org.apache.hadoop.fs import Path
+from org.apache.hadoop.io import *
+from org.apache.hadoop.mapred import *
+
+import sys
+import getopt
+
+class WordCountMap(Mapper, MapReduceBase):
+    one = IntWritable(1)
+    def map(self, key, value, output, reporter):
+        for w in value.toString().split():
+            output.collect(Text(w), self.one)
+
+class Summer(Reducer, MapReduceBase):
+    def reduce(self, key, values, output, reporter):
+        sum = 0
+        while values.hasNext():
+            sum += values.next().get()
+        output.collect(key, IntWritable(sum))
+
+def printUsage(code):
+    print "wordcount [-m <maps>] [-r <reduces>] <input> <output>"
+    sys.exit(code)
+
+def main(args):
+    conf = JobConf(WordCountMap);
+    conf.setJobName("wordcount");
+ 
+    conf.setOutputKeyClass(Text);
+    conf.setOutputValueClass(IntWritable);
+    
+    conf.setMapperClass(WordCountMap);        
+    conf.setCombinerClass(Summer);
+    conf.setReducerClass(Summer);
+    try:
+        flags, other_args = getopt.getopt(args[1:], "m:r:")
+    except getopt.GetoptError:
+        printUsage(1)
+    if len(other_args) != 2:
+        printUsage(1)
+    
+    for f,v in flags:
+        if f == "-m":
+            conf.setNumMapTasks(int(v))
+        elif f == "-r":
+            conf.setNumReduceTasks(int(v))
+    conf.setInputPath(Path(other_args[0]))
+    conf.setOutputPath(Path(other_args[1]))
+    JobClient.runJob(conf);
+
+if __name__ == "__main__":
+    main(sys.argv)

+ 21 - 0
src/examples/python/compile

@@ -0,0 +1,21 @@
+#!/bin/bash
+
+export HADOOP_HOME=../../..
+
+export CLASSPATH="$HADOOP_HOME/build/classes"
+
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
+# add libs to CLASSPATH
+for f in $HADOOP_HOME/lib/*.jar; do
+  CLASSPATH=${CLASSPATH}:$f;
+done
+
+for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
+  CLASSPATH=${CLASSPATH}:$f;
+done
+
+# restore ordinary behaviour
+unset IFS
+jythonc -p org.apache.hadoop.examples -d -j wc.jar -c WordCount.py