Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Small Code for Big Data

Small Code for Big Data

Programming Environment Meetings Colloquium, Centrum voor Wiskunde en Informatica, Amsterdam, The Netherlands

Norbert Crombach

April 26, 2013
Tweet

Other Decks in Programming

Transcript

  1. public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } }
  2. public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result

    = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } }
  3. public static void main(String[] args) throws Exception { Configuration conf

    = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
  4. count: table sum of int; total: table sum of float;

    sum_of_squares: table sum of float; x: float = input; emit count <- 1; emit total <- x; emit sum_of_squares <- x * x;
  5. Pig

  6. input_lines = LOAD 'input' AS (line:chararray); words = FOREACH input_lines

    GENERATE FLATTEN(TOKENIZE(line)) AS word; filtered_words = FILTER words BY word MATCHES '\\w+'; word_groups = GROUP filtered_words BY word; word_count = FOREACH word_groups GENERATE group, COUNT(filtered_words); STORE word_count INTO 'output';
  7. public static void main(String[] args) { String inPath = args[0];

    String outPath = args[1]; Properties properties = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties); Tap inTap = new Hfs(new TextDelimited(), inPath); Tap outTap = new Hfs(new TextDelimited(), outPath); Fields token = new Fields("token"); Fields text = new Fields("text"); RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]"); Pipe inPipe = new Each("token", text, splitter, Fields.RESULTS); Pipe outPipe = new Pipe("wordcount", inPipe); outPipe = new GroupBy(outPipe, token); outPipe = new Every(outPipe, Fields.ALL, new Count(), Fields.ALL); FlowDef flowDef = FlowDef.flowDef() .setName("wordcount") .addSource(inPipe, inTap) .addTailSink(outPipe, outTap); flowConnector.connect(flowDef).complete(); }
  8. $ lein repl user=> (use 'cascalog.playground) (bootstrap) user=> (?<- (stdout)

    [?person ?age] (age ?person ?age) (< ?age 30)) ... RESULTS ----------------------- alice 28 david 25 emily 25 gary 28 kumar 27 ----------------------- ... user=>
  9. (?<- (stdout) [?person1 !!person2] (person ?person1) (follows ?person1 !!person2)) (?<-

    (stdout) [?person] (person ?person) (follows ?person !!p2) (nil? !!p2)) (?<- (stdout) [?person ?count] (person ?person) (follows ?person !!p2) (c/!count !!p2))
  10. (ns pem.core (:use [cascalog.api] [cascalog.more-taps :only (hfs-delimited)]) (:require [clojure.string :as

    s] [cascalog.ops :as c]) (:gen-class)) (defmapcatop split [line] (s/split line #"[\[\]\\\(\),.)\s]+")) (defn -main [in out & args] (?<- (hfs-delimited out) [?word ?count] ((hfs-delimited in) _ ?line) (split ?line :> ?word) (c/count ?count)))
  11. (ns pem.core (:use [cascalog.api] [cascalog.more-taps]) (:require [cascalog [ops :as c]

    [vars :as v]]) (:import [org.apache.hadoop.io Text]) (:gen-class)) (defn -main [stop in out tfidf & args] (let [docs (hfs-wrtseqfile in Text Text :outfields ["key" "value"]) stop (hfs-delimited stop) src (etl-docs-gen docs stop)] (?- (hfs-delimited tfidf) (TF-IDF src)) (?- (hfs-delimited out) (word-count src))))
  12. (defmapcatop tokenize [line] (re-seq #"(?i)\b\w\w+\b" line)) (defn etl-docs-gen [docs stop]

    (<- [?doc-id ?word] (docs ?key ?value) (str ?key :> ?doc-id) (str ?value :> ?body) (tokenize ?body :> ?word-dirty) ((c/comp s/trim s/lower-case) ?word-dirty :> ?word) (stop ?word :> false)))
  13. (defn TF-IDF [src] (let [n-doc (first (flatten (??- (D src))))]

    (<- [?doc-id ?tf-idf ?tf-word] ((TF src) ?doc-id ?tf-word ?tf-count) ((DF src) ?tf-word ?df-count) (tf-idf-formula ?tf-count ?df-count n-doc :> ?tf-idf))))
  14. (defn D [src] (let [src (select-fields src ["?doc-id"])] (<- [?n-docs]

    (src ?doc-id) (c/distinct-count ?doc-id :> ?n-docs)))) (defn DF [src] (<- [?df-word ?df-count] (src ?doc-id ?df-word) (c/distinct-count ?doc-id ?df-word :> ?df-count))) (defn TF [src] (<- [?doc-id ?tf-word ?tf-count] (src ?doc-id ?tf-word) (c/count ?tf-count)))
  15. (defproject pem-20130426 "0.1.0-SNAPSHOT" :license {:name "Eclipse Public License" :url "http://www.eclipse.org/legal/epl-v10.html"}

    :uberjar-name "pem.jar" :aot [pem.core] :main pem.core :dependencies [[org.clojure/clojure "1.5.1"] [cascalog "1.10.1"] [cascalog-more-taps "0.3.0"]] :profiles {:provided {:dependencies [[org.apache.hadoop/hadoop-core "1.0.4"]]}})
  16. $ make data/wikipedia-sample $ lein uberjar $ hadoop jar target/pem.jar

    data/stopwords data/wikipedia-sample \ data/output/wordcount data/output/tfidf
  17. $ cat data/output/tfidf/part-00000 | ag "\tmathematics$" | \ ruby -Eutf-8

    -ne 'puts $_.chomp.split("\t")[0,2].reverse.join("\t")' | \ sort -n -r 49.796916454164744! Igor Shafarevich 27.161954429544405! Pseudomathematics 27.161954429544405! Nelson Goodman 22.634962024620336! Johannes Kepler University of Linz 13.580977214772203! University of Maine at Farmington 13.580977214772203! Polish Academy of Learning 13.580977214772203! Michael Gove 13.580977214772203! Hao Wang (academic) 13.580977214772203! Farkas Bolyai 9.053984809848135 ! Stanisław Zaremba (mathematician) ...
  18. $ cat data/output/tfidf/part-00000 | ag "^Igor Shafarevich\t" | \ ruby

    -Eutf-8 -ne 'puts $_.chomp.split("\t")[1,2].join("\t")' | \ sort -n -r 289.20781088078576! shafarevich 63.59573868672378 ! algebraic 56.234852115708335! russophobia 49.796916454164744! mathematics 36.68239604961073 ! geometry 31.77039393029212 ! russian 28.824333804628424! socialism 26.590093350718455! socialist ... 10.749259518050817! fields 10.651000202283678! finite 10.651000202283678! curves 10.521923160008535! theorem 10.480684585603065! extensions ...
  19. (defbolt split-sentence ["word"] [tuple collector] (let [words (.split (.getString tuple

    0) " ")] (doseq [w words] (emit-bolt! collector [w] :anchor tuple)) (ack! collector tuple))) (defbolt word-count ["word" "count"] {:prepare true} [conf context collector] (let [counts (atom {})] (bolt (execute [tuple] (let [word (.getString tuple 0)] (swap! counts (partial merge-with +) {word 1}) (emit-bolt! collector [word (@counts word)] :anchor tuple) (ack! collector tuple))))))