Small Code for Big Data

Big Data

Small Code

MapReduce

Hadoop

public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } }

public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result
= new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } }

public static void main(String[] args) throws Exception { Configuration conf
= new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }

Sawzall

count: table sum of int; total: table sum of float;
sum_of_squares: table sum of float; x: float = input; emit count <- 1; emit total <- x; emit sum_of_squares <- x * x;

input_lines = LOAD 'input' AS (line:chararray); words = FOREACH input_lines
GENERATE FLATTEN(TOKENIZE(line)) AS word; filtered_words = FILTER words BY word MATCHES '\\w+'; word_groups = GROUP filtered_words BY word; word_count = FOREACH word_groups GENERATE group, COUNT(filtered_words); STORE word_count INTO 'output';

SELECT t1.c2 FROM t1 JOIN t2 ON (t1.c1 = t2.c1);

Cascading

public static void main(String[] args) { String inPath = args[0];
String outPath = args[1]; Properties properties = new Properties(); AppProps.setApplicationJarClass(properties, Main.class); HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties); Tap inTap = new Hfs(new TextDelimited(), inPath); Tap outTap = new Hfs(new TextDelimited(), outPath); Fields token = new Fields("token"); Fields text = new Fields("text"); RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\$\$,.]"); Pipe inPipe = new Each("token", text, splitter, Fields.RESULTS); Pipe outPipe = new Pipe("wordcount", inPipe); outPipe = new GroupBy(outPipe, token); outPipe = new Every(outPipe, Fields.ALL, new Count(), Fields.ALL); FlowDef flowDef = FlowDef.flowDef() .setName("wordcount") .addSource(inPipe, inTap) .addTailSink(outPipe, outTap); flowConnector.connect(flowDef).complete(); }

Lingual

SELECT * FROM EMPLOYEES.EMPLOYEES WHERE FIRST_NAME = 'Gina';

Clojure

Cascalog

$ lein repl user=> (use 'cascalog.playground) (bootstrap) user=> (?<- (stdout)
[?person ?age] (age ?person ?age) (< ?age 30)) ... RESULTS ----------------------- alice 28 david 25 emily 25 gary 28 kumar 27 ----------------------- ... user=>

(?<- (stdout) [?person1 !!person2] (person ?person1) (follows ?person1 !!person2)) (?<-
(stdout) [?person] (person ?person) (follows ?person !!p2) (nil? !!p2)) (?<- (stdout) [?person ?count] (person ?person) (follows ?person !!p2) (c/!count !!p2))

(ns pem.core (:use [cascalog.api] [cascalog.more-taps :only (hfs-delimited)]) (:require [clojure.string :as
s] [cascalog.ops :as c]) (:gen-class)) (defmapcatop split [line] (s/split line #"[\[\]\\,.)\s]+")) (defn -main [in out & args] (?<- (hfs-delimited out) [?word ?count] ((hfs-delimited in) _ ?line) (split ?line :> ?word) (c/count ?count)))

TF-IDF

(defn tf-idf-formula [tf-count df-count n-docs] (->> (+ 1.0 df-count) (div
n-docs) (Math/log) (* tf-count)))

(ns pem.core (:use [cascalog.api] [cascalog.more-taps]) (:require [cascalog [ops :as c]
[vars :as v]]) (:import [org.apache.hadoop.io Text]) (:gen-class)) (defn -main [stop in out tfidf & args] (let [docs (hfs-wrtseqfile in Text Text :outfields ["key" "value"]) stop (hfs-delimited stop) src (etl-docs-gen docs stop)] (?- (hfs-delimited tfidf) (TF-IDF src)) (?- (hfs-delimited out) (word-count src))))

(defmapcatop tokenize [line] (re-seq #"(?i)\b\w\w+\b" line)) (defn etl-docs-gen [docs stop]
(<- [?doc-id ?word] (docs ?key ?value) (str ?key :> ?doc-id) (str ?value :> ?body) (tokenize ?body :> ?word-dirty) ((c/comp s/trim s/lower-case) ?word-dirty :> ?word) (stop ?word :> false)))

(defn word-count [src] (<- [?word ?count] (src _ ?word) (c/count
?count)))

(defn TF-IDF [src] (let [n-doc (first (flatten (??- (D src))))]
(<- [?doc-id ?tf-idf ?tf-word] ((TF src) ?doc-id ?tf-word ?tf-count) ((DF src) ?tf-word ?df-count) (tf-idf-formula ?tf-count ?df-count n-doc :> ?tf-idf))))

(defn D [src] (let [src (select-fields src ["?doc-id"])] (<- [?n-docs]
(src ?doc-id) (c/distinct-count ?doc-id :> ?n-docs)))) (defn DF [src] (<- [?df-word ?df-count] (src ?doc-id ?df-word) (c/distinct-count ?doc-id ?df-word :> ?df-count))) (defn TF [src] (<- [?doc-id ?tf-word ?tf-count] (src ?doc-id ?tf-word) (c/count ?tf-count)))

(defproject pem-20130426 "0.1.0-SNAPSHOT" :license {:name "Eclipse Public License" :url "http://www.eclipse.org/legal/epl-v10.html"}
:uberjar-name "pem.jar" :aot [pem.core] :main pem.core :dependencies [[org.clojure/clojure "1.5.1"] [cascalog "1.10.1"] [cascalog-more-taps "0.3.0"]] :profiles {:provided {:dependencies [[org.apache.hadoop/hadoop-core "1.0.4"]]}})

$ make data/wikipedia-sample $ lein uberjar $ hadoop jar target/pem.jar
data/stopwords data/wikipedia-sample \ data/output/wordcount data/output/tfidf

$ cat data/output/tfidf/part-00000 | ag "\tmathematics$" | \ ruby -Eutf-8
-ne 'puts $_.chomp.split("\t")[0,2].reverse.join("\t")' | \ sort -n -r 49.796916454164744! Igor Shafarevich 27.161954429544405! Pseudomathematics 27.161954429544405! Nelson Goodman 22.634962024620336! Johannes Kepler University of Linz 13.580977214772203! University of Maine at Farmington 13.580977214772203! Polish Academy of Learning 13.580977214772203! Michael Gove 13.580977214772203! Hao Wang (academic) 13.580977214772203! Farkas Bolyai 9.053984809848135 ! Stanisław Zaremba (mathematician) ...

$ cat data/output/tfidf/part-00000 | ag "^Igor Shafarevich\t" | \ ruby
-Eutf-8 -ne 'puts $_.chomp.split("\t")[1,2].join("\t")' | \ sort -n -r 289.20781088078576! shafarevich 63.59573868672378 ! algebraic 56.234852115708335! russophobia 49.796916454164744! mathematics 36.68239604961073 ! geometry 31.77039393029212 ! russian 28.824333804628424! socialism 26.590093350718455! socialist ... 10.749259518050817! fields 10.651000202283678! finite 10.651000202283678! curves 10.521923160008535! theorem 10.480684585603065! extensions ...

Testing

Mahout

(defbolt split-sentence ["word"] [tuple collector] (let [words (.split (.getString tuple
0) " ")] (doseq [w words] (emit-bolt! collector [w] :anchor tuple)) (ack! collector tuple))) (defbolt word-count ["word" "count"] {:prepare true} [conf context collector] (let [counts (atom {})] (bolt (execute [tuple] (let [word (.getString tuple 0)] (swap! counts (partial merge-with +) {word 1}) (emit-bolt! collector [word (@counts word)] :anchor tuple) (ack! collector tuple))))))

https://github.com/norbert/presentations /tree/pem-20130426

Discussion

Small Code for Big Data

Small Code for Big Data

Other Decks in Programming

Featured

Transcript