Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Cascading Through Hadoop for Boulder JUG

Cascading Through Hadoop for Boulder JUG

An exploration of Cascading, a more fluent API on top of Hadoop.

Matthew McCullough

October 13, 2011
Tweet

More Decks by Matthew McCullough

Other Decks in Programming

Transcript

  1. // The WordCount Mapper public static class TokenizerMapper extends Mapper<Object,

    Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } }
  2. // The WordCount Reducer public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable>

    { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context ) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } }
  3. // The WordCount main() public static void main(String[] args) throws

    Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
  4. package org.apache.hadoop.examples; import java.io.BufferedReader; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException;

    import java.io.InputStreamReader; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.MultiFileInputFormat; import org.apache.hadoop.mapred.MultiFileSplit; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader;
  5. //set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class);

    // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(IntWritable.class); //use the defined mapper job.setMapperClass(MapClass.class); //use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; } public static void main(String[] args) throws Exception { int ret = ToolRunner.run(new MultiFileWordCount(), args); System.exit(ret); } }
  6. public class SimplestPipe1Flip { public static void main(String[] args) {

    String inputPath = "data/babynamedefinitions.csv"; String outputPath = "output/simplestpipe1"; Scheme sourceScheme = new TextDelimited( new Fields( "name", "definition" ), "," ); Tap source = new Hfs( sourceScheme, inputPath ); Scheme sinkScheme = new TextDelimited( new Fields( "definition", "name" ), " ++ " ); Tap sink = new Hfs( sinkScheme, outputPath, SinkMode.REPLACE ); Pipe assembly = new Pipe( "flip" ); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, SimplestPipe1Flip.class); FlowConnector flowConnector = new FlowConnector( properties ); Flow flow = flowConnector.connect( "flipflow", source, sink, assembly ); flow.complete(); } }
  7. Ignoring that Hadoop is as much about analytics as it

    is about integration leads to a fair number of compromises, including, but not exclusive to a loss in quality of life (in trade for a false sense of accomplishment) -Chris Wensel, Cascading Inventor
  8. Hadoop is as much about analytics as it is about

    integration. Ignoring that leads to crazy complex tool chains that typically involve XML -Chris Wensel, Cascading Inventor
  9. #Pig Script Person = LOAD 'people.csv' using PigStorage(','); Names =

    FOREACH Person GENERATE $2 AS name; OrderedNames = ORDER Names BY name ASC; GroupedNames = GROUP OrderedNames BY name; NameCount = FOREACH GroupedNames GENERATE group, COUNT(OrderedNames); store NameCount into 'names.out';
  10. #Hive Script LOAD DATA INPATH “shakespeare_freq” INTO TABLE shakespeare; SELECT

    * FROM shakespeare WHERE freq > 100 SORT BY freq ASC LIMIT 10;
  11. //Cascading Groovy Script def cascading = new Cascading() def builder

    = cascading.builder(); Flow flow = builder.flow("wordcount") { source(input, scheme: text()) tokenize(/[.,]*\s+/) group() count() group(["count"], reverse: true) sink(output, delete: true) }
  12. public class SimplestPipe1Flip { public static void main(String[] args) {

    String inputPath = "data/babynamedefinitions.csv"; String outputPath = "output/simplestpipe1"; Scheme sourceScheme = new TextDelimited( new Fields( "name", "definition" ), "," ); Tap source = new Hfs( sourceScheme, inputPath ); Scheme sinkScheme = new TextDelimited( new Fields( "definition", "name" ), " ++ " ); Tap sink = new Hfs( sinkScheme, outputPath, SinkMode.REPLACE ); Pipe assembly = new Pipe( "flip" ); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, SimplestPipe1Flip.class); FlowConnector flowConnector = new FlowConnector( properties ); Flow flow = flowConnector.connect( "flipflow", source, sink, assembly ); flow.complete(); } }
  13. DAG

  14. Cascade GroupBy CoGroup Every Sub-Assembly Each CoGroup GroupBy CoGroup Every

    Sub-Assembly Each CoGroup GroupBy CoGroup Every Sub-Assembly Each CoGroup
  15. public class SimplestPipe3CoGroup { public static void main(String[] args) {

    String inputPathDefinitions = "data/babynamedefinitions.csv"; String inputPathCounts = "data/babynamecounts.csv"; String outputPath = "output/simplestpipe3"; Scheme sourceSchemeDefinitions = new TextDelimited( new Fields( "name", "definition" ), "," ); Scheme sourceSchemeCounts = new TextDelimited( new Fields( "name", "count" ), "," ); Tap sourceDefinitions = new Hfs( sourceSchemeDefinitions, inputPathDefinitions ); Tap sourceCounts = new Hfs( sourceSchemeCounts, inputPathCounts ); Scheme sinkScheme = new TextDelimited( new Fields( "dname", "count", "definition" ), " ^^^ " ); Tap sink = new Hfs( sinkScheme, outputPath, SinkMode.REPLACE ); Pipe definitionspipe = new Pipe( "definitionspipe" ); Pipe countpipe = new Pipe( "countpipe" ); //Join the tuple streams Fields commonfields = new Fields( "name" ); Fields newfields = new Fields("dname", "definition", "cname", "count"); Pipe joinpipe = new CoGroup( definitionspipe, commonfields, countpipe, commonfields, newfields, new InnerJoin() ); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, SimplestPipe3CoGroup.class); FlowConnector flowConnector = new FlowConnector( properties ); Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put("definitionspipe", sourceDefinitions); sources.put("countpipe", sourceCounts); Flow flow = flowConnector.connect( sources, sink, joinpipe ); flow.complete(); } }
  16. Why a new MR toolkit? ㊌ Simpler coding ㊌ More

    logical processing abstractions ㊌ Run MapReduce locally ㊌ Debug jobs with ease
  17. public class SimplestPipe1Flip { public static void main(String[] args) {

    String inputPath = "data/babynamedefinitions.csv"; String outputPath = "output/simplestpipe1"; Scheme sourceScheme = new TextDelimited( new Fields( "name", "definition" ), "," ); Tap source = new Hfs( sourceScheme, inputPath ); Scheme sinkScheme = new TextDelimited( new Fields( "definition", "name" ), " ++ " ); Tap sink = new Hfs( sinkScheme, outputPath, SinkMode.REPLACE ); Pipe assembly = new Pipe( "flip" ); //OPTIONAL: Debug the tuple //assembly = new Each( assembly, DebugLevel.VERBOSE, new Debug() ); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, SimplestPipe1Flip.class); FlowConnector flowConnector = new FlowConnector( properties ); //OPTIONAL: Have the planner use or filter out the debugging statements //FlowConnector.setDebugLevel( properties, DebugLevel.VERBOSE ); Flow flow = flowConnector.connect( "flipflow", source, sink, assembly ); flow.complete(); } }
  18. Hadoop is never used alone. The dirty secret is that

    it is really a huge ETL tool. -Chris Wensel, Cascading Inventor
  19. When in doubt, look at the Cascading source code. If

    something is not documented in this User Guide, the source code will give you clear instructions on what to do or expect. -Chris Wensel, Cascading Inventor
  20. $ ls -al drwxr-xr-x 15 mccm06 staff 510B Feb 21

    14:31 ./ drwxr-xr-x 20 mccm06 staff 680B Feb 17 15:39 ../ drwxr-xr-x 10 mccm06 staff 340B Feb 19 01:40 cascading.groovy_git/ drwxr-xr-x 7 mccm06 staff 238B Feb 19 01:40 cascading.hbase_git/ drwxr-xr-x 8 mccm06 staff 272B Feb 19 01:40 cascading.jdbc_git/ drwxr-xr-x 8 mccm06 staff 272B Feb 19 01:39 cascading.load_git/ drwxr-xr-x 9 mccm06 staff 306B Feb 19 01:39 cascading.memcached_git/ drwxr-xr-x 9 mccm06 staff 306B Feb 19 01:39 cascading.multitool_git/ drwxr-xr-x 10 mccm06 staff 340B Feb 19 01:39 cascading.samples_git/ drwxr-xr-x 8 mccm06 staff 272B Feb 19 01:39 cascading.work_git/ drwxr-xr-x 14 mccm06 staff 476B Feb 21 14:26 cascading_git/ drwxr-xr-x 11 mccm06 staff 374B Dec 31 16:16 cascalog_git/ lrwxr-xr-x 1 mccm06 staff 45B Feb 21 14:31 hadoop -> /Applications/Dev/hadoop-family/hadoop-0.20.1
  21. [javac] cascading_git/src/core/cascading/tap/hadoop/TapIterator.java:52: cannot find symbol [javac] symbol : class JobConf

    [javac] location: class cascading.tap.hadoop.TapIterator [javac] private final JobConf conf; [javac] ^ [javac] cascading_git/src/core/cascading/tap/hadoop/TapIterator.java:54: cannot find symbol [javac] symbol : class InputSplit [javac] location: class cascading.tap.hadoop.TapIterator [javac] private InputSplit[] splits; [javac] ^ [javac] cascading_git/src/core/cascading/tap/hadoop/TapIterator.java:56: cannot find symbol [javac] symbol : class RecordReader [javac] location: class cascading.tap.hadoop.TapIterator [javac] private RecordReader reader; [javac] ^ [javac] cascading_git/src/core/cascading/tap/hadoop/TapIterator.java:75: cannot find symbol [javac] symbol : class JobConf [javac] location: class cascading.tap.hadoop.TapIterator [javac] public TapIterator( Tap tap, JobConf conf ) throws IOException [javac] ^ [javac] Note: Some input files use or override a deprecated API. [javac] Note: Recompile with -Xlint:deprecation for details. [javac] Note: Some input files use unchecked or unsafe operations. [javac] Note: Recompile with -Xlint:unchecked for details. [javac] 100 errors
  22. Buildfile: cascading_git/build.xml init: [echo] initializing cascading environment... [mkdir] Created dir:

    cascading_git/build/core [mkdir] Created dir: cascading_git/build/xml [mkdir] Created dir: cascading_git/build/test [mkdir] Created dir: cascading_git/build/testresults echo-compile-buildnum: compile: [echo] building cascading... [javac] Compiling 238 source files to cascading_git/build/core [javac] Note: Some input files use or override a deprecated API. [javac] Note: Recompile with -Xlint:deprecation for details. [javac] Note: Some input files use unchecked or unsafe operations. [javac] Note: Recompile with -Xlint:unchecked for details. [copy] Copying 1 file to cascading_git/build/core/cascading [javac] Compiling 5 source files to cascading_git/build/xml [javac] Compiling 85 source files to cascading_git/build/test [javac] Note: Some input files use or override a deprecated API. [javac] Note: Recompile with -Xlint:deprecation for details. [javac] Note: Some input files use unchecked or unsafe operations. [javac] Note: Recompile with -Xlint:unchecked for details. [copy] Copying 24 files to cascading_git/build/test BUILD SUCCESSFUL Total time: 7 seconds
  23. public class SimplestPipe2Sort { public static void main(String[] args) {

    String inputPath = "data/babynamedefinitions.csv"; String outputPath = "output/simplestpipe2"; Scheme sourceScheme = new TextDelimited( new Fields( "name", "definition" ), "," ); Tap source = new Hfs( sourceScheme, inputPath ); Scheme sinkScheme = new TextDelimited( new Fields( "definition", "name" ), " ^^^ " ); Tap sink = new Hfs( sinkScheme, outputPath, SinkMode.REPLACE ); Pipe assembly = new Pipe( "sortreverse" ); Fields groupFields = new Fields( "name"); //OPTIONAL: Set the comparator //groupFields.setComparator("name", Collections.reverseOrder()); assembly = new GroupBy( assembly, groupFields ); Properties properties = new Properties(); FlowConnector.setApplicationJarClass(properties, SimplestPipe2Sort.class); FlowConnector flowConnector = new FlowConnector( properties ); Flow flow = flowConnector.connect( "sortflow", source, sink, assembly ); flow.complete(); //OPTIONAL: Output a debugging diagram //flow.writeDOT(outputPath + "/flowdiagram.dot"); } }
  24. def cascading = new Cascading() def builder = cascading.builder(); Flow

    flow = builder.flow("wordcount") { source(input, scheme: text()) // output new tuple for each split, //result replaces stream by default tokenize(/[.,]*\s+/) group() // group on stream // count values in group // creates 'count' field by default count() // group/sort on 'count', reverse the sort order group(["count"], reverse: true) sink(output, delete: true) }
  25. $ groovy wordcount INFO - Concurrent, Inc - Cascading 1.2.1

    [hadoop-0.19.2+] INFO - [wordcount] starting INFO - [wordcount] source: Hfs["TextLine[['line']->[ALL]]"]["output/fetched/fetch.txt"]"] INFO - [wordcount] sink: Hfs["TextLine[['line']->[ALL]]"]["output/counted"]"] INFO - [wordcount] parallel execution is enabled: false INFO - [wordcount] starting jobs: 2 INFO - [wordcount] allocating threads: 1 INFO - [wordcount] starting step: (1/2) TempHfs["SequenceFile[[0, 'count']]"][wordcount/18750/] INFO - [wordcount] starting step: (2/2) Hfs["TextLine[['line']->[ALL]]"]["output/counted"]"] INFO - deleting temp path output/counted/_temporary
  26. ㊌ Simple ㊌ Functions, filters, and aggregators all use the

    same syntax. Joins are implicit and natural.
  27. ㊌Expressive ㊌Logical composition is very powerful, and you can run

    arbitrary Clojure code in your query with little effort.
  28. ㊌Query Anything ㊌Query HDFS data, database data, and/or local data

    by making use of Cascading’s “Tap” abstraction