Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Map Reduce & Ruby

Map Reduce & Ruby

Presentation on Map Reduce and Wukong

Avatar for Patrick Van Stee

Patrick Van Stee

December 14, 2011
Tweet

More Decks by Patrick Van Stee

Other Decks in Programming

Transcript

  1. •Map Reduce In 5 Seconds •Hadoop as a Black Box

    •Wukong and Examples •Demo Time
  2. import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path;

    import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class Sum extends Configured implements Tool { public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> { private IntWritable number = new IntWritable(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { number.set(tokenizer.nextToken().toDouble().round()); context.write(number); } } } public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override protected void reduce(Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable value : values) { sum += value.get(); } context.write(new IntWritable(sum)); } } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf, "sum"); job.setJarByClass(getClass()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean success = job.waitForCompletion(true); return success ? 0 : 1; } public static void main(String[] args) throws Exception { int r = ToolRunner.run(new Sum(), args); System.exit(r); } }
  3. import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path;

    import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class Sum extends Configured implements Tool { public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> { private IntWritable number = new IntWritable(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { number.set(tokenizer.nextToken().toDouble().round()); context.write(number); } } } public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> { @Override protected void reduce(Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable value : values) { sum += value.get(); } context.write(new IntWritable(sum)); } } @Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf, "sum"); job.setJarByClass(getClass()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map.class); job.setCombinerClass(Reduce.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean success = job.waitForCompletion(true); return success ? 0 : 1; } public static void main(String[] args) throws Exception { int r = ToolRunner.run(new Sum(), args); System.exit(r); } } OMG WTF BBQ
  4. require 'wukong' class Mapper < Wukong::Streamer::LineStreamer def process(line) yield ['sum',

    line.to_f.round] end end class Reducer < Wukong::Streamer::ListReducer def finalize(line) yield [key, values.map(&:to_i).sum] end end Wukong::Script.new(Mapper, Reducer).run
  5. bin/round_and_sum --run=local numbers.txt output Test locally with numbers.txt <script> --run=<local

    or hadoop> <input> <output> Run on a 100 node cluster with 100 TB of input bin/round_and_sum --run=hadoop \ hdfs://datanode/numbers-*.txt \ hdfs://datanode/output \ --jobtracker=jobtracker
  6. ?