Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Map Reduce & Ruby

Map Reduce & Ruby

Presentation on Map Reduce and Wukong

Patrick Van Stee

December 14, 2011
Tweet

More Decks by Patrick Van Stee

Other Decks in Programming

Transcript

  1. Map Reduce
    Ruby
    &

    View Slide

  2. @vanstee
    github.com/vanstee
    Patrick Van Stee
    highgroove.com

    View Slide

  3. •Map Reduce In 5 Seconds
    •Hadoop as a Black Box
    •Wukong and Examples
    •Demo Time

    View Slide

  4. TBs of data
    Cheap Servers
    Problem
    Solution
    Simplify Queries
    Distribute Work

    View Slide

  5. [1.2, 3.4, 5.6].
    map(&:round).
    reduce(:+)
    # => 10
    Input
    Map
    Reduce
    Output

    View Slide

  6. Map Reduce
    Input Output

    View Slide

  7. Hadoop
    Java
    Fault Tolerant
    Map
    Distributed
    File System Reduce
    Open Source

    View Slide

  8. Framework

    View Slide

  9. Framework
    Hadoop
    Data
    Script
    Result
    Map Reduce

    View Slide

  10. [1.2, 3.4, 5.6].
    map(&:round).
    reduce(:+)
    # => 10

    View Slide

  11. import java.io.IOException;
    import java.util.StringTokenizer;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    public class Sum extends Configured implements Tool {
    public static class Map extends Mapper {
    private IntWritable number = new IntWritable();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);
    while (tokenizer.hasMoreTokens()) {
    number.set(tokenizer.nextToken().toDouble().round());
    context.write(number);
    }
    }
    }
    public static class Reduce extends Reducer {
    @Override
    protected void reduce(Iterable values, Context context) throws IOException, InterruptedException {
    int sum = 0;
    for (IntWritable value : values) {
    sum += value.get();
    }
    context.write(new IntWritable(sum));
    }
    }
    @Override
    public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf, "sum");
    job.setJarByClass(getClass());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
    }
    public static void main(String[] args) throws Exception {
    int r = ToolRunner.run(new Sum(), args);
    System.exit(r);
    }
    }

    View Slide

  12. import java.io.IOException;
    import java.util.StringTokenizer;
    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.conf.Configured;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.Mapper;
    import org.apache.hadoop.mapreduce.Reducer;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    import org.apache.hadoop.util.Tool;
    import org.apache.hadoop.util.ToolRunner;
    public class Sum extends Configured implements Tool {
    public static class Map extends Mapper {
    private IntWritable number = new IntWritable();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);
    while (tokenizer.hasMoreTokens()) {
    number.set(tokenizer.nextToken().toDouble().round());
    context.write(number);
    }
    }
    }
    public static class Reduce extends Reducer {
    @Override
    protected void reduce(Iterable values, Context context) throws IOException, InterruptedException {
    int sum = 0;
    for (IntWritable value : values) {
    sum += value.get();
    }
    context.write(new IntWritable(sum));
    }
    }
    @Override
    public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf, "sum");
    job.setJarByClass(getClass());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(Map.class);
    job.setCombinerClass(Reduce.class);
    job.setReducerClass(Reduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
    }
    public static void main(String[] args) throws Exception {
    int r = ToolRunner.run(new Sum(), args);
    System.exit(r);
    }
    }
    OMG
    WTF
    BBQ

    View Slide

  13. Ruby + Java Streaming API
    Wukong

    View Slide

  14. require 'wukong'
    class Mapper < Wukong::Streamer::LineStreamer
    def process(line)
    yield ['sum', line.to_f.round]
    end
    end
    class Reducer < Wukong::Streamer::ListReducer
    def finalize(line)
    yield [key, values.map(&:to_i).sum]
    end
    end
    Wukong::Script.new(Mapper, Reducer).run

    View Slide

  15. bin/round_and_sum --run=local numbers.txt output
    Test locally with numbers.txt
    --run=<local or hadoop> <input> <output><br/>Run on a 100 node cluster with 100 TB of input<br/>bin/round_and_sum --run=hadoop \<br/>hdfs://datanode/numbers-*.txt \<br/>hdfs://datanode/output \<br/>--jobtracker=jobtracker<br/>

    View Slide

  16. DEMO

    View Slide

  17. Gems
    Rubies
    Use the same
    &
    On All Workers

    View Slide

  18. Ruby is slow...

    View Slide

  19. ?

    View Slide

  20. Hack Night

    View Slide