Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Summingbird: Streaming MapReduce at Twitter

Summingbird: Streaming MapReduce at Twitter

Summingbird is a platform for streaming map/reduce used at Twitter to build aggregations in real-time or on hadoop. When the programmer describes her job, that job can be run without change on Storm or Hadoop. Additionally, summingbird can manage merging realtime/online computations with offline batches so that small errors in real-time do not accumulate. Put another way, summingbird gives eventual consistency in a manner that is easy for the programmer to reason about.

Sam Ritchie

June 20, 2013
Tweet

More Decks by Sam Ritchie

Other Decks in Programming

Transcript

  1. Streaming MapReduce at Twitter
    Sam Ritchie (@sritchie)
    Friday, June 21, 13

    View Slide

  2. Summingbird
    Friday, June 21, 13

    View Slide

  3. def wordCount[P <: Platform[P]](
    source: Producer[P, Tweet],
    store: P#Store[String, Long]) =
    source
    .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) }
    .sumByKey(store)
    Friday, June 21, 13

    View Slide

  4. public class WordCount {
    public static class Map extends Mapper {
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    StringTokenizer tokenizer = new StringTokenizer(line);
    while (tokenizer.hasMoreTokens()) {
    word.set(tokenizer.nextToken());
    context.write(word, one);
    }
    }
    }
    public static class Reduce extends Reducer {
    public void reduce(Text key, Iterable values, Context context)
    throws IOException, InterruptedException {
    int sum = 0;
    for (IntWritable val : values) {
    sum += val.get();
    }
    context.write(key, new IntWritable(sum));
    }
    }
    public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "wordcount");
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.waitForCompletion(true);
    }
    }
    Friday, June 21, 13

    View Slide

  5. public class WordCountTopology {
    public static class SplitSentence extends ShellBolt implements IRichBolt {
    public SplitSentence() {
    super("python", "splitsentence.py");
    }
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
    declarer.declare(new Fields("word"));
    }
    @Override
    public Map getComponentConfiguration() {
    return null;
    }
    }
    public static class WordCount extends BaseBasicBolt {
    Map counts = new HashMap();
    @Override
    public void execute(Tuple tuple, BasicOutputCollector collector) {
    String word = tuple.getString(0);
    Integer count = counts.get(word);
    if(count==null) count = 0;
    count++;
    counts.put(word, count);
    collector.emit(new Values(word, count));
    }
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
    declarer.declare(new Fields("word", "count"));
    }
    }
    public static void main(String[] args) throws Exception {
    TopologyBuilder builder = new TopologyBuilder();
    builder.setSpout("spout", new RandomSentenceSpout(), 5);
    builder.setBolt("split", new SplitSentence(), 8)
    .shuffleGrouping("spout");
    builder.setBolt("count", new WordCount(), 12)
    .fieldsGrouping("split", new Fields("word"));
    Config conf = new Config();
    conf.setDebug(true);
    if(args!=null && args.length > 0) {
    conf.setNumWorkers(3);
    StormSubmitter.submitTopology(args[0], conf, builder.createTopology());
    } else {
    conf.setMaxTaskParallelism(3);
    LocalCluster cluster = new LocalCluster();
    cluster.submitTopology("word-count", conf, builder.createTopology());
    Thread.sleep(10000);
    cluster.shutdown();
    }
    }
    }
    Friday, June 21, 13

    View Slide

  6. def wordCount[P <: Platform[P]](
    source: Producer[P, Tweet],
    store: P#Store[String, Long]) =
    source
    .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) }
    .sumByKey(store)
    // Running in Hadoop (via Scalding):
    Scalding.run {
    wordCount[Scalding](
    Scalding.source[Tweet]("source_data_location"),
    Scalding.store[String, Long]("word_count_location")
    )
    }
    // Running in Storm:
    Storm.run {
    wordCount[Storm](new TweetSpout(), new MemcacheStore[String, Long])
    }
    Friday, June 21, 13

    View Slide

  7. def flatMap[T, U](fn: T => List[U]): List[U]
    def map[T, U](fn: T => U): List[U]
    def filter[T](fn: T => Boolean): List[T]
    The “Map” of MapReduce
    Friday, June 21, 13

    View Slide

  8. def wordCount[P <: Platform[P]](
    source: Producer[P, Tweet],
    store: P#Store[String, Long]) =
    source
    .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) }
    .sumByKey(store)
    ”Bears up in
    my room”
    (”just” -> 1)
    (“setting” -> 1)
    (“up” -> 1)
    (my” -> 1)
    (“twttr” -> 1)
    “Just setting
    up my twttr”
    (”Bears” -> 1)
    (“up” -> 1)
    (“in” -> 1)
    (my” -> 1)
    (“room” -> 1)
    (“Bears” -> (1))
    (”just” -> (1))
    (“setting” -> (1))
    (“up” -> (1, 1))
    (“in” -> (1))
    (“my” -> (1, 1))
    (“room” -> (1))
    (“twttr” -> (1))
    P#Store[String, Long]
    Producer[P, Tweet]
    (“Bears” -> 1)
    (”just” -> 1)
    (“setting” -> 1)
    (“up” -> 2)
    (“in” -> 1)
    (“my” -> 2)
    (“room” -> 1)
    (“twttr” -> 1)
    Friday, June 21, 13

    View Slide

  9. Producer
    Store
    Producer[T]
    T => List[(K, V)]
    Store[K, V]
    Client
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    (K, List[V]) => (K, V)
    Friday, June 21, 13

    View Slide

  10. Client
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Storm
    Hadoop
    Friday, June 21, 13

    View Slide

  11. Friday, June 21, 13

    View Slide

  12. def wordCount[P <: Platform[P]](
    source: Producer[P, Tweet],
    store: P#Store[(Long, String), Long]) =
    source
    .flatMap { tweet =>
    tweet.getText.split("\\s+")
    .map { word =>
    ((tweet.getHour, word) -> 1L)
    }
    }.sumByKey(store)
    Key: (Long, String)
    Value: Long
    Friday, June 21, 13

    View Slide

  13. 0
    2
    4
    5
    7
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
    “ape” by Hour (Hadoop)
    Friday, June 21, 13

    View Slide

  14. 0
    2
    4
    5
    7
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
    “ape” by Hour (Hadoop)
    Friday, June 21, 13

    View Slide

  15. 0
    2
    4
    5
    7
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
    “ape” by Hour (Storm)
    Friday, June 21, 13

    View Slide

  16. 0
    1
    3
    4
    5
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
    “ape” by Hour (Faulty Storm)
    Friday, June 21, 13

    View Slide

  17. 0
    2
    4
    5
    7
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
    “ape” by Hour (Hadoop) “ape” by Hour (Storm)
    “ape” by Hour (Merged)
    Friday, June 21, 13

    View Slide

  18. 0
    2
    4
    5
    7
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
    “ape” by Hour (Merged)
    Friday, June 21, 13

    View Slide

  19. Client
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Storm
    Hadoop
    Friday, June 21, 13

    View Slide

  20. The “Reduce” of MapReduce
    What can we aggregate?
    Friday, June 21, 13

    View Slide

  21. trait Monoid[V] {
    def zero: V
    def plus(l: V, r: V): V
    }
    Friday, June 21, 13

    View Slide

  22. a + b + c + d
    (a + b) + (c + d)
    (a + b + c) + d
    a + (b + c + d)
    Friday, June 21, 13

    View Slide

  23. Friday, June 21, 13

    View Slide

  24. /**
    * 3 + 3 ==> 6
    */
    class AddingMonoid[T <: Numeric] extends Monoid[T] {
    override val zero = 0
    override def plus(l: T, r: T) = l + r
    }
    Friday, June 21, 13

    View Slide

  25. /**
    * Set(1,2,3) + Set(2,3,4)
    * ==> Set(1,2,3,4)
    */
    class SetMonoid[T] extends Monoid[Set[T]] {
    override val zero = Set.empty[T]
    override def plus(l: Set[T], r: Set[T]) = l.concatenate(r)
    }
    Friday, June 21, 13

    View Slide

  26. /**
    * The value type’s monoid is used recursively:
    *
    * Map("a" -> 1, "b" -> 1) + Map("a" -> 2, "c" -> 1)
    * ==> Map("a" -> 3, "b" -> 1, "c" -> 1)
    */
    class MapMonoid[K, V](vMonoid: Monoid[V]) extends Monoid[Map[K, V]] {
    override val zero = Map.empty[K, V]
    override def plus(x: Map[K, V], y: Map[K, V]) =
    l.foldLeft(r) { (oldMap, (k, leftV)) =>
    oldMap + (r.get(k) match {
    case Some(rightV) => vMonoid.plus(leftV, rightV)
    case None => leftV
    })
    }
    }
    Friday, June 21, 13

    View Slide

  27. /**
    * 3 “+” 3 ==> 9 // wtf?
    */
    class MultiplyingMonoid[T <: Numeric] extends Monoid[T] {
    override val zero = 1
    override def plus(l: T, r: T) = l * r
    }
    Friday, June 21, 13

    View Slide

  28. Client
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Storm
    Hadoop
    Friday, June 21, 13

    View Slide

  29. // Timeline Service
    (UserID, Hour), List[Tweet]
    // Impression Counts per URL
    [TweetID, CountMinSketch[URL, ImpCount]]
    Friday, June 21, 13

    View Slide

  30. https://github.com/twitter/algebird
    Friday, June 21, 13

    View Slide

  31. Client
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Storm
    Hadoop
    Friday, June 21, 13

    View Slide

  32. trait Bijection[A, B] extends (A => B) {
    def apply(a: A): B
    def invert(b: B): A
    }
    trait Injection[A, B] extends (A => B) {
    def apply(a: A): B
    def invert(b: B): Option[A]
    }
    Friday, June 21, 13

    View Slide

  33. class LongToBytes extends Injection[Long, Array[Byte]] {
    val size = 8
    def apply(value: Long) = {
    val buf = ByteBuffer.allocate(size)
    buf.putLong(value)
    buf.array
    }
    override def invert(b: Array[Byte]) =
    try {
    Some(ByteBuffer.wrap(b).getLong)
    } catch {
    case _ => None
    }
    }
    Friday, June 21, 13

    View Slide

  34. Injection.connect[CountMinSketch, ThriftCountMinSketch, Array[Byte]]
    Friday, June 21, 13

    View Slide

  35. val injection = Injection.connect[Long, String, Array[Byte], Base64String]
    scala> injection.apply(243L)
    res17: com.twitter.bijection.Base64String = Base64String(MjQz)
    scala> injection.invert(res17)
    res18: Option[Long] = Some(243)
    Friday, June 21, 13

    View Slide

  36. class BijectedMonoid[T, U](monoid: Monoid[T], bij: Bijection[T, U])
    extends Monoid[U] {
    override def zero: U = monoid.zero.as[U]
    override def plus(l: U, r: U): U = monoid.plus(l.as[T], r.as[T]).as[U]
    }
    class MonoidBijection[T, U](monoid: Monoid[T], bij: Bijection[T, U])
    extends Bijection[Monoid[T], Monoid[U]] {
    override def apply(mon: Monoid[T]) = new BijectedMonoid[T, U](mon, bij)
    override def invert(mon: Monoid[U]) = new BijectedMonoid[U, T](mon, bij.inverse)
    }
    Friday, June 21, 13

    View Slide

  37. scala> (1 to 10).map(_.as[EnglishInt]).foreach(println(_))
    EnglishInt("one")
    EnglishInt("two")
    EnglishInt("three")
    EnglishInt("four")
    EnglishInt("five")
    EnglishInt("six")
    EnglishInt("seven")
    EnglishInt("eight")
    EnglishInt("nine")
    EnglishInt("ten")
    Friday, June 21, 13

    View Slide

  38. val englishMonoid = intMonoid.as[Monoid[EnglishInt]]
    scala> englishMonoid.plus(1.as[EnglishInt], 2.as[EnglishInt])
    res21: EnglishInt = EnglishInt("three")
    scala> englishMonoid.plus(4321.as[EnglishInt], 1234.as[EnglishInt])
    res22: EnglishInt = EnglishInt("five thousand five hundred fifty five")
    Friday, June 21, 13

    View Slide

  39. Friday, June 21, 13

    View Slide

  40. https://github.com/twitter/bijection
    Friday, June 21, 13

    View Slide

  41. Client
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Storm
    Hadoop
    Friday, June 21, 13

    View Slide

  42. Client
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Storm
    Hadoop
    Store
    Friday, June 21, 13

    View Slide

  43. trait ReadableStore[-K, +V] extends Closeable {
    def get(k: K): Future[Option[V]] = multiGet(Set(k)).apply(k)
    def multiGet[K1 <: K](ks: Set[K1]): Map[K1, Future[Option[V]]] =
    ks.map { k => (k, self.get(k)) }.toMap
    override def close { }
    }
    Friday, June 21, 13

    View Slide

  44. class ReadableStoreMonoid[K, V](monoid: Monoid[V]) extends Monoid[ReadableStore[K, V]] {
    def zero = ReadableStore.const(Monoid.zero[V])
    def plus(l: ReadableStore[K, V], r: ReadableStore[K,V]) =
    new ReadableStore[K, V] {
    override def get(k: K) = Monoid.plus(l.get(k), r.get(k))
    }
    }
    Friday, June 21, 13

    View Slide

  45. // Store giving us access to hadoop data
    val offlineStore: ReadableStore[(Int, String), Long]
    // Store containing storm data
    val onlineStore: ReadableStore[(Int, String), Long]
    // Boom! Both!
    val combinedStore: ReadableStore[(Int, String), Long] =
    Monoid.plus(offlineStore, onlineStore)
    // Wuh?
    val cachedStore: ReadableStore[(Int, String), Long] =
    combinedStore.withCache(Cache.empty)
    Friday, June 21, 13

    View Slide

  46. Client
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Producer
    Store
    flatMap
    flatMap
    flatMap
    flatMap
    Producer
    Producer
    Reduce
    Reduce
    Storm
    Hadoop
    Store
    Friday, June 21, 13

    View Slide

  47. Friday, June 21, 13

    View Slide

  48. https://github.com/twitter/storehaus
    Friday, June 21, 13

    View Slide

  49. Friday, June 21, 13

    View Slide

  50. •Open Source the Glue
    •New Execution Platforms
    •Smarter Systems
    What’s Next?
    Friday, June 21, 13

    View Slide

  51. https://github.com/twitter/algebird
    https://github.com/twitter/bijection
    https://github.com/twitter/storehaus
    https://github.com/twitter/summingbird (July!)
    Friday, June 21, 13

    View Slide