Summingbird: Streaming MapReduce at Twitter

Summingbird is a platform for streaming map/reduce used at Twitter to build aggregations in real-time or on hadoop. When the programmer describes her job, that job can be run without change on Storm or Hadoop. Additionally, summingbird can manage merging realtime/online computations with offline batches so that small errors in real-time do not accumulate. Put another way, summingbird gives eventual consistency in a manner that is easy for the programmer to reason about.

Sam Ritchie

June 20, 2013

  1. def wordCount[P <: Platform[P]]( source: Producer[P, Tweet], store: P#Store[String, Long])

    source .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) } .sumByKey(store)
  2. public class WordCount { public static class Map extends Mapper<LongWritable,

    public class WordCount { public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); context.write(word, one); } } } public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> { public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } context.write(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "wordcount"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } }
  3. public class WordCountTopology { public static class SplitSentence extends ShellBolt

    public class WordCountTopology { public static class SplitSentence extends ShellBolt implements IRichBolt { public SplitSentence() { super("python", "splitsentence.py"); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("word")); } @Override public Map<String, Object> getComponentConfiguration() { return null; } } public static class WordCount extends BaseBasicBolt { Map<String, Integer> counts = new HashMap<String, Integer>(); @Override public void execute(Tuple tuple, BasicOutputCollector collector) { String word = tuple.getString(0); Integer count = counts.get(word); if(count==null) count = 0; count++; counts.put(word, count); collector.emit(new Values(word, count)); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("word", "count")); } } public static void main(String[] args) throws Exception { TopologyBuilder builder = new TopologyBuilder(); builder.setSpout("spout", new RandomSentenceSpout(), 5); builder.setBolt("split", new SplitSentence(), 8) .shuffleGrouping("spout"); builder.setBolt("count", new WordCount(), 12) .fieldsGrouping("split", new Fields("word")); Config conf = new Config(); conf.setDebug(true); if(args!=null && args.length > 0) { conf.setNumWorkers(3); StormSubmitter.submitTopology(args[0], conf, builder.createTopology()); } else { conf.setMaxTaskParallelism(3); LocalCluster cluster = new LocalCluster(); cluster.submitTopology("word-count", conf, builder.createTopology()); Thread.sleep(10000); cluster.shutdown(); } } }
  4. def wordCount[P <: Platform[P]]( source: Producer[P, Tweet], store: P#Store[String, Long])

    source .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) } .sumByKey(store) // Running in Hadoop (via Scalding): Scalding.run { wordCount[Scalding]( Scalding.source[Tweet]("source_data_location"), Scalding.store[String, Long]("word_count_location") ) } // Running in Storm: Storm.run { wordCount[Storm](new TweetSpout(), new MemcacheStore[String, Long]) }
  5. def flatMap[T, U](fn: T => List[U]): List[U] def map[T, U](fn:

    def flatMap[T, U](fn: T => List[U]): List[U] def map[T, U](fn: T => U): List[U] def filter[T](fn: T => Boolean): List[T] The "Map" of MapReduce
  6. def wordCount[P <: Platform[P]]( source: Producer[P, Tweet], store: P#Store[String, Long])

    source .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) } .sumByKey(store) "Bears up in my room" ("just" -> 1) ("setting" -> 1) ("up" -> 1) (my" -> 1) ("twttr" -> 1) "Just setting up my twttr" ("Bears" -> 1) ("up" -> 1) ("in" -> 1) (my" -> 1) ("room" -> 1) ("Bears" -> (1)) ("just" -> (1)) ("setting" -> (1)) ("up" -> (1, 1)) ("in" -> (1)) ("my" -> (1, 1)) ("room" -> (1)) ("twttr" -> (1)) P#Store[String, Long] Producer[P, Tweet] ("Bears" -> 1) ("just" -> 1) ("setting" -> 1) ("up" -> 2) ("in" -> 1) ("my" -> 2) ("room" -> 1) ("twttr" -> 1)
  7. Producer Store Producer[T] T => List[(K, V)] Store[K, V] Client

    Producer Store Producer[T] T => List[(K, V)] Store[K, V] Client flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce (K, List[V]) => (K, V)
  8. Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce

    Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop
  9. def wordCount[P <: Platform[P]]( source: Producer[P, Tweet], store: P#Store[(Long, String),

    source .flatMap { tweet => tweet.getText.split("\\s+") .map { word => ((tweet.getHour, word) -> 1L) } }.sumByKey(store) Key: (Long, String) Value: Long
  10. 0 2 4 5 7 1 2 3 4 5

    "ape" by Hour (Hadoop)
  11. 0 2 4 5 7 1 2 3 4 5

    "ape" by Hour (Hadoop)
  12. 0 2 4 5 7 1 2 3 4 5

    "ape" by Hour (Storm)
  13. 0 1 3 4 5 1 2 3 4 5

    "ape" by Hour (Faulty Storm)
  14. 0 2 4 5 7 1 2 3 4 5

    "ape" by Hour (Hadoop) "ape" by Hour (Storm) "ape" by Hour (Merged)
  15. 0 2 4 5 7 1 2 3 4 5

    "ape" by Hour (Merged)
  16. Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce

    Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop
  17. a + b + c + d (a + b)

    a + b + c + d (a + b) + (c + d) (a + b + c) + d a + (b + c + d)
  18. /** * 3 + 3 ==> 6 */ class AddingMonoid[T

    /** * 3 + 3 ==> 6 */ class AddingMonoid[T <: Numeric] extends Monoid[T] { override val zero = 0 override def plus(l: T, r: T) = l + r }
  19. /** * Set(1,2,3) + Set(2,3,4) * ==> Set(1,2,3,4) */ class

    /** * Set(1,2,3) + Set(2,3,4) * ==> Set(1,2,3,4) */ class SetMonoid[T] extends Monoid[Set[T]] { override val zero = Set.empty[T] override def plus(l: Set[T], r: Set[T]) = l.concatenate(r) }
  20. /** * The value type’s monoid is used recursively: *

    /** * The value type's monoid is used recursively: * * Map("a" -> 1, "b" -> 1) + Map("a" -> 2, "c" -> 1) * ==> Map("a" -> 3, "b" -> 1, "c" -> 1) */ class MapMonoid[K, V](vMonoid: Monoid[V]) extends Monoid[Map[K, V]] { override val zero = Map.empty[K, V] override def plus(x: Map[K, V], y: Map[K, V]) = l.foldLeft(r) { (oldMap, (k, leftV)) => oldMap + (r.get(k) match { case Some(rightV) => vMonoid.plus(leftV, rightV) case None => leftV }) } }
  21. /** * 3 “+” 3 ==> 9 // wtf? */

    /** * 3 "+" 3 ==> 9 // wtf? */ class MultiplyingMonoid[T <: Numeric] extends Monoid[T] { override val zero = 1 override def plus(l: T, r: T) = l * r }
  22. Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce

    Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop
  23. // Timeline Service (UserID, Hour), List[Tweet] // Impression Counts per

    URL [TweetID, CountMinSketch[URL, ImpCount]] Friday, June 21, 13
  24. Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce

    Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop
  25. trait Bijection[A, B] extends (A => B) { def apply(a:

    trait Bijection[A, B] extends (A => B) { def apply(a: A): B def invert(b: B): A } trait Injection[A, B] extends (A => B) { def apply(a: A): B def invert(b: B): Option[A] }
  26. class LongToBytes extends Injection[Long, Array[Byte]] { val size = 8

    class LongToBytes extends Injection[Long, Array[Byte]] { val size = 8 def apply(value: Long) = { val buf = ByteBuffer.allocate(size) buf.putLong(value) buf.array } override def invert(b: Array[Byte]) = try { Some(ByteBuffer.wrap(b).getLong) } catch { case _ => None } }
  27. val injection = Injection.connect[Long, String, Array[Byte], Base64String] scala> injection.apply(243L) res17:

    com.twitter.bijection.Base64String = Base64String(MjQz) scala> injection.invert(res17) res18: Option[Long] = Some(243) Friday, June 21, 13
  28. class BijectedMonoid[T, U](monoid: Monoid[T], bij: Bijection[T, U]) extends Monoid[U] {

    class BijectedMonoid[T, U](monoid: Monoid[T], bij: Bijection[T, U]) extends Monoid[U] { override def zero: U = monoid.zero.as[U] override def plus(l: U, r: U): U = monoid.plus(l.as[T], r.as[T]).as[U] } class MonoidBijection[T, U](monoid: Monoid[T], bij: Bijection[T, U]) extends Bijection[Monoid[T], Monoid[U]] { override def apply(mon: Monoid[T]) = new BijectedMonoid[T, U](mon, bij) override def invert(mon: Monoid[U]) = new BijectedMonoid[U, T](mon, bij.inverse) }
  29. val englishMonoid = intMonoid.as[Monoid[EnglishInt]] scala> englishMonoid.plus(1.as[EnglishInt], 2.as[EnglishInt]) res21: EnglishInt =

    EnglishInt("three") scala> englishMonoid.plus(4321.as[EnglishInt], 1234.as[EnglishInt]) res22: EnglishInt = EnglishInt("five thousand five hundred fifty five") Friday, June 21, 13
  30. Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce

    Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop
  31. Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce

    Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Store
  32. trait ReadableStore[-K, +V] extends Closeable { def get(k: K): Future[Option[V]]

    trait ReadableStore[-K, +V] extends Closeable { def get(k: K): Future[Option[V]] = multiGet(Set(k)).apply(k) def multiGet[K1 <: K](ks: Set[K1]): Map[K1, Future[Option[V]]] = ks.map { k => (k, self.get(k)) }.toMap override def close { } }
  33. class ReadableStoreMonoid[K, V](monoid: Monoid[V]) extends Monoid[ReadableStore[K, V]] { def zero

    class ReadableStoreMonoid[K, V](monoid: Monoid[V]) extends Monoid[ReadableStore[K, V]] { def zero = ReadableStore.const(Monoid.zero[V]) def plus(l: ReadableStore[K, V], r: ReadableStore[K,V]) = new ReadableStore[K, V] { override def get(k: K) = Monoid.plus(l.get(k), r.get(k)) } }
  34. // Store giving us access to hadoop data val offlineStore:

    // Store giving us access to hadoop data val offlineStore: ReadableStore[(Int, String), Long] // Store containing storm data val onlineStore: ReadableStore[(Int, String), Long] // Boom! Both! val combinedStore: ReadableStore[(Int, String), Long] = Monoid.plus(offlineStore, onlineStore) // Wuh? val cachedStore: ReadableStore[(Int, String), Long] = combinedStore.withCache(Cache.empty)
  35. Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce

    Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Store