Slide 1

Slide 1 text

Streaming MapReduce at Twitter Sam Ritchie (@sritchie) Friday, June 21, 13

Slide 2

Slide 2 text

Summingbird Friday, June 21, 13

Slide 3

Slide 3 text

def wordCount[P <: Platform[P]]( source: Producer[P, Tweet], store: P#Store[String, Long]) = source .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) } .sumByKey(store) Friday, June 21, 13

Slide 4

Slide 4 text

public class WordCount { public static class Map extends Mapper { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); while (tokenizer.hasMoreTokens()) { word.set(tokenizer.nextToken()); context.write(word, one); } } } public static class Reduce extends Reducer { public void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } context.write(key, new IntWritable(sum)); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "wordcount"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } } Friday, June 21, 13

Slide 5

Slide 5 text

public class WordCountTopology { public static class SplitSentence extends ShellBolt implements IRichBolt { public SplitSentence() { super("python", "splitsentence.py"); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("word")); } @Override public Map getComponentConfiguration() { return null; } } public static class WordCount extends BaseBasicBolt { Map counts = new HashMap(); @Override public void execute(Tuple tuple, BasicOutputCollector collector) { String word = tuple.getString(0); Integer count = counts.get(word); if(count==null) count = 0; count++; counts.put(word, count); collector.emit(new Values(word, count)); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("word", "count")); } } public static void main(String[] args) throws Exception { TopologyBuilder builder = new TopologyBuilder(); builder.setSpout("spout", new RandomSentenceSpout(), 5); builder.setBolt("split", new SplitSentence(), 8) .shuffleGrouping("spout"); builder.setBolt("count", new WordCount(), 12) .fieldsGrouping("split", new Fields("word")); Config conf = new Config(); conf.setDebug(true); if(args!=null && args.length > 0) { conf.setNumWorkers(3); StormSubmitter.submitTopology(args[0], conf, builder.createTopology()); } else { conf.setMaxTaskParallelism(3); LocalCluster cluster = new LocalCluster(); cluster.submitTopology("word-count", conf, builder.createTopology()); Thread.sleep(10000); cluster.shutdown(); } } } Friday, June 21, 13

Slide 6

Slide 6 text

def wordCount[P <: Platform[P]]( source: Producer[P, Tweet], store: P#Store[String, Long]) = source .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) } .sumByKey(store) // Running in Hadoop (via Scalding): Scalding.run { wordCount[Scalding]( Scalding.source[Tweet]("source_data_location"), Scalding.store[String, Long]("word_count_location") ) } // Running in Storm: Storm.run { wordCount[Storm](new TweetSpout(), new MemcacheStore[String, Long]) } Friday, June 21, 13

Slide 7

Slide 7 text

def flatMap[T, U](fn: T => List[U]): List[U] def map[T, U](fn: T => U): List[U] def filter[T](fn: T => Boolean): List[T] The “Map” of MapReduce Friday, June 21, 13

Slide 8

Slide 8 text

def wordCount[P <: Platform[P]]( source: Producer[P, Tweet], store: P#Store[String, Long]) = source .flatMap { tweet => tweet.getText.split("\\s+").map(_ -> 1L) } .sumByKey(store) ”Bears up in my room” (”just” -> 1) (“setting” -> 1) (“up” -> 1) (my” -> 1) (“twttr” -> 1) “Just setting up my twttr” (”Bears” -> 1) (“up” -> 1) (“in” -> 1) (my” -> 1) (“room” -> 1) (“Bears” -> (1)) (”just” -> (1)) (“setting” -> (1)) (“up” -> (1, 1)) (“in” -> (1)) (“my” -> (1, 1)) (“room” -> (1)) (“twttr” -> (1)) P#Store[String, Long] Producer[P, Tweet] (“Bears” -> 1) (”just” -> 1) (“setting” -> 1) (“up” -> 2) (“in” -> 1) (“my” -> 2) (“room” -> 1) (“twttr” -> 1) Friday, June 21, 13

Slide 9

Slide 9 text

Producer Store Producer[T] T => List[(K, V)] Store[K, V] Client flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce (K, List[V]) => (K, V) Friday, June 21, 13

Slide 10

Slide 10 text

Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Friday, June 21, 13

Slide 11

Slide 11 text

Friday, June 21, 13

Slide 12

Slide 12 text

def wordCount[P <: Platform[P]]( source: Producer[P, Tweet], store: P#Store[(Long, String), Long]) = source .flatMap { tweet => tweet.getText.split("\\s+") .map { word => ((tweet.getHour, word) -> 1L) } }.sumByKey(store) Key: (Long, String) Value: Long Friday, June 21, 13

Slide 13

Slide 13 text

0 2 4 5 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 “ape” by Hour (Hadoop) Friday, June 21, 13

Slide 14

Slide 14 text

0 2 4 5 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 “ape” by Hour (Hadoop) Friday, June 21, 13

Slide 15

Slide 15 text

0 2 4 5 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 “ape” by Hour (Storm) Friday, June 21, 13

Slide 16

Slide 16 text

0 1 3 4 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 “ape” by Hour (Faulty Storm) Friday, June 21, 13

Slide 17

Slide 17 text

0 2 4 5 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 “ape” by Hour (Hadoop) “ape” by Hour (Storm) “ape” by Hour (Merged) Friday, June 21, 13

Slide 18

Slide 18 text

0 2 4 5 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 “ape” by Hour (Merged) Friday, June 21, 13

Slide 19

Slide 19 text

Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Friday, June 21, 13

Slide 20

Slide 20 text

The “Reduce” of MapReduce What can we aggregate? Friday, June 21, 13

Slide 21

Slide 21 text

trait Monoid[V] { def zero: V def plus(l: V, r: V): V } Friday, June 21, 13

Slide 22

Slide 22 text

a + b + c + d (a + b) + (c + d) (a + b + c) + d a + (b + c + d) Friday, June 21, 13

Slide 23

Slide 23 text

Friday, June 21, 13

Slide 24

Slide 24 text

/** * 3 + 3 ==> 6 */ class AddingMonoid[T <: Numeric] extends Monoid[T] { override val zero = 0 override def plus(l: T, r: T) = l + r } Friday, June 21, 13

Slide 25

Slide 25 text

/** * Set(1,2,3) + Set(2,3,4) * ==> Set(1,2,3,4) */ class SetMonoid[T] extends Monoid[Set[T]] { override val zero = Set.empty[T] override def plus(l: Set[T], r: Set[T]) = l.concatenate(r) } Friday, June 21, 13

Slide 26

Slide 26 text

/** * The value type’s monoid is used recursively: * * Map("a" -> 1, "b" -> 1) + Map("a" -> 2, "c" -> 1) * ==> Map("a" -> 3, "b" -> 1, "c" -> 1) */ class MapMonoid[K, V](vMonoid: Monoid[V]) extends Monoid[Map[K, V]] { override val zero = Map.empty[K, V] override def plus(x: Map[K, V], y: Map[K, V]) = l.foldLeft(r) { (oldMap, (k, leftV)) => oldMap + (r.get(k) match { case Some(rightV) => vMonoid.plus(leftV, rightV) case None => leftV }) } } Friday, June 21, 13

Slide 27

Slide 27 text

/** * 3 “+” 3 ==> 9 // wtf? */ class MultiplyingMonoid[T <: Numeric] extends Monoid[T] { override val zero = 1 override def plus(l: T, r: T) = l * r } Friday, June 21, 13

Slide 28

Slide 28 text

Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Friday, June 21, 13

Slide 29

Slide 29 text

// Timeline Service (UserID, Hour), List[Tweet] // Impression Counts per URL [TweetID, CountMinSketch[URL, ImpCount]] Friday, June 21, 13

Slide 30

Slide 30 text

https://github.com/twitter/algebird Friday, June 21, 13

Slide 31

Slide 31 text

Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Friday, June 21, 13

Slide 32

Slide 32 text

trait Bijection[A, B] extends (A => B) { def apply(a: A): B def invert(b: B): A } trait Injection[A, B] extends (A => B) { def apply(a: A): B def invert(b: B): Option[A] } Friday, June 21, 13

Slide 33

Slide 33 text

class LongToBytes extends Injection[Long, Array[Byte]] { val size = 8 def apply(value: Long) = { val buf = ByteBuffer.allocate(size) buf.putLong(value) buf.array } override def invert(b: Array[Byte]) = try { Some(ByteBuffer.wrap(b).getLong) } catch { case _ => None } } Friday, June 21, 13

Slide 34

Slide 34 text

Injection.connect[CountMinSketch, ThriftCountMinSketch, Array[Byte]] Friday, June 21, 13

Slide 35

Slide 35 text

val injection = Injection.connect[Long, String, Array[Byte], Base64String] scala> injection.apply(243L) res17: com.twitter.bijection.Base64String = Base64String(MjQz) scala> injection.invert(res17) res18: Option[Long] = Some(243) Friday, June 21, 13

Slide 36

Slide 36 text

class BijectedMonoid[T, U](monoid: Monoid[T], bij: Bijection[T, U]) extends Monoid[U] { override def zero: U = monoid.zero.as[U] override def plus(l: U, r: U): U = monoid.plus(l.as[T], r.as[T]).as[U] } class MonoidBijection[T, U](monoid: Monoid[T], bij: Bijection[T, U]) extends Bijection[Monoid[T], Monoid[U]] { override def apply(mon: Monoid[T]) = new BijectedMonoid[T, U](mon, bij) override def invert(mon: Monoid[U]) = new BijectedMonoid[U, T](mon, bij.inverse) } Friday, June 21, 13

Slide 37

Slide 37 text

scala> (1 to 10).map(_.as[EnglishInt]).foreach(println(_)) EnglishInt("one") EnglishInt("two") EnglishInt("three") EnglishInt("four") EnglishInt("five") EnglishInt("six") EnglishInt("seven") EnglishInt("eight") EnglishInt("nine") EnglishInt("ten") Friday, June 21, 13

Slide 38

Slide 38 text

val englishMonoid = intMonoid.as[Monoid[EnglishInt]] scala> englishMonoid.plus(1.as[EnglishInt], 2.as[EnglishInt]) res21: EnglishInt = EnglishInt("three") scala> englishMonoid.plus(4321.as[EnglishInt], 1234.as[EnglishInt]) res22: EnglishInt = EnglishInt("five thousand five hundred fifty five") Friday, June 21, 13

Slide 39

Slide 39 text

Friday, June 21, 13

Slide 40

Slide 40 text

https://github.com/twitter/bijection Friday, June 21, 13

Slide 41

Slide 41 text

Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Friday, June 21, 13

Slide 42

Slide 42 text

Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Store Friday, June 21, 13

Slide 43

Slide 43 text

trait ReadableStore[-K, +V] extends Closeable { def get(k: K): Future[Option[V]] = multiGet(Set(k)).apply(k) def multiGet[K1 <: K](ks: Set[K1]): Map[K1, Future[Option[V]]] = ks.map { k => (k, self.get(k)) }.toMap override def close { } } Friday, June 21, 13

Slide 44

Slide 44 text

class ReadableStoreMonoid[K, V](monoid: Monoid[V]) extends Monoid[ReadableStore[K, V]] { def zero = ReadableStore.const(Monoid.zero[V]) def plus(l: ReadableStore[K, V], r: ReadableStore[K,V]) = new ReadableStore[K, V] { override def get(k: K) = Monoid.plus(l.get(k), r.get(k)) } } Friday, June 21, 13

Slide 45

Slide 45 text

// Store giving us access to hadoop data val offlineStore: ReadableStore[(Int, String), Long] // Store containing storm data val onlineStore: ReadableStore[(Int, String), Long] // Boom! Both! val combinedStore: ReadableStore[(Int, String), Long] = Monoid.plus(offlineStore, onlineStore) // Wuh? val cachedStore: ReadableStore[(Int, String), Long] = combinedStore.withCache(Cache.empty) Friday, June 21, 13

Slide 46

Slide 46 text

Client Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Producer Store flatMap flatMap flatMap flatMap Producer Producer Reduce Reduce Storm Hadoop Store Friday, June 21, 13

Slide 47

Slide 47 text

Friday, June 21, 13

Slide 48

Slide 48 text

https://github.com/twitter/storehaus Friday, June 21, 13

Slide 49

Slide 49 text

Friday, June 21, 13

Slide 50

Slide 50 text

•Open Source the Glue •New Execution Platforms •Smarter Systems What’s Next? Friday, June 21, 13

Slide 51

Slide 51 text

https://github.com/twitter/algebird https://github.com/twitter/bijection https://github.com/twitter/storehaus https://github.com/twitter/summingbird (July!) Friday, June 21, 13