Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Stream processing made easy with riko

Stream processing made easy with riko

8th Light University talk on stream processing with riko

Reuben Cummings

May 30, 2017
Tweet

More Decks by Reuben Cummings

Other Decks in Programming

Transcript

  1. Stream processing made easy with riko 8th Light University -

    Chicago, IL May 30, 2017 by Reuben Cummings @reubano #8LU
  2. @reubano #8LU Who am I? Managing Director, Nerevu Development Programming

    in Python since 2011 Author of several popular packages
  3. >>> 'abracadabra'[0] 'a' >>> 'hello 8th Light University'.split(' ')[0] 'hello'

    >>> range(1, 11)[0] 1 >>> [{'x': x} for x in range(4)][0] {'x': 0} >>> ({'x': x} for x in range(4)) <generator object <genexpr> at 0x103c10830> >>> next({'x': x} for x in range(4)) {'x': 0}
  4. >>> [ord(x) for x in 'abracadabra'] [97, 98, 114, 97,

    99, 97, 100, 97, 98, 114, 97] >>> [2 * x for x in range(1, 11)] [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] >>> [x for x in range(1, 11) if x > 5] [6, 7, 8, 9, 10] >>> stream = ({'num': x} for x in range(4)) >>> sum(s['num'] for s in stream) 6
  5. $ cat file.txt What is Lorem Ipsum? Lorem Ipsum is

    simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap...
  6. >>> from mrjob.job import MRJob >>> from mrjob.step import MRStep

    >>> >>> >>> class MRWordCount(MRJob): ... def steps(self): ... kwargs = { ... 'mapper': self.mapper, ... 'combiner': self.combiner, ... 'reducer': self.reducer} ... ... return [MRStep(**kwargs)] ... ... def mapper(self, _, line):
  7. >>> >>> class MRWordCount(MRJob): ... def steps(self): ... kwargs =

    { ... 'mapper': self.mapper, ... 'combiner': self.combiner, ... 'reducer': self.reducer} ... ... return [MRStep(**kwargs)] ... ... def mapper(self, _, line): ... for word in line.split(' '): ... yield word.lower(), 1 ... ... def combiner(self, word, counts):
  8. ... return [MRStep(**kwargs)] ... ... def mapper(self, _, line): ...

    for word in line.split(' '): ... yield word.lower(), 1 ... ... def combiner(self, word, counts): ... yield word, sum(counts) ... ... def reducer(self, word, counts): ... yield word, sum(counts) >>> >>> if __name__ == '__main__': ... MRWordCount.run()
  9. $ python hadoop_job.py file.txt '1500s' 1 'a' 2 'an' 1

    'and' 1 'been' 1 'book' 1 'but' 1 'centuries' 1 'dummy' 1 'ever' 1 'five' 1 'galley' 1
  10. >>> from operator import add >>> from pyspark.sql import SparkSession

    >>> >>> spark = SparkSession.builder.getOrCreate() >>> fpath = 'hdfs:///file.txt' >>> >>> stream = (spark.read.text(fpath).rdd ... .flatMap(lambda line: line.split(' ')) ... .map(lambda word: (word.lower(), 1)) ... .reduceByKey(add) ... .collect())
  11. >>> from riko.modules.fetch import pipe >>> >>> url = 'https://8thlight.com/blog/feed'

    >>> url += '/rss.xml' >>> stream = pipe(conf={'url': url}) >>> item = next(stream) >>> item['author'] 'Rabea Gleissner' >>> item['published'] 'Fri, 26 May 2017 00:00:00 -0500' >>> item['title'] 'How to set up a React project without flipping tables'
  12. >>> from riko.collections import SyncPipe >>> >>> frule = {

    ... 'field': 'title', ... 'op': 'contains', ... 'value':'erlang'} >>> >>> stream = ( ... SyncPipe('fetch', conf={'url': url}) ... .filter(conf={'rule': frule}) ... .output)
  13. >>> item = next(stream) >>> item['title'] 'The Core of Erlang'

    >>> item['tags'][0]['term'] 'Coding' >>> item['link'] 'https://8thlight.com/blog/kofi-gumbs/ 2017/05/02/core-erlang.html'
  14. >>> conf ={'url': 'file:///file.txt'} >>> tconf = {'delimiter': ' '}

    >>> rule = {'transform': 'lower'} >>> cconf = {'count_key': 'strtransform'} >>> >>> stream = (SyncPipe('fetchtext', conf=conf) ... .tokenizer(conf=tconf, emit=True) ... .strtransform(conf={'rule': rule}) ... .count(conf=cconf) ... .output) >>> >>> next(stream) {'1500s': 1}
  15. >>> from riko.modules import xpathfetchpage >>> >>> pipe = xpathfetchpage.pipe

    >>> xpath = '/html/body/section/div/div[1]' >>> xpath += '/div/div/article/div[3]/div' >>> xpath += '/ul[1]/li/a' >>> >>> xconf = { ... 'url': item['link'], 'xpath': xpath} >>> >>> stream = pipe(conf=xconf)
  16. >>> next(stream) {'content': "Two Design Patterns You're Probably Already Using",

    'href': '/blog/becca-nelson/2017/05/22/two- design-patterns-youre-probably-already- using.html'}
  17. >>> kwargs = {'conf': xconf} >>> parts = [ ...

    {'value': 'http://8thlight.com'}, ... {'subkey': 'href'}] >>> >>> fconf = { ... 'url': {'subkey': 'strconcat'}, ... 'start': '<p>', 'end': '</p>'} >>> >>> stream = ( ... SyncPipe('xpathfetchpage', **kwargs) ... .strconcat(conf={'part': parts}) ... .fetchpage(conf=fconf) ... .output)
  18. >>> kwargs = {'conf': xconf} >>> parts = [ ...

    {'value': 'http://8thlight.com'}, ... {'subkey': 'href'}] >>> >>> fconf = { ... 'url': {'subkey': 'strconcat'}, ... 'start': '<p>', 'end': '</p>'} >>> >>> stream = ( ... SyncPipe('xpathfetchpage', **kwargs) ... .strconcat(conf={'part': parts}) ... .fetchpage(conf=fconf) ... .output)
  19. >>> next(stream)['content'].decode('utf-8') 'I came into this field from a very

    non- technical background. And when I say <em>very non-technical</em>, I mean that I was an elementary fine arts teacher. The most important calculation I performed on a daily basis was counting my kindergarteners when they lined up to leave to make sure I hadn’t lost any since the beginning of the class period.'
  20. >>> from time import monotonic >>> >>> start = monotonic()

    >>> count = len(list(stream)) >>> stop = monotonic() - start >>> count, stop (9, 0.4573155799989763)
  21. >>> kwargs = {'conf': xconf, 'parallel': True} >>> start =

    monotonic() >>> >>> stream = ( ... SyncPipe('xpathfetchpage', **kwargs) ... .strconcat(conf={'part': parts}) ... .fetchpage(conf=fconf) ... .output) >>> >>> count = len(list(stream)) >>> stop = monotonic() - start >>> count, stop (10, 0.2804829629985761)
  22. >>> from riko.bado import coroutine, react >>> from riko.collections import

    AsyncPipe >>> >>> @coroutine >>> def run(reactor): ... start = monotonic() ... flow = AsyncPipe( ... 'xpathfetchpage', **kwargs) ... ... stream = yield (flow ... .strconcat(conf={'part': parts}) ... .fetchpage(conf=fconf) ... .output) ...
  23. >>> @coroutine >>> def run(reactor): ... start = monotonic() ...

    flow = AsyncPipe( ... 'xpathfetchpage', **kwargs) ... ... stream = yield (flow ... .strconcat(conf={'part': parts}) ... .fetchpage(conf=fconf) ... .output) ... ... count = len(list(stream)) ... stop = monotonic() - start ... print((count, stop))
  24. @reubano #8LU Head to Head Spark, etc. Huginn riko installation

    complex moderate simple push/pull push push pull native ingestors few many many parallel ✔ ✔ ✔ async ✔ json serializable ✔ ✔ distributed ✔