Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Stream processing made easy with riko

Stream processing made easy with riko

Talk on stream processing given at PyConZA16

Reuben Cummings

October 06, 2016
Tweet

More Decks by Reuben Cummings

Other Decks in Programming

Transcript

  1. Stream processing made easy with riko PyConZA - Cape Town,

    SA Oct 6, 2016 by Reuben Cummings @reubano #PyConZA16
  2. Who am I? Managing Director, Nerevu Development Programming in Python

    since 2011 Author of several popular packages Flask over Django, Twisted over Tornado, functions over classes @reubano #PyConZA16
  3. >>> 'abracadabra'[0] >>> range(1, 11)[0] >>> 'hello pycon attendees'.split(' ')[0]

    >>> [{'x': x} for x in range(4)][0] 'a' 'hello' 1 {'x': 0} >>> ({'num': x} for x in range(4)) >>> next({'num': x} for x in range(4)) <generator object <genexpr> at 0x103c10830> {'num': 0}
  4. >>> [ord(x) for x in 'abracadabra'] >>> [2 * x

    for x in range(1, 11)] >>> [x for x in range(1, 11) if x > 5] [97, 98, 114, 97, 99, 97, 100, 97, 98, 114, 97] [2, 4, 6, 8, 10, 12, 14, 16, 18, 20] [6, 7, 8, 9, 10] >>> stream = ({'num': x} for x in range(4)) >>> sum(s['num'] for s in stream) 6
  5. What is Lorem Ipsum? Lorem Ipsum is simply dummy text

    of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap...
  6. >>> import re >>> from mrjob.job import MRJob >>> from

    mrjob.step import MRStep >>> >>> WORD_RE = re.compile(r"[\w']+") >>> >>> >>> class MRWordCount(MRJob): ... def steps(self): ... kwargs = { ... 'mapper': self.mapper, ... 'combiner': self.combiner, ... 'reducer': self.reducer} ... ... return [MRStep(**kwargs)]
  7. ... def mapper(self, _, line): ... for word in WORD_RE.findall(line):

    ... yield word.lower(), 1 ... ... def combiner(self, word, counts): ... yield word, sum(counts) ... ... def reducer(self, word, counts): ... yield word, sum(counts) >>> >>> if __name__ == '__main__': ... MRWordCount.run()
  8. >>> from operator import add >>> from pyspark.sql import SparkSession

    >>> >>> spark = SparkSession.builder.getOrCreate() >>> >>> stream = (spark.read.text('hdfs://file.txt').rdd ... .flatMap(lambda line: line.split(' ')) ... .map(lambda word: (word.lower(), 1)) ... .reduceByKey(add) ... .collect()) >>> >>> stream[0] ('"de', 1)
  9. >>> from riko.collections.sync import SyncPipe >>> >>> url = 'file:///file.txt'

    >>> conf = {'delimiter': ' '} >>> rule = {'transform': 'lower'} >>> >>> next(stream) >>> next(stream) {'"de': 1} {'"lorem': 1} >>> stream = (SyncPipe('fetchtext', conf={'url': url}) ... .stringtokenizer(conf=conf, emit=True) ... .strtransform(conf={'rule': rule}) ... .count(conf={'count_key': 'strtransform'}) ... .output)
  10. >>> from riko.modules.fetchdata import pipe >>> url = 'data.code4sa.org/resource/6rgz-ak57.json' >>>

    stream = pipe(conf={'url': url}) >>> next(stream) >>> next(stream) {'crime': 'All theft not mentioned elsewhere', 'incidents': '2266.0', 'police_station': 'Bellville', 'province': 'WC', 'year': '2014'} {'crime': 'Drug-related crime', 'incidents': '2578.0', 'police_station': 'Bishop Lavis', 'province': 'WC', 'year': '2014'}
  11. >>> sort_conf = { ... 'rule': { ... 'sort_key': 'incidents',

    'sort_dir': 'desc'}} >>> filter_conf = { ... 'rule': { ... 'field': 'province', ... 'op': 'eq', ... 'value':'GP'}} >>> from riko.collections.sync import SyncPipe >>> stream = (SyncPipe('fetchdata', conf={'url': url}) ... .filter(conf=filter_conf) ... .sort(conf=sort_conf) ... .truncate(conf={'count': '5'}) ... .output)
  12. >>> next(stream) {'crime': 'All theft not mentioned elsewhere', 'incidents': '3339.0',

    'police_station': 'Pretoria Central', 'province': 'GP', 'year': '2014'} {'crime': 'Drug-related crime', 'incidents': '3125.0', 'police_station': 'Eldorado Park', 'province': 'GP', 'year': '2014'} >>> next(stream)
  13. >>> from riko.modules import fetchdata, join >>> >>> url2 =

    'data.code4sa.org/resource/qtx7-xbrs.json' >>> stream = fetchdata.pipe(conf={'url': url}) >>> stream2 = fetchdata.pipe(conf={'url': url2}) >>> conf = { ... 'join_key': 'police_station', ... 'other_join_key': 'station'} >>> joined = join.pipe(stream, conf=conf, other=stream2) {'station': 'Aberdeen', 'sum_2014_2015': '1153'} >>> next(stream2)
  14. >>> next(joined) {'crime': 'All theft not mentioned elsewhere', 'incidents': '2266.0',

    'police_station': 'Bellville', 'province': 'WC', 'station': 'Bellville', 'sum_2014_2015': '28989', 'year': '2014'} stream data stream2 data
  15. {'crime': 'Drug-related crime', 'incidents': '2578.0', 'police_station': 'Bishop Lavis', 'province': 'WC',

    'station': 'Bishop Lavis', 'sum_2014_2015': '24983', 'year': '2014'} >>> next(joined)
  16. >>> from riko.bado import coroutine, react >>> from riko.collections.async import

    AsyncCollection >>> sources = [ ... {'url': url, 'type': 'fetchdata'}, ... {'url': url2, 'type': 'fetchdata'}] >>> flow = AsyncCollection(sources) >>> >>> @coroutine >>> def run(reactor): ... stream = yield flow.async_fetch() ... print(next(stream)) >>> >>> react(run) {'police_station': 'Bellville', 'crime': 'All theft...', 'year': '2014', 'province': 'WC', 'incidents': '2266.0'}
  17. >>> flow = SyncCollection(sources, parallel=True) >>> stream = flow.list >>>

    stream[0] {'crime': 'All theft not mentioned elsewhere', 'incidents': '2266.0', 'police_station': 'Bellville', 'province': 'WC', 'year': '2014'} >>> stream[-1] {'station': 'Tierpoort', 'sum_2014_2015': '327'} >>> from riko.collections.sync import SyncCollection >>>
  18. >>> kwargs = {'parallel': True, 'threads': False} >>> flow =

    SyncCollection(sources, **kwargs) >>> stream = flow.list >>> stream[0] {'crime': 'All theft not mentioned elsewhere', 'incidents': '2266.0', 'police_station': 'Bellville', 'province': 'WC', 'year': '2014'} >>> stream[-1] {'station': 'Tierpoort', 'sum_2014_2015': '327'}
  19. Head to Head Spark, etc. Huginn riko installation complex moderate

    simple push/pull push push pull native ingestors few many many parallel ✔ ✔ ✔ async ✔ distributed ✔ @reubano #PyConZA16
  20. >>> from riko.modules.fetchsitefeed import pipe >>> >>> url = 'arstechnica.com/rss-feeds/'

    >>> stream = pipe(conf={'url': url}) >>> item = next(stream) >>> item.keys() dict_keys(['tags', 'summary_detail','author.name', 'y:published', 'content', 'title', 'pubDate', 'id', 'summary', 'authors','links', 'y:id', 'author', 'link','published']) >>> item['title'], item['author'], item['id'] ('Gravity doesn’t care about quantum spin', 'Chris Lee', 'http://arstechnica.com/?p=924009')