Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Quantum of Data: A data science journey

Quantum of Data: A data science journey

A data science talk given at Python Exposé, Nairobi.

Reuben Cummings

April 01, 2017
Tweet

More Decks by Reuben Cummings

Other Decks in Programming

Transcript

  1. Quantum of Data A data science journey Python Exposé •

    Nairobi, Kenya April 1, 2017 by Reuben Cummings
  2. Who am I? Managing Director, Nerevu Development Founder of Arusha

    Coders Author of several popular Python packages reubano on Twitter and GitHub
  3. Ransom Note If you want to see your shortcake again,

    visit bitly.com/ pyexpose for further instructions
  4. To obtain the key, first get the number of attendees

    from the previous meetups. decryptme.txt.gpg readme.txt
  5. from html.parser import HTMLParser from itertools import chain def handle_starttag(self,

    tag, attrs): entry = dict(attrs) if entry.get('class') == 'event-rating': self.match = True class AttendanceParser(HTMLParser): def reset(self): HTMLParser.reset(self) self.match = False self.nums = iter([])
  6. from html.parser import HTMLParser from itertools import chain class AttendanceParser(HTMLParser):

    ... def handle_data(self, data): num = data.strip() if self.match and num: self.nums = chain(self.nums, [int(num)]) self.match = False
  7. from urllib.request import urlopen BASE = 'https://www.meetup.com/Python-Nairobi' BASE_URL = '{base}/events/past/?page={page}'

    >>> extract_attendance() def extract_attendance(): parser = AttendanceParser() url = BASE_URL.format(base=BASE, page=0) f = urlopen(url) encoding = f.info().get_content_charset() [parser.feed(line.decode(encoding)) for line in f] return list(parser.nums) [65, 83, 50, 64, 46]
  8. def extract_attendance(): parser = AttendanceParser() # Inner loop to parse

    each line for line in f: parser.feed(line.decode(encoding)) yield from parser.nums # Outer loop to extract each page for page in range(5): url = BASE_URL.format(base=BASE, page=page) f = urlopen(url) encoding = f.info().get_content_charset() >>> len(list(extract_attendance())) 25
  9. Hint #2 This code is available at bitly.com/ pyexpose-attendance python

    extract-attendance.py in a shell, enter the command:
  10. >>> printable = [ ...: chr(x) for x in range(150)

    ...: if chr(x).isprintable()] >>> len(printable) >>> [ ...: (x, chr(x)) for x in range(150) ...: if chr(x).isprintable()] >>> ''.join(printable[num] for num in attendance) [(32, ' '), (33, '!'), (34, '"'), (35, '#')...] 'asR`N_WMH.153F24682579(?7' 95
  11. Decrypted message Your shortcake is at a cafe in Nairobi

    that shares an object with a snake in this flickr group https://www.flickr.com/groups/1329313@N21/ Find the first photo taken by the most prolific group member in 2017.
  12. >>> from riko.collections import SyncPipe >>> >>> BASE = 'https://api.flickr.com/services/feeds'

    >>> BASE_URL = '{}/groups_pool.gne?id=1329313@N21' >>> conf = {'url': BASE_URL.format(BASE)} >>> stream = SyncPipe('fetch', conf=conf).output >>> next(stream)
  13. {'author.name': 'Sharon B Mott', 'link': 'https://www.flickr.com/photos/...', 'pubDate': time.struct_time(tm_year=2017, tm_mo,...), 'tags':

    [ {'label': None, 'scheme': 'https://www.flickr.com/photos/tags/', 'term': 'boaconstrictor'}, {'label': None, 'scheme': 'https://www.flickr.com/photos/tags/', 'term': 'boa'}, ... ], 'title': 'Hints of blue', ... }
  14. >>> from datetime import datetime as dt >>> 15 >>>

    stream = ( ...: SyncPipe('fetch', conf=conf) ...: .filter(conf={'rule': rule}) ...: .list) >>> len(stream) >>> rule = { ...: 'field': 'pubDate', ...: 'op': 'after', ...: 'value': dt(2016, 12, 31)}
  15. >>> creators = [ ...: item.get('author.name') for item in stream]

    ['Sharon B Mott', 'baker.cameron43', 'stevekpriest', 'TessaSmits', 'TessaSmits', 'TessaSmits', 'Sharon B Mott', ... ] >>> creators
  16. >>> from collections import Counter >>> Counter({'Jesonis|Photography_On/Off (super busy)': 1,

    'Sabrina Filipiak Vasseur': 3, 'Sharon B Mott': 5, 'TessaSmits': 3, 'baker.cameron43': 2, 'stevekpriest': 1}) >>> c = Counter(creators) >>> c >>> c.most_common(1) [('Sharon B Mott', 5)] >>> top_creator = c.most_common(1)[0][0]
  17. Hint #9 This code is available at bitly.com/ pyexpose-flickr python

    get-flickr-link.py in a shell, enter the command:
  18. Hint #10 Your shortcake is at a cafe in Nairobi

    that shares an object with a snake in this flickr group