Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Data Mining and Processing for fun and profit

Data Mining and Processing for fun and profit

Tutorial on data mining given at PyConZA16

Reuben Cummings

October 07, 2016
Tweet

More Decks by Reuben Cummings

Other Decks in Programming

Transcript

  1. Data Mining and Processing for fun and profit PyConZA -

    Cape Town, SA Oct 7, 2016 by Reuben Cummings @reubano #PyConZA16
  2. Who am I? •Managing Director, Nerevu Development •Lead organizer of

    Arusha Coders •Author of several popular packages @reubano #PyConZA16
  3. Topics & Format •data and data mining •code samples and

    interactive exercises •hands-on (don't be a spectator) @reubano #PyConZA16
  4. Organization country capital S Africa Joburg Tanzania Dodoma Rwanda Kigali

    structured unstructured "O God of all creation. Bless this our land and nation. Justice be our shield..."
  5. Storage flat/text binary greeting,loc hello,world good bye,moon welcome,stars what's up,sky

    00105e0 b0e6 04... 00105f0 e4e7 04... 0010600 0be8 04... 00105f0 e4e7 04... 00105e0 b0e6 04...
  6. Visualizing Data 0 25 50 75 100 1930 1940 1950

    1960 Cumulative Independent African Countries* * Note: above data is made up
  7. >>> from urllib.request import urlopen >>> from ijson import items

    >>> >>> # crime-summary.json in repo >>> url = 'http://data.code4sa.org/resource/qtx7- xbrs.json' >>> f = urlopen(url) >>> data = items(f, 'item') >>> next(data) {'station': 'Aberdeen', 'sum_2014_2015': '1153'}
  8. >>> from csv import DictReader >>> from io import open

    >>> from os import path as p >>> {'Crime': 'All theft not mentioned elsewhere', 'Incidents': '3397', 'Police Station': 'Durban Central', 'Province': 'KZN', 'Year': '2014'} >>> url = p.abspath('filtered-crime-stats.csv') >>> f = open(url) >>> data = DictReader(f) >>> next(data)
  9. >>> from xlrd import open_workbook >>> >>> url = p.abspath('filtered-crime-stats.xlsx')

    >>> book = open_workbook(url) >>> sheet = book.sheet_by_index(0) >>> sheet.row_values(0) ['Province', 'Police Station', 'Crime', 'Year', 'Incidents'] >>> sheet.row_values(1) ['KZN', 'Durban Central', 'All theft not mentioned elsewhere', 2014.0, 3397.0]
  10. >>> import requests >>> from bs4 import BeautifulSoup >>> >>>

    url = 'https://github.com/reubano/pyconza-tutorial/ raw/master/migrants.html' >>> r = requests.get(url) >>> soup = BeautifulSoup(r.text, 'html.parser') >>> >>> def get_data(table): ... for row in table.findAll('tr')[1:]: ... header = row.findAll('th') ... td = row.findAll('td') ... columns = header or td ... yield [c.getText() for c in columns]
  11. >>> table = soup.find('table') >>> data = get_data(table) >>> next(data)

    ['Mediterranean', '82', '346', ... ] >>> next(data) ['', 'January', 'February', ... ]
  12. >>> import itertools as it >>> >>> records = [

    ... {'a': 'item', 'amount': 200}, ... {'a': 'item', 'amount': 300}, ... {'a': 'item', 'amount': 400}] ... {'a': 'item', 'amount': 900} >>> key = 'amount' >>> first = records[0] >>> value = sum(r.get(key, 0) for r in records) >>> dict(it.chain(first.items(), [(key, value)]))
  13. >>> import itertools as it >>> from operator import itemgetter

    >>> >>> records = [ ... {'item': 'a', 'amount': 200}, ... {'item': 'b', 'amount': 200}, ... {'item': 'c', 'amount': 400}] >>> (200, [{'amount': 200, 'item': 'a'}, {'amount': 200, 'item': 'b'}]) >>> keyfunc = itemgetter('amount') >>> sorted_records = sorted(records, key=keyfunc) >>> grouped = it.groupby(sorted_records, keyfunc) >>> data = ((key, list(group)) for key, group in grouped) >>> next(data)
  14. FS ('All theft not mentioned elsewhere', 2940) GP ('Drug-related crime',

    5229) KZN ('Drug-related crime', 4571) WC ('Common assault', 2188)
  15. >>> url = p.abspath('filtered-crime-stats.csv') >>> f = open(url) >>> data

    = DictReader(f) >>> keyfunc = itemgetter('Province') >>> records = sorted(data, key=keyfunc) >>> grouped = groupby(records, keyfunc) >>> from csv import DictReader >>> from io import open >>> from os import path as p >>> from itertools import groupby >>> from operator import itemgetter >>>
  16. >>> for key, group in grouped: ... print(key) ... keyfunc

    = itemgetter('Crime') ... sub_records = sorted(group, key=keyfunc) ... sub_grouped = groupby(sub_records, keyfunc) ... low_count, low_key = 0, None ... ... for sub_key, sg in sub_grouped: ... count = sum(int(s['Incidents']) for s in sg) ... if not low_count or count < low_count: ... low_count = count ... low_key = sub_key ... ... print((low_key, low_count))
  17. >>> from urllib.request import urlopen >>> from meza.io import read_json

    >>> >>> # crime-summary >>> url = 'http://data.code4sa.org/resource/qtx7- xbrs.json' >>> f = urlopen(url) >>> records = read_json(f) >>> next(records) {'station': 'Aberdeen', 'sum_2014_2015': '1153'} >>> next(records) {'station': 'Acornhoek', 'sum_2014_2015': '5047'}
  18. >>> from io import StringIO >>> from meza.io import read_csv

    >>> >>> f = StringIO('greeting,location\nhello,world\n') >>> records = read_csv(f) >>> next(records) {'greeting': 'hello', 'location': 'world'}
  19. >>> from os import path as p >>> from meza

    import io >>> >>> url = p.abspath('crime-summary.json') >>> records = io.read(url) >>> next(records) {'station': 'Aberdeen', 'sum_2014_2015': '1153'} >>> url2 = p.abspath('filtered-crime-stats.csv') >>> records = io.join(url, url2) >>> next(records) {'station': 'Aberdeen', 'sum_2014_2015': '1153'}
  20. >>> from io import open >>> from meza.io import read_xls

    >>> >>> url = p.abspath('filtered-crime-stats.xlsx') >>> records = read_xls(url, sanitize=True) >>> next(records) {'crime': 'All theft not mentioned elsewhere', 'incidents': '3397.0', 'police_station': 'Durban Central', 'province': 'KZN', 'year': '2014.0'}
  21. >>> from meza.io import read_html >>> >>> url = p.abspath('migrants.html')

    >>> records = read_html(url, sanitize=True) >>> next(records) {'': 'Mediterranean', 'april': '1,244', 'august': '684', 'december': '203', 'february': '346', 'january': '82', 'july': '230', 'june': '\xa010', ... 'total_to_date': '3,760'}
  22. >>> from meza.process import aggregate >>> >>> records = [

    ... {'a': 'item', 'amount': 200}, ... {'a': 'item', 'amount': 300}, ... {'a': 'item', 'amount': 400}] ... >>> aggregate(records, 'amount', sum) {'a': 'item', 'amount': 900}
  23. >>> from meza.process import group >>> >>> records = [

    ... {'item': 'a', 'amount': 200}, ... {'item': 'b', 'amount': 200}, ... {'item': 'c', 'amount': 400}] >>> >>> grouped = group(records, 'amount') >>> next(grouped) (200, [{'amount': 200, 'item': 'a'}, {'amount': 200, 'item': 'b'}])
  24. >>> from meza.io import read_csv >>> from meza.process import detect_types,

    type_cast >>> >>> url = p.abspath('filtered-crime-stats.csv') >>> raw = read_csv(url) >>> records, result = detect_types(raw) >>> result['types'] [{'id': 'Incidents', 'type': 'int'}, {'id': 'Crime', 'type': 'text'}, {'id': 'Province', 'type': 'text'}, {'id': 'Year', 'type': 'int'}, {'id': 'Police Station', 'type': 'text'}]
  25. >>> casted = type_cast(records, **result) >>> next(casted) {'Crime': 'All theft

    not mentioned elsewhere', 'Incidents': 3397, 'Police Station': 'Durban Central', 'Province': 'KZN', 'Year': 2014}
  26. >>> from meza.process import normalize >>> >>> records = [

    ... {'color': 'blue', 'setosa': 5, 'versi': 6}, ... {'color': 'red', 'setosa': 5, 'versi': 6}] >>> kwargs = { ... 'data': 'length', ... 'column':'species', ... 'rows': ['setosa', 'versi']} >>> data = normalize(records, **kwargs) >>> next(data) {'color': 'blue', 'length': 5, 'species': 'setosa'}
  27. head to head pandas meza installation complex simple size large

    small memory usage high low speed fast fast* functions very many many input/output many many
  28. {'Police Station': 'Park Road', 'Incidents': 2940, 'Province': 'FS', 'Crime': 'All

    theft not mentioned elsewhere', 'Year': 2014} {'Police Station': 'Eldorado Park', 'Incidents': 5229, 'Province': 'GP', 'Crime': 'Drug-related crime', 'Year': 2014} {'Police Station': 'Durban Central', 'Incidents': 4571, 'Province': 'KZN', 'Crime': 'Drug-related crime', 'Year': 2014} {'Police Station': 'Mitchells Plain', 'Incidents': 2188, 'Province': 'WC', 'Crime': 'Common assault', 'Year': 2014}
  29. >>> url = p.abspath('filtered-crime-stats.csv') >>> raw = read_csv(url) >>> records,

    result = detect_types(raw) >>> casted = type_cast(records, **result) >>> grouped = group(casted, 'Province') >>> from meza.io import read_csv >>> from meza.process import ( ... aggregate, group, detect_types, type_cast) >>>
  30. >>> for key, _group in grouped: ... sub_grouped = group(_group,

    'Crime') ... aggs = ( ... aggregate(sg[1], 'Incidents', sum) ... for sg in sub_grouped) ... ... keyfunc = itemgetter('Incidents') ... print(min(aggs, key=keyfunc))