Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Fan-in and Fan-out: The crucial components of concurrency by Brett Slatkin

PyCon 2014
April 11, 2014
2.1k

Fan-in and Fan-out: The crucial components of concurrency by Brett Slatkin

PyCon 2014

April 11, 2014
Tweet

More Decks by PyCon 2014

Transcript

  1. Fan-in and Fan-out
    The crucial components of concurrency
    Brett Slatkin
    Google Inc
    PyCon 2014

    View Slide

  2. Agenda
    •Goal
    •Definitions
    •The old way
    •The new way
    •It’s everywhere
    •Links

    View Slide

  3. Reference
    Slides & code
    github.com/bslatkin/pycon2014
    Me
    onebigfluke.com
    @haxor

    View Slide

  4. Why do we need Tulip?
    PEP 3156 – asyncio

    View Slide

  5. Definitions

    View Slide

  6. When one thread of control spawns one or more new
    threads of control.
    Fan-out

    View Slide

  7. Fan-in
    When one thread of control gathers results from one or
    more separate threads of control.

    View Slide

  8. Building a web crawler
    The old way

    View Slide

  9. Retrieve a URL
    Fetch

    View Slide

  10. def fetch(url):
    response = urlopen(url)
    assert response.status == 200
    data = response.read()
    assert data
    text = data.decode('utf-8')
    return text

    View Slide

  11. >> fetch(‘http://example.com’)
    ‘\n\n\n...’

    View Slide

  12. Find all URLs on a page
    Extract

    View Slide

  13. def extract(url):
    data = fetch(url)
    found_urls = set()
    for match in URL_EXPR.finditer(data):
    found = match.group('url')
    found_urls.add(found)
    return url, data, found_urls

    View Slide

  14. >> extract(‘http://example.com’)
    (
    ‘http://example.com’,
    ‘\n...’,
    set([
    ‘http://example.com/foo’,
    ‘http://example.com/bar’,
    ...
    ])
    )

    View Slide

  15. Breadth-first search of links
    Crawl

    View Slide

  16. def crawl(to_fetch=[]):
    results = []
    for depth in range(MAX_DEPTH + 1):
    batch = extract_multi(to_fetch)
    to_fetch = []
    for url, data, found in batch:
    results.append((depth, url, data))
    to_fetch.extend(found)
    return results

    View Slide

  17. def extract_multi(to_fetch):
    results = []
    for url in to_fetch:
    x = extract(url)
    results.append(x)
    return results

    View Slide

  18. >> crawl([‘http://example.com’])
    [
    (‘http://example.com’, ‘’, set([...])),
    (‘.../bar’, ‘’, set([...])),
    (‘.../foo’, ‘’, set([...])),
    ...
    ]

    View Slide

  19. Many simultaneous fetches
    Crawl in parallel

    View Slide

  20. def crawl_parallel(url):
    fetchq = Queue()
    result = []
    f = lambda: fetcher(fetchq, result)
    for _ in range(3): Thread(target=f).start()
    fetchq.put((0, url))
    fetchq.join()
    return result

    View Slide

  21. def fetcher(fetchq, result):
    while True:
    depth, url = fetchq.get()
    try:
    if depth > MAX_DEPTH: continue
    _, data, found = extract(url)
    result.append((depth, url, data)) # GIL
    for url in found:
    fetchq.put((depth + 1, url))
    finally:
    fetchq.task_done()

    View Slide

  22. >> crawl_parallel(‘http://example.com’)
    [
    (‘http://example.com’, ‘...’, set([...])),
    (‘.../bar’, ‘...’, set([...])),
    (‘.../foo’, ‘...’, set([...])),
    ...
    ]
    # Same output, much faster

    View Slide

  23. Many simultaneous crawls
    Concurrent crawls

    View Slide

  24. Possible, but complex
    See example #12 here:
    github.com/bslatkin/pycon2014
    ~200 lines of code
    Makes no sense

    View Slide

  25. Building a web crawler
    The new way

    View Slide

  26. Retrieve a URL
    Fetch

    View Slide

  27. # Old way
    def fetch(url):
    response = urlopen(url)
    try:
    assert response.status == 200
    data = response.read()
    assert data
    text = data.decode('utf-8')
    return text
    finally:
    pass

    View Slide

  28. @asyncio.coroutine
    def fetch_async(url):
    response = yield from request('get', url)
    try:
    assert response.status == 200
    data = yield from response.read()
    assert data
    text = data.decode(‘utf-8’)
    return text
    finally:
    response.close()

    View Slide

  29. Find all URLs on a page
    Extract

    View Slide

  30. # Old way
    def extract(url):
    data = fetch(url)
    found_urls = set()
    for match in URL_EXPR.finditer(data):
    found = match.group('url')
    found_urls.add(found)
    return url, data, found_urls

    View Slide

  31. @asyncio.coroutine
    def extract_async(url):
    data = yield from fetch_async(url)
    found_urls = set()
    for match in URL_EXPR.finditer(data):
    found = match.group('url')
    found_urls.add(found)
    return url, data, found_urls

    View Slide

  32. Breadth-first search of links
    Crawl

    View Slide

  33. # Old way
    def crawl(to_fetch=[]):
    results = []
    for depth in range(MAX_DEPTH + 1):
    batch = extract_multi(to_fetch)
    to_fetch = []
    for url, data, found in batch:
    results.append((depth, url, data))
    to_fetch.extend(found)
    return results

    View Slide

  34. @asyncio.coroutine
    def crawl_async(to_fetch=[]):
    results = []
    for depth in range(MAX_DEPTH + 1):
    batch = yield from ex_multi_async(to_fetch)
    to_fetch = []
    for url, data, found in batch:
    results.append((depth, url, data))
    to_fetch.extend(found)
    return results

    View Slide

  35. # Old way
    def extract_multi(to_fetch):
    results = []
    for url in to_fetch:
    x = extract(url)
    results.append(x)
    return results

    View Slide

  36. @asyncio.coroutine
    def ex_multi_async(to_fetch):
    results = []
    for url in to_fetch:
    x = yield from extract_async(url)
    results.append(x)
    return results

    View Slide

  37. Many simultaneous fetches
    Crawl in parallel

    View Slide

  38. @asyncio.coroutine
    def ex_multi_async(to_fetch):
    results = []
    for url in to_fetch:
    x = yield from extract_async(url)
    results.append(x)
    return results

    View Slide

  39. @asyncio.coroutine
    def ex_multi_async(to_fetch):
    futures, results = [], []
    for url in to_fetch:
    futures.append(extract_async(url))
    for future in asyncio.as_completed(futures):
    results.append((yield from future))
    return results

    View Slide

  40. @asyncio.coroutine # No changes
    def crawl_async(to_fetch=[]):
    results = []
    for depth in range(MAX_DEPTH + 1):
    batch = yield from ex_multi_async(to_fetch)
    to_fetch = []
    for url, data, found in batch:
    results.append((depth, url, data))
    to_fetch.extend(found)
    return results

    View Slide

  41. >> crawl_async([‘http://example.com’])
    [
    (‘http://example.com’, ‘...’, set([...])),
    (‘.../bar’, ‘...’, set([...])),
    (‘.../foo’, ‘...’, set([...])),
    ...
    ]
    # Same output, much faster, 4 line delta

    View Slide

  42. Many simultaneous crawls
    Concurrent crawls

    View Slide

  43. class MyServer(ServerHttpProtocol):
    @asyncio.coroutine
    def handle_request(self, message, payload):
    data = yield from payload.read()
    url = get_url_param(data)
    result = yield from crawl_async([url])
    response = Response(self.writer, 200)
    response.write(get_message(result))
    response.write_eof()

    View Slide

  44. It’s everywhere

    View Slide

  45. SQL
    SELECT Customer.id, sum(Order.cost)
    FROM Customer, Order
    WHERE Customer.id = Order.id -- Fan-out
    GROUP BY Customer.id -- Fan-in

    View Slide

  46. Map Reduce
    def map(text):
    for word in WORD_EXPR.finditer(text):
    yield (word, 1) # Fan-out
    def reduce(word, count_iter):
    total = 0
    for count in count_iter:
    total += count
    yield (word, total) # Fan-in

    View Slide

  47. Measurement
    •Histograms
    •Reservoir samplers
    •Profilers
    •Estimators

    View Slide

  48. •PEP3156 – asyncio
    •Google App Engine’s NDB library (by Guido)
    •C# async / await – Promises & ES7 generators
    •Rob Pike: “Concurrency is not Parallelism”
    •Slides: github.com/bslatkin/pycon2014
    •Me: onebigfluke.com – @haxor
    Links

    View Slide