Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Fan-in and Fan-out: The crucial components of c...

PyCon 2014
April 11, 2014
2.2k

Fan-in and Fan-out: The crucial components of concurrency by Brett Slatkin

PyCon 2014

April 11, 2014
Tweet

More Decks by PyCon 2014

Transcript

  1. Fan-in When one thread of control gathers results from one

    or more separate threads of control.
  2. def fetch(url): response = urlopen(url) assert response.status == 200 data

    = response.read() assert data text = data.decode('utf-8') return text
  3. def extract(url): data = fetch(url) found_urls = set() for match

    in URL_EXPR.finditer(data): found = match.group('url') found_urls.add(found) return url, data, found_urls
  4. def crawl(to_fetch=[]): results = [] for depth in range(MAX_DEPTH +

    1): batch = extract_multi(to_fetch) to_fetch = [] for url, data, found in batch: results.append((depth, url, data)) to_fetch.extend(found) return results
  5. def extract_multi(to_fetch): results = [] for url in to_fetch: x

    = extract(url) results.append(x) return results
  6. def crawl_parallel(url): fetchq = Queue() result = [] f =

    lambda: fetcher(fetchq, result) for _ in range(3): Thread(target=f).start() fetchq.put((0, url)) fetchq.join() return result
  7. def fetcher(fetchq, result): while True: depth, url = fetchq.get() try:

    if depth > MAX_DEPTH: continue _, data, found = extract(url) result.append((depth, url, data)) # GIL for url in found: fetchq.put((depth + 1, url)) finally: fetchq.task_done()
  8. # Old way def fetch(url): response = urlopen(url) try: assert

    response.status == 200 data = response.read() assert data text = data.decode('utf-8') return text finally: pass
  9. @asyncio.coroutine def fetch_async(url): response = yield from request('get', url) try:

    assert response.status == 200 data = yield from response.read() assert data text = data.decode(‘utf-8’) return text finally: response.close()
  10. # Old way def extract(url): data = fetch(url) found_urls =

    set() for match in URL_EXPR.finditer(data): found = match.group('url') found_urls.add(found) return url, data, found_urls
  11. @asyncio.coroutine def extract_async(url): data = yield from fetch_async(url) found_urls =

    set() for match in URL_EXPR.finditer(data): found = match.group('url') found_urls.add(found) return url, data, found_urls
  12. # Old way def crawl(to_fetch=[]): results = [] for depth

    in range(MAX_DEPTH + 1): batch = extract_multi(to_fetch) to_fetch = [] for url, data, found in batch: results.append((depth, url, data)) to_fetch.extend(found) return results
  13. @asyncio.coroutine def crawl_async(to_fetch=[]): results = [] for depth in range(MAX_DEPTH

    + 1): batch = yield from ex_multi_async(to_fetch) to_fetch = [] for url, data, found in batch: results.append((depth, url, data)) to_fetch.extend(found) return results
  14. # Old way def extract_multi(to_fetch): results = [] for url

    in to_fetch: x = extract(url) results.append(x) return results
  15. @asyncio.coroutine def ex_multi_async(to_fetch): results = [] for url in to_fetch:

    x = yield from extract_async(url) results.append(x) return results
  16. @asyncio.coroutine def ex_multi_async(to_fetch): results = [] for url in to_fetch:

    x = yield from extract_async(url) results.append(x) return results
  17. @asyncio.coroutine def ex_multi_async(to_fetch): futures, results = [], [] for url

    in to_fetch: futures.append(extract_async(url)) for future in asyncio.as_completed(futures): results.append((yield from future)) return results
  18. @asyncio.coroutine # No changes def crawl_async(to_fetch=[]): results = [] for

    depth in range(MAX_DEPTH + 1): batch = yield from ex_multi_async(to_fetch) to_fetch = [] for url, data, found in batch: results.append((depth, url, data)) to_fetch.extend(found) return results
  19. class MyServer(ServerHttpProtocol): @asyncio.coroutine def handle_request(self, message, payload): data = yield

    from payload.read() url = get_url_param(data) result = yield from crawl_async([url]) response = Response(self.writer, 200) response.write(get_message(result)) response.write_eof()
  20. Map Reduce def map(text): for word in WORD_EXPR.finditer(text): yield (word,

    1) # Fan-out def reduce(word, count_iter): total = 0 for count in count_iter: total += count yield (word, total) # Fan-in
  21. •PEP3156 – asyncio •Google App Engine’s NDB library (by Guido)

    •C# async / await – Promises & ES7 generators •Rob Pike: “Concurrency is not Parallelism” •Slides: github.com/bslatkin/pycon2014 •Me: onebigfluke.com – @haxor Links