def extract(url): data = fetch(url) found_urls = set() for match in URL_EXPR.finditer(data): found = match.group('url') found_urls.add(found) return url, data, found_urls
def crawl_parallel(url): fetchq = Queue() result = [] f = lambda: fetcher(fetchq, result) for _ in range(3): Thread(target=f).start() fetchq.put((0, url)) fetchq.join() return result
# Old way def fetch(url): response = urlopen(url) try: assert response.status == 200 data = response.read() assert data text = data.decode('utf-8') return text finally: pass
@asyncio.coroutine def fetch_async(url): response = yield from request('get', url) try: assert response.status == 200 data = yield from response.read() assert data text = data.decode(‘utf-8’) return text finally: response.close()
# Old way def extract(url): data = fetch(url) found_urls = set() for match in URL_EXPR.finditer(data): found = match.group('url') found_urls.add(found) return url, data, found_urls
@asyncio.coroutine def extract_async(url): data = yield from fetch_async(url) found_urls = set() for match in URL_EXPR.finditer(data): found = match.group('url') found_urls.add(found) return url, data, found_urls
# Old way def crawl(to_fetch=[]): results = [] for depth in range(MAX_DEPTH + 1): batch = extract_multi(to_fetch) to_fetch = [] for url, data, found in batch: results.append((depth, url, data)) to_fetch.extend(found) return results
@asyncio.coroutine def ex_multi_async(to_fetch): futures, results = [], [] for url in to_fetch: futures.append(extract_async(url)) for future in asyncio.as_completed(futures): results.append((yield from future)) return results
@asyncio.coroutine # No changes def crawl_async(to_fetch=[]): results = [] for depth in range(MAX_DEPTH + 1): batch = yield from ex_multi_async(to_fetch) to_fetch = [] for url, data, found in batch: results.append((depth, url, data)) to_fetch.extend(found) return results
Map Reduce def map(text): for word in WORD_EXPR.finditer(text): yield (word, 1) # Fan-out def reduce(word, count_iter): total = 0 for count in count_iter: total += count yield (word, total) # Fan-in