Slide 51
Slide 51 text
TPL Dataflow approach
async Task ScrapeAsync(string startUrl, string basePath, int maxDepth, string translateTo, bool stayInDomain)
{
var context = new ScrapeContext(startUrl, …);
var dataflowOptions = new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 20, BoundedCapacity = 100 };
// Set up dataflow blocks
var fetchHtmlBlock = CreateFetchHtmlBlock(dataflowOptions, context);
var htmlBroadcaster = new BroadcastBlock(x => x);
var retrieveHtmlLinksBlock = CreateRetrieveHtmlLinksBlock(dataflowOptions, context);
var retrieveHtmlImageLinksBlock = CreateRetrieveHtmlImageLinksBlock(dataflowOptions);
var downloadImageBlock = CreateDownloadImageBlock(dataflowOptions);
var translateHtmlBlock = CreateTranslateHtmlBlock(dataflowOptions, context);
var replaceToLocalLinksBlock = CreateReplaceToLocalLinksBlock(dataflowOptions);
var saveHtmlBlock = CreateSaveHtmlBlock(dataflowOptions);
}
Tamir Dresher
TransformBlock CreateFetchHtmlBlock(ExecutionDataflowBlockOptions options,
ScrapeContext context)
{
return new TransformBlock(async item =>
{
try
{
if (item.depth <= context.MaxDepth && _processedUrls.TryAdd(item.Url, true))
{
var html = await DownloadHtmlAsync(item.Url);
return new HtmlProcessingData(html, item.Url, item.depth, context);
}
}
catch (Exception ex)
{
_logger.LogError(ex, "Error fetching html {url}", item?.Url);
}
return new HtmlProcessingData("", "", -1, context);
}, options);
}