Slide 30
Slide 30 text
Customer's spider!
class SpiderBot(CrawlSpider):
name = 'bot'
rules = (
Rule(
SgmlLinkExtractor(restrict_xpaths=('//li//h3',)),
process_links='filter_links',
callback='parse_item',
),
)
def filter_links(self, links):
# Get only first 5 links if available
return links[0:5]
def __init__(self, filename=None, *args, **kwargs):
super(SpiderBot, self).__init__(*args, **kwargs)
# Get start urls from file
self.start_urls = urls_from_file(filename) if filename else []
def parse_item(self, response):
selector = Selector(response)