Slide 44
Slide 44 text
Wring a Crawler — CrawlSpider
class IMDBSpider(CrawlSpider):
name, start_urls = 'imdbspider', ['http://www.imdb.com/chart/top']
rules = (
Rule(SgmlLinkExtractor(allow=('category\.php', ),
deny=('subsection\.php', ))),
Rule(SgmlLinkExtractor(allow=('regular expression', )),
callback='parse_review),
Rule(SgmlLinkExtractor(allow=('regular expression', )),
callback='parse_movie),
)
def parse_movie(self, response):
sel = Selector(response)
movie_item = MovieItem()
movie_item['title'] = sel.xpath("...").extract()
movie_item['year'] = sel.xpath("...").extract()
movie_item['rating'] = sel.xpath("...").extract()
yield movie_item
Bob Mingshen Sun ANSR Lab Group Study June 1, 2015 44 / 56