一、爬取电影信息

http://www.imdb.cn/nowplaying/{num}    #页面规则

http://www.imdb.cn/title/tt{num}    #某部电影信息

获取电影url和title

新建项目

scrapy startproject imdb

修改items.py

# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass ImdbItem(Item):    # define the fields for your item here like:    # name = scrapy.Field()    #url = scrapy.Field()    #url    #title = scrapy.Field()  #影片名    video_title = Field()    video_rating = Field()    video_name = Field()    video_alias = Field()    video_director = Field()    video_actor = Field()    video_length = Field()    video_language = Field()    video_year = Field()    video_type = Field()    video_color = Field()    video_area = Field()    video_voice = Field()    video_summary = Field()    video_url = Field()

在spiders目录下新建爬虫文件moive.py

# -*- coding: utf-8 -*-from scrapy.spiders import CrawlSpider, Request, Rulefrom imdb.items import ImdbItemfrom scrapy.linkextractor import LinkExtractorclass ImdbSpider(CrawlSpider):    name = 'imdb'    allowed_domains = ['www.imdb.cn']    rules = (        Rule(LinkExtractor(allow=r"/title/tt\d+$"), callback="parse_imdb", follow=True),    )    def start_requests(self):        for i in range(1, 20):            url = "http://www.imdb.cn/nowplaying/" + str(i)            yield Request(url=url, callback=self.parse)    def parse_imdb(self, response):        item = ImdbItem()        try:            item['video_title'] = "".join(response.xpath('//*[@class="fk-3"]/div[@class="hdd"]/h3/text()').extract())            item['video_rating'] = "".join(                response.xpath('//*[@class="fk-3"]/div[@class="hdd"]/span/i/text()').extract())            content = response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li').extract()            for i in range(0, len(content)):                if "片名" in content[i]:                    if i == 0:                        item['video_name'] = "".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[1]/a/text()').extract())                if "别名" in content[i]:                    if i == 1:                        item['video_alias'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()').extract())                if "导演" in content[i]:                    if i == 1:                        item['video_director'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[2]/a/text()').extract())                    elif i == 2:                        item['video_director'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()').extract())                if "主演" in content[i]:                    if i == 2:                        item['video_actor'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[3]/a/text()').extract())                    if i == 3:                        item['video_actor'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[4]/a/text()').extract())                if "上映时间" in content[i]:                    if i == 4:                        item['video_year'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a[1]/text()').extract())                        a = response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a').extract()                        length = len(a) - 1                        try:                            item['video_color'] = "".join(                                response.xpath(                                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()').extract()[length])                        except Exception as e:                            item['video_color'] = ""                        try:                            type = "|".join(                                response.xpath(                                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[5]/a/text()').extract()[1:length])                            maohao = type.split(":")                            if len(maohao) > 0:                                item['video_type'] = maohao[0]                            else:                                item['video_type'] = ""                        except Exception as e:                            item['video_type'] = ""                    if i == 5:                        item['video_year'] = "".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()').extract())                        a = response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a').extract()                        length = len(a) - 1                        try:                            item['video_color'] = "".join(                                response.xpath(                                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()').extract()[length])                        except Exception as e:                            item['video_color'] = ""                        try:                            type = "|".join(                                response.xpath(                                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a/text()').extract()[1:length])                            maohao = type.split(":")                            if len(maohao) > 0:                                item['video_type'] = maohao[0]                            else:                                item['video_type'] = ""                        except Exception as e:                            item['video_type'] = ""                if "国家" in content[i]:                    if i == 5:                        item['video_area'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[1]/text()').extract())                        item['video_voice'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[6]/a[2]/text()').extract())                    if i == 6:                        item['video_area'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[1]/text()').extract())                        item['video_voice'] = "|".join(                            response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[7]/a[2]/text()').extract())            item['video_length'] = "".join(                response.xpath(                    '//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/text()').extract()).replace(                " ", "")            item['video_language'] = "".join(                response.xpath('//*[@class="fk-3"]/div[@class="bdd clear"]/ul/li[@class="nolink"]/a/text()').extract())            item['video_summary'] = "".join(                response.xpath(                    '//*[@class="fk-4 clear"]/div[@class="bdd clear"]/i/text()').extract()).lstrip().rstrip().replace(                "
", "")            item['video_url'] = response.url            yield item        except Exception as error:            log(error)

在spiders目录下新建run.py启动文件

vim run.py

# coding:utf-8

from scrapy import cmdline

cmdline.execute("scrapy crawl imdb".split())

二、有限深度爬取

新建项目

scrapy startproject douban

scrapy中,我们在settings.py设置深度使用DEPTH_LIMIT,例如:DEPTH_LIMIT = 5,该深度是相对于初始请求url的深度

修改settings.py

DEPTH_LIMIT = 4

#豆瓣有反爬虫机制,因此设置延时DOWNLOAD_DELAY

DOWNLOAD_DELAY = 2

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'    #设置代理

items.py

from scrapy import Item, Field# 音乐class MusicItem(Item):    music_name = Field()    music_alias = Field()    music_singer = Field()    music_time = Field()    music_rating = Field()    music_votes = Field()    music_tags = Field()    music_url = Field()class MusicReviewItem(Item):    review_title = Field()    review_content = Field()    review_author = Field()    review_music = Field()    review_time = Field()    review_url = Field()

爬虫文件music.py

# --*-- coding: utf-8 --*--from scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import LinkExtractorfrom douban.items import MusicItem, MusicReviewItemfrom scrapy import logclass ReviewSpider(CrawlSpider):    name = 'review'    allowed_domains = ['music.douban.com']    start_urls = ['https://music.douban.com/subject/1406522/']    rules = (        Rule(LinkExtractor(allow=r"/subject/\d+/reviews$")),        Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time$")),        Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time\&start=\d+$")),        Rule(LinkExtractor(allow=r"/review/\d+/$"), callback="parse_review", follow=True),    )    def parse_review(self,response):        try:            item = MusicReviewItem()            item['review_title'] = "".join(response.xpath('//*[@property="v:summary"]/text()').extract())            content = "".join(                response.xpath('//*[@id="link-report"]/div[@property="v:description"]/text()').extract()            )            item['review_content'] = content.lstrip().rstrip().replace('\n'," ")            item['review_author'] = "".join(response.xpath('//*[@property="v:reviewer"]/text()').extract())            item['review_music'] = "".join(response.xpath('//*[@class="main-hd"]/a[2]/text()').extract())            item['review_time'] = "".join(response.xpath('//*[@class="main-hd"]/p/text()').extract())            item['review_url'] = response.url            yield item        except Exception as error:            log(error)

启动命令文件run.py

# --*-- coding: utf-8 --*--from scrapy import cmdlinecmdline.execute("scrapy crawl review -o review.json".split())

-o 参数导出结果到review.json文件

多个爬虫组合

我们现在有这么个需求,既要爬取音乐详情又要爬取乐评,既要爬取电影详情又要爬取影评,这个要怎么搞,难道是每一个需求就要创建一个项目么,如果按这种方式,我们就要创建四个项目,分别来爬取音乐、乐评、电影、影评,显然这么做的话,代码不仅有很多重合的部分,而且还不容易维护爬虫

新建项目

scrapy startproject multi

修改settings.py

USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'DOWNLOAD_DELAY = 2修改items.pyfrom scrapy import Item, Field# 音乐class MusicItem(Item):    music_name = Field()    music_alias = Field()    music_singer = Field()    music_time = Field()    music_rating = Field()    music_votes = Field()    music_tags = Field()    music_url = Field()# 乐评class MusicReviewItem(Item):    review_title = Field()    review_content = Field()    review_author = Field()    review_music = Field()    review_time = Field()    review_url = Field()# 电影class VideoItem(Item):    video_name = Field()    video_alias = Field()    video_actor = Field()    video_year = Field()    video_time = Field()    video_rating = Field()    video_votes = Field()    video_tags = Field()    video_url = Field()    video_director = Field()    video_type = Field()    video_bigtype = Field()    video_area = Field()    video_language = Field()    video_length = Field()    video_writer = Field()    video_desc = Field()    video_episodes = Field()# 影评class VideoReviewItem(Item):    review_title = Field()    review_content = Field()    review_author = Field()    review_video = Field()    review_time = Field()    review_url = Field()        spiders目录下新建两个爬虫文件videospider.py# --*-- coding: utf-8 --*--from scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import LinkExtractorfrom multi.items import VideoItem, VideoReviewItemfrom scrapy import logimport reAREA = re.compile(r"制片国家/地区: (.+?)
")ALIAS = re.compile(r"又名: (.+?)
")LANGUAGE = re.compile(r"语言: (.+?)
")EPISODES = re.compile(r"集数: (.+?)
")LENGTH = re.compile(r"单集片长: (.+?)
")class VideoSpider(CrawlSpider):    name = 'video'    allowed_domains = ['movie.douban.com']    start_urls = [        'https://movie.douban.com/tag/',        'https://movie.douban.com/tag/?view=cloud'    ]    rules = (Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))$")),             Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))\?start=\d+\&type=T$")),             Rule(LinkExtractor(allow=r"/subject/\d+/reviews$")),             Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?start=\d+$")),             Rule(LinkExtractor(allow=r"/subject/\d+/$"), callback="parse_video", follow=True),             Rule(LinkExtractor(allow=r"/review/\d+/$"), callback="parse_review", follow=True),             )    def parse_video(self, response):        item = VideoItem()        try:            item["video_url"] = response.url            item["video_name"] = ''.join(                response.xpath('//*[@id="content"]/h1/span[@property="v:itemreviewed"]/text()').extract()            )            try:                item["video_year"] = ''.join(                    response.xpath('//*[@id="content"]/h1/span[@class="year"]/text()').extract()).replace(                    "(", "").replace(")", ""                )            except Exception as e:                print('Exception:', e)                item['video_year'] = ''            introduction = response.xpath('//*[@id="link-report"]/span[@property="v:summary"]/text()').extract()            if introduction:                item["video_desc"] = ''.join(introduction).strip().replace("\r\n", " ")            else:                item["video_desc"] = ''.join(                    response.xpath('//*[@id="link-report"]/span/text()').extract()).strip().replace("\r\n", " ")            item["video_director"] = "|".join(                response.xpath('//*[@id="info"]/span/span/a[@rel="v:directedBy"]/text()').extract())            item["video_writer"] = "|".join(                response.xpath('//*[@id="info"]/span[2]/span[2]/a/text()').extract())            item["video_actor"] = "|".join(response.xpath("//a[@rel='v:starring']/text()").extract())            item["video_type"] = "|".join(response.xpath('//*[@id="info"]/span[@property="v:genre"]/text()').extract())            S = "".join(response.xpath("//div[@id='info']").extract())            M = AREA.search(S)            if M is not None:                item["video_area"] = "|".join([area.strip() for area in M.group(1).split("/")])            else:                item['video_area'] = ''            A = "".join(response.xpath("//div[@id='info']").extract())            AL = ALIAS.search(A)            if AL is not None:                item["video_alias"] = "|".join([alias.strip() for alias in AL.group(1).split("/")])            else:                item["video_alias"] = ""            video_info = "".join(response.xpath("//div[@id='info']").extract())            language = LANGUAGE.search(video_info)            episodes = EPISODES.search(video_info)            length = LENGTH.search(video_info)            if language is not None:                item["video_language"] = "|".join([language.strip() for language in language.group(1).split("/")])            else:                item['video_language'] = ''            if length is not None:                item["video_length"] = "|".join([runtime.strip() for runtime in length.group(1).split("/")])            else:                item["video_length"] = "".join(                    response.xpath('//*[@id="info"]/span[@property="v:runtime"]/text()').extract())            item['video_time'] = "/".join(                response.xpath('//*[@id="info"]/span[@property="v:initialReleaseDate"]/text()').extract())            if episodes is not None:                item['video_bigtype'] = "电视剧"                item["video_episodes"] = "|".join([episodes.strip() for episodes in episodes.group(1).split("/")])            else:                item['video_bigtype'] = "电影"                item['video_episodes'] = ''            item['video_tags'] = "|".join(                response.xpath('//*[@class="tags"]/div[@class="tags-body"]/a/text()').extract())            try:                item['video_rating'] = "".join(response.xpath(                    '//*[@class="rating_self clearfix"]/strong/text()').extract())                item['video_votes'] = "".join(response.xpath(                    '//*[@class="rating_self clearfix"]/div/div[@class="rating_sum"]/a/span/text()').extract())            except Exception as error:                item['video_rating'] = '0'                item['video_votes'] = '0'                log(error)            yield item        except Exception as error:            log(error)    def parse_review(self, response):        try:            item = VideoReviewItem()            item['review_title'] = "".join(response.xpath('//*[@property="v:summary"]/text()').extract())            content = "".join(                response.xpath('//*[@id="link-report"]/div[@property="v:description"]/text()').extract())            item['review_content'] = content.lstrip().rstrip().replace("\n", " ")            item['review_author'] = "".join(response.xpath('//*[@property = "v:reviewer"]/text()').extract())            item['review_video'] = "".join(response.xpath('//*[@class="main-hd"]/a[2]/text()').extract())            item['review_time'] = "".join(response.xpath('//*[@class="main-hd"]/p/text()').extract())            item['review_url'] = response.url            yield item        except Exception as error:            log(error)             新建musicspider.py  # --*-- coding: utf-8 --*--from scrapy.spiders import CrawlSpider, Rulefrom scrapy.linkextractors import LinkExtractorfrom multi.items import MusicItem, MusicReviewItemfrom scrapy import logimport reclass MusicSpider(CrawlSpider):    name = "music"    allowed_domains = ['music.douban.com']    start_urls = [        'https://music.douban.com/tag/',        'https://music.douban.com/tag/?view=cloud'    ]    rules = (Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))$")),             Rule(LinkExtractor(allow=r"/tag/((\d+)|([\u4e00-\u9fa5]+)|(\w+))\?start=\d+\&type=T$")),             Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time$")),             Rule(LinkExtractor(allow=r"/subject/\d+/reviews\?sort=time\&start=\d+$")),             Rule(LinkExtractor(allow=r"/subject/\d+/$"), callback="parse_music", follow=True),             Rule(LinkExtractor(allow=r"/review/\d+/$"), callback="parse_review", follow=True),             )    def parse_music(self,response):        item = MusicItem()        try:            item['music_name'] = response.xpath('//*[@id="wrapper"]/h1/span/text()').extract()[0]            content = "".join(response.xpath('//*[@id="info"]').extract())            info = response.xpath('//*[@id="info"]/span').extract()            item['music_alias'] = ""            item['music_singer'] = ""            item['music_time'] = ""            for i in range(0, len(info)):                if "又名" in info[i]:                    if i == 0:                        item['music_alias'] = response.xpath('//*[@id="info"]/text()').extract()[1].replace("\xa0", "").replace("\n", "").rstrip()                    elif i == 1:                        item['music_alias'] = response.xpath('//*[@id="info"]/text()').extract()[2].replace("\xa0", "").replace("\n", "").rstrip()                    elif i == 2:                        item['music_alias'] = response.xpath('//*[@id="info"]/text()').extract()[3].replace("\xa0", "").replace("\n", "").rstrip()                    else:                        item['music_alias'] = ""                if "表演者" in info[i]:                    if i == 0:                        item['music_singer'] = "|".join(response.xpath('//*[@id="info"]/span[1]/span/a/text()').extract())                    elif i == 1:                        item['music_singer'] = "|".join(                            response.xpath('//*[@id="info"]/span[2]/span/a/text()').extract())                    elif i == 2:                        item['music_singer'] = "|".join(                            response.xpath('//*[@id="info"]/span[3]/span/a/text()').extract())                    else:                        item['music_singer'] = ""                if "发行时间" in info[i]:                    nbsp = re.findall(r"
发行时间:(.*?)
", content, re.S)                    item['music_time'] = "".join(nbsp).replace("\xa0", "").replace("\n", "").replace(" ", "")            try:                item['music_rating'] = "".join(response.xpath('//*[@class="rating_self clearfix"]/strong/text()').extract())                item['music_votes'] = "".join(response.xpath('//*[@class="rating_self clearfix"]/div/div[@class="rating_sum"]/a/span/text()').extract())            except Exception as error:                item['music_rating'] = '0'                item['music_votes'] = '0'                log(error)            item['music_tags'] = "|".join(response.xpath('//*[@id="db-tags-section"]/div/a/text()').extract())            item['music_url'] = response.url            yield item        except Exception as error:            log(error)    def parse_review(self, response):        try:            item = MusicReviewItem()            item['review_title'] = "".join(response.xpath('//*[@property="v:summary"]/text()').extract())            content = "".join(                response.xpath('//*[@id="link-report"]/div[@property="v:description"]/text()').extract()            )            item['review_content'] = content.lstrip().rstrip().replace("\n", " ")            item['review_author'] = "".join(response.xpath('//*[@property = "v:reviewer"]/text()').extract())            item['review_music'] = "".join(response.xpath('//*[@class="main-hd"]/a[2]/text()').extract())            item['review_time'] = "".join(response.xpath('//*[@class="main-hd"]/p/text()').extract())            item['review_url'] = response.url            yield item        except Exception as error:            log(error)             新建启动文件run.py # --*-- coding: utf-8 --*--from scrapy import cmdlinecmdline.execute("scrapy crawl music".split())cmdline.execute("scrapy crawl video".split())