一.分析采用crawlspider,利用rule规则提取url,并且follow=True追踪下去
rules = ( Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_all', follow=True), # Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_pachong', follow=True), )二.spider为
#coding:utf-8 from scrapy.spiders import CrawlSpider, Rule, Request from scrapy.linkextractors import LinkExtractor from ..items import CuiqingcaiItem class myspider(CrawlSpider): name = 'cqc' allowed_domains = ['cuiqingcai.com'] count_all = 0 url_all = [] start_urls = ['http://cuiqingcai.com'] label_tags = [u'爬虫', 'scrapy', 'selenium'] rules = ( Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_all', follow=True), # Rule(LinkExtractor(allow=('\d+\.html$',)), callback='parse_pachong', follow=True), ) ''' # 将爬虫相关的数据存入数据库 def parse_pachong(self, response): print_tag = False title_name = u"" for tag in self.label_tags: title_name = response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0] if tag in title_name.lower().encode("utf-8"): print_tag = True if print_tag == True: self.count_all = self.count_all + 1 self.url_all.append(response.url) item = CuiqingcaiItem() item['url'] = response.url item['title'] = title_name.encode("utf-8") return item ''' # 将全站数据存入json文件 def parse_all(self, response): title_name = None if response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0]: title_name = response.xpath('//header/h1[1][@class="article-title"]/a/text()').extract()[0] item = CuiqingcaiItem() item['url'] = response.url item['title'] = title_name return item三.pipelines为
import json from pymongo import MongoClient import settings from items import CuiqingcaiItem class CuiqingcaiPipeline(object): def __init__(self): cn=MongoClient('127.0.0.1',27017) db=cn[settings.Mongodb_DBNAME] self.table=db[settings.Mongodb_DBTable] def process_item(self, item, spider): if isinstance(item,CuiqingcaiItem): try: self.table.insert(dict(item)) except Exception, e: pass return item四.item为
import scrapy class CuiqingcaiItem(scrapy.Item): title = scrapy.Field() # 标题 url = scrapy.Field() # 页面的地址