# scrapy笔记
# -*- coding: utf-8 -*-
# author : seven
# time : 2017/7/21
#1、安装scrapy,用国内豆瓣镜像:
pip install
-i https
://pypi.douban.com
/simple
/ scrapy
#2、新建scrapy项目:scrapy startproject ArticleScripy
#3、新建工程
cd ArticleScrapy
scrapy genspider jobbole blog.jobbole.com
#4、创建调试
#4.1 页面main,在ArticleScrapy根目录下新建main.py文件,代码如下:
from scrapy.cmdline
import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
#将main的文件路径加入环境变量
execute([
'scrapy','crawl','jobbole'])
#启动工程jobbole,前两个是scrapy启动工程固定使用,第三个就是工程(splides)下的name
#4.2 然后修改settings.py:
ROBOTSTXT_OBEY
= False
#4.3 在工程jobbole.py下的parse中打断点,在main中debug启动
#4.4 命令行调试某个url下的页面
scrapy shell http
://blog.jobbole.com
/11
title
= response.xpath(
'//*xxsadadadada')
title_content
= title.extract()[
0]
#5、xpath语法,所有下标从1开始的
article
:选取所有article元素的子节点
/article
:选取根元素的article
article
/a
:选取所有属于article的子元素的a元素
//div
:选取所有div子元素(无论出现在文档任何地方)
article
//div
:选取所有属于arcile元素后代的div元素
,不管它出现在article之下的任何位置
//@class:选取所有有class的属性的标签(
//@title
:选取所有有title属性的标签;article
//span[contains(
@class,'test_class')]
:class的值包含
"test_class"的span元素
/article
/div[
1]
:选取属于article子元素的第一个div元素,下标从1开始的
/article
/div[last()]
:选取属于article子元素的最后一个div元素下标从1开始的
/article
/div[last()
-1]
:选取属于article子元素的倒数第二个div元素下标从1开始的
//div[
@lang
='eng']
:选取所有lang属性为eng的div元素
/div
/*:获取div下所有元素
//*:获取所有元素
//div[
@*]
:获取所有带属性的div元素
//div
/a
| //div
/p
:所有div元素下u的a和p元素
//span
|//ul :选取所有span和ul
article
/div
/p
|| //span
:选取所有属于article元素的div元素的p元素和所有span
article
/p
/text()
#获取所有text文本
article
/p
/html()
#获取所有html内容
#6、class选择器,与前端一样
response.css(
'#tt .test h2::text').extract()[
0]
#获取text
response.css(
'.test a::attr(href)').extract()[
0]
#获取href的内容
#7、基本demo:
# -*- coding: utf-8 -*-
import scrapy
import re
class JobboleSpider(scrapy.Spider)
:
name
= 'jobbole'
allowed_domains
= [
'blog.jobbole.com']
start_urls
= [
'http://blog.jobbole.com/112109/']
def parse(
self, response)
:
# 【注】:extract_first(0)与extract()[0],前者如果取不到会返回穿进去的变量(0),后者抛异常,所以如果在直到是一个的情况下还是用extract_first
# xpath获取内容
# 标题
title = response.xpath(
'//*[@id="post-112109"]/div[1]/h1').extract_first(
'')
# 添加时间
create_date = response.xpath(
'//*[@id="post-112109"]/div[2]/p/text()').extract_first(
'').strip().replace(
'·',
'').strip()
# 赞数量
zan_nums = int(
response.xpath(
'//span[contains(@class,"vote-post-up")]/h10/text()').extract_first(
0))
# 收藏数量
fav_str
= response.xpath(
'//span[contains(@class,"bookmark-btn")]/text()').extract_first(
'')
zan_tmp
= re.match(
".*?(\d+).*", fav_str)
if zan_tmp
:
fav_nums = zan_tmp.group(
1)
else:
fav_nums = 0
# 评论数量
comment_str
= response.xpath(
'//a[@href="#article-comment"]/span/text()').extract_first(
0).strip()
comment_tmp
= re.match(
".*(\d+).*", comment_str)
if comment_tmp
:
comment_nums = zan_tmp.group(
1)
else:
comment_nums = 0
# 正文内容
content = response.xpath(
'//div[@class="entry"]').extract_first(
'')
# css获取内容
# 标题
title2 = response.css(
'.entry-header h1::text').extract()[
0]
# 添加时间
create_date2 = response.css(
'.entry-meta-hide-on-mobile::text').extract()[
0].strip().replace(
'·', '').strip()
# 赞数量
zan_nums2 = int(
response.css(
'.vote-post-up h10::text').extract()[
0])
# 收藏数量
fav_str2
= response.css(
'.bookmark-btn::text').extract()[
0]
zan_tmp2
= re.match(
".*?(\d+).*", fav_str2)
if zan_tmp2
:
fav_nums2 = zan_tmp2.group(
1)
else:
fav_nums2 = 0
# 评论数量
comment_str2
= response.css(
'a[href="#article-comment"] span::text').extract()[
0].strip()
comment_tmp2
= re.match(
".*(\d+).*", comment_str2)
if comment_tmp2
:
comment_nums2 = comment_tmp2.group(
1)
else:
comment_nums2 = 0
# content
content = response.xpath(
'.entry').extract()[
0]
#8、进阶demo
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http
import Request
from urllib
import parse
class JobboleSpider(scrapy.Spider)
:
name
= 'jobbole'
allowed_domains
= [
'blog.jobbole.com']
start_urls
= [
'http://blog.jobbole.com/all-posts/']
def parse(
self, response)
:
# 1、提取文章列表页中的每一篇文章的url交给scrapy下载后并解析
post_urls
= response.css(
'#archive .floated-thumb .post-thumb a::attr(href)').extract()
for post_url
in post_urls
:
# esponse.url: 主域名
yield Request(
url=parse.urljoin(
response.url
, post_url)
, callback=self.parse_detail)
# 2、提取下一页交给scrapy下载
next_urls
= response.css(
'a.next.page-numbers::attr(href)').extract_first(
'')
# <a class='next page-numbers'></a>
if next_urls
:
yield Request(
url=parse.urljoin(
response.url
, next_urls)
, callback=self.parse)
else:
print(
'end')
def parse_detail(
self, response)
:
'''提取每一篇文章内容'''
# 【注】:extract_first(0)与extract()[0],前者如果取不到会返回穿进去的变量(0),后者抛异常,所以如果在直到是一个的情况下还是用extract_first
# xpath获取内容
# 标题
title = response.xpath(
'//*[@id="post-112109"]/div[1]/h1/text()').extract_first(
'')
# 添加时间
create_date = response.xpath(
'//*[@id="post-112109"]/div[2]/p/text()').extract_first(
'').strip().replace(
'·',
'').strip()
# 赞数量
zan_nums = int(
response.xpath(
'//span[contains(@class,"vote-post-up")]/h10/text()').extract_first(
0))
# 收藏数量
fav_str
= response.xpath(
'//span[contains(@class,"bookmark-btn")]/text()').extract_first(
'')
zan_tmp
= re.match(
".*?(\d+).*", fav_str)
if zan_tmp
:
fav_nums = int(zan_tmp.group(
1))
else:
fav_nums = 0
# 评论数量
comment_str
= response.xpath(
'//a[@href="#article-comment"]/span/text()').extract_first(
0).strip()
comment_tmp
= re.match(
".*(\d+).*", comment_str)
if comment_tmp
:
comment_nums = int(zan_tmp.group(
1))
else:
comment_nums = 0
# 正文内容
content = response.xpath(
'//div[@class="entry"]').extract_first(
'')
# css获取内容
# 标题
title2 = response.css(
'.entry-header h1::text').extract()[
0]
# 添加时间
create_date2 = response.css(
'.entry-meta-hide-on-mobile::text').extract()[
0].strip().replace(
'·', '').strip()
# 赞数量
zan_nums2 = int(
response.css(
'.vote-post-up h10::text').extract()[
0])
# 收藏数量
fav_str2
= response.css(
'.bookmark-btn::text').extract()[
0]
zan_tmp2
= re.match(
".*?(\d+).*", fav_str2)
if zan_tmp2
:
fav_nums2 = zan_tmp2.group(
1)
else:
fav_nums2 = 0
# 评论数量
comment_str2
= response.css(
'a[href="#article-comment"] span::text').extract()[
0].strip()
comment_tmp2
= re.match(
".*(\d+).*", comment_str2)
if comment_tmp2
:
comment_nums2 = comment_tmp2.group(
1)
else:
comment_nums2 = 0
# content
content = response.xpath(
'.entry').extract()[
0]
#8 配置item与pipelines,同时配置下载图片
#spides/jobbole.py代码:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http
import Request
from urllib
import parse
from ArticleScrapy.items
import JobBoleArticleItem
class JobboleSpider(scrapy.Spider)
:
name
= 'jobbole'
allowed_domains
= [
'blog.jobbole.com']
start_urls
= [
'http://blog.jobbole.com/all-posts/']
def parse(
self, response)
:
# 1、提取文章列表页中的每一篇文章的url交给scrapy下载后并解析
post_nodes
= response.css(
'#archive .floated-thumb .post-thumb a')
for post_node
in post_nodes
:
image_url
= post_node.css(
'img::attr(src)').extract_first(
'')
post_url
= post_node.css(
'::attr(href)').extract_first(
'')
#esponse.url: 主域名; meta:front_image_url传递到parse_detail中到response
yield Request(
url=parse.urljoin(
response.url
,post_url)
,callback=self.parse_detail
,meta={
'front_image_url':image_url})
# 2、提取下一页交给scrapy下载
next_urls
= response.css(
'a.next.page-numbers::attr(href)').extract_first(
'')
#<a class='next page-numbers'></a>
if next_urls
:
yield Request(
url=parse.urljoin(
response.url
, next_urls)
, callback=self.parse)
def parse_detail(
self,response)
:
article_item
= JobBoleArticleItem()
'''提取每一篇文章内容'''
image_url
= response.meta.get(
'front_image_url','')
#接受parse中传递过来的参数(列表封面)
# 【注】:extract_first(0)与extract()[0],前者如果取不到会返回穿进去的变量(0),后者抛异常,所以如果在直到是一个的情况下还是用extract_first
#css获取内容
# 标题
title2
= response.css(
'.entry-header h1::text').extract_first(
0)
# 添加时间
create_date2
= response.css(
'.entry-meta-hide-on-mobile::text').extract_first(
'').strip().replace(
'·','').strip()
# 赞数量
zan_nums2
= int(
response.css(
'.vote-post-up h10::text').extract_first(
0))
# 收藏数量
fav_str2
=response.css(
'.bookmark-btn::text').extract_first(
'')
zan_tmp2
= re.match(
".*?(\d+).*",fav_str2)
if zan_tmp2
:
fav_nums2
= zan_tmp2.group(
1)
else:
fav_nums2
= 0
# 评论数量
comment_str2
= response.css(
'a[href="#article-comment"] span::text').extract_first(
'').strip()
comment_tmp2
= re.match(
".*(\d+).*", comment_str2)
if comment_tmp2
:
comment_nums2
= comment_tmp2.group(
1)
else:
comment_nums2
= 0
#content
content2
= response.css(
'.entry').extract()[
0]
#传递到item
article_item[
'title']
= title2
article_item[
'url']
= response.url
article_item[
'create_date']
= create_date2
article_item[
'front_image_url']
= [image_url]
#这里一定是数组,否则配置下载那里会报错,它接受到是一个数组
article_item[
'zan_nums']
= zan_nums2
article_item[
'comment_nums']
= comment_nums2
article_item[
'fav_nums']
= fav_nums2
article_item[
'content']
= content2
yield article_item
#配置settings中ITEM_PIPELINES,然后会将数据传递到pipelines中,之后在处理数据
#items代码:
# -*- coding: utf-8 -*-
import scrapy
class ArticlescrapyItem(scrapy.Item)
:
# define the fields for your item here like:
# name = scrapy.Field()
pass
class JobBoleArticleItem(scrapy.Item)
:
title
= scrapy.Field()
create_date
= scrapy.Field()
url
= scrapy.Field()
url_object_id
= scrapy.Field()
front_image_url
= scrapy.Field()
#列表封面图片路径
front_image_patch
= scrapy.Field()
#封面图片下载到本地的路径
zan_nums
= scrapy.Field()
comment_nums
= scrapy.Field()
fav_nums
= scrapy.Field()
content
= scrapy.Field()
#pipelines默认
#settings.py:
ITEM_PIPELINES
= {
#item到pipelines管道配置,每一项后面到数字,代表顺序,数字越小越先进入
'ArticleScrapy.pipelines.ArticlescrapyPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline':1 #下载图片
}
IMAGES_URLS_FIELD
= "front_image_url" #告诉ITEM_PIPELINES中配置的ImagesPipeline,要下载哪个字段的图片url
project_dir
= os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE
= os.path.join(project_dir
,'images')
#IMAGES_URLS_FIELD下载图片存入的路径
#9、将数据保存到json中
#spides/jobbole.py代码:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http
import Request
from urllib
import parse
from ArticleScrapy.items
import JobBoleArticleItem
from ArticleScrapy.utils.common
import get_md5
from datetime
import datetime
class JobboleSpider(scrapy.Spider)
:
name
= 'jobbole'
allowed_domains
= [
'blog.jobbole.com']
start_urls
= [
'http://blog.jobbole.com/all-posts/']
def parse(
self, response)
:
# 1、提取文章列表页中的每一篇文章的url交给scrapy下载后并解析
post_nodes
= response.css(
'#archive .floated-thumb .post-thumb a')
for post_node
in post_nodes
:
image_url
= post_node.css(
'img::attr(src)').extract_first(
'')
post_url
= post_node.css(
'::attr(href)').extract_first(
'')
#esponse.url: 主域名; meta:front_image_url传递到parse_detail中到response
yield Request(
url=parse.urljoin(
response.url
,post_url)
,callback=self.parse_detail
,meta={
'front_image_url':image_url})
# 2、提取下一页交给scrapy下载
#next_urls = response.css('a.next.page-numbers::attr(href)').extract_first('') #<a class='next page-numbers'></a>
#if next_urls:
#yield Request(url=parse.urljoin(response.url, next_urls), callback=self.parse)
def parse_detail(
self,response)
:
article_item
= JobBoleArticleItem()
'''提取每一篇文章内容'''
image_url
= response.meta.get(
'front_image_url','')
#接受parse中传递过来的参数(列表封面)
# 【注】:extract_first(0)与extract()[0],前者如果取不到会返回穿进去的变量(0),后者抛异常,所以如果在直到是一个的情况下还是用extract_first
# #xpath获取内容
# #标题
# title = response.xpath('//*[@id="post-112109"]/div[1]/h1/text()').extract_first('')
# #添加时间
# create_date = response.xpath('//*[@id="post-112109"]/div[2]/p/text()').extract_first('').strip().replace('·','').strip()
# #赞数量
# zan_nums= int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first(0))
# #收藏数量
# fav_str =response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract_first('')
# zan_tmp = re.match(".*?(\d+).*",fav_str)
# if zan_tmp:
# fav_nums = int(zan_tmp.group(1))
# else:
# fav_nums = 0
# #评论数量
# comment_str = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first(0).strip()
# comment_tmp = re.match(".*(\d+).*", comment_str)
# if comment_tmp:
# comment_nums = int(zan_tmp.group(1))
# else:
# comment_nums = 0
# #正文内容
# content = response.xpath('//div[@class="entry"]').extract_first('')
#css获取内容
# 标题
title2
= response.css(
'.entry-header h1::text').extract()[
0]
# 添加时间
create_date2
= response.css(
'.entry-meta-hide-on-mobile::text').extract()[
0].strip().replace(
'·','').strip()
# 赞数量
zan_nums2
= int(
response.css(
'.vote-post-up h10::text').extract()[
0])
# 收藏数量
fav_str2
=response.css(
'.bookmark-btn::text').extract()[
0]
zan_tmp2
= re.match(
".*?(\d+).*",fav_str2)
if zan_tmp2
:
fav_nums2
= zan_tmp2.group(
1)
else:
fav_nums2
= 0
# 评论数量
comment_str2
= response.css(
'a[href="#article-comment"] span::text').extract()[
0].strip()
comment_tmp2
= re.match(
".*(\d+).*", comment_str2)
if comment_tmp2
:
comment_nums2
= comment_tmp2.group(
1)
else:
comment_nums2
= 0
#content
content2
= response.css(
'.entry').extract()[
0]
#传递到item
article_item[
'url_object_id']
= get_md5(
response.url)
article_item[
'title']
= title2
article_item[
'url']
= response.url
try:
create_date2
= datetime.strftime(create_date2
,'%Y/%m/%d')
except Exception as e:
create_date2
= datetime.now().date()
article_item[
'create_date']
= create_date2
article_item[
'front_image_url']
= [image_url]
#这里一定是数组,否则配置下载那里会报错,它接受到是一个数组
article_item[
'zan_nums']
= zan_nums2
article_item[
'comment_nums']
= comment_nums2
article_item[
'fav_nums']
= fav_nums2
article_item[
'content']
= content2
yield article_item
#配置settings中ITEM_PIPELINES,然后会将数据传递到pipelines中,之后在处理数据
#items代码:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
import datetime
class ArticlescrapyItem(scrapy.Item)
:
# define the fields for your item here like:
# name = scrapy.Field()
pass
class JobBoleArticleItem(scrapy.Item)
:
title
= scrapy.Field()
create_date
= scrapy.Field()
url
= scrapy.Field()
url_object_id
= scrapy.Field()
front_image_url
= scrapy.Field()
#列表封面图片路径
front_image_path
= scrapy.Field()
#封面图片下载到本地的路径
zan_nums
= scrapy.Field()
comment_nums
= scrapy.Field()
fav_nums
= scrapy.Field()
content
= scrapy.Field()
#pipelines默认
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images
import ImagesPipeline
from scrapy.exporters
import JsonItemExporter
import codecs
import json
class ArticlescrapyPipeline(
object)
:
'''因为此pipelines比处理图片ArticleImagePipelines晚,所以传进来这里时候front_image_path已经被ArticleImagePipelines赋值了'''
def process_item(
self, item, spider)
:
return item
class JsonWithEncodingPipeline(
object)
:
'''自定义json导出on文件d'''
def __init__(
self)
:
self.file
= codecs.open(
'article.json','w',encoding='utf-8')
def process_item(
self, item, spider)
:
lines
= json.dumps(
dict(
item)
,ensure_ascii=False)
+ "\n"
self.file.write(lines)
return item
def spider_closed(
self,spider)
:
'''当爬虫结束时候会调用此方法'''
self.file.close()
class JsonExporterPipeline(
object)
:
'''调用scrapy提供的json export导出json文件'''
def __init__(
self)
:
self.file
= open(
'articleexporter.json','wb')
self.exporter
= JsonItemExporter(
self.file
,encoding = 'utf-8',ensure_ascii=False)
self.exporter.start_exporting()
def spider_closed(
self)
:
self.exporter.finish_exporting()
self.file.close()
def process_item(
self,item,spider)
:
self.exporter.export_item(
item)
return item
#定义自己到pipeplines
class ArticleImagePipelines(ImagesPipeline)
:
def item_completed(
self, results, item, info)
:
image_file_path
= ''
for ok
,value
in results: #results是一个tuple
image_file_path
= value[
'path']
#path是返回图片保存在本地路径,ImagesPipeline定义好的,每个item现在只有一个图片,所以只会循环一次,就会退出循环了
item[
'front_image_path']
= image_file_path
return item #一定要return出去,因为下一个pipelines还要接受
#settings.py:
ITEM_PIPELINES
= {
#item到pipelines管道配置,每一项后面到数字,代表顺序,数字越小越先进入
#'ArticleScrapy.pipelines.ArticlescrapyPipeline': 300,
# 'scrapy.pipelines.images.ImagesPipeline':1 #下载图片
'ArticleScrapy.pipelines.ArticleImagePipelines': 1,
'ArticleScrapy.pipelines.JsonExporterPipeline': 2,
}
IMAGES_URLS_FIELD
= "front_image_url" #告诉ITEM_PIPELINES中配置的ImagesPipeline,要下载哪个字段的图片url
project_dir
= os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE
= os.path.join(project_dir
,'images')
#IMAGES_URLS_FIELD下载图片存入的路径
#10、将数据保存到数据库
#先安装mysql驱动
pip install mysqlclient
#spides/jobbole.py代码:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http
import Request
from urllib
import parse
from ArticleScrapy.items
import JobBoleArticleItem
from ArticleScrapy.utils.common
import get_md5
from datetime
import datetime
class JobboleSpider(scrapy.Spider)
:
name
= 'jobbole'
allowed_domains
= [
'blog.jobbole.com']
start_urls
= [
'http://blog.jobbole.com/all-posts/']
def parse(
self, response)
:
# 1、提取文章列表页中的每一篇文章的url交给scrapy下载后并解析
post_nodes
= response.css(
'#archive .floated-thumb .post-thumb a')
for post_node
in post_nodes
:
image_url
= post_node.css(
'img::attr(src)').extract_first(
'')
post_url
= post_node.css(
'::attr(href)').extract_first(
'')
#esponse.url: 主域名; meta:front_image_url传递到parse_detail中到response
yield Request(
url=parse.urljoin(
response.url
,post_url)
,callback=self.parse_detail
,meta={
'front_image_url':image_url})
# 2、提取下一页交给scrapy下载
#next_urls = response.css('a.next.page-numbers::attr(href)').extract_first('') #<a class='next page-numbers'></a>
#if next_urls:
#yield Request(url=parse.urljoin(response.url, next_urls), callback=self.parse)
def parse_detail(
self,response)
:
article_item
= JobBoleArticleItem()
'''提取每一篇文章内容'''
image_url
= response.meta.get(
'front_image_url','')
#接受parse中传递过来的参数(列表封面)
# 【注】:extract_first(0)与extract()[0],前者如果取不到会返回穿进去的变量(0),后者抛异常,所以如果在直到是一个的情况下还是用extract_first
# #xpath获取内容
# #标题
# title = response.xpath('//*[@id="post-112109"]/div[1]/h1/text()').extract_first('')
# #添加时间
# create_date = response.xpath('//*[@id="post-112109"]/div[2]/p/text()').extract_first('').strip().replace('·','').strip()
# #赞数量
# zan_nums= int(response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract_first(0))
# #收藏数量
# fav_str =response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract_first('')
# zan_tmp = re.match(".*?(\d+).*",fav_str)
# if zan_tmp:
# fav_nums = int(zan_tmp.group(1))
# else:
# fav_nums = 0
# #评论数量
# comment_str = response.xpath('//a[@href="#article-comment"]/span/text()').extract_first(0).strip()
# comment_tmp = re.match(".*(\d+).*", comment_str)
# if comment_tmp:
# comment_nums = int(zan_tmp.group(1))
# else:
# comment_nums = 0
# #正文内容
# content = response.xpath('//div[@class="entry"]').extract_first('')
#css获取内容
# 标题
title2
= response.css(
'.entry-header h1::text').extract()[
0]
# 添加时间
create_date2
= response.css(
'.entry-meta-hide-on-mobile::text').extract()[
0].strip().replace(
'·','').strip()
# 赞数量
zan_nums2
= int(
response.css(
'.vote-post-up h10::text').extract()[
0])
# 收藏数量
fav_str2
=response.css(
'.bookmark-btn::text').extract()[
0]
zan_tmp2
= re.match(
".*?(\d+).*",fav_str2)
if zan_tmp2
:
fav_nums2
= zan_tmp2.group(
1)
else:
fav_nums2
= 0
# 评论数量
comment_str2
= response.css(
'a[href="#article-comment"] span::text').extract()[
0].strip()
comment_tmp2
= re.match(
".*(\d+).*", comment_str2)
if comment_tmp2
:
comment_nums2
= comment_tmp2.group(
1)
else:
comment_nums2
= 0
#content
content2
= response.css(
'.entry').extract()[
0]
#传递到item
article_item[
'url_object_id']
= get_md5(
response.url)
article_item[
'title']
= title2
article_item[
'url']
= response.url
try:
create_date2
= datetime.strftime(create_date2
,'%Y/%m/%d')
except Exception as e:
create_date2
= datetime.now().date()
article_item[
'create_date']
= create_date2
article_item[
'front_image_url']
= [image_url]
#这里一定是数组,否则配置下载那里会报错,它接受到是一个数组
article_item[
'zan_nums']
= zan_nums2
article_item[
'comment_nums']
= comment_nums2
article_item[
'fav_nums']
= fav_nums2
article_item[
'content']
= content2
yield article_item
#配置settings中ITEM_PIPELINES,然后会将数据传递到pipelines中,之后在处理数据
#items代码:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
import datetime
class ArticlescrapyItem(scrapy.Item)
:
# define the fields for your item here like:
# name = scrapy.Field()
pass
class JobBoleArticleItem(scrapy.Item)
:
title
= scrapy.Field()
create_date
= scrapy.Field()
url
= scrapy.Field()
url_object_id
= scrapy.Field()
front_image_url
= scrapy.Field()
#列表封面图片路径
front_image_path
= scrapy.Field()
#封面图片下载到本地的路径
zan_nums
= scrapy.Field()
comment_nums
= scrapy.Field()
fav_nums
= scrapy.Field()
content
= scrapy.Field()
#pipelines
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images
import ImagesPipeline
from scrapy.exporters
import JsonItemExporter
from twisted.enterprise
import adbapi
import codecs
import json
import MySQLdb
import MySQLdb.cursors
class ArticlescrapyPipeline(
object)
:
'''因为此pipelines比处理图片ArticleImagePipelines晚,所以传进来这里时候front_image_path已经被ArticleImagePipelines赋值了'''
def process_item(
self, item, spider)
:
return item
class JsonWithEncodingPipeline(
object)
:
'''自定义json导出on文件d'''
def __init__(
self)
:
self.file
= codecs.open(
'article.json','w',encoding='utf-8')
def process_item(
self, item, spider)
:
lines
= json.dumps(
dict(
item)
,ensure_ascii=False)
+ "\n"
self.file.write(lines)
return item
def spider_closed(
self,spider)
:
'''当爬虫结束时候会调用此方法'''
self.file.close()
class JsonExporterPipeline(
object)
:
'''调用scrapy提供的json export导出json文件'''
def __init__(
self)
:
self.file
= open(
'articleexporter.json','wb')
self.exporter
= JsonItemExporter(
self.file
,encoding = 'utf-8',ensure_ascii=False)
self.exporter.start_exporting()
def spider_closed(
self)
:
self.exporter.finish_exporting()
self.file.close()
def process_item(
self,item,spider)
:
self.exporter.export_item(
item)
return item
class MysqlPipelines(
object)
:
'''将数据存入数据库,方法一,同步入库,效率低'''
def __init__(
self)
:
self.conn
= MySQLdb.connect(
'127.0.0.1','root','root','scrapy_spider',charset='utf8',use_unicode = True)
self.cursor
= self.conn.cursor()
def process_item(
self,item,spider)
:
insert_sql
= "insert into jobbole_article(title,create_date,url,url_object_id,front_image_url,front_image_path,comment_nums,zan_nums,fav_nums,content)VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
self.cursor.execute(insert_sql
,(
item[
'title']
,item[
'create_date']
,item[
'url']
,item[
'url_object_id']
,item[
'front_image_url'][
0]
,item[
'front_image_path']
,
item[
'comment_nums']
,item[
'zan_nums']
,item[
'fav_nums']
,item[
'content']))
self.conn.commit()
return item
class MysqlTwistedPipelines(
object)
:
def __init__(
self,dbpool)
:
self.dbpool
= dbpool
'''将数据存入数据库,方法二,Twiste异步入库,现在settings中配置好数据库配置'''
@classmethod
def from_settings(
cls,settings)
:
'''#进入此pipelines自动执行,获取settings.py变量,此方法完成后在执行__init__'''
dbparams
= dict(
host = settings[
'MYSQL_HOST']
,
db = settings[
'MYSQL_DBNAME']
,
user = settings[
'MYSQL_USER']
,
passwd = settings[
'MYSQL_PASSWORD']
,
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor
)
#adbapi将mysql变为异步操作
dbpoll
= adbapi.ConnectionPool(
'MySQLdb',**dbparams)
return cls(dbpoll)
#cls其实就是MysqlTwistedPipelines,实例化之后执行__init__
def process_item(
self,item,spider)
:
'''使用Twisted将mysql插入变为异步'''
query
= self.dbpool.runInteraction(
self.do_insert
,item)
#第一个参数自定义
#处理异常
query.addErrback(
self.handle_error)
#自定义错误时候处理函数
def do_insert(
self,cursor,item)
:
'''执行具体到插入数据库逻辑'''
insert_sql
= "insert into jobbole_article(title,create_date,url,url_object_id,front_image_url,front_image_path,comment_nums,zan_nums,fav_nums,content)VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insert_sql
, (
item[
'title']
, item[
'create_date']
, item[
'url']
, item[
'url_object_id']
, item[
'front_image_url'][
0]
,
item[
'front_image_path']
,
item[
'comment_nums']
, item[
'zan_nums']
, item[
'fav_nums']
, item[
'content']))
def handle_error(
self,failure)
:
'''处理异步插入异常'''
print(
failure)
#定义自己到pipeplines
class ArticleImagePipelines(ImagesPipeline)
:
def item_completed(
self, results, item, info)
:
image_file_path
= ''
for ok
,value
in results: #results是一个tuple
image_file_path
= value[
'path']
#path是返回图片保存在本地路径,ImagesPipeline定义好的,每个item现在只有一个图片,所以只会循环一次,就会退出循环了
item[
'front_image_path']
= image_file_path
return item #一定要return出去,因为下一个pipelines还要接受
#settings.py:
ITEM_PIPELINES
= {
#item到pipelines管道配置,每一项后面到数字,代表顺序,数字越小越先进入
#'ArticleScrapy.pipelines.ArticlescrapyPipeline': 300,
# 'scrapy.pipelines.images.ImagesPipeline':1 #下载图片
'ArticleScrapy.pipelines.ArticleImagePipelines': 1,
# 'ArticleScrapy.pipelines.JsonExporterPipeline': 2,
# 'ArticleScrapy.pipelines.MysqlPipelines': 2,
'ArticleScrapy.pipelines.MysqlTwistedPipelines': 2,
}
IMAGES_URLS_FIELD
= "front_image_url" #告诉ITEM_PIPELINES中配置的ImagesPipeline,要下载哪个字段的图片url
project_dir
= os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE
= os.path.join(project_dir
,'images')
#IMAGES_URLS_FIELD下载图片存入的路径
#11、使用ItemLoader:
#spiders下代码:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy.http
import Request
from urllib
import parse
from ArticleScrapy.items
import JobBoleArticleItem
,ArticleItemLoader
from ArticleScrapy.utils.common
import get_md5
from datetime
import datetime
from scrapy.loader
import ItemLoader
class JobboleSpider(scrapy.Spider)
:
name
= 'jobbole'
allowed_domains
= [
'blog.jobbole.com']
start_urls
= [
'http://blog.jobbole.com/all-posts/']
def parse(
self, response)
:
# 1、提取文章列表页中的每一篇文章的url交给scrapy下载后并解析
post_nodes
= response.css(
'#archive .floated-thumb .post-thumb a')
for post_node
in post_nodes
:
image_url
= post_node.css(
'img::attr(src)').extract_first(
'')
post_url
= post_node.css(
'::attr(href)').extract_first(
'')
#esponse.url: 主域名; meta:front_image_url传递到parse_detail中到response
yield Request(
url=parse.urljoin(
response.url
,post_url)
,callback=self.parse_detail
,meta={
'front_image_url':image_url})
# 2、提取下一页交给scrapy下载
#next_urls = response.css('a.next.page-numbers::attr(href)').extract_first('') #<a class='next page-numbers'></a>
#if next_urls:
#yield Request(url=parse.urljoin(response.url, next_urls), callback=self.parse)
def parse_detail(
self,response)
:
'''提取每一篇文章内容'''
image_url
= response.meta.get(
'front_image_url','')
#接受parse中传递过来的参数(列表封面)
#使用itemLoader获取内容
itemLoader
= ArticleItemLoader(
item=JobBoleArticleItem()
,response = response)
# itemLoader.add_xpath() #通过xpath筛选后的值
# itemLoader.add_value() #直接添加值
itemLoader.add_css(
'title','.entry-header h1::text')
#通过css筛选的值
itemLoader.add_value(
'url',response.url)
itemLoader.add_value(
'url_object_id', get_md5(
response.url))
itemLoader.add_css(
'create_date', '.entry-meta-hide-on-mobile::text')
# 通过css筛选的值
itemLoader.add_value(
'front_image_url', [image_url])
itemLoader.add_css(
'zan_nums', '.vote-post-up h10::text')
# 通过css筛选的值
itemLoader.add_css(
'comment_nums', 'a[href="#article-comment"] span::text')
# 通过css筛选的值
itemLoader.add_css(
'fav_nums', '.bookmark-btn::text')
# 通过css筛选的值
itemLoader.add_css(
'content', '.entry')
# 通过css筛选的值
article_item
= itemLoader.load_item()
yield article_item
#配置settings中ITEM_PIPELINES,然后会将数据传递到pipelines中,之后在处理数据
#items代码:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
from datetime
import datetime
import re
from scrapy.loader
import ItemLoader
from scrapy.loader.processors
import MapCompose
, TakeFirst
, Join
class ArticlescrapyItem(scrapy.Item)
:
# define the fields for your item here like:
# name = scrapy.Field()
pass
def add_jobbole(
value)
:
return value + '-jobbole'
def date_convert(
date)
:
'''格式化字符串转为日期'''
try:
result
= datetime.strftime(
date, '%Y/%m/%d')
except Exception as e:
result
= datetime.now().date()
return result
def get_nums(
value)
:
tmp
= re.match(
".*?(\d+).*", value)
if tmp
:
result
= int(tmp.group(
1))
else:
result
= 0
return result
def remove_content_text(
value)
:
'''去除tag中的评论这一项'''
if '评论' in value:
return ''
else:
return value
def return_value(
value)
:
return value
class ArticleItemLoader(ItemLoader)
:
'''自定义ItemLoader'''
default_output_processor
= TakeFirst()
# 所有使用此loader的item都会取出第一个值,item下不需要在每个字段都定义
class JobBoleArticleItem(scrapy.Item)
:
# data = scrapy.Field(
# input_processor=MapCompose(lambda x:x+'lxj',add_jobbole) #input_processor预处理,MapCompose中可以传入多个函数,依次执行处理(也可以传入匿名函数)
# output_processor = TakeFirst() # 从list中取出第一个,类似extract_first()
# )
# tag = scrapy.Field(
# input_processor=MapCompose(remove_content_text) #由于list中含有一个"评论数",不想要它,定义一个方法剔除它,就不再给我们JOIN了,先执行input_processor,在执行Join
# output_processor = Join(',') # tag在spiders本身就是一个list,我们想把这个list里面的值用","拼接起来,就用JOIN,这里的output_processor会覆盖ArticleItemLoader中的默认的
# )
title
= scrapy.Field(
input_processor=MapCompose(
lambda x
: x
+ 'lxj', add_jobbole)
)
create_date
= scrapy.Field(
input_processor=MapCompose(date_convert)
,
)
url
= scrapy.Field()
url_object_id
= scrapy.Field()
front_image_url
= scrapy.Field(
#因为front_image_url配置在settings后,必须要传入一个list,所以要自定义output_processor覆盖ArticleItemLoader中的默认的
output_processor = MapCompose(return_value)
#什么也不干,直接返回给定的数组
)
front_image_path
= scrapy.Field()
zan_nums
= scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_nums
= scrapy.Field(
input_processor=MapCompose(get_nums)
)
fav_nums
= scrapy.Field(
input_processor=MapCompose(get_nums)
)
content
= scrapy.Field()
#pipelines代码:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images
import ImagesPipeline
from scrapy.exporters
import JsonItemExporter
from twisted.enterprise
import adbapi
import codecs
import json
import MySQLdb
import MySQLdb.cursors
class ArticlescrapyPipeline(
object)
:
'''因为此pipelines比处理图片ArticleImagePipelines晚,所以传进来这里时候front_image_path已经被ArticleImagePipelines赋值了'''
def process_item(
self, item, spider)
:
return item
class JsonWithEncodingPipeline(
object)
:
'''自定义json导出on文件d'''
def __init__(
self)
:
self.file
= codecs.open(
'article.json','w',encoding='utf-8')
def process_item(
self, item, spider)
:
lines
= json.dumps(
dict(
item)
,ensure_ascii=False)
+ "\n"
self.file.write(lines)
return item
def spider_closed(
self,spider)
:
'''当爬虫结束时候会调用此方法'''
self.file.close()
class JsonExporterPipeline(
object)
:
'''调用scrapy提供的json export导出json文件'''
def __init__(
self)
:
self.file
= open(
'articleexporter.json','wb')
self.exporter
= JsonItemExporter(
self.file
,encoding = 'utf-8',ensure_ascii=False)
self.exporter.start_exporting()
def spider_closed(
self)
:
self.exporter.finish_exporting()
self.file.close()
def process_item(
self,item,spider)
:
self.exporter.export_item(
item)
return item
class MysqlPipelines(
object)
:
'''将数据存入数据库,方法一,同步入库,效率低'''
def __init__(
self)
:
self.conn
= MySQLdb.connect(
'127.0.0.1','root','root','scrapy_spider',charset='utf8',use_unicode = True)
self.cursor
= self.conn.cursor()
def process_item(
self,item,spider)
:
insert_sql
= "insert into jobbole_article(title,create_date,url,url_object_id,front_image_url,front_image_path,comment_nums,zan_nums,fav_nums,content)VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
self.cursor.execute(insert_sql
,(
item[
'title']
,item[
'create_date']
,item[
'url']
,item[
'url_object_id']
,item[
'front_image_url'][
0]
,item[
'front_image_path']
,
item[
'comment_nums']
,item[
'zan_nums']
,item[
'fav_nums']
,item[
'content']))
self.conn.commit()
return item
class MysqlTwistedPipelines(
object)
:
def __init__(
self,dbpool)
:
self.dbpool
= dbpool
'''将数据存入数据库,方法二,Twiste异步入库,现在settings中配置好数据库配置'''
@classmethod
def from_settings(
cls,settings)
:
'''#进入此pipelines自动执行,获取settings.py变量,此方法完成后在执行__init__'''
dbparams
= dict(
host = settings[
'MYSQL_HOST']
,
db = settings[
'MYSQL_DBNAME']
,
user = settings[
'MYSQL_USER']
,
passwd = settings[
'MYSQL_PASSWORD']
,
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor
)
#adbapi将mysql变为异步操作
dbpoll
= adbapi.ConnectionPool(
'MySQLdb',**dbparams)
return cls(dbpoll)
#cls其实就是MysqlTwistedPipelines,实例化之后执行__init__
def process_item(
self,item,spider)
:
'''使用Twisted将mysql插入变为异步'''
query
= self.dbpool.runInteraction(
self.do_insert
,item)
#第一个参数自定义
#处理异常
query.addErrback(
self.handle_error)
#自定义错误时候处理函数
def do_insert(
self,cursor,item)
:
'''执行具体到插入数据库逻辑'''
insert_sql
= "insert into jobbole_article(title,create_date,url,url_object_id,front_image_url,front_image_path,comment_nums,zan_nums,fav_nums,content)VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
cursor.execute(insert_sql
, (
item[
'title']
, item[
'create_date']
, item[
'url']
, item[
'url_object_id']
, item[
'front_image_url'][
0]
,
item[
'front_image_path']
,
item[
'comment_nums']
, item[
'zan_nums']
, item[
'fav_nums']
, item[
'content']))
def handle_error(
self,failure)
:
'''处理异步插入异常'''
print(
failure)
#定义自己到pipeplines
class ArticleImagePipelines(ImagesPipeline)
:
def item_completed(
self, results, item, info)
:
if 'front_image_path' in item :
image_file_path
= ''
for ok
,value
in results: #results是一个tuple
image_file_path
= value[
'path']
#path是返回图片保存在本地路径,ImagesPipeline定义好的,每个item现在只有一个图片,所以只会循环一次,就会退出循环了
item[
'front_image_path']
= image_file_path
return item #一定要return出去,因为下一个pipelines还要接受
#settings.py
ITEM_PIPELINES
= {
#item到pipelines管道配置,每一项后面到数字,代表顺序,数字越小越先进入
#'ArticleScrapy.pipelines.ArticlescrapyPipeline': 300,
# 'scrapy.pipelines.images.ImagesPipeline':1 #下载图片
'ArticleScrapy.pipelines.ArticleImagePipelines': 1,
# 'ArticleScrapy.pipelines.JsonExporterPipeline': 2,
# 'ArticleScrapy.pipelines.MysqlPipelines': 2,
'ArticleScrapy.pipelines.MysqlTwistedPipelines': 2,
}
IMAGES_URLS_FIELD
= "front_image_url" #告诉ITEM_PIPELINES中配置的ImagesPipeline,要下载哪个字段的图片url
project_dir
= os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE
= os.path.join(project_dir
,'images')
#IMAGES_URLS_FIELD下载图片存入的路径
#MYSQL配置
MYSQL_HOST
= '127.0.0.1'
MYSQL_DBNAME
= 'scrapy_spider'
MYSQL_USER
= 'root'
MYSQL_PASSWORD
= 'root'