Scrapy抓取动态内容、下载图片和导入selenium
爬取动态内容
在网页中,有些数据是ajax异步加载,而这些数据我们在用爬虫直接抓取页面的时候并不能抓取到
这时,我们可以通过直接通过抓取数据接口的方式来获取数据
首先在要抓取的页面打开开发者工具进入network下面的XHR,刷新页面抓取异步的数据包分析数据包的url,找到数据加载规则分析数据字段,提取需要的字段信息
下面是抓取360图片的一段代码,因为我们要自己定义要抓取的页面,所以不再需要start_url这个列表,并重写了它的start_requests方法。
from json
import loads
from urllib.parse
import urlencode
import scrapy
from img360.items
import ImgItem
class ImageSpider(scrapy.Spider):
name =
'image'
allowed_domains = [
'image.so.com']
def start_requests(self):
base_url =
'http://image.so.com/zj?'
param = {
'ch':
'beauty',
'listtype':
'new',
'temp':
1}
for page
in range(
10):
param[
'sn'] = page *
30
full_url = base_url + urlencode(param)
yield scrapy.Request(url=full_url, callback=self.parse)
def parse(self, response):
model_dict = loads(response.text)
for elem
in model_dict[
'list']:
item = ImgItem()
item[
'title'] = elem[
'group_title']
item[
'tag'] = elem[
'tag']
item[
'width'] = elem[
'cover_width']
item[
'height'] = elem[
'cover_height']
item[
'url'] = elem[
'qhimg_url']
yield item
下载图片到本地
Scrapy中给我们封装好了下载图片的方法,只需要我们在pipelines文件中继承ImagesPipeline,并重写它的get_media_requests,item_completed, file_path 方法。并在settings中配置到下载路径,这样就可以在获取图片时,直接将图片下载到本地。具体的实现方法,参照下面的代码
import logging
import pymongo
from scrapy
import Request
from scrapy.exceptions
import DropItem
from scrapy.pipelines.images
import ImagesPipeline
logging = logging.getLogger(
'SaveImgPipeline')
class SaveImgPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
yield Request(url=item[
'url'])
def item_completed(self, results, item, info):
if not results[
0][
0]:
raise DropItem(
'下载失败')
logging.debug(
'下载完成')
return item
def file_path(self, request, response=None, info=None):
filename = request.url.split(
'/')[-
1]
return filename
class SaveToMongoPipeline(object):
def __init__(self, mongo_url, db_name, coll_name):
self.mongo_url = mongo_url
self.db_name = db_name
self.coll_name = coll_name
self.client =
None
self.db =
None
self.coll =
None
def process_item(self, item, spider):
self.coll.insert_one(dict(item))
return item
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.db_name]
self.coll = self.db[self.coll_name]
def close_spider(self, spider):
self.client.close()
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings.get(
'MONGO_URL'),
crawler.settings.get(
'MONGO_DB'),
crawler.settings.get(
'MONGO_COLL'))
settings.py文件中在这个项目中额外配置的多个item,和middleware
BOT_NAME =
'img360'
SPIDER_MODULES = [
'img360.spiders']
NEWSPIDER_MODULE =
'img360.spiders'
USER_AGENT =
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' \
' Chrome/67.0.3396.62 Safari/537.36'
ROBOTSTXT_OBEY =
False
CONCURRENT_REQUESTS =
2
DOWNLOAD_DELAY =
3
RANDOMIZE_DOWNLOAD_DELAY =
True
DOWNLOADER_MIDDLEWARES = {
'img360.middlewares.TaoDaoDownloaderMiddleware':
543,
}
IMAGES_STORE =
'./resourses/'
ITEM_PIPELINES = {
'img360.pipelines.SaveImgPipeline':
300,
'img360.pipelines.SaveToMongoPipeline':
301
}
LOG_LEVEL =
'DEBUG'
MONGO_URL =
'mongodb://47.98.172.171:27017'
MONGO_DB =
'image360'
MONGO_COLL =
'beauty'
items.py文件中
import scrapy
class ImgItem(scrapy.Item):
title = scrapy.Field()
tag = scrapy.Field()
width = scrapy.Field()
height = scrapy.Field()
url = scrapy.Field()
StringIO
StringIO()
可变字符串如果做字符串拼接要用这种方法,正常的字符串拼接会产生大量无用的数据存在内存中正常str是不变字符串
抓包工具
wireshark 一个强大的抓包工具Charles 只能抓http/https
抓包指南
from io
import StringIO
from urllib.parse
import urlencode
import re
import scrapy
class TaobaoSpider(scrapy.Spider):
name =
'taobao'
allowed_domains = [
'www.taobao.com']
def start_requests(self):
base_url =
'https://s.taobao.com/search?'
params = {}
for keyword
in [
'ipad',
'iphone',
'小米手机']:
params[
'q'] = keyword
for page
in range(
10):
params[
's'] = page *
44
full_url = base_url + urlencode(params)
yield scrapy.Request(url=full_url, callback=self.parse)
def parse(self, response):
goods_list = response.xpath(
'//*[@id="mainsrp-itemlist"]/div/div/div[1]/div')
for goods
in goods_list:
item = GoodsItem()
item[
'price'] = goods.xpath(
'div[2]/div[1]/div[1]/strong/text()').extract_first()
item[
'deal'] = goods.xpath(
'div[2]/div[1]/div[2]/text()').extract_first()
segments = goods.xpath(
'div[2]/div[2]/a/text()').extract()
title = StringIO()
for segment
in segments:
title.write(re.sub(
'\s',
'', segment))
item[
'title'] = title.getvalue()
yield item
在request中加cookie
先手动登录, 取到里面的cookie在自己的请求中加cookie,这样就可以绕过登录
用selenium代替scrapy中的下载器
首先要介绍下Scrapy中从引擎到下载器的中间过程,正常流程是,引擎(engine)将url交给下载器(downloader),下载器来 进行数据下载,如果我们想导入selenium, 那么必须要阻断这个过程,并有selenium来代替,我们就需要通过修改downloadmiddleware中的process_request来实现。process_request,有三种返回结果
NONE:
有scrapy内置的下载器来下载页面return a Request object
不需要下载器下载,而是交给调度器return a Response object
越过下载器,进行一个流程所以我们在这里只需将selenium在这个方法中实现,并返回一个下载好的页面的response。
import time
from selenium
import webdriver
from scrapy
import signals
from scrapy.http
import HtmlResponse
from selenium.common.exceptions
import TimeoutException
class Img360SpiderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i
in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r
in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info(
'Spider opened: %s' % spider.name)
class Img360DownloaderMiddleware(object):
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info(
'Spider opened: %s' % spider.name)
class TaoDaoDownloaderMiddleware(object):
def __init__(self, timeout=None):
self.timeout = timeout
self.browser = webdriver.Chrome()
self.browser.set_window_size(
1000,
600)
self.browser.set_page_load_timeout(self.timeout)
def __del__(self):
self.browser.close()
def process_request(self, request, spider):
try:
self.browser.get(request.url)
return HtmlResponse(url=request.url, body=self.browser.page_source,
request=request, encoding=
'utf-8', status=
200)
except TimeoutException:
return HtmlResponse(url=request.url, status=
500, request=request)
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
@classmethod
def from_crawler(cls, crawler):
return cls(timeout=
10)
推荐学习爬虫的好的博客 崔庆才静觅,github代码