爬虫爬取糗事百科

xiaoxiao2022-06-11  21

直接上代码

spiders class ExampleSpider(Spider): name = 'qiushi' def __init__(self): self.lit = [] def start_requests(self): url_str = "https://www.qiushibaike.com/text/page/3/" yield Request(url=url_str,callback=self.parse,meta={"page":"0"},dont_filter=True) def parse(self, response): a = re.search("page",response.url) url_strs =response.url self.lit.append(url_strs) content_block = response.xpath('//div[@class="article block untagged mb15 typs_hot"]') for i in content_block: url = i.xpath('//a[@class="contentHerf"]/@href').extract() for j in url: url_str = urljoin('https://www.qiushibaike.com',j) yield Request(url=url_str,callback=self.parse_detail,meta={"page":"0"}) if a: yield Request(url=self.lit[0],callback=self.parse,meta={'page':"2"},dont_filter=True) self.lit = [] def parse_detail(self,response): author = response.xpath('//div[@class="author clearfix"]/a[2]/h2/text()').extract() content = response.xpath('//*[@id="single-next-link"]/div/text()').extract() haoxiao_num = response.xpath('//div[@class="stats"]/span[@class="stats-vote"]/i/text()').extract() pinglun_num = response.xpath('//div[@class="stats"]/span[@class="stats-comments"]/i/text()').extract() item = QiushibaikeItem() item["author"] = "".join(author) item["content"] = "".join(content) item["haoxiao_num"] = "".join(haoxiao_num) item["pinglun_num"] = "".join(pinglun_num) yield item items: import scrapy class QiushibaikeItem(scrapy.Item): author = scrapy.Field() content = scrapy.Field() haoxiao_num = scrapy.Field() pinglun_num = scrapy.Field() 中间件 from selenium import webdriver from selenium.webdriver.firefox.options import Options as FOptions import time from scrapy.http import HtmlResponse import re class SeleniumMiddlewares(object): def __init__(self): self.options = FOptions() #self.options.add_argument("-headless") self.browser = webdriver.Firefox(executable_path="/home/hello/Downloads/geckodriver",firefox_options=self.options) def process_request(self,request,spider): if int(request.meta["page"]) >0: self.browser.get(request.url) self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)") time.sleep(1) button = self.browser.find_element_by_xpath('//span[@class="next"]') button.click() time.sleep(2) else: self.browser.get(request.url) time.sleep(2) return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8", request=request) pipelines import sqlite3 class QiushibaikePipeline(object): def __init__(self): self.conn = sqlite3.connect("qiushibaike.db") self.cursor = self.conn.cursor() self.cursor.execute("create table IF NOT EXISTS qiushi(author varchar(200),content varchar(500),haoxiao_num varchar(100),pinglun_num varchar(100))") def process_item(self, item, spider): self.cursor.execute("insert into qiushi values('%s','%s','%s','%s')"%(item["author"],item["content"],item["haoxiao_num"],item["pinglun_num"])) self.conn.commit() return item
转载请注明原文地址: https://www.6miu.com/read-4930454.html

最新回复(0)