Python爬取糗事百科-多进程方法

xiaoxiao2021-02-28  6

#正则表达式法 import requests import re import time from multiprocessing import Pool headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } def spider(url): req=requests.get(url=url,headers=headers) text=req.text names=re.findall('<h2>(.*?)</h2>',text,re.S) contents=re.findall('<div class="content">.*?<span>(.*?)</span>',text,re.S) laughts=re.findall('<span class="stats-vote">.*?<i class="number">(\d+)</i>.*?</span>',text,re.S) comments=re.findall('<span class="stats-comments">.*?<i class="number">(\d+)</i>.*?</span>',text,re.S) for name,content,laught,comment in zip(names,contents,laughts,comments): data={ 'name':name.strip(), 'content':content.strip(), 'laugth':laught, 'comment':comment } print(data) if __name__=='__main__': time1=time.time() urls=['https://www.qiushibaike.com/8hr/page/{}/'.format(str(i)) for i in range(1,14)] pool=Pool(processes=4) pool.map(spider,urls) time2=time.time() print(time2-time1)
转载请注明原文地址: https://www.6miu.com/read-2000106.html

最新回复(0)