#正则表达式法
import requests
import re
import time
from multiprocessing
import Pool
headers
={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
}
def spider(url
):
req
=requests.get
(url=url,
headers=headers
)
text
=req.text
names
=re.findall
('<h2>(.*?)</h2>',text,re.S
)
contents
=re.findall
('<div class="content">.*?<span>(.*?)</span>',text,re.S
)
laughts
=re.findall
('<span class="stats-vote">.*?<i class="number">(\d+)</i>.*?</span>',text,re.S
)
comments
=re.findall
('<span class="stats-comments">.*?<i class="number">(\d+)</i>.*?</span>',text,re.S
)
for name,content,laught,comment
in zip(names,contents,laughts,comments
):
data
={
'name':name.strip
(),
'content':content.strip
(),
'laugth':laught,
'comment':comment
}
print(data
)
if __name__
=='__main__':
time1
=time.time
()
urls
=['https://www.qiushibaike.com/8hr/page/{}/'.format
(str(i
)) for i
in range(1,
14)]
pool
=Pool
(processes=4)
pool.map
(spider,urls
)
time2
=time.time
()
print(time2
-time1
)