import urllib
.request
import re
def jokeCrawler(url
):
headers
= {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"
}
req
= urllib
.request
.Request
(url
,headers
=headers
)
response
= urllib
.request
.urlopen
(req
)
HTML
= response
.read
().decode
("utf-8")
pat
= r
'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
re_joke
= re
.compile(pat
,re
.S
)
divsList
= re_joke
.findall
(HTML
)
dic
= {}
for div
in divsList
:
re_u
= re
.compile(r
"<h2>(.*?)</h2>",re
.S
)
username
= re_u
.findall
(div
)
username
= username
[0]
re_d
= re
.compile(r
'<div class="content">\n<span>(.*?)</span>', re
.S
)
duzi
= re_d
.findall
(div
)
duzi
= duzi
[0]
print(duzi
)
dic
[username
]=duzi
return dic
url
= "https://www.qiushibaike.com/text/page/2/"
info
= jokeCrawler
(url
)
for k
,v
in info
.items
():
print(k
,v
)
代码年代久远,注释当时没写,现在懒得写了
转载请注明原文地址: https://www.6miu.com/read-4931656.html