#一个python正则表达式的练习
#抓取糗事百科一个页面,输出标题和内容
import urllib.request
import re
url = "https://www.qiushibaike.com/hot/page/1"
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '+
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
'Referer': 'https://www.qiushibaike.com/',
'Connection': 'keep-alive'}
#用Request包装url和header
req = urllib.request.Request(url, headers = header);
#再将打包好的requst传给urlopen,对某些网站可用urlopen(url),如果拒绝连接,
# 可用urlopen(request)
with urllib.request.urlopen(req) as f:
#转换成字节数组
content = f.read().decode("utf-8")
# .*? 匹配任意字符 (.*?)匹配结果返回一个list ,pattern中第一个(.*?)对应
# item[0],第二个(.*?)对应item[1]
pattern = re.compile('<div.*?class="author.*?>.*?<img.*?<h2>(.*?)'+
'</h2>.*?</a>.*?="content".*?<span>(.*?)</span>.*?="stats"', re.S)
#items是一个list
items = re.findall(pattern,content)
i = 1
for item in items:
print("第", i, "个段子")
print(item[0], item[1])
i = i+1
print("----------------")