import re
import urllib.request
def juubao(n, pprice):
"""
实现从juubao网站爬取前n页,折扣大于pprice的商品链接
:param n:网站的前n页
:param pprice:最低优惠券价格
:return:产品的标题,链接,领券地址等
"""
for i
in range(
1, n):
url =
'http://www.juubao.com/index-index-p-{}.html'.format(i)
page = urllib.request.urlopen(url).read().decode(
'utf-8')
goods_url = re.findall(
r'http://www.juubao.com/item-\d{1,8}.html', page)
goods_url = list(set(goods_url))
goods_titles = re.findall(
r'<img alt=.{1,100}" src', page)
goods_title = [title[
10:-
5]
for title
in goods_titles]
discounts = re.findall(
r'get_cupon[\s\S]{5,10}\d{1,4}', page)
discount = [float(re.findall(
r'\d{1,4}', count)[
0])
for count
in discounts]
price_currents = re.findall(
r'price-current[\s\S]{10,15}\d{1,5}\.\d{1,2}', page)
price_current = [float(re.findall(
r'\d{1,5}\.\d{1,2}', price)[
0])
for price
in price_currents]
quan_urls = re.findall(
r'/jump-index-id-\d{3,9}\.html', page)
root_url =
r'http://www.juubao.com'
quan_url = [root_url + quan_urls[i]
for i
in range(
0, len(quan_urls),
2)]
for i
in range(len(discount)):
if discount[i] >= pprice:
print(
"商品的标题:%s,商品url:%s,领券地址:%s 折扣价:%.2f,现价:%.2f" % (goods_title[i], goods_url[i],
quan_url[i], discount[i], price_current[i]))
if __name__ ==
'__main__':
juubao(
10,
50)