更新一版,相比上一版,执行效率有所提升。在于使用了BeautifulSoup(html,’lxml’) 这一版,把python环境升级到了python 3.6.4
#coding=utf-8 import urllib from bs4 import BeautifulSoup import re import sys def get_html(url):#获取html req = urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0') response = urllib.request.urlopen(req) html = response.read().decode('utf-8') return html def get_goods(url):#根据搜索列表页,获取商品的url,每页应该是20个 html = get_html(url) soup = BeautifulSoup(html,'lxml') soup.prettify() goods = soup.find_all('dt',attrs={'class':'title'}) g_list = [] for g in goods: g_list.append(g.a.get('href')) return g_list def get_all_pages(url):# 获取所有页的URL html = get_html(url + str(1)) soup = BeautifulSoup(html,'lxml') soup.prettify() pages = soup.find_all('a',attrs={'class':'num'}) l = [] for p in pages: l.append(int(p.string)) max_page = max(l)#获取共有多少页 all_pages_list = [] for i in range(1,max_page+1): all_pages_list.append(url + str(i)) return all_pages_list def pet_info(good_url):#根据商品详情url,获取角色的宝宝信息,输出符合要求的宝宝 start_str= "charObj = " end_str = ',"items"' html = get_html(good_url) startIndex = html.index(start_str) startIndex = startIndex + len(start_str) endIndex = html.index(end_str) role_json = html[startIndex:endIndex]+"}" role_dict = eval(role_json) petList = role_dict["petList"] if petList == []: return "no_pet" else: for i in range(0,len(petList)): petname = petList[i]["petVarLevelExplain"] #珍兽名称 petvarlevel = petList[i]["petVarLevel"] #变异等级 if (petname == "傲云苍龙") and (petvarlevel >= 7): return "matching" if __name__ == "__main__": url = "http://tl.cyg.changyou.com/goods/public?world_id=0&profession=8&price=11-2345&level=110-119&have_chosen=profession*8 price*11-2345 level*110-119&page_num=" pages_list = get_all_pages(url) print("共:",len(pages_list),"页") for p in range(0,len(pages_list)): p_goods = get_goods(pages_list[p]) print("开始分析第",p+1,"页,共有",len(p_goods),"个商品") for g in range(0,len(p_goods)): if pet_info(p_goods[g]) == "matching": print(g+1,"--match--",p_goods[g]) else: sys.stdout.write(str(g+1)+"/"+str(len(p_goods))+",NO match"+"\r") sys.stdout.flush()今天又更新一版。
#coding=utf-8 from urllib import request,parse from bs4 import BeautifulSoup import re import sys import time def get_html(url):#获取html req = request.Request(url) req.add_header('User-Agent','Mozilla/5.0') response = request.urlopen(req) html = response.read().decode('utf-8') time.sleep(5) return html def get_goods(url):#根据搜索列表页,获取角色商品的url,每页应该是20个 html = get_html(url) soup = BeautifulSoup(html,'lxml') soup.prettify() goods = soup.find_all('dt',attrs={'class':'title'}) g_list = [] for g in goods: g_list.append(g.a.get('href')) return g_list def get_all_pages(url):# 获取所有页的URL html = get_html(url + str(1)) soup = BeautifulSoup(html,'lxml') soup.prettify() pages = soup.find_all('a',attrs={'class':'num'}) l = [] for p in pages: l.append(int(p.string)) max_page = max(l)#获取共有多少页 all_pages_list = [] for i in range(1,max_page+1): all_pages_list.append(url + str(i)) return all_pages_list def pet_info(good_url):#根据商品详情url,获取角色的宝宝信息,输出符合要求的宝宝 start_str= "charObj = " end_str = ',"items"' html = get_html(good_url) startIndex = html.index(start_str) startIndex = startIndex + len(start_str) endIndex = html.index(end_str) role_json = html[startIndex:endIndex]+"}" role_dict = eval(role_json) petList = role_dict["petList"] if petList == []: return "no_pet" else: for i in range(0,len(petList)): petname = petList[i]["petVarLevelExplain"] #珍兽名称 petvarlevel = petList[i]["petVarLevel"] #变异等级0:宝宝,9:成年 xingge = petList[i]["petXingGe"] #性格 0:胆小, 1:谨慎, 2:忠诚,3:精明,4:勇猛 if (petname == "傲云苍龙") and (petvarlevel == 7): return "matching" if __name__ == "__main__": host_url = "http://tl.cyg.changyou.com/goods/public?world_id=0&" info_dict = { 'profession':8, 'price':'222-2333', 'level':'110-119', 'xinfa':'4001-0', 'xiulian':'10001-0', 'jinjiexiulian':'3001-0', 'equipscore':'100001-400000' } tt = parse.urlencode(info_dict) have_chosen = parse.quote(tt.encode('GBK')) info_dict['have_chosen'] = have_chosen url = host_url + parse.urlencode(info_dict) +"&page_num=" pages_list = get_all_pages(url) print("共:",len(pages_list),"页") for p in range(0,len(pages_list)): p_goods = get_goods(pages_list[p]) print("开始分析第",p+1,"页,共有",len(p_goods),"个商品") for g in range(0,len(p_goods)): if pet_info(p_goods[g]) == "matching": print(g+1,"--match--",p_goods[g]) else: sys.stdout.write(str(g+1)+"/"+str(len(p_goods))+",NO match"+"\r") sys.stdout.flush()