用python爬取网贷之家p2p平台数据

xiaoxiao2021-03-01  17

网贷之家中的p2p平台数据比较容易获取,重要的就是如何分析网页的源代码然后从里面提取自己需要的信息,也不需要用户登录,该网站的爬虫比较简单,主要用了urllib包来获取网页信息,用BeautifulSoup来解析网页,最后用正则表达式提取数据。这里就直接上源码了:

# -*- coding: utf-8 -*- """ Created on Wed Aug 8 18:22:26 2018 @author: 95647 """ import urllib from urllib.request import urlopen from bs4 import BeautifulSoup import re import pandas as pd headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} lists =[] domains = "https://www.wdzj.com" def get_platform_site(url,lists): """获取所有的平台网址""" # global lists req = urllib.request.Request(url, headers=headers) html = urlopen(req) bsObj = BeautifulSoup(html,'lxml') title = bsObj.findAll("div",{'class':'itemTitle'}) for titles in title: links = titles.findAll("a",{'target':'_blank'}) for link in links: if 'href' in link.attrs: lists.append(link.attrs['href']) # print(link.attrs['href']) return lists #用utf-8进行解码 def pages_num(url): """获取各类平台的页面总数""" req = urllib.request.Request(url, headers=headers) html = urlopen(req) bsObj = BeautifulSoup(html,'lxml') pages= bsObj.findAll("a",text = '尾页') for page in pages: if "currentnum" in page.attrs: pages_num = page.attrs["currentnum"] else: return None return pages_num def conditions(i): """获取各个平台的运营状态,生成包含各类平台的列表""" # global lists lists =[] url_ = r"""https://www.wdzj.com/dangan/search?filter=e%s"""%str(i) all_pages_num = int(pages_num(url_)) for num in range(1, all_pages_num +1): url = url_ + "¤tPage=%s"%str(num) lists = get_platform_site(url,lists) return lists operations = conditions(1) #正常运营平台 #close_transitions = conditions(2) #停业或转型平台 #in_problems= conditions(3) #问题平台 def plat_profile(lists): """抓取平台的数据""" global domains plat_profile=[] for site in lists: plat_info =[] url = domains + site req = urllib.request.Request(url, headers=headers) html = urlopen(req) bsObj = BeautifulSoup(html,'lxml') plat_name = bsObj.findAll('h1')[0].attrs["alt"] #平台名称 t_l = bsObj.findAll("div",{"class":"pt-info"})[0].get_text() time_s="" location ="" if len(t_l)>0: t_l = re.split("上线",t_l) time_s = t_l[0].strip() #上线时间 location= t_l[1].strip() #平台所属地域 common_data = bsObj.findAll("b",{"class":"tab_common_data"}) yield0 ="" #给出变量值 duration = "" #给出变量值 for data in common_data: text = data.parent.get_text() if len(re.findall(".*月.*",text)) > 0: duration = re.findall(".*月.*",text)[0] duration = text.strip() #平均期限 if len(re.findall(".*%.*",text)) > 0: yield0 = re.findall(".*%.*",text)[0] yield0 = text.strip() #平均收益率 rates_ = bsObj.find("div",{"class":"dpxximg"}) if "data-pl" in rates_.attrs: rates = bsObj.find("div",{"class":"dpxximg"}).attrs["data-pl"] #获取评分 plat_pro = bsObj.findAll("div",{"class":"bgbox-bt zzfwbox"}) plat_pro = BeautifulSoup(str(plat_pro),"lxml") L1 =[] L2 =[] zzzj = "" gqss = "" yhtg = "" rzjl = "" jgxh = "" ICP = "" zdtb = "" zqzr = "" tbbz = "" bzms = "" for div in plat_pro.findAll("div",{"class":"l"}): L1.append(div.get_text().strip()) for div in plat_pro.findAll("div",{"class":"r"}): L2.append(div.get_text().strip()) for slzz in L1: #获取平台的备案信息 if slzz =="注册资金": zzzj = L2[L1.index(slzz)] if slzz =="股权上市": gqss = L2[L1.index(slzz)].replace(" ","") if slzz =="银行存管": yhtg = L2[L1.index(slzz)] if slzz =="融资记录": rzjl = L2[L1.index(slzz)].replace(" ","") if slzz =="监管协会": jgxh = L2[L1.index(slzz)].replace(" ","") if slzz =="ICP号": ICP = L2[L1.index(slzz)] if slzz =="自动投标": zdtb = L2[L1.index(slzz)] if slzz =="债券转让": zqzr = L2[L1.index(slzz)] if slzz =="投标保障": tbbz = L2[L1.index(slzz)] if slzz =="保障模式": bzms = L2[L1.index(slzz)] plat_info.append(plat_name) #这个地放用了很笨的方法,一个个的添加元素到列表中,存在优化空间 plat_info.append(time_s) plat_info.append(location) plat_info.append(duration) plat_info.append(yield0) plat_info.append(rates) plat_info.append(zzzj) plat_info.append(gqss) plat_info.append(yhtg) plat_info.append(rzjl) plat_info.append(jgxh) plat_info.append(ICP) plat_info.append(zdtb) plat_info.append(zqzr) plat_info.append(tbbz) plat_info.append(bzms) plat_profile.append(plat_info) print("------------->"+plat_name+str(lists.index(site))) #打印爬取的平台信息 return plat_profile plat_profile = plat_profile(conditions(1)) #conditions根据平台类型的不同来设置,为1则表示正常运营平台 name = ['平台名称','上线时间','区域','投资期限','平均收益率','评分', '注册资金', '股权上市', '银行存管', '融资记录', '监管协会', 'ICP号', '自动投标', '债权转让', '投标保障', '保障模式'] operating = pd.DataFrame(columns=name,data= plat_profile) operating.to_csv(r"""C:\Users\95647\Desktop\operating.csv""") #path to save csvfile

写该代码之前,还没有学习panda的用法,所以简单粗暴的用列表和字典来解决问题,会使用pandas的朋友可以用pandas来进行优化。爬虫运行过程如下:

该爬虫运行速度很快,我爬取了100个正常运营平台的信息,才用了5分钟左右,部分爬虫结果如下:

有问题的和其他想法的朋友,欢迎加QQ:956471511交流,这里还有一篇关于如何爬取人人贷网站数据的博文,有兴趣的朋友可以看一下。手把手教你用python爬取人人贷借款人数据

转载请注明原文地址: https://www.6miu.com/read-4200230.html

最新回复(0)