import os
import time
import datetime
import codecs
#from lxml import etree
from selenium import webdriver
#import csv
#import urllib.request
from bs4 import BeautifulSoup as bp
import re
import io
import sys
import pandas
#为了保证遇到动态网页时爬取的安全性,和反爬虫的应对策略,和在公司的网络环境下可以成功运行,所有代码舍弃urllib模块,全部采用了python3.5 + selenium + phantomjs的框架。
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') #cmd编码尝试 无弹窗就要在cmd下运行,则必须这样设置
def zsgg(i): #抓取招商的满足要求的公告地址
global zsim1 #保存包含关键字的公告地址,以return返回保存公告的函数:zsbc()
global words0,words1 #0是包含的关键字,1是不包含的关键字
global yeardate,monthdate #输入时间
url = 'http://cc.cmbchina.com/notice/?PageNo=%s'%i #招商公告地址 以%进行格式化,实现翻页,每翻页一次就调用一次本函数
#url = "http://pws.paic.com.cn"
#html = urllib.request.urlopen(url,timeout=500).read()
#html = html.decode('utf-8')
#allurl = re.findall('href="(.*?)"',html)
#html =
#allurl = re.sub('[\s]target="_blank">','标题:'#print(allurl)
#print(allurl)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe') #打开浏览器,并赋一好操作的别名:driver
html0 = driver.get(url) #获取网页链接
driver.maximize_window() # 将浏览器最大化显示
html0 = driver.page_source #获取并下载网页html源代码
time.sleep(5) #休眠一段时间以完成爬取下载
#driver.close() #不能关
#print(html)
#return html
#print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++对应官网公告的html是++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++',html0) #最新公告的网页html 暂时设置为html0
if monthdate != 'none':
pattern = '<a href="(/Notice/.*?htm"[\s]*[.\n]*.*?target="_blank"[.\n]*[\s]*title=.*?)>[.\n]*[\s]*.*?</a></span><span.*?>\[%s-%s-\d{2}\]</span>'%(yeardate,monthdate)
# 以格式化方式,将输入的时间写入正则表达式
print('所得正则表达式是!!!!!!!!!!!',pattern) # 打印正则表达式以验证格式化的作用是否正确
#a = a+1
else:
pattern = '<a href="(/Notice/.*?htm"[\s]*[.\n]*.*?target="_blank"[.\n]*[\s]*title=.*?)>[.\n]*[\s]*.*?</a></span><span.*?>\[%s-\d{2}-\d{2}\]</span>'% yeardate
# 以格式化方式,将输入的时间写入正则表达式
print('所得正则表达式是!!!!!!!!!!!',pattern) # 打印正则表达式以验证格式化的作用是否正确
#a = a+1
urlzs = re.findall(pattern,html0) #仅靠时间所筛选出的公告地址 是一个列表,不止一个
print('*******************************************************筛选过时间的公告地址有**********************************************************************',urlzs) #打印列表以验证是否正确
zsim = [] # 关键字筛选后得到的列表,带有不需要的文字和残余标签
for word0 in words0: #从需要“包含”的关键字列表里逐个选出关键字进行筛选
for urlg0 in urlzs: #从已经筛选过时间的公告地址列表里逐个进行对当前关键字的筛选
if (urlg0.find(word0) != -1 and urlg0 not in zsim): #包含(xx != -1) 并且不再已经得到的结果列表中的地址,写入列表
zsim.append(urlg0)
for word1 in words1: # 从“不包含”的关键字列表中逐个选出关键字进行筛选
for urlg1 in zsim: #将已经筛选过“包含”的关键字从列表中逐个取出
#print(word1)
if (urlg1.find(word1) != -1): #检查是否有 含有 禁止包含的关键字 的地址
q = zsim.index(urlg1) #如果有,取出该地址,并将该地址在列表中的下标位置赋给q
zsim.pop(q) #根据对应下标,弹出该地址
#print('所有包含关键字的公告地址是:',zsim) #打印选择出的地址,以判断是否正确
zsim1 = [] # 最终结果列表,剔除不相干部分后干净的链接
for zsim0 in zsim: #从已经筛选好的地址中逐个取出地址
urlzsg = re.findall('(/Notice/.*?htm)',zsim0) #找到对应公告的链接,即过滤掉文字和和网页标签中,除了链接地址以外的部分,此步结束的状态是一个列表
#print('包含关键字的公告地址是:',urlzsg) #打印干净的链接以验证是否正确
urlzsg = ''.join(urlzsg) #将列表转化为字符串以进行后续操作
#print('是否有将列表转换为字符串???????',urlzsg) #打印以验证列表是否转化为字符串
zsim1.append(urlzsg) #将转化好的字符串加入结果列表中
print('*******************************************************包含关键字的公告地址有**********************************************************************',zsim1)
#urls = []
#urls.append(url0)
driver.quit() #退出浏览器
return zsim1 #返回结果列表以供保存
def zsbc(): #对于最新公告的逐个保存
for i in range(1,9): # 在页数范围内进行循环,爬取公告。(9即爬取前8页数据,因为range函数不会取到最后一个数字)
zsgg(i) #调用函数,传入页数,进行对公告地址的筛选
global zsim1 #所调用函数的返回列表,已将时间和关键字筛选完毕
for url in zsim1: #逐个取出公告地址,访问html,进行保存
n = zsim1.index(url) #此页的第几个公告(以网页所呈现的顺序排序)
print('=======================================================这是第几个公告==============================================================',zsim1.index(url))
preurl = 'http://cc.cmbchina.com' # 公告链接是残缺的,进行补全所需要的前缀网页
url = preurl + url #完整链接地址
#r = urllib.request.urlopen(url).read() #居然连公告都是动态网页,不能用requests,必须用selenium
print('_____________________________________________________抓取到的公告有______________________________________________________________',url) #打印已验证是否正确
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe') #打开浏览器
gg = driver.get(url) #公告的html 暂时用gg代替
driver.maximize_window() # 将浏览器最大化显示
gg = driver.page_source # 网页html
time.sleep(5) #提供网页下载时间
#driver.close() #不能关
driver.save_screenshot('D:\招商银行%s%s.png'%(i,n)) #截图保存 注意图片的名字设置,需要翻页的银行一定要加上页数(i)区分。
#print(gg)
#return gg
#print('#####################################################具体公告的html是###############################################################',gg) #公告的html 暂时用gg代替
#return gg
driver.quit() #退出浏览器
soup = bp(gg,'html.parser') # 网页解析成beautifulsoup的格式
#jpg_link =
soup = soup.find_all('div',{'class':'mainnotice'})[0] # 取出所需标签信息
#print(soup)
content = soup.text.strip() #提取标签文本
content = ''.join(content) #将类型转化为字符串
#print('emmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm有无成功转成stringemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n',type(content))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<公告内容如下:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n',content) #打印公告内容以验证是否正确
xrtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) #进行时间的格式化同步本地时间如“2017-01-09 14:21:09”
xrtime = ''.join(xrtime) #转化为字符串以进行后续操作
string0 = '\n\n————————————本次爬取公告%s%s如下————————————:\n'%(i,n) #后续爬取公告的时间分割线
string = '\n\n————————————招商银行公告%s%s内容如下————————————:\n'%(i,n) #第一次爬取公告的分割线
string = string + xrtime + '\n\n' #组合分割线和本地时间
string0 = string0 + xrtime + '\n\n' #组合分割线和本地时间
if (os.path.exists('D:\招行公告.doc')): # 判断当前路径是否存在公告文件
with open('D:\招行公告.doc','a+',encoding='utf-8') as f: #如果存在,写入新分割线,并继续写入公告内容
f.write(string0 + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
else:
with open('D:\招行公告.doc','a+',encoding='utf-8') as f: # 如果不存在,写入旧分割线,创建文件写入内容
f.write(string + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
def pfgg(i): #抓取浦发的满足要求的公告地址
global pfim1
global words0,words1
global yeardate,monthdate
url = 'http://ccc.spdb.com.cn/news/zxgg/index%s.shtml'%i #浦发公告地址
#url = "http://pws.paic.com.cn"
#html = urllib.request.urlopen(url,timeout=500).read()
#html = html.decode('utf-8')
#allurl = re.findall('href="(.*?)"',html)
#html =
#allurl = re.sub('[\s]target="_blank">','标题:'#print(allurl)
#print(allurl)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
html0 = driver.get(url)
driver.maximize_window() # 将浏览器最大化显示
html0 = driver.page_source
time.sleep(5)
#driver.close() #不能关
#print(html)
#return html
#print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++对应官网公告的html是++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++',html0) #最新公告的网页html 暂时设置为html0
urlpf = []
if monthdate != 'none':
pattern = '<a (href="./%s%s.*?shtml" .*?>.*?)</a>'%(yeardate,monthdate)
print('得到的正则表达式是======================================================',pattern)
#a = a+1
else:
pattern = '<a (href="./%s.*?shtml" .*?>.*?)</a>'% yeardate
print('得到的正则表达式是======================================================',pattern)
#a = a+1
allurl = re.findall(pattern,html0) #筛选时间后的公告地址 是一个数组,不止一个
#<a href="./201707/t20170704_340566.shtml" target="_blank">关于实施银联人民币卡境外取现限额的通知</a>
for i in allurl:
url0 = re.sub('href=".','http://ccc.spdb.com.cn/news/zxgg',i)
urlpf.append(url0)
print('*******************************************************筛选时间后的公告地址有**********************************************************************',urlpf)
pfim = []
for word0 in words0:
for urlg0 in urlpf:
if (urlg0.find(word0) != -1 and urlg0 not in pfim): #包含(xx != -1)“费”,或者(or)“服务”,并且(and)不包含(xx == -1)“双11”注意要排除关键字重叠的可能
#注意还要将or 的两个条件包进一个括号,否则逻辑不对
pfim.append(urlg0)
for word1 in words1:
for urlg1 in pfim:
#print(pfword1)
if (urlg1.find(word1) != -1):
q = pfim.index(urlg1)
pfim.pop(q)
#print('所有包含关键字的公告地址是:',pfim)
pfim1 = []
for pfim0 in pfim:
urlpfg = re.findall('(http://ccc.spdb.com.cn/news/zxgg/.*?shtml)',pfim0)
#print('包含关键字的公告地址是:',urlzxg)
urlpfg = ''.join(urlpfg)
#print('是否有将列表转换为字符串???????',urlzxg)
pfim1.append(urlpfg)
#print('所有包含关键字的公告地址是:',pfim1)
print('*******************************************************包含关键字的公告地址有**********************************************************************',pfim1)
#urls = []
#urls.append(url0)
driver.quit()
return pfim1
def pfbc(): #对于最新公告的逐个保存
linshi = ['','_1','_2','_3','_4','_5','_6','_7','_8']
for i in linshi:
pfgg(i)
global pfim1
for url in pfim1: #逐个取出公告地址,访问html,进行保存
n = pfim1.index(url)
print('=======================================================这是第几个公告==============================================================',pfim1.index(url))
#preurl = 'http://ccc.spdb.com.cn'
#url = preurl + url
#r = urllib.request.urlopen(url).read() #公告都是动态网页,不能用requests,必须用selenium
print('_____________________________________________________抓取到的公告有______________________________________________________________',url)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
gg = driver.get(url) #公告的html 暂时用gg代替
driver.maximize_window() # 将浏览器最大化显示
gg = driver.page_source
time.sleep(5)
#driver.close() #不能关
driver.save_screenshot('D:\浦发银行%s%s.png'%(i,n)) #截图保存 注意命名加页数
#print(gg)
#return gg
#print('#####################################################具体公告的html是###############################################################',gg) #公告的html 暂时用gg代替
#return gg
driver.quit()
soup = bp(gg,'html.parser')
#jpg_link =
soup = soup.find_all('div',{'class':'main_body_left_noticehead'})[0]
#print(soup)
content = soup.text.strip()
content = ''.join(content)
#print('emmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm有无成功转成stringemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n',type(content))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<公告内容如下:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n',content)
xrtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
xrtime = ''.join(xrtime)
string0 = '\n\n————————————本次爬取公告%s%s如下————————————:\n'%(i,n)
string = '\n\n————————————浦发银行公告%s%s内容如下————————————:\n'%(i,n)
string = string + xrtime + '\n\n'
string0 = string0 + xrtime + '\n\n'
if (os.path.exists('D:\浦发公告.doc')):
with open('D:\浦发公告.doc','a+',encoding='utf-8') as f:
f.write(string0 + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
else:
with open('D:\浦发公告.doc','a+',encoding='utf-8') as f:
f.write(string + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
def gfgg(i): #抓取广发的满足要求的公告地址
global gfim1
global words0,words1
global yeardate,monthdate
url = 'http://card.cgbchina.com.cn/Channel/11586528?_tp_t2343=%s'%i #广发公告地址
#url = "http://pws.paic.com.cn"
#html = urllib.request.urlopen(url,timeout=500).read()
#html = html.decode('utf-8')
#allurl = re.findall('href="(.*?)"',html)
#html =
#allurl = re.sub('[\s]target="_blank">','标题:'#print(allurl)
#print(allurl)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
html0 = driver.get(url)
driver.maximize_window() # 将浏览器最大化显示
html0 = driver.page_source
time.sleep(5)
#driver.close() #不能关
#print(html)
#return html
#print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++对应官网公告的html是++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++',html0) #最新公告的网页html 暂时设置为html0
if monthdate != 'none':
pattern = '<a href="(/Info/.*?" target="_blank">.*?)</a><span[.\n]*[\s]*[.\n]*class=" gray">(%s-%s-\d{2})</span>'%(yeardate,monthdate)
print('得到的正则表达式是======================================================',pattern)
#a = a+1
else:
pattern = '<a href="(/Info/.*?" target="_blank">.*?)</a><span[.\n]*[\s]*[.\n]*class=" gray">(%s-\d{2}-\d{2})</span>'% yeardate
print('得到的正则表达式是======================================================',pattern)
#a = a+1
urlgf = re.findall(pattern,html0) #201707的公告地址 是一个数组,不止一个
print('*******************************************************筛选过时间公告地址有**********************************************************************',urlgf)
gfim = []
for word0 in words0:
for urlg0 in urlgf:
if (urlg0.find(word0) != -1 and urlg0 not in gfim): #包含(xx != -1)“费”,或者(or)“服务”,并且(and)不包含(xx == -1)“双11”注意要排除关键字重叠的可能
#注意还要将or 的两个条件包进一个括号,否则逻辑不对
gfim.append(urlg0)
gfim1 = []
for word1 in words1:
for urlg1 in gfim:
#print(word1)
if (urlg1.find(word1) != -1):
q = gfim.index(urlg1)
gfim.pop(q)
#print('所有包含关键字的公告地址是:',gfim)
for gfim0 in gfim:
urlgfg = re.findall('(/Info/\d{8})',gfim0)
#print('包含关键字的公告地址是:',urlgfg)
urlgfg = ''.join(urlgfg)
#print('是否有将列表转换为字符串???????',urlgfg)
gfim1.append(urlgfg)
print('*******************************************************包含关键字的公告地址有**********************************************************************',gfim1)
#urls = []
#urls.append(url0)
driver.quit()
return gfim1
def gfbc(): #对于最新公告的逐个保存
for i in range(1,9):
#print(type(i))
gfgg(i)
global gfim1
for url in gfim1: #逐个取出公告地址,访问html,进行保存
n = gfim1.index(url)
#print(type(n))
print('=======================================================这是第几个公告==============================================================',n)
preurl = 'http://card.cgbchina.com.cn' #具体公告的网址前缀
url = preurl + url
#r = urllib.request.urlopen(url).read() #公告都是动态网页,不能用requests,必须用selenium
print('_____________________________________________________抓取到的公告有______________________________________________________________',url)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
gg = driver.get(url) #公告的html 暂时用gg代替
driver.maximize_window() # 将浏览器最大化显示
gg = driver.page_source
time.sleep(5)
#driver.close() #不能关
driver.save_screenshot('D:\广发银行%s%s.png'%(i,n)) #截图保存 注意加页数
#print(gg)
#return gg
#print('#####################################################具体公告的html是###############################################################',gg) #公告的html 暂时用gg代替
#return gg
driver.quit()
soup = bp(gg,'html.parser')
#jpg_link =
soup = soup.find_all('div',{'class':'cred_text'})[0] #广发银行网页比较麻烦
soup = soup.find_all('div',{'class':'fLayout'})
c = []
for s in soup:
#s = s.find_all('div',{'class':'fLayout'})
#for f in s:
content = s.text.strip()
#print('这是公告:',content)
content = str(content)
c.append(content)
#content = content + content
#print('是否连接成功???????',c)
#driver.quit()
#print(c)
#for i in c:
#c = s.text.strip()
#c = ''.join(c)
#c += c
#print(c)
#print(soup)
#for c in soup:
#content = soup.text.strip()
#content = ''.join(content)
#print('emmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm有无成功转成stringemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n',type(conten)
#content += content
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<公告内容如下:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n',c)
xrtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
xrtime = ''.join(xrtime)
string0 = '\n\n————————————本次爬取公告%s%s如下————————————:\n'%(i,n)
string = '\n\n————————————广发银行公告%s%s内容如下————————————:\n'%(i,n)
string = string + xrtime + '\n\n'
string0 = string0 + xrtime + '\n\n'
if (os.path.exists('D:\广发公告.doc')):
with open('D:\广发公告.doc','a+',encoding='utf-8') as f:
f.write(string0)
for j in c:
f.write(j)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
else:
with open('D:\广发公告.doc','a+',encoding='utf-8') as f:
f.write(string)
for j in c:
f.write(j)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
def gdgg(i): #抓取光大的满足要求的公告地址
global gdim1
global words0,words1
global yeardate,monthdate
url = 'http://xyk.cebbank.com/home/bulletin/search/category/NC_NOTICE/list.htm?searchTitle=&actSearch=2&pageNo=%s'%i #光大公告地址
#url = "http://pws.paic.com.cn"
#html = urllib.request.urlopen(url,timeout=500).read()
#html = html.decode('utf-8')
#allurl = re.findall('href="(.*?)"',html)
#html =
#allurl = re.sub('[\s]target="_blank">','标题:'#print(allurl)
#print(allurl)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
html0 = driver.get(url)
driver.maximize_window() # 将浏览器最大化显示
html0 = driver.page_source
time.sleep(5)
#driver.close() #不能关
#print(html)
#return html
#print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++对应官网公告的html是++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++',html0) #最新公告的网页html 暂时设置为html0
if monthdate != 'none':
pattern = '<a href="(/home/bulletin/content/.*?htm">[.\n]*[\s]*<span>[.\n]*[\s]*.*?[.\n]*[\s]*)</span>[.\n]*[\s]*<em class="time">%s-%s-\d{2}</em>'%(yeardate,monthdate)
print('得到的正则表达式是======================================================',pattern)
#a = a+1
else:
pattern = '<a href="(/home/bulletin/content/.*?htm">[.\n]*[\s]*<span>[.\n]*[\s]*.*?[.\n]*[\s]*)</span>[.\n]*[\s]*<em class="time">%s-\d{2}-\d{2}</em>'% yeardate
print('得到的正则表达式是======================================================',pattern)
#a = a+1
urlgd = re.findall(pattern,html0) #201707的公告地址 是一个数组,不止一个
# <a href="/home/bulletin/content/35213.htm">
print('*******************************************************筛选过时间的公告地址有**********************************************************************',urlgd)
gdim = []
for word0 in words0:
for urlg0 in urlgd:
if (urlg0.find(word0) != -1 and urlg0 not in gdim): #包含(xx != -1)“费”,或者(or)“服务”,并且(and)不包含(xx == -1)“双11”注意要排除关键字重叠的可能
#注意还要将or 的两个条件包进一个括号,否则逻辑不对
gdim.append(urlg0)
for word1 in words1:
for urlg1 in gdim:
#print(word1)
if (urlg1.find(word1) != -1):
q = gdim.index(urlg1)
gdim.pop(q)
#print('所有包含关键字的公告地址是:',gdim)
gdim1 = []
for gdim0 in gdim:
urlgdg = re.findall('(/home/bulletin/content/.*?htm)',gdim0)
#print('包含关键字的公告地址是:',urlgdg)
urlgdg = ''.join(urlgdg)
#print('是否有将列表转换为字符串???????',urlgdg)
gdim1.append(urlgdg)
print('*******************************************************包含关键字的公告地址有**********************************************************************',gdim1)
#urls = []
#urls.append(url0)
driver.quit()
return gdim1
def gdbc(): #对于最新公告的逐个保存
for i in range(1,9):
gdgg(i)
global gdim1
for url in gdim1: #逐个取出公告地址,访问html,进行保存
n = gdim1.index(url)
print('=======================================================这是第几个公告==============================================================',gdim1.index(url))
preurl = 'http://xyk.cebbank.com'
url = preurl + url
#r = urllib.request.urlopen(url).read() #公告都是动态网页,不能用requests,必须用selenium
print('_____________________________________________________抓取到的公告有______________________________________________________________',url)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
gg = driver.get(url) #公告的html 暂时用gg代替
driver.maximize_window() # 将浏览器最大化显示
gg = driver.page_source
time.sleep(5)
#driver.close() #不能关
driver.save_screenshot('D:\光大银行%s%s.png'%(i,n)) #截图保存
#print(gg)
#return gg
#print('#####################################################具体公告的html是###############################################################',gg) #公告的html 暂时用gg代替
#return gg
driver.quit()
#try:
soup = bp(gg,'html.parser')
#jpg_link =
soup = soup.find_all('div',{'class':'bulletin_content'})[0]
#except:
#pass
#print(soup)
content = soup.text.strip()
content = ''.join(content)
#print('emmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm有无成功转成stringemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n',type(content))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<公告内容如下:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n',content)
xrtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
xrtime = ''.join(xrtime)
string0 = '\n\n————————————本次爬取公告%s%s如下————————————:\n'%(i,n)
string = '\n\n————————————光大银行公告%s%s内容如下————————————:\n'%(i,n)
string = string + xrtime + '\n\n'
string0 = string0 + xrtime + '\n\n'
if (os.path.exists('D:\光大公告.doc')):
with open('D:\光大公告.doc','a+',encoding='utf-8') as f:
f.write(string0 + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
else:
with open('D:\光大公告.doc','a+',encoding='utf-8') as f:
f.write(string + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
def zxgg(): #抓取中信的满足要求的公告地址 中信不用翻页,网页自动
global zxim1
global words0,words1
global yeardate,monthdate
url = 'https://creditcard.ecitic.com/gonggao/#' #中信公告地址
#url = "http://pws.paic.com.cn"
#html = urllib.request.urlopen(url,timeout=500).read()
#html = html.decode('utf-8')
#allurl = re.findall('href="(.*?)"',html)
#html =
#allurl = re.sub('[\s]target="_blank">','标题:'#print(allurl)
#print(allurl)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
html0 = driver.get(url)
driver.maximize_window() # 将浏览器最大化显示
html0 = driver.page_source
time.sleep(5)
#driver.close() #不能关
#print(html)
#return html
#print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++对应官网公告的html是++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++',html0) #最新公告的网页html 暂时设置为html0
yeardate1 = yeardate[-2]
yeardate2 = yeardate[-1]
yeardate0 = yeardate1 + yeardate2
print('是否有过滤出年份============================================',yeardate0)
if monthdate != 'none':
pattern = '<a .*?href="(/gonggao/news_%s%s.*?.shtml" .*?>[\s]*.*?)[\s]*</a>'%(yeardate0,monthdate)
print('得到的正则表达式是======================================================',pattern)
#a = a+1
else:
pattern = '<a .*?href="(/gonggao/news_%s.*?.shtml" .*?>[\s]*.*?)[\s]*</a>'% yeardate0
print('得到的正则表达式是======================================================',pattern)
#a = a+1
urlzx = re.findall(pattern,html0) #201707的公告地址 是一个数组,不止一个
print('*******************************************************筛选过时间的公告地址有**********************************************************************',urlzx)
zxim = []
for word0 in words0:
for urlg0 in urlzx:
if (urlg0.find(word0) != -1 and urlg0 not in zxim): #包含(xx != -1)“费”,或者(or)“服务”,并且(and)不包含(xx == -1)“双11”注意要排除关键字重叠的可能
#注意还要将or 的两个条件包进一个括号,否则逻辑不对
zxim.append(urlg0)
for word1 in words1:
for urlg1 in zxim:
#print(zsword1)
if (urlg1.find(word1) != -1):
q = zxim.index(urlg1)
zxim.pop(q)
#print('所有包含关键字的公告地址是:',zxim)
zxim1 = []
for zxim0 in zxim:
urlzxg = re.findall('(/gonggao/news_.*?.shtml)',zxim0)
#print('包含关键字的公告地址是:',urlzxg)
urlzxg = ''.join(urlzxg)
#print('是否有将列表转换为字符串???????',urlzxg)
zxim1.append(urlzxg)
print('*******************************************************包含关键字的公告地址有**********************************************************************',zxim1)
#urls = []
#urls.append(url0)
driver.quit()
return(zxim1)
def zxbc(): #对于最新公告的逐个保存
zxgg()
global zxim1
for url in zxim1: #逐个取出公告地址,访问html,进行保存
n = zxim1.index(url)
print('=======================================================这是第几个公告==============================================================',zxim1.index(url))
preurl = 'https://creditcard.ecitic.com'
url = preurl + url
#r = urllib.request.urlopen(url).read() #公告都是动态网页,不能用requests,必须用selenium
print('_____________________________________________________抓取到的公告有______________________________________________________________',url)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
gg = driver.get(url) #公告的html 暂时用gg代替
driver.maximize_window() # 将浏览器最大化显示 ??????
gg = driver.page_source
time.sleep(5)
#driver.close() #不能关?
driver.save_screenshot('D:\中信银行%s.png'%n) #截图保存
#print(gg)
#return gg
#print('#####################################################具体公告的html是###############################################################',gg) #公告的html 暂时用gg代替
#return gg
driver.quit()
soup = bp(gg,'html.parser')
#jpg_link =
soup = soup.find_all('div',{'class':'gonggao_con_left'})[0]
#print(soup)
content = soup.text.strip()
content = ''.join(content)
#print('emmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm有无成功转成stringemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n',type(content))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<公告内容如下:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n',content)
xrtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
xrtime = ''.join(xrtime)
string0 = '\n\n————————————本次爬取公告%s如下————————————:\n'%n
string = '\n\n————————————中信银行公告%s内容如下————————————:\n'%n
string = string + xrtime + '\n\n'
string0 = string0 + xrtime + '\n\n'
if (os.path.exists('D:\中信公告.doc')):
with open('D:\中信公告.doc','a+',encoding='utf-8') as f:
f.write(string0 + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
else:
with open('D:\中信公告.doc','a+',encoding='utf-8') as f:
f.write(string + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
def xygg(i): #抓取兴业的满足要求的具体公告地址
global words0,words1
global xyim1
global yeardate,monthdate
urlxy = []
#url = preurlgg + l['href'] #兴业公告地址
url = 'http://creditcard.cib.com.cn/news/notice/index.html_91759631%s.html'%i #兴业是从最后一页往前翻的
#url = "http://pws.paic.com.cn"
#html = urllib.request.urlopen(url,timeout=500).read()
#html = html.decode('utf-8')
#allurl = re.findall('href="(.*?)"',html)
#html =
#allurl = re.sub('[\s]target="_blank">','标题:'#print(allurl)
#print(allurl)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
html0 = driver.get(url)
driver.maximize_window() # 将浏览器最大化显示
html0 = driver.page_source
time.sleep(5)
#driver.close() #不能关?
#print(html)
#return html
#print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++对应官网公告的html是++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++',html0) #最新公告的网页html 暂时设置为html0
if monthdate != 'none':
pattern = '<a (href="/news/notice/.*?/%s%s.*?html".*?>[.\n]*[\s]*.*?)<'%(yeardate,monthdate)
print('得到的正则表达式是======================================================',pattern)
#a = a+1
else:
pattern = '<a (href="/news/notice/.*?/%s.*?html".*?>[.\n]*[\s]*.*?)<'% yeardate
print('得到的正则表达式是======================================================',pattern)
#a = a+1
allurl0 = re.findall(pattern,html0) #201707的公告地址 是一个数组,不止一个
# <a href="/news/notice/2017/20170803.html" target="_blank">“境外返现季 购物最尽兴”活动返现名单公告<img src="/resources/images/2012/news.gif"
# <a href="/news/notice/2017/20170527.html">关于调整屈臣氏“积分当钱花,购物更轻松”积分消费活动结束日期的公告</a>
for a in allurl0:
url0 = re.sub('href="','http://creditcard.cib.com.cn',a)
urlxy.append(url0)
allurl1 = re.findall('<a href="(http://creditcard.cib.com.cn/promotion/national/2.*?html">[.\n]*[\s]*.*?)<',html0) #兴业银行有两种格式
# <a href="http://creditcard.cib.com.cn/promotion/national/20170726.html">电子银行预借现金,上万积分回馈豪礼<img src="/resources/images/2012/news.gif"
for b in allurl1:
urlxy.append(b)
print('*******************************************************筛选过时间的公告地址有**********************************************************************',urlxy)
xyim = []
#for urlg0 in urlxy:
for word0 in words0:
for urlg0 in urlxy:
if (urlg0.find(word0) != -1 and urlg0 not in xyim): #包含(xx != -1)“费”,或者(or)“服务”,并且(and)不包含(xx == -1)“双11”
#注意还要将or 的两个条件包进一个括号,否则逻辑不对
#urlxyg0 = (re.findall('(http://creditcard.cib.com.cn/news/notice/2017/2017.*?html)',urlg0) or re.findall('(http://creditcard.cib.com.cn/promotion/national/2017.*?html)',urlg0))
# 测试 or 逻辑是否发挥作用
#print('包含关键字的公告地址是:',urlxyg)
#urlxyg0 = ''.join(urlxyg0)
#print('是否有将列表转换为字符串???????',urlxyg)
xyim.append(urlg0)
xyim1 = []
for word1 in words1:
for urlg1 in xyim:
#print(xywword1)
if (urlg1.find(word1) != -1):
#urlxyg1 = (re.findall('(http://creditcard.cib.com.cn/news/notice/2017/2017.*?html)',urlg1) or re.findall('(http://creditcard.cib.com.cn/promotion/national/2017.*?html)',urlg1))
# 测试 or 逻辑是否发挥作用
#print('包含关键字的公告地址是:',urlxyg)
#urlxyg1 = ''.join(urlxyg1)
#print('是否有将列表转换为字符串???????',urlxyg)
q = xyim.index(urlg1)
xyim.pop(q)
#print('所有包含关键字的公告地址是:',xyim)
for xyim0 in xyim:
urlxyg0 = (re.findall('(http://creditcard.cib.com.cn/news/notice/.*?html)',xyim0) or re.findall('(http://creditcard.cib.com.cn/promotion/national/.*?html)',xyim0))
# 测试 or 逻辑是否发挥作用
#print('包含关键字的公告地址是:',urlxyg)
urlxyg0 = ''.join(urlxyg0)
#print('是否有将列表转换为字符串???????',urlxyg)
xyim1.append(urlxyg0)
print('*******************************************************包含关键字的公告地址有**********************************************************************',xyim1)
#urls = []
#urls.append(url0)
driver.quit()
return(xyim1)
def xybc(): #对于最新公告的逐个保存
url = 'http://creditcard.cib.com.cn/news/notice/index.html_917596318.html' #光大银行的只能点击更多之后再获取尾页页数
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
html = driver.get(url)
driver.maximize_window() # 将浏览器最大化显示
html = driver.page_source
time.sleep(5)
print('_____________________________________________________对应html是_____________________________________________________',html)
driver.quit()
soup = bp(html,'html.parser')
soup = soup.find_all('div',{'class':'page_arw'})[0]
soup = soup.find_all('span')[0]
soup = soup.string.strip()
print(soup)
print(type(soup))
print(soup)
s = re.findall('.*?/共(\d+)页',soup)[0]
print('总共有多少页_____________________________________________________',s)
#for i in range(1,int(s)+1):
l = int(s)
list = []
for j in range(1,int(s)+1):
u = l
list.append(u)
l = l-1
print('最终结果的列表=======================================',list)
for i in list:
xygg(i)
global xyim1
for url in xyim1: #逐个取出公告地址,访问html,进行保存
n = xyim1.index(url)
print('=======================================================这是第几个公告==============================================================',xyim1.index(url))
#preurl = 'http://xyk.cebbank.com'
#url = preurl + url
#r = urllib.request.urlopen(url).read()
print('_____________________________________________________抓取到的公告有______________________________________________________________',url)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
gg = driver.get(url) #公告的html 暂时用gg代替
driver.maximize_window() # 将浏览器最大化显示
gg = driver.page_source
time.sleep(5)
#driver.close() #不能关?
driver.save_screenshot('D:\兴业银行%s%s.png'%(i,n)) #截图保存
#print(gg)
#return gg
#print('#####################################################具体公告的html是###############################################################',gg) #公告的html 暂时用gg代替
#return gg
driver.quit()
soup = bp(gg,'html.parser')
#jpg_link =
soup = soup.find_all('div',{'class':'fl add'})[0]
#print(soup)
content = soup.text.strip()
content = ''.join(content)
#print('emmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm有无成功转成stringemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n',type(content))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<公告内容如下:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n',content)
xrtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
xrtime = ''.join(xrtime)
string0 = '\n\n————————————本次爬取公告%s%s如下————————————:\n'%(i,n)
string = '\n\n————————————兴业银行公告%s%s内容如下————————————:\n'%(i,n)
string = string + xrtime + '\n\n'
string0 = string0 + xrtime + '\n\n'
if (os.path.exists('D:\兴业公告.doc')):
with open('D:\兴业公告.doc','a+',encoding='utf-8') as f:
f.write(string0 + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
else:
with open('D:\兴业公告.doc','a+',encoding='utf-8') as f:
f.write(string + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
def msgg(i): #抓取满足要求的公告地址
global msim1
global words0,words1
global yeardate,monthdate
url = 'https://creditcard.cmbc.com.cn/home/cn/web/customer/import/list_%s.shtml'%i #最新公告地址
#url = "http://pws.paic.com.cn"
#html = urllib.request.urlopen(url,timeout=500).read()
#html = html.decode('utf-8')
#allurl = re.findall('href="(.*?)"',html)
#html =
#allurl = re.sub('[\s]target="_blank">','标题:'#print(allurl)
#print(allurl)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
html0 = driver.get(url)
driver.maximize_window() # 将浏览器最大化显示
html0 = driver.page_source
time.sleep(5)
#driver.close() #不能关?
#print(html)
#return html
#print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++对应官网公告的html是++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++',html0) #最新公告的网页html 暂时设置为html0
if monthdate != 'none':
pattern = '<a href="(/home/cn/web/customer/import/.*?shtml" target="_blank" title=".*?)" class="ms-textEllipsis">[.\n]*[\s]*<span class="mediadate">%s-%s-\d{2}</span>'%(yeardate,monthdate)
print('得到的正则表达式是======================================================',pattern)
#a = a+1
else:
pattern = '<a href="(/home/cn/web/customer/import/.*?shtml" target="_blank" title=".*?)" class="ms-textEllipsis">[.\n]*[\s]*<span class="mediadate">%s-\d{2}-\d{2}</span>'% yeardate
print('得到的正则表达式是======================================================',pattern)
#a = a+1
urlms = re.findall(pattern,html0) #201707的公告地址 是一个数组,不止一个
# <a href="/home/cn/web/customer/import/20170606/409587.shtml" target="_blank" title="关于2017年5月1日至7月31日期间免除部分外币交易手续费的活动通知" class="ms-textEllipsis">
# <span class="mediadate">2017-05-25</span>
# 关于2017年5月1日至7月31日期间免除部分外币交易手续费的活动通知
#</a>
print('*******************************************************筛选过时间的公告地址有**********************************************************************',urlms)
msim = []
for word0 in words0:
for urlg0 in urlms:
if (urlg0.find(word0) != -1 and urlg0 not in msim): #包含(xx != -1)“费”,或者(or)“服务”,并且(and)不包含(xx == -1)“双11”注意要排除关键字重叠的可能
#注意还要将or 的两个条件包进一个括号,否则逻辑不对
msim.append(urlg0)
for word1 in words1:
for urlg1 in msim:
#print(word1)
if (urlg1.find(word1) != -1):
#print('为什么没有过滤掉409529?????????????????????',urlg1)
q = msim.index(urlg1)
msim.pop(q)
#print('所有包含关键字的公告地址是:',msim)
msim1 = []
for msim0 in msim:
urlmsg = re.findall('(/home/cn/web/customer/import/.*?shtml)',msim0)
#print('包含关键字的公告地址是:',urlmsg)
urlmsg = ''.join(urlmsg)
#print('是否有将列表转换为字符串???????',urlmsg)
msim1.append(urlmsg)
print('*******************************************************包含关键字的公告地址有**********************************************************************',msim1)
#urls = []
#urls.append(url0)
driver.quit()
return msim
def msbc(): #对于最新公告的逐个保存
for i in range(1,9):
msgg(i)
global msim1
for url in msim1: #逐个取出公告地址,访问html,进行保存
n = msim1.index(url)
print('=======================================================这是第几个公告==============================================================',msim1.index(url))
preurl = 'https://creditcard.cmbc.com.cn'
url = preurl + url
#r = urllib.request.urlopen(url).read() #公告都是动态网页,不能用requests,必须用selenium
print('_____________________________________________________抓取到的公告有______________________________________________________________',url)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
gg = driver.get(url) #公告的html 暂时用gg代替
driver.maximize_window() # 将浏览器最大化显示
gg = driver.page_source
time.sleep(5)
#driver.close() #不能关?
driver.save_screenshot('D:\民生银行%s%s.png'%(i,n)) #截图保存
#print(gg)
#return gg
#print('#####################################################具体公告的html是###############################################################',gg) #公告的html 暂时用gg代替
#return gg
driver.quit()
soup = bp(gg,'html.parser')
#jpg_link =
soup = soup.find_all('div',{'class':'container detailBox'})[0]
#print(soup)
content = soup.text.strip()
content = ''.join(content)
#print('emmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm有无成功转成stringemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n',type(content))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<公告内容如下:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n',content)
xrtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
xrtime = ''.join(xrtime)
string0 = '\n\n————————————本次爬取公告%s%s如下————————————:\n'%(i,n)
string = '\n\n————————————民生银行公告%s%s内容如下————————————:\n'%(i,n)
string = string + xrtime + '\n\n'
string0 = string0 + xrtime + '\n\n'
if (os.path.exists('D:\民生公告.doc')):
with open('D:\民生公告.doc','a+',encoding='utf-8') as f:
f.write(string0 + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
else:
with open('D:\民生公告.doc','a+',encoding='utf-8') as f:
f.write(string + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
def hxgg(i): #抓取满足要求的公告地址
global hxim1
global words0,words1
global yeardate,monthdate
url = 'http://creditcard.hxb.com.cn/card/cn/customerservice/importantinfor/list_%s.shtml'%i #最新公告地址
#url = "http://pws.paic.com.cn"
#html = urllib.request.urlopen(url,timeout=500).read()
#html = html.decode('utf-8')
#allurl = re.findall('href="(.*?)"',html)
#html =
#allurl = re.sub('[\s]target="_blank">','标题:'#print(allurl)
#print(allurl)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
html0 = driver.get(url)
driver.maximize_window() # 将浏览器最大化显示
html0 = driver.page_source
time.sleep(5)
#driver.close() #不能关?
#print(html)
#return html
#print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++对应官网公告的html是++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++',html0) #最新公告的网页html 暂时设置为html0
if monthdate != 'none':
pattern = '<a href="(http://creditcard.hxb.com.cn/card/cn/customerservice/importantinfor/.*?shtml">.*?)</a>[.\n]*[\s]*</div>[.\n]*[\s]*<div class="list_right">%s-%s-\d{2}</div>'%(yeardate,monthdate)
urlhx = re.findall(pattern,html0) #201707的公告地址 是一个数组,不止一个
#a = a+1
else:
pattern = '<a href="(http://creditcard.hxb.com.cn/card/cn/customerservice/importantinfor/.*?shtml">.*?)</a>[.\n]*[\s]*</div>[.\n]*[\s]*<div class="list_right">%s-\d{2}-\d{2}</div>'% yeardate
urlhx = re.findall(pattern,html0) #201707的公告地址 是一个数组,不止一个
#a = a+1
print('*******************************************************筛选过时间的公告地址有**********************************************************************',urlhx)
hxim = []
for word0 in words0:
for urlg0 in urlhx:
if (urlg0.find(word0) != -1 and urlg0 not in hxim): #包含(xx != -1)“费”,或者(or)“服务”,并且(and)不包含(xx == -1)“双11”注意要排除关键字重叠的可能
#注意还要将or 的两个条件包进一个括号,否则逻辑不对
hxim.append(urlg0)
for word1 in words1:
for urlg1 in hxim:
#print(word1)
if (urlg1.find(word1) != -1):
q = hxim.index(urlg1)
hxim.pop(q)
#print('所有包含关键字的公告地址是:',hxim)
hxim1 = []
for hxim0 in hxim:
urlhxg = re.findall('(http://creditcard.hxb.com.cn/card/cn/customerservice/importantinfor/.*?shtml)',hxim0)
#print('包含关键字的公告地址是:',urlhxg)
urlhxg = ''.join(urlhxg)
#print('是否有将列表转换为字符串???????',urlhxg)
hxim1.append(urlhxg)
print('*******************************************************包含关键字的公告地址有**********************************************************************',hxim1)
#urls = []
#urls.append(url0)
driver.quit()
return hxim1
def hxbc(): #对于最新公告的逐个保存
for i in range(1,9):
hxgg(i)
global hxim1
for url in hxim1: #逐个取出公告地址,访问html,进行保存
n = hxim1.index(url)
print('=======================================================这是第几个公告==============================================================',hxim1.index(url))
#preurl = 'https://creditcard.cmbc.com.cn'
#url = preurl + url
#r = urllib.request.urlopen(url).read() #公告都是动态网页,不能用requests,必须用selenium
print('_____________________________________________________抓取到的公告有______________________________________________________________',url)
driver = webdriver.PhantomJS(executable_path='D:/phantomjs/bin/phantomjs.exe')
gg = driver.get(url) #公告的html 暂时用gg代替
driver.maximize_window() # 将浏览器最大化显示
gg = driver.page_source
time.sleep(5)
#driver.close() #不能关
driver.save_screenshot('D:\华夏银行%s%s.png'%(i,n)) #截图保存
#print(gg)
#return gg
#print('#####################################################具体公告的html是###############################################################',gg) #公告的html 暂时用gg代替
#return gg
driver.quit()
soup = bp(gg,'html.parser')
#jpg_link =
soup = soup.find_all('div',{'class':'Details'})[0]
#print(soup)
content = soup.text.strip()
content = ''.join(content)
#print('emmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm有无成功转成stringemmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmmm\n',type(content))
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<公告内容如下:>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n',content)
xrtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
xrtime = ''.join(xrtime)
string0 = '\n\n————————————本次爬取公告%s%s如下————————————:\n'%(i,n)
string = '\n\n————————————华夏银行公告%s%s内容如下————————————:\n'%(i,n)
string = string + xrtime + '\n\n'
string0 = string0 + xrtime + '\n\n'
if (os.path.exists('D:\华夏公告.doc')):
with open('D:\华夏公告.doc','a+',encoding='utf-8') as f:
f.write(string0 + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
else:
with open('D:\华夏公告.doc','a+',encoding='utf-8') as f:
f.write(string + content)
f.write('\n\n本公告的网页链接是:' + url + '\n\n')
if __name__ == '__main__':
global words0,words1
global yeardate,monthdate
bank = input('\n\n——请输入希望查找的银行——:\n')
bank = bank.split(',')
yeardate = input('\n\n——请输入希望搜寻的的年份(如2017)——:\n')
monthdate = input('\n\n——请输入希望搜寻的月份(如07)——:\n')
words0 = input('\n\n——请输入公告题目中希望包含的关键字(若有多个请以中文逗号分隔)——:\n')
words0 = words0.split(',')
#print(words0)
words1 = input('\n\n——请输入公告题目中不希望包含的关键字(若有多个请以中文逗号分隔)——:\n')
words1 = words1.split(',')
#print(words1)
for ban in bank:
print(ban)
if ban == '招商银行':
zsbc()
elif ban == '浦发银行':
pfbc()
elif ban == '中信银行':
zxbc()
elif ban == '民生银行':
msbc()
elif ban == '兴业银行':
xybc()
elif ban == '广发银行':
gfbc()
elif ban == '光大银行':
gdbc()
elif ban == '华夏银行':
hxbc()
else :
zsbc()
pfbc()
zxbc()
msbc()
xybc()
gfbc()
gdbc()
hxbc()
exit = input("运行完毕!请输入任意键退出……")