4.微信公众号文章接口地址可以在微信公众号后台中新建图文消息,超链接功能中获取(F12进入浏览器开发工具)
5.搜索公众号名称
搜索可以获取所有相关的公众号信息,不过我这里只取第一个做测试,其他的有兴趣的也可以全部获取。
6.获取要爬取的公众号的fakeid
7.选定要爬取的公众号,获取文章接口地址
8.文章列表翻页及内容获取
完整代码
login.py
# -*- coding:utf-8 -*- from selenium import webdriver import time import json driver = webdriver.Chrome() #需要一个谷歌驱动 driver.get('https://mp.weixin.qq.com/') driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[1]/div[1]/div/span/input').clear() #定位到账号输入框 清除 driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[1]/div[1]/div/span/input').send_keys('账号') time.sleep(2) driver.find_element_by_xpath(('//*[@id="header"]/div[2]/div/div/form/div[1]/div[2]/div/span/input')).clear() driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[1]/div[2]/div/span/input').send_keys('密码') time.sleep(2) driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[3]/label').click() time.sleep(2) driver.find_element_by_xpath('//*[@id="header"]/div[2]/div/div/form/div[4]/a').click() time.sleep(15) cookies = driver.get_cookies() #获取登录之后的cookies print cookies cookie = {} for items in cookies: cookie[items.get('name')] = items.get('value') with open('cookies.txt','w') as file: file.write(json.dumps(cookie))#写入转成字符串的字典 driver.close()weixin.py
# -*- coding:utf-8 -*- import requests import json import re import random import time query = 'java' with open('cookies.txt','r') as file: cookie = file.read() url = 'https://mp.weixin.qq.com/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36', 'Referer':'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&share=1&type=10&lang=zh_CN&token=1066113728', 'Host':'mp.weixin.qq.com', } cookies = json.loads(cookie) # print cookies response = requests.get(url,cookies = cookies) token = re.findall(r'token=(\d+)', str(response.url))[0] data = { 'token':token, 'lang':'zh_CN', 'f':'json', 'ajax':'1', 'random':random.random(), 'url':query, 'begin':'0', 'count':'3', } search_url = 'https://mp.weixin.qq.com/cgi-bin/operate_appmsg?sub=check_appmsg_copyright_stat' search_response = requests.post(search_url,cookies=cookies,data=data,headers=headers) max_num = search_response.json().get('total') #获取所有文章的条数 num = int(int(max_num/3)) begin = 0 while num +1 >0: data = { 'token': token, 'lang': 'zh_CN', 'f': 'json', 'ajax': '1', 'random': random.random(), 'url': query, 'begin':'{}'.format(str(begin)) , 'count': '3', } search_response = requests.post(search_url, cookies=cookies, data=data, headers=headers) content = search_response.json().get('list') for items in content: print items.get('title') #标题 print items.get('url') #文章的url num-=1 begin = int(begin) begin+=3 time.sleep(5) # if __name__ == '__main__': # query = raw_input('请输入你要搜索的文章') # main()