公众号文章的爬虫

xiaoxiao2021-02-28  93

微信在4月的时候更新了一版,以前的公众号爬虫没有 那么好用了,现在微信开始针对个人账号了,频繁的访问会导致账号被封,查看公众号历史的时候会出现“页面无法打开”的提示,但是大概两天之后又可以查看了,所以现在的做法就是控制访问频率+多微信账号进行采集以前制作的公众号采集站好久没打理了,整理下代码吧.

获取文章链接:Get_list.py

# -*- coding: UTF-8 -*- import re import urllib2 import cookielib import json import time import sys from Unique import Redis import base64 import redis sys.setrecursionlimit(999999999) REDIS=Redis() TASK_SCHEDUL = 'task::mweb' REDIS_URL = 'redis://xxxx:6379' REDIS_HOST = 'xxxx' REDIS_PORT = 6379 def from_settings(): return redis.Redis(host=REDIS_HOST, port=REDIS_PORT,password='xxxx',db=4) rediscli = from_settings() def from_settings1():#settings return redis.Redis(host=REDIS_HOST, port=REDIS_PORT,password='',db=10) rediscli1 = from_settings1() ##读取文件中的抓取地址## def get_start_url(): Flag=0 tps=rediscli.rpop('task::getmes') print 'tttttt----%s'%tps Url= re.sub(r'&f=json','',tps) return Url ##生成cookie## def create_cookie(Url): global opener cookie = cookielib.CookieJar() handler=urllib2.HTTPCookieProcessor(cookie) opener = urllib2.build_opener(handler) response = opener.open(Url,timeout=5) C_url=Url+'&f=json' return C_url ##抓取最近10天的数据## def get_recent_ten_list(url): headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Mobile/10B329 MicroMessenger/5.0.1'} request = urllib2.Request(url,headers=headers) response2 = opener.open(request,timeout=5) result = response2.read() #结构化数据转成字典# return result def format_data(datas): data=datas print 'format' is_continue=1 is_friend=0 temp_b=0 url_list=[] content_list=[] ###########一次取得最近10天的数据########### items = {'info:create_time':'%s'%time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))} msg_list=re.findall(r'comm_msg_info(.*?)}},',data) print len(msg_list) for x in range(0,len(msg_list)): #封面信息内容 ##有些公众号中作者自己发布的内容格式不一样 try: temp_content_url_1=re.search(r'content_url:(.*?),',str(msg_list[x])) temp_index_cover_img=re.search(r'cover:(.*?),',str(msg_list[x])) temp_cover_title=re.search(r'title:(.*?),',str(msg_list[x])) temp_cover_digest=re.search(r'digest:(.*?),',str(msg_list[x])) temp_is_multi=re.search(r'is_multi:(.*?),',str(msg_list[x])) try: temp_source_url=re.search(r'source_url:(.*?),',str(msg_list[x])) except: temp_source_url='' temp_content_url=temp_content_url_1.group(1) index_cover_img=temp_index_cover_img.group(1) cover_title=temp_cover_title.group(1) cover_digest=temp_cover_digest.group(1) is_multi=temp_is_multi.group(1) try: c_source_url=temp_source_url.group(1) #print temp_source_url.group(1) except: c_source_url='' except Exception,e: print 'json err___.%s'%e continue temp_comm_msg_info=re.search(r'id:(.*?),',str(msg_list[x])) comm_msg_info=temp_comm_msg_info.group(1) content_url=re.sub(r'\\|amp;','',temp_content_url) #封面中的url带有符号,最终取得的封面的url temp_pub_time=re.search(r'datetime:(.*?),',str(msg_list[x])) pub_time=temp_pub_time.group(1) Key_cover=base64.encodestring(content_url) if not REDIS.getkey(Key_cover): print '%s_____'%(x+1) print '封面标题--->%s'%cover_title.decode('utf8','ignore').encode('utf8') print '封面url---->%s'%content_url content_list.append(cover_digest.decode('utf8','ignore').encode('utf8')) url_list.append(content_url) print u"当前封面ID------->>%s"%comm_msg_info if content_url: rediscli.lpush('task::mweb','{"url":"%s","time":"%s","cover_img":"%s","title":"%s","source_url":"%s","flag":"1"}'%(content_url,pub_time,index_cover_img,cover_title,c_source_url)) else: print '当前封面已采集---:%s'%cover_title.decode('utf8','ignore').encode('utf8') temp_a=comm_msg_info if temp_b>=temp_a: temp_c=temp_a temp_b=temp_c else: temp_c=temp_a temp_b=temp_c if is_multi=='1': temp_more_content=''.join(re.findall(r'multi_app_msg_item_list(.*?)],',str(msg_list[x]))) temp_multi_app_msg_item_list=re.findall(r'{(.*?)}',temp_more_content) for z in range(0,len(temp_multi_app_msg_item_list)): temp_multi_url=''.join(re.findall(r'content_url:(.*?),',temp_multi_app_msg_item_list[z])) list_title=''.join(re.findall(r'title:(.*?),',''.join(temp_multi_app_msg_item_list[z]))) multi_url=re.sub(r'\\|amp;','',temp_multi_url) #最终取得的列表的url地址 list_cover_img=''.join(re.findall(r'cover:(.*?),',temp_multi_app_msg_item_list[z])) Key_cover2=base64.encodestring(multi_url) l_source_url=''.join(re.findall(r'source_url:(.*?),',temp_multi_app_msg_item_list[z])) if not REDIS.getkey(Key_cover2): #print multi_url print '列表标题----->%s'%list_title.decode('utf8','ignore').encode('utf8') content_list.append(list_title.decode('utf8','ignore').encode('utf8')) url_list.append(multi_url) #hbase.put(Key_cover2,'t_cr_duplicate',items) if multi_url: rediscli.lpush('task::mweb','{"url":"%s","time":"%s","cover_img":"%s","title":"%s","source_url":"%s","flag":"%s"}'%(multi_url,pub_time,list_cover_img,list_title,l_source_url,(z+2))) #REDIS.setkey(Key_cover2,multi_url) else: print '列表已采集----:%s'%list_title continue print '----------分--------割-------线------------' return [is_continue,is_friend,temp_b,content_list,url_list] #################10天以后的数据,更多消息(需要关注公众号)#################### #BIZS=Window() def retry_history(count): url=get_start_url() Json_url = create_cookie(url) data=get_recent_ten_list(Json_url) return data def run(): ddd=0 url=get_start_url() #print url url_biz=re.findall(r'biz=(.*?)&',url) re_biz=''.join(url_biz) Json_url = create_cookie(url) print Json_url data=get_recent_ten_list(Json_url) msglist=re.findall(r"var msgList = '(.*?)';",data) data=re.sub(r'"|amp;|\\|\s+','',''.join(msglist)) data=re.sub(r' ',' ',data) history_length=0 #公众号下发布总文章数 ddd+=1 try: History_args=format_data(data) rediscli.set('flag','1') except Exception,e: print "格式化数据错误--->%s"%e print '当前-------%s'
转载请注明原文地址: https://www.6miu.com/read-72067.html

最新回复(0)