【Python】统计个人新浪微博…

xiaoxiao2021-02-28 94

原文地址：【Python】统计个人新浪微博词频并给出相应的柱状图作者：西风独自凉本文介绍如何进行个人新浪微博词频统计，并给出相应的柱状图分析，编程环境为Python 2.7。该文主要包括三个部分：新浪微博API的使用、文本过滤及分词和词频统计。一、新浪微博API的使用首先在新浪微博开放平台 http://open.weibo.com/development/上申请开发者账号，获取个人APP_KEY和APP_SECRET，下载并安装Python SDK。本文介绍的方法无需每次验证，直接运行即可。 # -*- coding: UTF-8 -*- from weibo import APIClient from re import split import urllib,httplib import webbrowser import operator import numpy as np import matplotlib.pyplot as plt class iWInsightor(object): def __init__(self,ID,PW): self.ACCOUNT = ID self.PASSWORD = PW self.CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html' self.APP_KEY = 'XXXXXXX'#Yours self.APP_SECRET = 'XXXXXX'#Yours self.client = APIClient(app_key=self.APP_KEY, app_secret=self.APP_SECRET, redirect_uri=self.CALLBACK_URL) self.url = self.client.get_authorize_url() self.get_Authorization() def get_code(self): conn = httplib.HTTPSConnection('api.weibo.com') postdata = urllib.urlencode({'client_id':self.APP_KEY,'response_type':'code','redirect_uri':self.CALLBACK_URL,'action':'submit','userId':self.ACCOUNT,'passwd':self.PASSWORD,'isLoginSina':0,'from':'','regCallback':'','state':'','ticket':'','withOfficalFlag':0}) conn.request('POST','/oauth2/authorize',postdata,{'Referer':self.url,'Content-Type': 'application/x-www-form-urlencoded'}) res = conn.getresponse() location = res.getheader('location') code = location.split('=')[1] conn.close() return code def get_Authorization(self): code = self.get_code() r = self.client.request_access_token(code) access_token = r.access_token expires_in = r.expires_in self.client.set_access_token(access_token, expires_in) #发送微博消息 def post_weibo(self,message): self.client.post.statuses__update(status=message.decode('gbk')) #获取当前用户ID def getCurrentUid(self): try: uid = self.client.account.get_uid.get()['uid'] return uid except Exception: print 'get userid failed' return #获取用户关注列表 def getFocus(self,userid): focuses = self.client.get.friendships__friends(uid=userid,count=200) Resfocus = [] for focus in focuses["users"]: try: Resfocus.append((focus["screen_name"],focus["gender"])) except Exception: print 'get focus failed' return return Resfocus #获取用户标签 def getTags(self,userid): try: tags = self.client.tags.get(uid=userid) except Exception: print 'get tags failed' return userTags = [] sortedT = sorted(tags,key=operator.attrgetter('weight'),reverse=True) for tag in sortedT: for item in tag: if item != 'weight': userTags.append(tag[item]) return userTags #获取用户发布的微博 def getWeibo(self,uesrid,infile): contents = self.client.get.statuses__user_timeline(uid=uesrid, count=100) for content in contents.statuses: try: f = open(infile,'a') f.write(content.text) f.write('n') f.close() except Exception: print 'get text failed' def autolabel(self,rects): for rect in rects: height = rect.get_height() plt.text(rect.get_x()+rect.get_width()/2., 1.03*height, '%s' % float(height)) #画出用户的关注男女比例图 def getSexplot(self,userid,m,f,n): res = self.client.get.users__show(uid=userid) ind = np.arange(1,4) width = 0.25 plt.subplot(111) rects1 = plt.bar(left=ind, height=(m,f,n), width=0.25,align = 'center') plt.ylabel('The Focus Number') plt.title('Sex Analysis(effective samples:%d)' % (m+f+n)) plt.xticks(ind, ("Male","Female","Unknown") ) self.autolabel(rects1) plt.legend((rects1,),("User:%s" % res["screen_name"],)) plt.show() if __name__ == '__main__': usrID = raw_input('请输入新浪微博用户名：') usrPW = raw_input('请输入新浪微博密码:') AppClient = iWInsightor(usrID, usrPW) userid = AppClient.getCurrentUid() infile = "E://data/weibo.dat"#微博内容保存路径及文件名 AppClient.getWeibo(userid,infile) #Focus = AppClient.getFocus(userid) #m = 0 #f = 0 #n = 0 #for i in Focus: #if i[1] == "m": #m = m+1 #elif i[1] == "f": #f = f+1 #else: #n = n+1 #AppClient.getSexplot(userid,m,f,n) 二、文本过滤及分词微博中常常含有一些词汇，其对词频统计无任何作用，利用英文字母数字、汉语标点符号以及其他个性符号，这些我们需要在分词前将其滤除。此外，你还可以添加自己想滤除的符号或者字词。中文与英文句子比较而言，有一个非常有趣的现象，那就是英文单词之间是有空格的，而中文则不然。因此，分词也成了中文信息处理中的一个基本步骤。我用的是结巴分词，可以添加自定义词典（因为分词字典很多词可能没涉及到），下载地址为 https://github.com/fxsjy/jieba。 # -*- coding: UTF-8-*- import string import jieba extra_dict = 'F://NLP/iWInsightor/jieba/mydict.dict'#自定义词典 jieba.load_userdict(extra_dict) def filter_str(instr): deEstr = string.punctuation + ' ' + string.digits + string.letters deCstr = '，。《》【】（）！？★”“、：…' destr = deEstr + deCstr outstr = '' for char in instr.decode('utf-8'): if char not in destr: outstr += char return outstr fp_in = open('F://NLP/iWInsightor/weibo.dat', 'rb+')#待处理文本 fp_out = open('F://NLP/iWInsightor/weibo_filter.dat', 'a')#处理后的文本 for line in fp_in: str_delete = filter_str(line) seg_list = jieba.cut(str_delete,cut_all=True) str_join = ' '.join(seg_list) fp_out.write(str_join) fp_in.close() fp_out.close() 三、词频统计词频统计就是指统计出某个文本中各个词出现的次数，这里使用python中的词典数据结构易得。我用的是 matplotlib画柱状图，画出top-K个高频词。这里需要注意的是图中的中文显示问题，在使用之前，需要修改相应的设置，具体方法不妨去google一下，我就不详细介绍了。 # -*- coding: UTF-8-*- import string import numpy import pylab def getstr(word, count): countstr = word + ',' + str(count) return countstr def get_wordlist(infile): c = open(infile).readlines() wordlist = [] for line in c: if len(line)>1: words = line.split(' ') for word in words: if len(word)>1: wordlist.append(word) return wordlist def get_wordcount(wordlist, outfile): out = open(outfile, 'w') wordcnt ={} for i in wordlist: if i in wordcnt: wordcnt[i] += 1 else: wordcnt[i] = 1 worddict = wordcnt.items() worddict.sort(key=lambda a: -a[1]) for word,cnt in worddict: out.write(getstr(word.encode('gbk'), cnt)+'n') out.close() return wordcnt def barGraph(wcDict): wordlist=[] for key,val in wcDict.items(): if val>5 and len(key)>3: wordlist.append((key.decode('utf-8'),val)) wordlist.sort() keylist=[key for key,val in wordlist] vallist=[val for key,val in wordlist] barwidth=0.5 xVal=numpy.arange(len(keylist)) pylab.xticks(xVal+barwidth/2.0,keylist,rotation=45) pylab.bar(xVal,vallist,width=barwidth,color='y') pylab.title(u'微博词频分析图') pylab.show() if __name__ == '__main__': myfile = 'F://NLP/iWInsightor/weibo_filter.dat' outfile = 'F://NLP/iWInsightor/result.dat' wordlist = get_wordlist(myfile) wordcnt = get_wordcount(wordlist,outfile) barGraph(wordcnt) 至此，我们的工作就完成了。下面是我的微博词频的一个柱状图。这些仅是业余时间之作，尚有诸多不足之处。

转载请注明原文地址: https://www.6miu.com/read-23158.html

技术

最新回复(0)