【Python】统计个人新浪微博…

xiaoxiao2021-02-28  89

原文地址:【Python】统计个人新浪微博词频并给出相应的柱状图 作者:西风独自凉     本文介绍如何进行个人新浪微博词频统计,并给出相应的柱状图分析,编程环境为Python 2.7。该文主要包括三个部分:新浪微博API的使用、文本过滤及分词和词频统计。     一、新浪微博API的使用     首先在新浪微博开放平台 http://open.weibo.com/development/上申请开发者账号,获取个人APP_KEY和APP_SECRET,下载并安装Python SDK。本文介绍的方法无需每次验证,直接运行即可。 # -*- coding: UTF-8 -*- from weibo import APIClient from re import split import urllib,httplib import webbrowser import operator import numpy as np import matplotlib.pyplot as plt class iWInsightor(object):     def __init__(self,ID,PW):         self.ACCOUNT = ID         self.PASSWORD = PW         self.CALLBACK_URL = 'https://api.weibo.com/oauth2/default.html'         self.APP_KEY = 'XXXXXXX'#Yours         self.APP_SECRET = 'XXXXXX'#Yours         self.client = APIClient(app_key=self.APP_KEY, app_secret=self.APP_SECRET, redirect_uri=self.CALLBACK_URL)         self.url = self.client.get_authorize_url()         self.get_Authorization()           def get_code(self):           conn = httplib.HTTPSConnection('api.weibo.com')         postdata = urllib.urlencode({'client_id':self.APP_KEY,'response_type':'code','redirect_uri':self.CALLBACK_URL,'action':'submit','userId':self.ACCOUNT,'passwd':self.PASSWORD,'isLoginSina':0,'from':'','regCallback':'','state':'','ticket':'','withOfficalFlag':0})         conn.request('POST','/oauth2/authorize',postdata,{'Referer':self.url,'Content-Type': 'application/x-www-form-urlencoded'})         res = conn.getresponse()         location = res.getheader('location')         code = location.split('=')[1]         conn.close()         return code           def get_Authorization(self):         code = self.get_code()         r = self.client.request_access_token(code)         access_token = r.access_token         expires_in = r.expires_in         self.client.set_access_token(access_token, expires_in)     #发送微博消息         def post_weibo(self,message):         self.client.post.statuses__update(status=message.decode('gbk'))               #获取当前用户ID     def getCurrentUid(self):         try:             uid = self.client.account.get_uid.get()['uid']             return uid         except Exception:             print 'get userid failed'             return     #获取用户关注列表     def getFocus(self,userid):         focuses = self.client.get.friendships__friends(uid=userid,count=200)         Resfocus = []         for focus in focuses["users"]:             try:                 Resfocus.append((focus["screen_name"],focus["gender"]))                 except Exception:                 print 'get focus failed'                 return         return Resfocus     #获取用户标签     def getTags(self,userid):         try:             tags = self.client.tags.get(uid=userid)         except Exception:             print 'get tags failed'             return         userTags = []         sortedT = sorted(tags,key=operator.attrgetter('weight'),reverse=True)         for tag in sortedT:             for item in tag:                 if item != 'weight':                     userTags.append(tag[item])         return userTags     #获取用户发布的微博     def getWeibo(self,uesrid,infile):         contents = self.client.get.statuses__user_timeline(uid=uesrid, count=100)         for content in contents.statuses:             try:                 f = open(infile,'a')                 f.write(content.text)                 f.write('n')                 f.close()             except Exception:                 print 'get text failed'     def autolabel(self,rects):         for rect in rects:             height = rect.get_height()             plt.text(rect.get_x()+rect.get_width()/2., 1.03*height, '%s' % float(height))           #画出用户的关注男女比例图     def getSexplot(self,userid,m,f,n):         res = self.client.get.users__show(uid=userid)         ind = np.arange(1,4)          width = 0.25               plt.subplot(111)         rects1 = plt.bar(left=ind, height=(m,f,n), width=0.25,align = 'center')         plt.ylabel('The Focus Number')         plt.title('Sex Analysis(effective samples:%d)' % (m+f+n))               plt.xticks(ind, ("Male","Female","Unknown") )         self.autolabel(rects1)         plt.legend((rects1,),("User:%s" % res["screen_name"],))         plt.show()           if __name__ == '__main__':     usrID = raw_input('请输入新浪微博用户名:')     usrPW = raw_input('请输入新浪微博密码:')     AppClient = iWInsightor(usrID, usrPW)           userid = AppClient.getCurrentUid()     infile = "E://data/weibo.dat"#微博内容保存路径及文件名     AppClient.getWeibo(userid,infile)     #Focus = AppClient.getFocus(userid)     #m = 0     #f = 0     #n = 0     #for i in Focus:         #if i[1] == "m":             #m = m+1         #elif i[1] == "f":             #f = f+1         #else:             #n = n+1     #AppClient.getSexplot(userid,m,f,n)     二、文本过滤及分词     微博中常常含有一些词汇,其对词频统计无任何作用,利用英文字母数字、汉语标点符号以及其他个性符号,这些我们需要在分词前将其滤除。此外,你还可以添加自己想滤除的符号或者字词。     中文与英文句子比较而言,有一个非常有趣的现象,那就是英文单词之间是有空格的,而中文则不然。因此,分词也成了中文信息处理中的一个基本步骤。我用的是结巴分词,可以添加自定义词典(因为分词字典很多词可能没涉及到),下载地址为 https://github.com/fxsjy/jieba。 # -*- coding: UTF-8-*- import string import jieba extra_dict = 'F://NLP/iWInsightor/jieba/mydict.dict'#自定义词典 jieba.load_userdict(extra_dict) def filter_str(instr):   deEstr = string.punctuation + ' ' + string.digits + string.letters   deCstr = ',。《》【】()!?★”“、:…'   destr = deEstr + deCstr   outstr = ''   for char in instr.decode('utf-8'):     if char not in destr:       outstr += char   return outstr fp_in = open('F://NLP/iWInsightor/weibo.dat', 'rb+')#待处理文本 fp_out = open('F://NLP/iWInsightor/weibo_filter.dat', 'a')#处理后的文本 for line in fp_in:   str_delete = filter_str(line)   seg_list = jieba.cut(str_delete,cut_all=True)   str_join = ' '.join(seg_list)   fp_out.write(str_join) fp_in.close() fp_out.close()     三、词频统计     词频统计就是指统计出某个文本中各个词出现的次数,这里使用python中的词典数据结构易得。我用的是 matplotlib画柱状图,画出top-K个高频词。这里需要注意的是图中的中文显示问题,在使用之前,需要修改相应的设置,具体方法不妨去google一下,我就不详细介绍了。      # -*- coding: UTF-8-*- import string import numpy import pylab def getstr(word, count):     countstr = word + ',' + str(count)     return countstr def get_wordlist(infile):     c = open(infile).readlines()     wordlist = []     for line in c:         if len(line)>1:             words = line.split(' ')             for word in words:                 if len(word)>1:                     wordlist.append(word)     return wordlist      def get_wordcount(wordlist, outfile):     out = open(outfile, 'w')     wordcnt ={}     for i in wordlist:         if i in wordcnt:             wordcnt[i] += 1         else:             wordcnt[i] = 1     worddict = wordcnt.items()     worddict.sort(key=lambda a: -a[1])     for word,cnt in worddict:         out.write(getstr(word.encode('gbk'), cnt)+'n')     out.close()     return wordcnt def barGraph(wcDict):     wordlist=[]     for key,val in wcDict.items():         if val>5 and len(key)>3:             wordlist.append((key.decode('utf-8'),val))     wordlist.sort()     keylist=[key for key,val in wordlist]     vallist=[val for key,val in wordlist]     barwidth=0.5     xVal=numpy.arange(len(keylist))     pylab.xticks(xVal+barwidth/2.0,keylist,rotation=45)     pylab.bar(xVal,vallist,width=barwidth,color='y')     pylab.title(u'微博词频分析图')     pylab.show()       if __name__ == '__main__':     myfile = 'F://NLP/iWInsightor/weibo_filter.dat'     outfile = 'F://NLP/iWInsightor/result.dat'     wordlist = get_wordlist(myfile)     wordcnt = get_wordcount(wordlist,outfile)     barGraph(wordcnt)          至此,我们的工作就完成了。下面是我的微博词频的一个柱状图。这些仅是业余时间之作,尚有诸多不足之处。     
转载请注明原文地址: https://www.6miu.com/read-23158.html

最新回复(0)