结合词频统计的功能,联想到可以应用于企业审批流程回退意见的词频分析,是企业流程绩效分析的扩展之一。
技术路线:jieba分词,wordcloud绘制特定形状词云
#bpmRejectAnalyzeV1.pyimport jiebaimport jieba.posseg as psegfrom os import pathfrom scipy.misc import imreadfrom wordcloud import WordCloudimport matplotlib.pyplot as pltdef getTxt(txt): with open(txt,'r',encoding='utf-8')as f: reject_list = f.readlines() return reject_listdef segmentWords(txtlist): stop_words = set(line.strip() for line in open('stopwords.txt', encoding='utf-8')) newslist = [] for subject in txtlist: if subject.isspace(): continue word_list = pseg.cut(subject) for word, flag in word_list: if not word in stop_words and flag == 'n': newslist.append(word) return newslist def drawPlant(newslist): d = path.dirname(__file__) mask_image = imread(path.join(d, "mickey.png")) content = ' '.join(newslist) wordcloud = WordCloud(font_path='simhei.ttf', background_color="white",mask=mask_image, max_words=40).generate(content) # Display the generated image: plt.imshow(wordcloud) plt.axis("off") wordcloud.to_file('wordcloud.jpg') plt.show()def countWords(newslist): wordDict = {} for item in newslist: wordDict[item] = wordDict.get(item,0) + 1 itemList = list(wordDict.items()) itemList.sort(key=lambda x:x[1],reverse=True) for i in range(100): word, count = itemList[i] print("{}:{}".format(word,count)) def main(): txtlist = getTxt('bpmreject.txt') wordlist = segmentWords(txtlist) countWords(wordlist) drawPlant(wordlist)
main()
