招聘数据分析<三>

xiaoxiao2021-02-28  58

词云分析:

## package: readxl 用于读取Excel文件 ## package: ggplot2 用于绘制各类图表 ## package: jiebaR 用于分词 ## package: wordcloud2 用于绘制词云图 library(readxl) library(ggplot2) library(jiebaR) library(wordcloud2) options(scipen = 200) ## 去除科学计数法 jobinfo = read_excel("jobinfo.xlsx") ## 读取原始数据 str(jobinfo) ## 查看数据结构 jobinfo$最低薪资 = as.numeric(jobinfo$最低薪资) ## 将最低薪资的字符型变量改为数值型变量 jobinfo$最高薪资 = as.numeric(jobinfo$最高薪资) ## 将最高薪资的字符型变量改为数值型变量 ## 图片颜色设置,利用rgb函数产生颜色,用于后续画图使用 col1 = rgb(32,40,51,maxColorValue = 255) ##某一种黑色 col2 = rgb(172,22,34,maxColorValue = 255) ##某一种深红 col3 = "indianred " ##某一种浅红 col4 = "dimgrey" ##某一种灰色 ############ 行业类别分析 ## 从jobinfo中取出行业类别 hangye = jobinfo$行业类别 ## 统计各个行业的频率,返回格式为data.frame hangye = as.data.frame(ftable(hangye)) ## 查看行业类别 View(hangye) ############### ## 前五类所占比例最大,绘制饼图展示具体比例; ## 其它由于涉及种类太多,选择使用词云图的方式展示 ## 提取频数前五的行业的名称和频数 hangye = hangye[order(-hangye$Freq),] hangye$hangye = as.character(hangye$hangye) hangye$hangye[4] = "快速消费品" ## 将快速消费品(食品、饮料、化妆品)改为快速消费品 top5 = hangye[order(-hangye$Freq),][1:5,] ## 根据前五的行业的名称和频数汇表 行业 = factor(top5$hangye,levels = top5$hangye) 频数 = top5$Freq indusrty.df = data.frame(x = 行业, y = 频数) ggplot(indusrty.df ,aes(x=行业,y=频数)) + geom_bar(stat = 'identity',fill=col4) ##############“其它”行业的词云图,先把前五个行业 和 “无”的去除,然后绘制词云图 hangye$Freq[which(hangye$hangye=="无")] = 0 hangye$Freq[which(hangye$hangye==top5$hangye[1])] = 0 hangye$Freq[which(hangye$hangye==top5$hangye[2])] = 0 hangye$Freq[which(hangye$hangye==top5$hangye[3])] = 0 hangye$Freq[which(hangye$hangye==top5$hangye[4])] = 0 hangye$Freq[which(hangye$hangye==top5$hangye[5])] = 0 wordcloud2(hangye, size = 0.2,shape = "diamond") #词云图 ##############岗位描述处理 mixseg = worker() ## 按照缺省值,设置分词引擎 subdata1 = as.character(jobinfo$描述[1]) #读入第一条数据的岗位描述,以其为例,进行分词测试 fenci = mixseg[subdata1] #分词 fenci # 展示第一条数据的岗位描述的分词结果,以下循环为依次对数据集中所有的岗位描述进行分词 for (i in 2:length(jobinfo$描述)){ subdata = as.character(jobinfo$描述[i]) subfenci = mixseg[subdata] fenci = c(fenci,subfenci) } ###################把大小写统一 同义词替换 fenci[which(fenci=="excel")] = "Excel" fenci[which(fenci=="EXCEL")] = "Excel" fenci[which(fenci=="r")] = "R" fenci[which(fenci=="spss")] = "SPSS" fenci[which(fenci=="Spss")] = "SPSS" fenci[which(fenci=="python")] = "Python" fenci[which(fenci=="Matlab")] = "MATLAB" fenci[which(fenci=="matlab")] = "MATLAB" fenci[which(fenci=="java")] = "Java" fenci[which(fenci=="Sql")] = "SQL" fenci[which(fenci=="sql")] = "SQL" fenci[which(fenci=="sas")] = "SAS" fenci[which(fenci=="WORD")] = "Word" fenci[which(fenci=="word")] = "Word" fenci[which(fenci=="ppt")] = "PPT" fenci[which(fenci=="Ppt")] = "PPT" fenci[which(fenci=="Office")] = "office" fenci[which(fenci=="spark")] = "Spark" fenci[which(fenci=="SPARK")] = "Spark" fenci[which(fenci=="STATA")] = "Stata" fenci[which(fenci=="stata")] = "Stata" fenci[which(fenci=="HADOOP")] = "hadoop" fenci[which(fenci=="Hadoop")] = "hadoop" fenci[which(fenci=="Eviews")] = "EViews" fenci[which(fenci=="EVIEWS")] = "EViews" fenci[which(fenci=="eviews")] = "EViews" fenci[which(fenci=="实时")] = "按时" fenci[which(fenci=="及时")] = "按时" fenci[which(fenci=="大学本科")] = "本科" fenci[which(fenci=="挖掘")] = "数据挖掘" fenci[which(fenci=="思维")] = "逻辑思维" fenci[which(fenci=="逻辑")] = "逻辑思维" fenci[which(fenci=="协作")] = "合作" fenci[which(fenci=="合作伙伴")] = "合作" fenci[which(fenci=="敬业")] = "敬业精神" fenci[which(fenci=="执行")] = "执行力" fenci[which(fenci=="编写")] = "撰写" fenci[which(fenci=="专业本科")] = "专业" fenci[which(fenci=="快")] = "快速" ########### 以下部分为去掉无意义的停词,由于停词太多,为了方便运行速度的考虑,将停词分为两个txt thetable = table(fenci) #统计分词词频 dftable = as.data.frame(thetable) # 将格式转成data.frame格式 dftable = dftable[which(dftable$Freq>500),] #提取词频数大于500的词研究 stopwords = unlist(read.table("stopwords.txt",stringsAsFactors=F,fileEncoding = "GB2312")) #读入停词表 dftable1 = dftable # 根据读入的停词表,去掉停词 for (i in length(dftable$fenci):1) { for (j in 1:length(stopwords)) { if (dftable$fenci[i]==stopwords[j]){ dftable1 = dftable1[-i,] } } } ##########读入第二个停词表并去停词 newstopwords = unlist(read.table("newstopwords.txt",stringsAsFactors=F,fileEncoding = "GB2312")) dftable2 = dftable1 for (i in length(dftable1$fenci):1) { for (j in 1:length(newstopwords)) { if (dftable1$fenci[i]==newstopwords[j]){ dftable2 = dftable2[-i,] } } } ###########绘制高频词汇图 order = dftable2[order(-dftable2$Freq),] #按照词频将分词重新排列 order11 = order[1:11,] #提取前11位 View(order11) ## 由于“能力”一词与其他词语属于包含关系,下列各词均是能力的体现,故将能力一词删去 order10 = order11[-1,] ############### 根据order11高频词汇及其数量绘制高频词汇表 分词 = factor(order10$fenci,levels = order10$fenci) 频数 = order10$Freq df = data.frame(x = 分词, y = 频数) ggplot(df,aes(x=分词,y=频数)) + geom_bar(stat = 'identity',fill=col4)+ labs(x="描述关键词") #################### 去除高频词汇后绘制词云图 redftable2 = dftable2 for (i in 1:length(order11$Freq)){ #将频数较大的前11个的频数变为零 redftable2$Freq[which(dftable2$fenci==order11$fenci[i])]=0 } ## 画词云 wordcloud2(redftable2)
转载请注明原文地址: https://www.6miu.com/read-79347.html

最新回复(0)