招聘数据分析<二>

xiaoxiao2021-02-28  82

回归分析:

############# 设置工作路径**** ## 请使用setwd函数,设置自己的工作路径,并将上述所提到的文件放到该工作路径下*** ## setwd(.....) ## package: ggplot2 用于绘制各类图表 library(ggplot2) load("temp.rda") ## 调用描述性统计“descriptive statsics.R”的中间运行结果 ## 图片颜色设置,利用rgb函数产生颜色,用于后续画图使用 col1 = rgb(32,40,51,maxColorValue = 255) ##某一种黑色 col2 = rgb(172,22,34,maxColorValue = 255) ##某一种深红 col3 = "indianred " ##某一种浅红 col4 = "dimgrey" ##某一种灰色 ############ 回归分析 ## 加虚拟变量,我们采用ifelse条件判断语句来创建虚拟变量,以北京为例,若地区为北京,则北京这个变量为1,反之为0 ## 地区 jobinfo.new$北京 = ifelse(jobinfo$地区=="北京",1,0) jobinfo.new$河北 = ifelse(jobinfo$地区=="河北",1,0) jobinfo.new$上海 = ifelse(jobinfo$地区=="上海",1,0) jobinfo.new$深圳 = ifelse(jobinfo$地区=="深圳",1,0) jobinfo.new$山西 = ifelse(jobinfo$地区=="山西",1,0) jobinfo.new$陕西 = ifelse(jobinfo$地区=="陕西",1,0) ## 公司类别 jobinfo.new$合资 = ifelse(jobinfo$公司类别=="合资",1,0) jobinfo.new$外资 = ifelse(jobinfo$公司类别=="外资",1,0) jobinfo.new$上市公司 = ifelse(jobinfo$公司类别=="上市公司",1,0) jobinfo.new$民营公司 = ifelse(jobinfo$公司类别=="民营公司",1,0) jobinfo.new$国企 = ifelse(jobinfo$公司类别=="国企",1,0) jobinfo.new$非营利机构 = ifelse(jobinfo$公司类别=="非营利机构",1,0) jobinfo.new$创业公司 = ifelse(jobinfo$公司类别=="创业公司",1,0) jobinfo.new$事业单位 = ifelse(jobinfo$公司类别=="事业单位",1,0) ## 公司规模 jobinfo.new$少于50人 = ifelse(jobinfo$公司规模=="少于50人",1,0) jobinfo.new$"50-150人" = ifelse(jobinfo$公司规模=="50-150人",1,0) jobinfo.new$"150-500人" = ifelse(jobinfo$公司规模=="150-500人",1,0) jobinfo.new$"500-1000人" = ifelse(jobinfo$公司规模=="500-1000人",1,0) jobinfo.new$"1000-5000人" = ifelse(jobinfo$公司规模=="1000-5000人",1,0) jobinfo.new$"5000-10000人" = ifelse(jobinfo$公司规模=="5000-10000人",1,0) jobinfo.new$"10000人以上" = ifelse(jobinfo$公司规模=="10000人以上",1,0) ## 学历 jobinfo.new$中专 = ifelse(jobinfo$学历=="中专",1,0) jobinfo.new$高中 = ifelse(jobinfo$学历=="高中",1,0) jobinfo.new$大专 = ifelse(jobinfo$学历=="大专",1,0) jobinfo.new$本科 = ifelse(jobinfo$学历=="本科",1,0) jobinfo.new$硕士 = ifelse(jobinfo$学历=="硕士",1,0) jobinfo.new$博士 = ifelse(jobinfo$学历=="博士",1,0) ## 要求经验 jobinfo.new$经验要求 = jobinfo$经验 ## 回归中,地区以河北为基准,公司类别以国企为基准,公司规模以少于50人为基准,学历以无为基准 lm.fit = lm(对数平均薪资~.-河北-国企-少于50人,data = jobinfo.new) ## 查看回归结果 summary(lm.fit) ## 创建显著性向量 sign = c("***","*","","***","**","","**","***","**","","","","***", "***","***","***","**","**","***","","***","","","**","", "","***","","*","","","***","***","***","***","***","***", "***") significance = paste(as.character(round(coef(lm.fit),3)),sign,sep = "") ## 将各类回归系数放到同一个dataframe,以软件为例,将软件系数用coef取出,然后放入software.coef中; ## 取出software.coef的行名,作为变量“软件要求”,给列命名并去除行名 ## 学历dataframe edu.coef = data.frame(coef(lm.fit)[32:37],significance[32:37]) edu.coef$地区 = factor(row.names(edu.coef), levels = row.names(edu.coef)) colnames(edu.coef) = c("系数","显著性水平","学历") row.names(edu.coef) = NULL ## 学历系数可视化,画出学历的回归系数直方图 ggplot(data=edu.coef, aes(x=学历, y=系数)) + geom_bar(fill=c(rep(col4,5),col3),stat="identity",width = 0.6) + annotate("text",x=1:6,y=c(edu.coef$系数[1:3]-0.02,edu.coef$系数[4:6]+0.02),label=edu.coef$显著性水平) + labs(x="学历要求") ## 软件系数可视化,画出地区的回归系数直方图 software.coef = data.frame(coef(lm.fit)[2:13],significance[2:13]) ## 将软件系数用coef取出,然后放入software.coef中 software.coef$软件技能 = factor(row.names(software.coef), levels = row.names(software.coef)[order(software.coef$coef.lm.fit..2.13.)]) ##取出software.coef的行名,作为变量“软件技能”,并转化为因子变量 colnames(software.coef) = c("系数","显著性水平","软件要求") row.names(software.coef) = NULL ## 按照系数大小重新排列software.coef software.coef = software.coef[order(software.coef$系数),] ## 软件系数可视化,画出软件的回归系数直方图 ggplot(data=software.coef, aes(x=软件要求, y=系数)) + geom_bar(fill=c(col3,rep(col4,10),col3),stat="identity",width=0.6) + annotate("text",x=1:12,y=c(software.coef$系数 [1:5]-0.007,software.coef$系数[6:12]+0.007),label=software.coef$显著性水平) ## 地区系数dataframe district.coef = data.frame(coef(lm.fit)[14:18],significance[14:18]) district.coef$地区 = factor(row.names(district.coef), levels = row.names(district.coef)[order(district.coef$coef.lm.fit..14.18.)]) colnames(district.coef) = c("系数","显著性水平","地区") row.names(district.coef) = NULL ## 地区系数可视化,画出地区的回归系数直方图 ggplot(data=district.coef, aes(x=地区, y=系数)) + geom_bar(fill=c(rep(col4,2),rep(col3,3)),stat="identity",width=0.6) + geom_text(label=district.coef$显著性水平,vjust = -0.4,size=5) ## 公司规模dataframe scale.coef = data.frame(coef(lm.fit)[26:31],significance[26:31]) scale.coef$公司规模 = c("50-150人","150-500人","500-1000人","1000-5000人","5000-10000人","10000人以上") scale.coef$公司规模 = factor(scale.coef$公司规模,levels = scale.coef$公司规模) colnames(scale.coef) = c("系数","显著性水平","公司规模") row.names(scale.coef) = NULL ## 公司规模系数可视化,画出公司规模的回归系数直方图 ggplot(data=scale.coef, aes(x=公司规模, y=系数)) + geom_bar(fill=c(col4,col3,rep(col4,4)),stat="identity",width = 0.6) + geom_text(label=scale.coef$显著性水平,vjust = -0.4,size=5) ## 公司类别dataframe type.coef = data.frame(coef(lm.fit)[19:25],significance[19:25]) type.coef$地区 = factor(row.names(type.coef), levels = row.names(type.coef)[order(type.coef$coef.lm.fit..19.25.)]) colnames(type.coef) = c("系数","显著性水平","公司类别") row.names(type.coef) = NULL type.coef = type.coef[order(type.coef$系数),] ## 公司类别系数可视化,画出公司类别的回归系数直方图 ggplot(data=type.coef, aes(x=公司类别, y=系数)) + geom_bar(fill=c(rep(col4,6),col3),stat="identity",width = 0.6) + annotate("text",x=1:7,y=c(type.coef$系数 [1:2]-0.007,type.coef$系数[3:7]+0.007),label=type.coef$显著性水平) ## 预测:会用r和python,本科毕业,无工作经验,公司位于上海,规模87人,上市公司 ## 创建一个名为new.data1的data frame (职场菜鸟) new.data1 = matrix(c(1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0),1,40) new.data1 = as.data.frame(new.data1) colnames(new.data1) = names(jobinfo.new)[-1] ## 对data frame命名 interval1 = predict(lm.fit,new.data1,interval="confidence") ## 区间估计 income1 = exp(interval1) ##将预测的对数薪资转化为实际薪资 income1 ## 预测:会用r,java,sas和python,硕士毕业, ## 7年工作经验,公司位于北京,中小型公司(规模150-500人),创业公司 ## 创建一个名为new.data2的data frame (职场高富帅) new.data2 = matrix(c(1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,7),1,40) new.data2 = as.data.frame(new.data2) colnames(new.data2) = names(jobinfo.new)[-1] ## 对data frame命名 interval2 = predict(lm.fit,new.data2,interval="confidence") ## 区间估计 income2 = exp(interval2) ##将预测的对数薪资转化为实际薪资 income2
转载请注明原文地址: https://www.6miu.com/read-78113.html

最新回复(0)