R 爬虫入门

xiaoxiao2021-02-28  94

library(xml2) library(rvest) site1 <- "https://www.zhipin.com/c101280600/h_101280600/?query=数据分析&page="; site2 <- "&ka=page-"; page <- 1; total<-data.frame(name=character(),salary=character(),city=character(), experience=character(),degree=character()); for(page in 1:30){ site <- paste(site1,page,site2,page,sep=""); html <- read_html(site); #jobName<-html_nodes(html,"div.info-primary>h3.name")%>%html_text(trim=T); #temp<-strsplit(jobName," "); #后来发现这样处理不行,因为有些职务名称里本身就带有空格 job<-html_nodes(html,"div.info-primary>h3.name") job<-gsub("<h3 class=\"name\">","",job); jobName<-gsub("<(span.*?)(class.*?)>(.*?)</h3>"," ",job); jobName salary<-html_nodes(html,"div.info-primary>h3.name>span.red")%>%html_text(trim=T); salary df1 <- data.frame(jobName,salary); jobMsg<-html_nodes(html,"div.info-primary>p"); jobMsg<-gsub("<(em.*?)(class.*?)></em>"," ",jobMsg); jobMsg<-gsub("<(.?p)>","",jobMsg); temp<-strsplit(jobMsg," "); df2 <- data.frame(matrix(unlist(temp), nrow=15, byrow=T)); df<-data.frame(df1,df2); total<-rbind(total,df); }
转载请注明原文地址: https://www.6miu.com/read-66306.html

最新回复(0)