数据预处理demo

xiaoxiao2021-02-28  109

R笔记:

#step(1): Reading data txt=readLines("Data_Hw2.txt") #readLines: when the rows in a data files are not uniformly formatted txt #step(2):Selecting lines containing data I=grepl("^//",txt) I dat=txt[!I] dat #step(3):Split lines into separate fields (fieldList=strsplit(dat,split=";")) str(fieldList) #step(4):Standardize rows #先定义一个对列表中单个元素处理的 assignFields=function(x) #函数声明 { l=length(x) out=character(3) if(l>1){ #匹配list中的字符作为输出的第一列 i=grepl("[[:alpha:]]",x) #print(i) out[1]=x[i] out[2]=round(as.numeric(x[2])) #若长度不大于0,则赋值为NA #print(i) if(l==3){ out[3]=x[3] }else{ out[3]=NA } } #若长度不大于0,则赋值为NA return(out) } #lapply函数用来处理列表的每一个元素 standardFields=lapply(fieldList,assignFields) #apply a function over a list standardFields #step(5): transform a list to data.frame(将list转化为data.frame) M=matrix(unlist(standardFields),nrow=length(standardFields),byrow=TRUE) #copy into a matrix which is then coerced into a data.frame #unlist() produce a vector which contains all the atomic components which occur in x colnames(M)=c("Gender","Age","weight") M M=M[1:4,] M deltons=as.data.frame(M,stringsAsFactors=FALSE) #stringsAsFactors=FALSE 防止R把第一列默认成因子模式factor deltons #step(6):Normalize and coerce to correct types(强制转换类型) str(deltons) J=grepl("^m",deltons$Gender,ignore.case=T) J for(i in 1:length(deltons$Gender)){ deltons$Gender[i]=ifelse(J[i],"man","woman") } #deltons$Gender=gsub("^m","man",deltons$Gender,ignore.case=T) deltons$weight=gsub(",",".",deltons$weight) deltons$Age=as.integer(deltons$Age) deltons$weight=as.numeric(deltons$weight) deltons str(deltons)
转载请注明原文地址: https://www.6miu.com/read-71498.html

最新回复(0)