Step 6: Machine Learning
Decision Trees
>>Introduction to Decision Trees
构建决策树时将类别型特征转换为数值型数据:
用到pandas的categorical,使用Categorical.from_array方法
numpy.bincount 计算array中各值出现的频次,类似于pandas的value_counts()
# Convert a single column from text categories to numbers col = pandas.Categorical.from_array(income["workclass"]) income["workclass"] = col.codes cats=['education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country','high_income'] for cat in cats: col = pandas.Categorical.from_array(income[cat]) income[cat] = col.codes #ID3 # Create a dictionary to hold the tree tree = {} nodes = [] def id3(data, target, columns, tree): unique_targets = pandas.unique(data[target]) nodes.append(len(nodes) + 1) tree["number"] = nodes[-1] if len(unique_targets) == 1: if unique_targets==1: tree['label']=1 else: tree['label']=0 return best_column = find_best_column(data, target, columns) column_median = data[best_column].median() tree['column']=best_column tree['median']=column_median left_split = data[data[best_column] <= column_median] right_split = data[data[best_column] > column_median] split_dict = [["left", left_split], ["right", right_split]] for name, split in split_dict: tree[name] = {} id3(split, target, columns, tree[name]) # Call the function on our data to set the counters properly id3(data, "high_income", ["age", "marital_status"], tree) #格式化输出ID3 tree def print_with_depth(string, depth): prefix = " " * depth print("{0}{1}".format(prefix, string)) def print_node(tree, depth): if "label" in tree: print_with_depth("Leaf: Label {0}".format(tree["label"]), depth) return print_with_depth("{0} > {1}".format(tree["column"], tree["median"]), depth) branches = [tree["left"], tree["right"]] for branch in branches: print_node(branch,depth+1) print_node(tree, 0) #决策树预测 def predict(tree, row): if "label" in tree: return tree["label"] column = tree["column"] median = tree["median"] if row[column] <= median: return predict(tree["left"], row) else: return predict(tree["right"], row) print(predict(tree, data.iloc[0])) #使用匿名函数预测整个DataFrame def batch_predict(tree, df): return df.apply(lambda x: predict(tree,x),axis=1) predictions = batch_predict(tree, new_data)>> Applying Decision Trees
决策树中一些控制过拟合的参数:
决策树的优缺点:
advantages: Easy to interpret Relatively fast to fit and make predictions Able to handle multiple types of data Able to pick up nonlinearities in data, and usually fairly accurate disadvantage: their tendency to overfit.
#调用sklearn库训练决策树 from sklearn.tree import DecisionTreeClassifier columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"] # Set random_state to 1 to make sure the results are consistent clf = DecisionTreeClassifier(random_state=1) clf.fit(income[columns],income['high_income']) #划分训练集80% 测试集20% import numpy import math numpy.random.seed(1) income = income.reindex(numpy.random.permutation(income.index)) train_max_row = math.floor(income.shape[0] * .8) train=income.iloc[:train_max_row] test=income.iloc[train_max_row:] #计算auc from sklearn.metrics import roc_auc_score clf = DecisionTreeClassifier(random_state=1) clf.fit(train[columns], train["high_income"]) predictions = clf.predict(test[columns]) error=roc_auc_score(test['high_income'],predictions) #调整模型参数,防止过拟合 clf = DecisionTreeClassifier(random_state=1,min_samples_split=13,max_depth=7) clf.fit(train[columns], train["high_income"]) predictions = clf.predict(test[columns]) test_auc = roc_auc_score(test["high_income"], predictions) #通过训练集和测试集的auc来看模型的拟合情况,从而调整模型参数>>Random Forest
对本身差别比较大的模型进行融合往往效果更明显,比如融合决策树和逻辑斯蒂回归
如果两个模型的预测能力auc差的比较大,融合后可能不会有太明显提升,需要对二者之间赋权值
随机森林产生:为了让融合的各个子树具有差异性,引入Variation With Bagging
numpy.random.choice()从一个list随机选出几个,random.choice(columns,2)即从columns中选出两个
随机森林的说明:documentation
随机森林参数:
random forest相对于单颗决策树能更好的减少过拟合
随机森林的优缺点:
#手动构造随机森林 tree_count = 10 bag_proportion = .6 predictions = [] for i in range(tree_count): bag = train.sample(frac=bag_proportion, replace=True, random_state=i) clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2, splitter='random',max_features='auto') clf.fit(bag[columns], bag["high_income"]) predictions.append(clf.predict_proba(test[columns])[:,1]) combined = numpy.sum(predictions, axis=0) / 10 rounded = numpy.round(combined) print(roc_auc_score(test["high_income"], rounded)) #用sklearn库构造随机森林 from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=5, random_state=1, min_samples_leaf=2) clf.fit(train[columns],train['high_income']) pre=clf.predict(test[columns]) auc=roc_auc_score(test['high_income'],pre)
注意:决策树/随机森林既可以做分类又可以做回归(RandomForestClassifer/RandomForestRegressor)
Machine Learning Project
数据集: Lending Club
>>Data Cleaning
将数据集中某些值替换为其他值:replace ,对DataFrame操作一般传入的是嵌套的字典
numpy.nan即空值,在对Series进行unique操作时,nan也会算进来,[1,2,Nan].unique() 也有三个值,不会把Nan排除
#数据预处理 import pandas as pd loans_2007 = pd.read_csv('LoanStats3a.csv', skiprows=1) #忽略第一行 half_count = len(loans_2007) / 2 loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)#删除有50%的值为空的列 loans_2007 = loans_2007.drop(['desc', 'url'],axis=1) loans_2007.to_csv('loans_2007.csv', index=False) loans_2007 = pd.read_csv("loans_2007.csv") loans_2007.drop_duplicates() print(loans_2007.iloc[0]) print(loans_2007.shape[1]) #排除一些有信息泄露的数据,冗余数据,或者需要其他数据才能变成有用特征的数据 drop_cols=["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"] loans_2007=loans_2007.drop(drop_cols,axis=1) loans_2007 = loans_2007.drop(["zip_code","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv", "total_rec_prncp"], axis=1) loans_2007 = loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1) #构造label,排除一些不明确rows,将label映射为正类和负类 loans_2007=loans_2007[(loans_2007['loan_status']=='Fully Paid')|(loans_2007['loan_status']=='Charged Off')] re={'loan_status':{'Fully Paid':1,'Charged Off':0}} loans_2007=loans_2007.replace(re) #排除单值列:每一列排除空值后如果只有一个有效值,则将其删除 drop_columns=[] for col in loans_2007.columns.tolist(): uni_value=loans_2007[col].dropna().unique() if len(uni_value)==1: drop_columns.append(col) loans_2007=loans_2007.drop(drop_columns,axis=1) >>Preparing the Feature选择DataFrame中特定元素类型的列构建新的DataFrame
get_dummies 将类型变量转为数值变量,类似于OneHotEncoder
#读入数据,统计每一列空值Nan的个数 import pandas as pd loans = pd.read_csv('filtered_loans_2007.csv') null_counts = loans.isnull().sum() print(null_counts) #output: #title 10 #revol_util 50 #last_credit_pull_d 2 #pub_rec_bankruptcies 697 #移除含有空值的行,以及空值较多的列 loans=loans.drop('pub_rec_bankruptcies',axis=1) loans=loans.dropna(axis=0) print(loans.dtypes.value_counts()) #output: #object 11 #float64 10 #int64 1 #选出并查看object类型的列 object_columns_df=loans.select_dtypes(include=['object']) print(object_columns_df.iloc[0]) #探索category数值的列 cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state'] for c in cols: print(loans[c].value_counts()) mapping_dict = { "emp_length": { "10+ years": 10, "9 years": 9, "8 years": 8, "7 years": 7, "6 years": 6, "5 years": 5, "4 years": 4, "3 years": 3, "2 years": 2, "1 year": 1, "< 1 year": 0, "n/a": 0 } } loans=loans.drop(['last_credit_pull_d', 'addr_state', 'title', 'earliest_cr_line'],axis=1) loans['int_rate']=loans['int_rate'].str.rstrip('%').astype('float64') loans['revol_util']=loans['revol_util'].str.rstrip('%').astype('float64') loans=loans.replace(mapping_dict) cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"] dummy_df = pd.get_dummies(loans[cat_columns]) loans = pd.concat([loans, dummy_df], axis=1) loans = loans.drop(cat_columns, axis=1) >>Making predictions
针对正负样本极不均匀的问题(正样本是负样本6倍):
1.可以对正样本进行采样,或者构造一些负样本
Use oversampling and undersampling to ensure that the classifier gets input that has a balanced number of each class.
2.还有一种方式是在模型选择使,对不同的label赋予不同的权值:setting the class_weight parameter to balanced
Tell the classifier to penalize misclassifications of the less prevalent class more than the other class
import pandas as pd loans=pd.read_csv('cleaned_loans_2007.csv') print(loans.info()) #计算TP,TN,FP,FN import pandas as pd tn=sum((predictions==0)&(loans['loan_status']==0)) tp=sum((predictions==1)&(loans['loan_status']==1)) fn=sum((predictions==0)&(loans['loan_status']==1)) fp=sum((predictions==1)&(loans['loan_status']==0)) #根据样本的分布和实际场景选择评价指标 # Predict that all loans will be paid off on time. predictions = pd.Series(numpy.ones(loans.shape[0])) fp=sum((predictions==1)&(loans['loan_status']==0)) tp=sum((predictions==1)&(loans['loan_status']==1)) fn=sum((predictions==0)&(loans['loan_status']==1)) tn=sum((predictions==0)&(loans['loan_status']==0)) fpr=fp/(fp+tn) tpr=tp/(tp+fn) print(fpr,tpr) from sklearn.linear_model import LogisticRegression features=loans.drop('loan_status',axis=1) target=loans['loan_status'] lr = LogisticRegression() lr.fit(features,target) predictions=lr.predict(features) #交叉验证 from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_predict, KFold lr = LogisticRegression() kf = KFold(features.shape[0], random_state=1) predictions=cross_val_predict(lr,features,target,cv=kf) tp=sum((predictions==1)&(target==1)) tn=sum((predictions==0)&(target==0)) fp=sum((predictions==1)&(target==0)) fn=sum((predictions==0)&(target==1)) tpr=tp/(tp+fn) fpr=fp/(fp+tn) print(fpr,tpr) #调整参数克服样本不均匀,增加负样本错误的惩罚 from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_predict lr=LogisticRegression(class_weight='balanced') kf=KFold(features.shape[0],random_state=1) predictions=cross_val_predict(lr,features,target,cv=kf) tp=sum((predictions==1)&(target==1)) tn=sum((predictions==0)&(target==0)) fp=sum((predictions==1)&(target==0)) fn=sum((predictions==0)&(target==1)) tpr=tp/(tp+fn) fpr=fp/(fp+tn) print(fpr,tpr)