二分类实例

xiaoxiao2021-02-28  74

导入数据

import numpy as np from matplotlib import pyplot from pandas import read_csv from pandas.plotting import scatter_matrix from pandas import set_option from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier #导入数据 filename = '/home/hadoop/下载/sonar.all-data.csv' data = read_csv(filename,header=None)

分析数据

描述性统计

#数据维度 print(data.shape) print(data.dtypes) print(data.head(20)) #描述性统计信息 set_option('precision',3) print(data.describe()) #数据的分类分布 print(data.groupby(60).size())

数据可视化 通过多种图表观察数据的分布情况,这会为解决问题提供灵感。

#直方图 data.hist(sharex=False,sharey=False,xlabelsize=1,ylabelsize=1) pyplot.show() #密度图 data.plot(kind='density',subplots=True,layout=(8,8),sharex=False,legend=False,fontsize=1) pyplot.show()

从密度图可以看到,大部分数据都呈现一定程度的偏态分布,也许通过Box-Cox转换可以提高模型的准确度。Box-Cox转换是统计中常用的一种数据变化方式,用于连续响应变量不满足正态分布的情况。Box-Cox转换后,可以在一定程度上减少不可观测的误差,也可以预测变量的相关性,将数据转换为正态分布。

#关系矩阵图 fig = pyplot.figure() ax = fig.add_subplot(111) cax = ax.matshow(data.corr(),vmin=-1,vmax=1,interpolation='none') fig.colorbar(cax) pyplot.show()

分离评估数据集

#分离评估数据集 array = data.values X = array[:,0:60].astype(float) Y = array[:,60] validation_size = 0.2 seed = 7 X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y,test_size = validation_size,random_state=seed)

评估算法

#评估算法的基准 num_folds = 10 seed = 7 scoring = 'accuracy'

线性算法:逻辑回归算法(LR)和线性判别分析(LDA) 非线性算法:分类与回归算法(CART),支持向量机(SVM),贝叶斯分类器(NB)和K近邻算法(KNN)

#评估算法-------原始数据 models = {} models['LR'] = LogisticRegression() models['LDA'] = LinearDiscriminantAnalysis() models['KNN'] = KNeighborsClassifier() models['CART'] = DecisionTreeClassifier() models['NB'] = GaussianNB() models['SVM'] = SVC() results = [] for key in models: kfold = KFold(n_splits=num_folds,random_state=seed) cv_results = cross_val_score(models[key],X_train,Y_train,cv=kfold,scoring=scoring) results.append(cv_results) print('%s:%f(%f)'%(key,cv_results.mean(),cv_results.std())) #评估算法-----箱线图 fig = pyplot.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(models.keys()) pyplot.show()

猜想原始数据分布不均匀,导致有些算法表现不佳,于是将数据进行正态化处理,然后对算法再次进行评估。为了确保数据的一致性,将采用Pipeline来流程化处理

#评估算法-----正态化数据 pipelines = {} pipelines['ScalarLR'] = Pipeline([('Scaler',StandardScaler()),('LR',LogisticRegression())]) pipelines['ScalerLDA'] = Pipeline([('Scaler',StandardScaler()),('LDA',LinearDiscriminantAnalysis())]) pipelines['ScalerKNN'] = Pipeline([('Scaler',StandardScaler()),('KNN',KNeighborsClassifier())]) pipelines['ScalerCART'] = Pipeline([('Scaler',StandardScaler()),('CART',DecisionTreeClassifier())]) pipelines['ScalerNB'] = Pipeline([('Scaler',StandardScaler()),('NB',GaussianNB())]) pipelines['ScalerSVM'] = Pipeline([('Scaler',StandardScaler()),('SVM',SVC())]) results = [] for key in pipelines: kfold = KFold(n_splits=num_folds,random_state=seed) cv_results = cross_val_score(pipelines[key],X_train,Y_train,cv=kfold,scoring=scoring) results.append(cv_results) print('%s:%f(%f)'%(key,cv_results.mean(),cv_results.std())) #评估算法-----箱线图 fig = pyplot.figure() fig.suptitle('Scaled Algorithm Comparison') ax = fig.add_subplot(111) pyplot.boxplot(results) ax.set_xticklabels(models.keys()) pyplot.show()

算法调参 通过对算法的评估,发现K近邻算法(KNN)和支持向量机(SVM)值得我们进一步进行优化。

K近邻算法调参

#调参改进算法----KNN scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) param_grid = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]} model = KNeighborsClassifier() kfold = KFold(n_splits=num_folds,random_state=seed) grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold) grid_result = grid.fit(X=rescaledX,y=Y_train) print('最优:%s使用%s'%(grid_result.best_score_,grid_result.best_params_)) cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params']) for mean,std,param in cv_results: print('%f(%f) with %r'%(mean,std,param))

支持向量机调参 支持向量机有两个重要的参数,C(惩罚系数)和kernel(径向基函数),默认的C参数为1.0,默认的kernel参数是rbf。下面将对这两个参数进行调参。

#调参改进算法------SVM scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train).astype(float) param_grid = {} param_grid['C'] = [0.1,0.3,0.5,0.7,0.9,1.0,1.3,1.5,1.7,2.0] param_grid['kernel'] = ['linear','poly','rbf','sigmoid','precomputed'] model = SVC() kfold = KFold(n_splits=num_folds,random_state=seed) grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold) grid_result = grid.fit(X=rescaledX,y=Y_train) print('最优:%s 使用 %s'%(grid_result.best_score_,grid_result.best_params_)) cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params']) for mean,std,param in cv_results: print('%f (%f) with %r'%(mean,std,param))

集成算法 除了调参,提高算法准确度的方法是集成算法。下面会对四种算法进行比较,以便进一步提到算法的准确度。 装袋算法:随机森林(RF)和极端随机树(ET) 提升算法:AdaBoost(AB)和随机梯度上升

#集成算法 ensembles = {} ensembles['ScaledAB'] = Pipeline([('Scaler',StandardScaler()),('AB',AdaBoostClassifier())]) ensembles['ScaledGBM'] = Pipeline([('Scaler',StandardScaler()),('GBM',GradientBoostingClassifier())]) ensembles['ScaledRF'] = Pipeline([('Scaler',StandardScaler()),('RFR',RandomForestClassifier())]) ensembles['ScaledET'] = Pipeline([('Scaler',StandardScaler()),('ETR',ExtraTreesClassifier())]) results = [] for key in ensembles: kfold = KFold(n_splits=num_folds,random_state=seed) cv_result = cross_val_score(ensembles[key],X_train,Y_train,cv=kfold,scoring=scoring) results.append(cv_result) print('%s:%f(%f)'(key,cv_result.mean(),cv_result.std()))

确定最终模型 通过前面对算法的评估发现,支持向量机(SVM)具有最佳的准确度。所以将会采用支持向量机(SVM)。通过训练集数据生成算法模型,并通过预留的评估数据集来评估模型。在算法评估过程中发现,支持向量机(SVM)对正态化的数据具有较高的准确度。所以对数据集做正态处理,对评估数据集也做相同的处理。

#模型最终化 scaler = StandardScaler().fit(X_train) rescaledX = scaler.transform(X_train) model = SVC(C=1.5,kernel='rbf') model.fit(X=rescaledX,y=Y_train) #评估模型 rescaled_validationX = scaler.transform(X_validation) predictions = model.predict(rescaled_validationX) print(accuracy_score(Y_validation,predictions)) print(confusion_matrix(Y_validation,predictions)) print(classification_report(Y_validation,predictions))

PS:范数的指数越高,就越关注大的值而忽略小的值。

转载请注明原文地址: https://www.6miu.com/read-2623295.html

最新回复(0)