《python数据挖掘入门与实践》第五章。学习构造自己的转换器,和特征工程的一些技巧。 代码及每一部分的输出如下。
数据下载:
http://archive.ics.uci.edu/ml/machine-learning-databases/internet_ads/
不要寻找表现好的子集,而只是找表现好的单个特征,依据是它们各自能达到的精度单个特征和某一类别的相关性计算方法有很多:卡方检测(x2),互信息,信息熵等
X = adult[["Age","Education-Num","Capital-gain","Capital-loss","Hours-per-week"]].values#税前收入分类y = (adult["Earnings-Raw"] == ' >50K').values#使用SelectKBest转换器,用卡方函数打分,初始化转换器from sklearn.feature_selection import SelectKBestfrom sklearn.feature_selection import chi2transformer = SelectKBest(score_func=chi2,k=3)#使用卡方函数,选择分类效果好的3个特征Xt_chi2 = transformer.fit_transform(X,y)#生成分类效果好的三个特征print(transformer.scores_)#每一个特征的相关性 [8.60061182e+03 2.40142178e+03 8.21924671e+07 1.37214589e+06 6.47640900e+03]使用皮尔逊相关系数计算相关性
from scipy.stats import pearsonr#因为SciPy的pearsonr函数参数为两个数组,但第一个数组为一维的#实现一个包装器函数,就能像上面那样处理多维数组def multivariate_pearsonr(X,y): scores,pvalues = [],[] for column in range(X.shape[1]): cur_score,cur_p = pearsonr(X[:,column],y) scores.append(abs(cur_score)) pvalues.append(cur_p) return (np.array(scores),np.array(pvalues))#现在就可以像之前一样使用转换器transformer = SelectKBest(score_func=multivariate_pearsonr,k=3)Xt_pearsonr = transformer.fit_transform(X,y)print(transformer.scores_) [0.2340371 0.33515395 0.22332882 0.15052631 0.22968907] #看哪个集合效果更好from sklearn.tree import DecisionTreeClassifierfrom sklearn.cross_validation import cross_val_scoreclf = DecisionTreeClassifier(random_state=14)scores_chi2 = cross_val_score(clf,Xt_chi2,y,scoring="accuracy")scores_pearsonr = cross_val_score(clf,Xt_pearsonr,y,scoring="accuracy")#输出各自预测准确率print("卡方函数:{0:.1f}%".format(np.mean(scores_chi2)*100))print("pearsonr系数:{0:.1f}%".format(np.mean(scores_pearsonr)*100)) E:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)卡方函数:82.9%pearsonr系数:77.1%特征之间的相关性很强,或者特征冗余,会增加算法处理难度,从已有特征创建新特征很有必要,有很多方法
import osimport pandas as pdimport numpy as npfrom collections import defaultdict#导入广告数据ad_data_folder = "E:\DataMining\Project\dataming_with_python\Adult"ad_adult_filename = os.path.join(ad_data_folder,"ad.data")#编写将字符串(数值形)转换成数字的函数def convert_number(x): try: return float(x) except ValueError: #此处应该返回其他充当缺失值的,这里简单填6 return 6#创建一个字典储存所有特征和转换结果myconverters = defaultdict(convert_number)for i in ads.columns[:-1]: myconverters[i] = convert_number#最后一列类别转换成0,1myconverters[1558] = lambda x: 1 if x.strip() =="ad." else 0#lambda:函数的简洁表示#csvads = pd.read_csv(ad_adult_filename,header = None,converters=myconverters)#指定转换函数ads.iloc[192:195,0] 192 6.0193 6.0194 60.0Name: 0, dtype: float64 #抽取x矩阵和y数组X = ads.drop(1558,axis=1).valuesy = ads[1558]PCA的目的是找到能用较少信息描述数据集的特征组合这些特征的方差跟整体方差没有多大差距,这些特征称之为主成分。得到的主成分往往是其他几个特征的复杂组合,例如下面的第一个特征就是原始数据的1558个特征分别乘不同权重得到的。
from sklearn.decomposition import PCAimport numpy as nppca = PCA(n_components=5)#传入主成分数量参数#Xd = pca.fit_transform(X)#c查看每个特征的方差Xd = pca.fit_transform(X)np.set_printoptions(precision=3, suppress=True)pca.explained_variance_ratio_ array([0.877, 0.121, 0.001, 0. , 0. ]) from sklearn.tree import DecisionTreeClassifierfrom sklearn.cross_validation import cross_val_scoreclf = DecisionTreeClassifier(random_state=14)scores_reduced =cross_val_score(clf,Xd,y,scoring="accuracy")print("主成分算法后预测准确率:{0:.1f}%".format(np.mean(scores_reduced))) 主成分算法后预测准确率:0.9%PCA算法号还可以把抽象难懂的数据集绘制成图形
#将pca返回的前两个特征做成图形%matplotlib inlinefrom matplotlib import pyplot as plt#获取类别取值classes = set(y)colors = ["red","green"]#zip()将两个列表组合起来for cur_class,color in zip(classes,colors): #为当前类别所有个体设置遮罩层 mask = (y ==cur_class).values #scatter()显示位置,这里x,y是的值是前两个特征 plt.scatter(Xd[mask,0],Xd[mask,1],marker='o',color=color,label=int(cur_class))plt.legend()plt.show()———关注我的公众号,一起学数据挖掘————
