Kaggle: 房价预测

xiaoxiao2021-02-28 127

0.前言1.导入数据2.查看房价分布3.填充缺失数据4.建模5.提交结果

0.前言

本文对Kaggle房价的训练集和测试集进行分析,采用正则线性回归,对房价进行了预测.本人将思路记录下来,以供参考.如有不足之处,欢迎指正.

1.导入数据

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # 忽略警告 import warnings warnings.filterwarnings('ignore') # 读取训练集和测试集 train = pd.read_csv('train.csv') train_len = len(train) test = pd.read_csv('test.csv') # 查看训练集 train.head() IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities…PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice0160RL65.08450PaveNaNRegLvlAllPub…0NaNNaNNaN022008WDNormal2085001220RL80.09600PaveNaNRegLvlAllPub…0NaNNaNNaN052007WDNormal1815002360RL68.011250PaveNaNIR1LvlAllPub…0NaNNaNNaN092008WDNormal2235003470RL60.09550PaveNaNIR1LvlAllPub…0NaNNaNNaN022006WDAbnorml1400004560RL84.014260PaveNaNIR1LvlAllPub…0NaNNaNNaN0122008WDNormal250000

5 rows × 81 columns

# 查看测试集, 缺少最后一列SalePrice test.head() IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities…ScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleCondition0146120RH80.011622PaveNaNRegLvlAllPub…1200NaNMnPrvNaN062010WDNormal1146220RL81.014267PaveNaNIR1LvlAllPub…00NaNNaNGar21250062010WDNormal2146360RL74.013830PaveNaNIR1LvlAllPub…00NaNMnPrvNaN032010WDNormal3146460RL78.09978PaveNaNIR1LvlAllPub…00NaNNaNNaN062010WDNormal41465120RL43.05005PaveNaNIR1HLSAllPub…1440NaNNaNNaN012010WDNormal

5 rows × 80 columns

# 合并训练集和测试集,去掉房价一列 all_data = pd.concat([train, test], axis = 0, ignore_index= True) all_data.drop(labels = ["SalePrice"],axis = 1, inplace = True)

2.查看房价分布

由于特征太多，我们在此不查看各特征与房价的关系，只看房价的分布。

# 查看训练集的房价分布，左图是原始房价分布，右图是将房价对数化之后的分布 fig = plt.figure(figsize=(12,5)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) g1 = sns.distplot(train['SalePrice'],hist = True,label='skewness:{:.2f}'.format(train['SalePrice'].skew()),ax = ax1) g1.legend() g1.set(xlabel = 'Price') g2 = sns.distplot(np.log1p(train['SalePrice']),hist = True,label='skewness:{:.2f}'.format(np.log1p(train['SalePrice']).skew()),ax=ax2) g2.legend() g2.set(xlabel = 'log(Price+1)') plt.show()

# 由于房价是有偏度的,将房价对数化 train['SalePrice'] = np.log1p(train['SalePrice']) # 将有偏的数值特征对数化 num_features_list = list(all_data.dtypes[all_data.dtypes != "object"].index) for i in num_features_list: if all_data[i].dropna().skew() > 0.75: all_data[i] = np.log1p(all_data[i]) # 将类别数值转化为虚拟变量 all_data = pd.get_dummies(all_data)

3.填充缺失数据

由于缺失值很多，我们在此不逐一预测，仅用均值来填充。

# 查看缺失值 all_data.isnull().sum() 1stFlrSF 0 2ndFlrSF 0 3SsnPorch 0 BedroomAbvGr 0 BsmtFinSF1 1 BsmtFinSF2 1 BsmtFullBath 2 BsmtHalfBath 2 BsmtUnfSF 1 EnclosedPorch 0 Fireplaces 0 FullBath 0 GarageArea 1 GarageCars 1 GarageYrBlt 159 GrLivArea 0 HalfBath 0 Id 0 KitchenAbvGr 0 LotArea 0 LotFrontage 486 LowQualFinSF 0 MSSubClass 0 MasVnrArea 23 MiscVal 0 MoSold 0 OpenPorchSF 0 OverallCond 0 OverallQual 0 PoolArea 0 ... RoofMatl_Metal 0 RoofMatl_Roll 0 RoofMatl_Tar&Grv 0 RoofMatl_WdShake 0 RoofMatl_WdShngl 0 RoofStyle_Flat 0 RoofStyle_Gable 0 RoofStyle_Gambrel 0 RoofStyle_Hip 0 RoofStyle_Mansard 0 RoofStyle_Shed 0 SaleCondition_Abnorml 0 SaleCondition_AdjLand 0 SaleCondition_Alloca 0 SaleCondition_Family 0 SaleCondition_Normal 0 SaleCondition_Partial 0 SaleType_COD 0 SaleType_CWD 0 SaleType_Con 0 SaleType_ConLD 0 SaleType_ConLI 0 SaleType_ConLw 0 SaleType_New 0 SaleType_Oth 0 SaleType_WD 0 Street_Grvl 0 Street_Pave 0 Utilities_AllPub 0 Utilities_NoSeWa 0 Length: 289, dtype: int64 # 将缺失值用该列的均值填充 all_data = all_data.fillna(all_data.mean()) # 将测试集和训练集分开 X_train = all_data[:train_len] X_test = all_data[train_len:] Y_train = train['SalePrice']

4.建模

from sklearn.linear_model import Ridge, LassoCV from sklearn.model_selection import cross_val_score # 定义交叉验证,用均方根误差来评价模型的拟合程度 def rmse_cv(model): rmse = np.sqrt(-cross_val_score(model, X_train, Y_train, scoring = 'neg_mean_squared_error', cv=5)) return rmse # Ridge模型 model_ridge = Ridge() alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75] cv_ridge = [rmse_cv(Ridge(alpha = a)).mean() for a in alphas] cv_ridge = pd.Series(cv_ridge, index = alphas) cv_ridge # 交叉验证可视化 fig = plt.figure(figsize=(8,5)) cv_ridge.plot(title = 'Cross Validation Score with Model Ridge') plt.xlabel("alpha") plt.ylabel("rmse") plt.show()

# 当alpha为10时,均方根误差最小 cv_ridge.min() 0.12699476769354789 # lasso模型,均方根误差的均值更小,因此最终选择lasso模型 model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, Y_train) rmse_cv(model_lasso).mean() 0.12296228157910054 # 查看模型系数, lasso模型能选择特征,将不重要的特征系数设置为0 coef = pd.Series(model_lasso.coef_, index = X_train.columns) print("Lasso picked {} variables and eliminated the other {} variables".format(sum(coef != 0), sum(coef==0))) Lasso picked 110 variables and eliminated the other 179 variables # 查看重要的特征, GrLivArea地上面积是最重要的正相关特征 imp_coef = pd.concat([coef.sort_values().head(10),coef.sort_values().tail(10)]) fig = plt.figure(figsize=(6,8)) imp_coef.plot(kind = "barh") plt.title("Coefficients in the Lasso Model") plt.show()

# 查看残差 est = pd.DataFrame({"est":model_lasso.predict(X_train), "true":Y_train}) plt.rcParams["figure.figsize"] = [6,6] est["resi"] = est["true"] - est["est"] est.plot(x = "est", y = "resi",kind = "scatter") plt.show()

# xgboost模型 import xgboost as xgb dtrain = xgb.DMatrix(X_train, label = Y_train) dtest = xgb.DMatrix(X_test) # 交叉验证 params = {"max_depth":2, "eta":0.1} cv_xgb = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100) cv_xgb.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot() plt.show()

# 训练模型 model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) model_xgb.fit(X_train, Y_train) XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=2, min_child_weight=1, missing=None, n_estimators=360, n_jobs=1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1) # 查看两种模型的预测结果, 将结果指数化 lasso_preds = np.expm1(model_lasso.predict(X_test)) xgb_preds = np.expm1(model_xgb.predict(X_test)) predictions = pd.DataFrame({"xgb":xgb_preds, "lasso":lasso_preds}) predictions.plot(x = "xgb", y = "lasso", kind = "scatter") plt.show()

5.提交结果

# 最终结果采用两种模型预测的加权平均值,提交结果 preds = 0.7*lasso_preds + 0.3*xgb_preds result = pd.DataFrame({"id":test.Id, "SalePrice":preds}) result.to_csv('result.csv', index = False)

结果排在前19%, 还有改进的空间, 要继续努力呀.

转载请注明原文地址: https://www.6miu.com/read-2621800.html

技术

最新回复(0)