0.前言1.导入数据2.查看房价分布3.填充缺失数据4.建模5.提交结果
0.前言
本文对Kaggle房价的训练集和测试集进行分析,采用正则线性回归,对房价进行了预测.本人将思路记录下来,以供参考.如有不足之处,欢迎指正.
1.导入数据
import numpy
as np
import pandas
as pd
import matplotlib.pyplot
as plt
import seaborn
as sns
import warnings
warnings.filterwarnings(
'ignore')
train = pd.read_csv(
'train.csv')
train_len = len(train)
test = pd.read_csv(
'test.csv')
train.head()
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities…PoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPub…0NaNNaNNaN022008WDNormal2085001220RL80.09600PaveNaNRegLvlAllPub…0NaNNaNNaN052007WDNormal1815002360RL68.011250PaveNaNIR1LvlAllPub…0NaNNaNNaN092008WDNormal2235003470RL60.09550PaveNaNIR1LvlAllPub…0NaNNaNNaN022006WDAbnorml1400004560RL84.014260PaveNaNIR1LvlAllPub…0NaNNaNNaN0122008WDNormal250000
5 rows × 81 columns
test.head()
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilities…ScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleCondition
0146120RH80.011622PaveNaNRegLvlAllPub…1200NaNMnPrvNaN062010WDNormal1146220RL81.014267PaveNaNIR1LvlAllPub…00NaNNaNGar21250062010WDNormal2146360RL74.013830PaveNaNIR1LvlAllPub…00NaNMnPrvNaN032010WDNormal3146460RL78.09978PaveNaNIR1LvlAllPub…00NaNNaNNaN062010WDNormal41465120RL43.05005PaveNaNIR1HLSAllPub…1440NaNNaNNaN012010WDNormal
5 rows × 80 columns
all_data = pd.concat([train, test], axis =
0, ignore_index=
True)
all_data.drop(labels = [
"SalePrice"],axis =
1, inplace =
True)
2.查看房价分布
由于特征太多,我们在此不查看各特征与房价的关系,只看房价的分布。
fig = plt.figure(figsize=(
12,
5))
ax1 = fig.add_subplot(
121)
ax2 = fig.add_subplot(
122)
g1 = sns.distplot(train[
'SalePrice'],hist =
True,label=
'skewness:{:.2f}'.format(train[
'SalePrice'].skew()),ax = ax1)
g1.legend()
g1.set(xlabel =
'Price')
g2 = sns.distplot(np.log1p(train[
'SalePrice']),hist =
True,label=
'skewness:{:.2f}'.format(np.log1p(train[
'SalePrice']).skew()),ax=ax2)
g2.legend()
g2.set(xlabel =
'log(Price+1)')
plt.show()
train[
'SalePrice'] = np.log1p(train[
'SalePrice'])
num_features_list = list(all_data.dtypes[all_data.dtypes !=
"object"].index)
for i
in num_features_list:
if all_data[i].dropna().skew() >
0.75:
all_data[i] = np.log1p(all_data[i])
all_data = pd.get_dummies(all_data)
3.填充缺失数据
由于缺失值很多,我们在此不逐一预测,仅用均值来填充。
all_data.isnull().sum()
1stFlrSF 0
2ndFlrSF 0
3SsnPorch 0
BedroomAbvGr 0
BsmtFinSF1 1
BsmtFinSF2 1
BsmtFullBath 2
BsmtHalfBath 2
BsmtUnfSF 1
EnclosedPorch 0
Fireplaces 0
FullBath 0
GarageArea 1
GarageCars 1
GarageYrBlt 159
GrLivArea 0
HalfBath 0
Id 0
KitchenAbvGr 0
LotArea 0
LotFrontage 486
LowQualFinSF 0
MSSubClass 0
MasVnrArea 23
MiscVal 0
MoSold 0
OpenPorchSF 0
OverallCond 0
OverallQual 0
PoolArea 0
...
RoofMatl_Metal 0
RoofMatl_Roll 0
RoofMatl_Tar&Grv 0
RoofMatl_WdShake 0
RoofMatl_WdShngl 0
RoofStyle_Flat 0
RoofStyle_Gable 0
RoofStyle_Gambrel 0
RoofStyle_Hip 0
RoofStyle_Mansard 0
RoofStyle_Shed 0
SaleCondition_Abnorml 0
SaleCondition_AdjLand 0
SaleCondition_Alloca 0
SaleCondition_Family 0
SaleCondition_Normal 0
SaleCondition_Partial 0
SaleType_COD 0
SaleType_CWD 0
SaleType_Con 0
SaleType_ConLD 0
SaleType_ConLI 0
SaleType_ConLw 0
SaleType_New 0
SaleType_Oth 0
SaleType_WD 0
Street_Grvl 0
Street_Pave 0
Utilities_AllPub 0
Utilities_NoSeWa 0
Length: 289, dtype: int64
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train_len]
X_test = all_data[train_len:]
Y_train = train[
'SalePrice']
4.建模
from sklearn.linear_model
import Ridge, LassoCV
from sklearn.model_selection
import cross_val_score
def rmse_cv(model):
rmse = np.sqrt(-cross_val_score(model, X_train, Y_train, scoring =
'neg_mean_squared_error', cv=
5))
return rmse
model_ridge = Ridge()
alphas = [
0.05,
0.1,
0.3,
1,
3,
5,
10,
15,
30,
50,
75]
cv_ridge = [rmse_cv(Ridge(alpha = a)).mean()
for a
in alphas]
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge
fig = plt.figure(figsize=(
8,
5))
cv_ridge.plot(title =
'Cross Validation Score with Model Ridge')
plt.xlabel(
"alpha")
plt.ylabel(
"rmse")
plt.show()
cv_ridge.min()
0.12699476769354789
model_lasso = LassoCV(alphas = [
1,
0.1,
0.001,
0.0005]).fit(X_train, Y_train)
rmse_cv(model_lasso).mean()
0.12296228157910054
coef = pd.Series(model_lasso.coef_, index = X_train.columns)
print(
"Lasso picked {} variables and eliminated the other {} variables".format(sum(coef !=
0), sum(coef==
0)))
Lasso picked 110 variables and eliminated the other 179 variables
imp_coef = pd.concat([coef.sort_values().head(
10),coef.sort_values().tail(
10)])
fig = plt.figure(figsize=(
6,
8))
imp_coef.plot(kind =
"barh")
plt.title(
"Coefficients in the Lasso Model")
plt.show()
est = pd.DataFrame({
"est":model_lasso.predict(X_train),
"true":Y_train})
plt.rcParams[
"figure.figsize"] = [
6,
6]
est[
"resi"] = est[
"true"] - est[
"est"]
est.plot(x =
"est", y =
"resi",kind =
"scatter")
plt.show()
import xgboost
as xgb
dtrain = xgb.DMatrix(X_train, label = Y_train)
dtest = xgb.DMatrix(X_test)
params = {
"max_depth":
2,
"eta":
0.1}
cv_xgb = xgb.cv(params, dtrain, num_boost_round=
500, early_stopping_rounds=
100)
cv_xgb.loc[
30:,[
"test-rmse-mean",
"train-rmse-mean"]].plot()
plt.show()
model_xgb = xgb.XGBRegressor(n_estimators=
360, max_depth=
2, learning_rate=
0.1)
model_xgb.fit(X_train, Y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=None, n_estimators=360,
n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1)
lasso_preds = np.expm1(model_lasso.predict(X_test))
xgb_preds = np.expm1(model_xgb.predict(X_test))
predictions = pd.DataFrame({
"xgb":xgb_preds,
"lasso":lasso_preds})
predictions.plot(x =
"xgb", y =
"lasso", kind =
"scatter")
plt.show()
5.提交结果
preds =
0.7*lasso_preds +
0.3*xgb_preds
result = pd.DataFrame({
"id":test.Id,
"SalePrice":preds})
result.to_csv(
'result.csv', index =
False)
结果排在前19%, 还有改进的空间, 要继续努力呀.