摘要
最近打各种比赛,在这里分享一些General Model,稍微改改就能用的
环境: python 3.5.2
XGBoost调参大全: http://blog.csdn.net/han_xiaoyang/article/details/52665396 XGBoost 官方API: http://xgboost.readthedocs.io/en/latest//python/python_api.html
Preprocess
import pandas
as pd
import numpy
as np
import scipy
as sp
def read_csv_file(f
, logging
=False):
print("==========读取数据=========")
data
= pd
.read_csv
(f
)
if logging
:
print(data
.head
(5))
print(f
, "包含以下列")
print(data
.columns
.values
)
print(data
.describe
())
print(data
.info
())
return data
Logistic Regression
import pandas
as pd
import numpy
as np
from scipy
import sparse
from sklearn
.preprocessing
import OneHotEncoder
from sklearn
.linear_model
import LogisticRegression
from sklearn
.preprocessing
import StandardScaler
df_train
= pd
.DataFrame
()
df_test
= pd
.DataFrame
()
y_train
= df_train
['label'].values
ss
= StandardScaler
()
enc
= OneHotEncoder
()
feats
= ["creativeID", "adID", "campaignID"]
for i
, feat
in enumerate(feats
):
x_train
= enc
.fit_transform
(df_train
[feat
].values
.reshape
(-1, 1))
x_test
= enc
.fit_transform
(df_test
[feat
].values
.reshape
(-1, 1))
if i
== 0:
X_train
, X_test
= x_train
, x_test
else:
X_train
, X_test
= sparse
.hstack
((X_train
, x_train
)), sparse
.hstack
((X_test
, x_test
))
feats
= ["price", "age"]
x_train
= ss
.fit_transform
(df_train
[feats
].values
)
x_test
= ss
.fit_transform
(df_test
[feats
].values
)
X_train
, X_test
= sparse
.hstack
((X_train
, x_train
)), sparse
.hstack
((X_test
, x_test
))
lr
= LogisticRegression
()
lr
.fit
(X_train
, y_train
)
proba_test
= lr
.predict_proba
(X_test
)[:, 1]
LightGBM
1. 二分类
import lightgbm
as lgb
import pandas
as pd
import numpy
as np
import pickle
from sklearn
.metrics
import roc_auc_score
from sklearn
.model_selection
import train_test_split
print("Loading Data ... ")
train_x
, train_y
, test_x
= load_data
()
X
, val_X
, y
, val_y
= train_test_split
(
train_x
,
train_y
,
test_size
=0.05,
random_state
=1,
stratify
=train_y
)
X_train
= X
y_train
= y
X_test
= val_X
y_test
= val_y
lgb_train
= lgb
.Dataset
(X_train
, y_train
)
lgb_eval
= lgb
.Dataset
(X_test
, y_test
, reference
=lgb_train
)
params
= {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss', 'auc'},
'num_leaves': 5,
'max_depth': 6,
'min_data_in_leaf': 450,
'learning_rate': 0.1,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5,
'lambda_l1': 1,
'lambda_l2': 0.001,
'min_gain_to_split': 0.2,
'verbose': 5,
'is_unbalance': True
}
print('Start training...')
gbm
= lgb
.train
(params
,
lgb_train
,
num_boost_round
=10000,
valid_sets
=lgb_eval
,
early_stopping_rounds
=500)
print('Start predicting...')
preds
= gbm
.predict
(test_x
, num_iteration
=gbm
.best_iteration
)
threshold
= 0.5
for pred
in preds
:
result
= 1 if pred
> threshold
else 0
importance
= gbm
.feature_importance
()
names
= gbm
.feature_name
()
with open('./feature_importance.txt', 'w+') as file:
for index
, im
in enumerate(importance
):
string
= names
[index
] + ', ' + str(im
) + '\n'
file.write
(string
)
2. 多分类
import lightgbm
as lgb
import pandas
as pd
import numpy
as np
import pickle
from sklearn
.metrics
import roc_auc_score
from sklearn
.model_selection
import train_test_split
print("Loading Data ... ")
train_x
, train_y
, test_x
= load_data
()
X
, val_X
, y
, val_y
= train_test_split
(
train_x
,
train_y
,
test_size
=0.05,
random_state
=1,
stratify
=train_y
)
X_train
= X
y_train
= y
X_test
= val_X
y_test
= val_y
lgb_train
= lgb
.Dataset
(X_train
, y_train
)
lgb_eval
= lgb
.Dataset
(X_test
, y_test
, reference
=lgb_train
)
params
= {
'boosting_type': 'gbdt',
'objective': 'multiclass',
'num_class': 9,
'metric': 'multi_error',
'num_leaves': 300,
'min_data_in_leaf': 100,
'learning_rate': 0.01,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'lambda_l1': 0.4,
'lambda_l2': 0.5,
'min_gain_to_split': 0.2,
'verbose': 5,
'is_unbalance': True
}
print('Start training...')
gbm
= lgb
.train
(params
,
lgb_train
,
num_boost_round
=10000,
valid_sets
=lgb_eval
,
early_stopping_rounds
=500)
print('Start predicting...')
preds
= gbm
.predict
(test_x
, num_iteration
=gbm
.best_iteration
)
for pred
in preds
:
result
= prediction
= int(np
.argmax
(pred
))
importance
= gbm
.feature_importance
()
names
= gbm
.feature_name
()
with open('./feature_importance.txt', 'w+') as file:
for index
, im
in enumerate(importance
):
string
= names
[index
] + ', ' + str(im
) + '\n'
file.write
(string
)
XGBoost
1. 二分类
import numpy
as np
import pandas
as pd
import xgboost
as xgb
import time
from sklearn
.model_selection
import StratifiedKFold
from sklearn
.model_selection
import train_test_split
train_x
, train_y
, test_x
= load_data
()
X
, val_X
, y
, val_y
= train_test_split
(
train_x
,
train_y
,
test_size
=0.01,
random_state
=1,
stratify
=train_y
)
xgb_val
= xgb
.DMatrix
(val_X
, label
=val_y
)
xgb_train
= xgb
.DMatrix
(X
, label
=y
)
xgb_test
= xgb
.DMatrix
(test_x
)
params
= {
'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'logloss',
'gamma': 0.1,
'max_depth': 8,
'alpha': 0,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.5,
'min_child_weight': 3,
'silent': 0,
'eta': 0.03,
'seed': 1000,
'nthread': -1,
'missing': 1,
'scale_pos_weight': (np
.sum(y
==0)/np
.sum(y
==1))
}
plst
= list(params
.items
())
num_rounds
= 2000
watchlist
= [(xgb_train
, 'train'), (xgb_val
, 'val')]
result
= xgb
.cv
(plst
, xgb_train
, num_boost_round
=200, nfold
=4, early_stopping_rounds
=200, verbose_eval
=True, folds
=StratifiedKFold
(n_splits
=4).split
(X
, y
))
model
= xgb
.train
(plst
, xgb_train
, num_rounds
, watchlist
, early_stopping_rounds
=200)
model
.save_model
('../data/model/xgb.model')
preds
= model
.predict
(xgb_test
)
threshold
= 0.5
for pred
in preds
:
result
= 1 if pred
> threshold
else 0
Keras
1. 二分类
import numpy
as np
import pandas
as pd
import time
from sklearn
.model_selection
import train_test_split
from matplotlib
import pyplot
as plt
from keras
.models
import Sequential
from keras
.layers
import Dropout
from keras
.layers
import Dense
, Activation
from keras
.utils
.np_utils
import to_categorical
from model
.util
import load_data
as load_data_1
from model
.util_combine_train_test
import load_data
as load_data_2
from sklearn
.preprocessing
import StandardScaler
from sklearn
.preprocessing
import Imputer
print(“Loading Data … ”
)
train_x
, train_y
, test_x
= load_data
()
X_train
= train_x
.values
X_test
= test_x
.values
y
= train_y
imp
= Imputer
(missing_values
=’NaN’
, strategy
=‘mean’
, axis
=0)
X_train
= imp
.fit_transform
(X_train
)
sc
= StandardScaler
()
sc
.fit
(X_train
)
X_train
= sc
.transform
(X_train
)
X_test
= sc
.transform
(X_test
)
model
= Sequential
()
model
.add
(Dense
(256, input_shape
=(X_train
.shape
[1],)))
model
.add
(Activation
(’tanh’
))
model
.add
(Dropout
(0.3))
model
.add
(Dense
(512))
model
.add
(Activation
(’relu’
))
model
.add
(Dropout
(0.3))
model
.add
(Dense
(512))
model
.add
(Activation
(’tanh’
))
model
.add
(Dropout
(0.3))
model
.add
(Dense
(256))
model
.add
(Activation
(’linear’
))
model
.add
(Dense
(1))
model
.add
(Activation
(’sigmoid’
))
model
.compile(loss
=’binary_crossentropy’
,
optimizer
=’rmsprop’
,
metrics
=[’accuracy’
])
epochs
= 100
model
.fit
(X_train
, y
, epochs
=epochs
, batch_size
=2000, validation_split
=0.1, shuffle
=True)
threshold
= 0.5
for index
, case
in enumerate(X_test
):
case
=np
.array
([case
])
prediction_prob
= model
.predict
(case
)
prediction
= 1 if prediction_prob
[0][0] > threshold
else 0
2. 多分类
import numpy
as np
import pandas
as pd
import time
from sklearn
.model_selection
import train_test_split
from matplotlib
import pyplot
as plt
from keras
.models
import Sequential
from keras
.layers
import Dropout
from keras
.layers
import Dense
, Activation
from keras
.utils
.np_utils
import to_categorical
from model
.util
import load_data
as load_data_1
from model
.util_combine_train_test
import load_data
as load_data_2
from sklearn
.preprocessing
import StandardScaler
from sklearn
.preprocessing
import Imputer
print(“Loading Data … ”
)
train_x
, train_y
, test_x
= load_data
()
X_train
= train_x
.values
X_test
= test_x
.values
y
= train_y
sc
= StandardScaler
()
sc
.fit
(X_train
)
X_train
= sc
.transform
(X_train
)
X_test
= sc
.transform
(X_test
)
y
= to_categorical
(y
)
model
= Sequential
()
model
.add
(Dense
(256, input_shape
=(X_train
.shape
[1],)))
model
.add
(Activation
(’tanh’
))
model
.add
(Dropout
(0.3))
model
.add
(Dense
(512))
model
.add
(Activation
(’relu’
))
model
.add
(Dropout
(0.3))
model
.add
(Dense
(512))
model
.add
(Activation
(’tanh’
))
model
.add
(Dropout
(0.3))
model
.add
(Dense
(256))
model
.add
(Activation
(’linear’
))
model
.add
(Dense
(9))
model
.add
(Activation
(’softmax’
))
model
.compile(optimizer
=’rmsprop’
,
loss
=’categorical_crossentropy’
,
metrics
=[’accuracy’
])
epochs
= 200
model
.fit
(X_train
, y
, epochs
=epochs
, batch_size
=200, validation_split
=0.1, shuffle
=True)
for index
, case
in enumerate(X_test
):
case
= np
.array
([case
])
prediction_prob
= model
.predict
(case
)
prediction
= np
.argmax
(prediction_prob
)
处理正负样本不均匀的案例
有些案例中,正负样本数量相差非常大,数据严重unbalanced,这里提供几个解决的思路。
positive_num
= df_train
[df_train
[’label’
]==1].values
.shape
[0]
negative_num
= df_train
[df_train
[’label’
]==0].values
.shape
[0]
print(float(positive_num
)/float(negative_num
))
主要思路
手动调整正负样本比例
过采样 Over-Sampling 对训练集里面样本数量较少的类别(少数类)进行过采样,合成新的样本来缓解类不平衡,比如SMOTE算法
欠采样 Under-Sampling
将样本按比例一一组合进行训练,训练出多个弱分类器,最后进行集成
框架推荐
Github上大神写的相关框架,专门用来处理此类问题: https://github.com/scikit-learn-contrib/imbalanced-learn