序
特征工程之后,我们基本了解了数据集的概貌,通过缺失值处理、异常值处理、归一化、独热编码、特征构造等一系列方法对数据进行了预处理,并根据不同模型的数据要求对数据进行了一定的转化,从而进行下一步模型的学习过程。以下就是对数据进行处理后,训练模型的过程代码。其实可以先使用随机森林等方法先做一步特征筛选的工作,我这里没有做特征的筛选,而且先复现了数据准备,模型构造和调参的过程。若是模型初步表现不错且较稳定,我会后续做特征筛选或特征构造,进一步提高模型的分数。
数据准备
导入第三方库
import pandas as pd
import numpy as np
import lightgbm as lgb
import warnings
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
import datetime
import pickle
import seaborn as sns
''' sns 相关设置 @return: """ # 声明使用 Seaborn 样式 sns.set() # 有五种seaborn的绘图风格,它们分别是:darkgrid, whitegrid, dark, white, ticks。默认的主题是darkgrid。 sns.set_style("whitegrid") # 有四个预置的环境,按大小从小到大排列分别为:paper, notebook, talk, poster。其中,notebook是默认的。 sns.set_context('talk') # 中文字体设置-黑体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 解决保存图像是负号'-'显示为方块的问题 plt.rcParams['axes.unicode_minus'] = False # 解决Seaborn中文显示问题并调整字体大小 sns.set(font='SimHei') '''
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.set_option('display.float_format', lambda x: '%.2f' % x)
读取数据
train = pd.read_csv(r'D:\Users\Felixteng\Documents\Pycharm Files\loanDefaultForecast\data\train.csv')
testA = pd.read_csv(r'D:\Users\Felixteng\Documents\Pycharm Files\loanDefaultForecast\data\testA.csv')
压缩数据
def reduce_mem_usage(df):
''' 遍历DataFrame的所有列并修改它们的数据类型以减少内存使用 :param df: 需要处理的数据集 :return: '''
start_mem = df.memory_usage().sum() / 1024 ** 2 # 记录原数据的内存大小
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtypes
if col_type != object: # 这里只过滤了object格式,如果代码中还包含其他类型,要一并过滤
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int': # 如果是int类型的话,不管是int64还是int32,都加入判断
# 依次尝试转化成in8,in16,in32,in64类型,如果数据大小没溢出,那么转化
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else: # 不是整形的话,那就是浮点型
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else: # 如果不是数值型的话,转化成category类型
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024 ** 2 # 看一下转化后的数据的内存大小
print('Memory usage after optimization is {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem)) # 看一下压缩比例
return df
train = reduce_mem_usage(train)
testA = reduce_mem_usage(testA)
del testA['n2.2']
del testA['n2.3']
简单建模
''' Tips1:金融风控的实际项目多涉及到信用评分,因此需要模型特征具有较好的可解释性,所以目前在实际项目中多还是以逻辑回归作为基础模型。 但是在比赛中以得分高低为准,不需要严谨的可解释性,所以大多基于集成算法进行建模。 Tips2:因为逻辑回归的算法特性,需要提前对异常值、缺失值数据进行处理(参考task3部分) Tips3:基于树模型的算法特性,异常值、缺失值处理可以跳过,但是对于业务较为了解的同学也可以自己对缺失异常值进行处理,效果可能会更优于模型处理的结果。 注:以下建模的源数据参考baseline进行了相应的特征工程,对于异常缺失值未进行相应的处理操作 '''
建模之前的数据处理
为了方便起见,把训练集和测试集合并处理
data = pd.concat([train, testA], axis=0, ignore_index=True)
category特征不能直接训练,需要处理转换
''' ['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine'] 先处理'employmentLength', 'issueDate', 'earliesCreditLine'这三个特征;'grade'和'subGrade'做one-hot编码 '''
‘employmentLength’ - 转换为数值
data.groupby('employmentLength')['id'].count()
'''10年以上算10年,1年一下算0年'''
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace(to_replace='< 1 year', value='0 year', inplace=True)
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
‘earliesCreditLine’ - 分别提取年份和月份做拼接
data['earliesCreditLine_year'] = data['earliesCreditLine'].apply(lambda x: x[-4:])
data['earliesCreditLine_month'] = data['earliesCreditLine'].apply(lambda x: x[0:3])
def month_re(x):
if x == 'Jan':
return '01'
elif x == 'Feb':
return '02'
elif x == 'Mar':
return '03'
elif x == 'Apr':
return '04'
elif x == 'May':
return '05'
elif x == 'Jun':
return '06'
elif x == 'Jul':
return '07'
elif x == 'Aug':
return '08'
elif x == 'Sep':
return '09'
elif x == 'Oct':
return '10'
elif x == 'Nov':
return '11'
else:
return '12'
data['earliesCreditLine_month'] = data['earliesCreditLine_month'].apply(lambda x: month_re(x))
data['earliesCreditLine_date'] = data['earliesCreditLine_year'] + data['earliesCreditLine_month']
data['earliesCreditLine_date'] = data['earliesCreditLine_date'].astype('int')
del data['earliesCreditLine']
del data['earliesCreditLine_year']
del data['earliesCreditLine_month']
‘issueDate’ - 从数据可以看出,issueDate从2017年6月1日开始;数据按照此节点统计天数
data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
data['issueDateDt'] = data['issueDate'].apply(lambda x: x - startdate).dt.days
del data['issueDate']
除了本身是category类型的特征,还有一些数值特征表现出的也是类别型的
'''将1类以上且非高维稀疏的特征进行one-hot编码'''
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose',
'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title', 'policyCode']
for cate in cate_features:
print(cate, '类型数', data[cate].nunique())
''' grade 类型数 7 subGrade 类型数 35 employmentTitle 类型数 298101 homeOwnership 类型数 6 verificationStatus 类型数 3 purpose 类型数 14 postCode 类型数 935 regionCode 类型数 51 applicationType 类型数 2 initialListStatus 类型数 2 title 类型数 6712 policyCode 类型数 1 不适合做one-hot编码的是 employmentTitle 类型数 298101 postCode 类型数 935 title 类型数 6712 regionCode 类型数 51 - 大于50的先不处理了,维度还是比较高的 policyCode 类型数 1 - 无分析价值,可直接删除 '''
del data['policyCode']
one-hot编码
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus',
'purpose', 'applicationType', 'initialListStatus'], drop_first=True)
对于高维类别特征,进行转换,取他们同类型的数量值和排名值
for f in ['employmentTitle', 'postCode', 'regionCode', 'title']:
data[f + '_counts'] = data.groupby([f])['id'].transform('count')
data[f + '_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
del data[f]
准备训练集和测试集
features = [f for f in data.columns if f not in ['id', 'isDefault']]
train = data[data.isDefault.notnull()].reset_index(drop=True)
testA = data[data.isDefault.isnull()].reset_index(drop=True)
x_train = train[features]
y_train = train['isDefault']
x_testA = testA[features]
五折交叉验证准备
folds = 5
seed = 2020
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
建模 - Lightgbm
将训练集分为5份,4份作为训练集,1份作为验证集
X_train_split, X_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2)
将数据集转化成能用于lgb训练的形式
train_split_matrix = lgb.Dataset(X_train_split, label=y_train_split)
val_matrix = lgb.Dataset(X_val, label=y_val)
初步自定义lgb参数
params = {
'boosting_type': 'gbdt', 'objective': 'binary', 'learning_rate': 0.1, 'metric': 'auc', 'min_child_weight': 1e-3,
'num_leaves': 31, 'max_depth': -1, 'reg_lambda': 0, 'reg_alpha': 0, 'feature_fraction': 1, 'bagging_fraction': 1,
'bagging_freq': 0, 'seed': 2020, 'nthread': 8, 'verbose': -1
}
将训练集丢入lgb模型训练
model = lgb.train(params, train_set=train_split_matrix, valid_sets=val_matrix, num_boost_round=20000,
verbose_eval=1000, early_stopping_rounds=200)
''' Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [419] valid_0's auc: 0.735017 '''
用训练好的模型预测验证集
val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration)
计算roc的相关指标
''' 真正类率(True Positive Rate)TPR: TP / (TP + FN),代表分类器预测的正类中实际正实例占所有正实例的比例 纵轴TPR:TPR越大,预测正类中实际正类越多 负正类率(False Positive Rate)FPR: FP / (FP + TN),代表分类器预测的正类中实际负实例占所有负实例的比例 横轴FPR:FPR越大,预测正类中实际负类越多 理想目标:TPR=1,FPR=0,即roc图中的(0,1)点,故ROC曲线越靠拢(0,1)点,越偏离45度对角线越好,Sensitivity、Specificity越大效果越好 '''
fpr, tpr, threshold = metrics.roc_curve(y_val, val_pre_lgb)
roc_auc = metrics.auc(fpr, tpr)
print('未调参前lgb在验证集上的AUC: {}'.format(roc_auc))
'''未调参前lgb在验证集上的AUC: 0.7350165705811689'''
画出roc曲线
plt.figure(figsize=(8, 8))
plt.title('Val ROC')
plt.plot(fpr, tpr, 'b', label='Val AUC = %0.4f' % roc_auc) # 保留四位小数
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.legend(loc='best')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.plot([0, 1], [0, 1], 'r--') # 对角线
使用5折交叉验证进行模型性能评估
cv_scores = [] # 用于存放每次验证的得分
# ## 五折交叉验证评估模型
for i, (train_index, val_index) in enumerate(kf.split(x_train, y_train)):
print('*** {} ***'.format(str(i+1)))
X_train_split, y_train_split, X_val, y_val = x_train.iloc[train_index], y_train[train_index], \
x_train.iloc[val_index], y_train[val_index]
'''划分训练集和验证集'''
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
val_matrix = lgb.Dataset(X_val, label=y_val)
'''转换成lgb训练的数据形式'''
params = {
'boosting_type': 'gbdt', 'objective': 'binary', 'learning_rate': 0.1, 'metric': 'auc', 'min_child_weight': 1e-3,
'num_leaves': 31, 'max_depth': -1, 'reg_lambda': 0, 'reg_alpha': 0, 'feature_fraction': 1,
'bagging_fraction': 1,
'bagging_freq': 0, 'seed': 2020, 'nthread': 8, 'verbose': -1
}
'''给定lgb参数'''
model = lgb.train(params, train_set=train_matrix, num_boost_round=20000, valid_sets=val_matrix, verbose_eval=1000,
early_stopping_rounds=200)
'''训练模型'''
val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration)
'''预测验证集结果'''
cv_scores.append(roc_auc_score(y_val, val_pre_lgb))
'''将auc加入列表'''
print(cv_scores)
''' *** 1 *** Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [480] valid_0's auc: 0.735706 [0.7357056028032594] *** 2 *** Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [394] valid_0's auc: 0.732804 [0.7357056028032594, 0.7328044319912592] *** 3 *** Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [469] valid_0's auc: 0.736296 [0.7357056028032594, 0.7328044319912592, 0.736295686606251] *** 4 *** Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [480] valid_0's auc: 0.735153 [0.7357056028032594, 0.7328044319912592, 0.736295686606251, 0.7351530881059898] *** 5 *** Training until validation scores don't improve for 200 rounds Early stopping, best iteration is: [481] valid_0's auc: 0.734523 [0.7357056028032594, 0.7328044319912592, 0.736295686606251, 0.7351530881059898, 0.7345226943030314] '''
调参
贪心调参
先使用当前对模型影响最大的参数进行调优,达到当前参数下的模型最优化,再使用对模型影响次之的参数进行调优,如此下去,直到所有的参数调整完毕。
这个方法的缺点就是可能会调到局部最优而不是全局最优,但是只需要一步一步的进行参数最优化调试即可,容易理解。
需要注意的是在树模型中参数调整的顺序,也就是各个参数对模型的影响程度,列举一下日常调参过程中常用的参数和调参顺序:
max_depth、num_leaves
min_data_in_leaf、min_child_weight
bagging_fraction、 feature_fraction、bagging_freq
reg_lambda、reg_alpha
min_split_gain
objective
best_obj = dict()
objective = ['regression', 'regression_l2', 'regression_l1', 'huber', 'fair', 'poisson',
'binary', 'lambdarank', 'multiclass']
for obj in objective:
model = lgb.LGBMRegressor(objective=obj)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_obj[obj] = score
'''针对每种学习目标参数,分别把5次交叉验证的结果取平均值放入字典'''
''' {'regression': 0.7317571771311902, 'regression_l2': 0.7317571771311902, 'regression_l1': 0.5254673662915372, 'huber': 0.7317930010205694, 'fair': 0.7299013530452948, 'poisson': 0.7276315321558192, 'binary': 0.7325703837580402, 'lambdarank': nan, 'multiclass': nan} 分数最高的objective是'binary': 0.7325703837580402 '''
max_depth
best_depth = dict()
max_depth = [4, 6, 8, 10, 12]
for depth in max_depth:
model = lgb.LGBMRegressor(objective='binary', max_depth=depth)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_depth[depth] = score
''' {4: 0.7289917272476384, 6: 0.7318582290988798, 8: 0.7326689825432566, 10: 0.7327216337284277, 12: 0.7326861296973519} 分数最高的depth是 10: 0.7327216337284277 '''
num_leaves - 为了防止过拟合,num_leaves要小于2max_depth(210=1024)
best_leaves = dict()
num_leaves = [60, 80, 100, 120, 140, 160, 180, 200]
for leaf in num_leaves:
model = lgb.LGBMRegressor(objective='binary', max_depth=10, num_leaves=leaf)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_leaves[leaf] = score
''' {60: 0.7338063124202595, 80: 0.7340086888735147, 100: 0.7340517113255459, 120: 0.7339504337283304, 140: 0.733943621732856, 160: 0.7340382165600425, 180: 0.7335684540056998, 200: 0.7331373764276772} 分数最高的num_leaves是 num_leaves 100: 0.7340517113255459 '''
min_data_in_leaf
best_min_leaves = dict()
min_data_in_leaf = [14, 18, 22, 26, 30, 34]
for min_leaf in min_data_in_leaf:
model = lgb.LGBMRegressor(objective='binary', max_depth=10, num_leaves=100, min_data_in_leaf=min_leaf)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_min_leaves[min_leaf] = score
''' {14: 0.7338644336034048, 18: 0.7340150561766138, 22: 0.7340158598138881, 26: 0.7341871752335695, 30: 0.7340615684229571, 34: 0.7340519101378781} 分数最高的 min_leaf是 26: 0.7341871752335695 '''
min_child_weight
best_weight = dict()
min_child_weight = [0.002, 0.004, 0.006, 0.008, 0.010, 0.012]
for min_weight in min_child_weight:
model = lgb.LGBMRegressor(objective='binary', max_depth=10, num_leaves=100, min_data_in_leaf=26,
min_child_weight=min_weight)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_weight[min_weight] = score
''' {0.002: 0.7341871752335695, 0.004: 0.7341871752335695, 0.006: 0.7341871752335695, 0.008: 0.7341871752335695, 0.01: 0.7341871752335695, 0.012: 0.7341871752335695} 都一样,说明min_data_in_leaf和min_child_weight应该是对应的? '''
bagging_fraction + bagging_freq
''' bagging_fraction+bagging_freq参数必须同时设置,bagging_fraction相当于subsample样本采样,可以使bagging更快的运行,同时也可以降拟合。 bagging_freq默认0,表示bagging的频率,0意味着没有使用bagging,k意味着每k轮迭代进行一次bagging '''
best_bagging_fraction = dict()
bagging_fraction = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
for bagging in bagging_fraction:
model = lgb.LGBMRegressor(objective='binary', max_depth=10, num_leaves=100, min_data_in_leaf=26,
bagging_fraction=bagging)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_bagging_fraction[bagging] = score
''' {0.5: 0.7341871752335695, 0.6: 0.734187175233 5695, 0.7: 0.7341871752335695, 0.8: 0.7341871752335695, 0.9: 0.7341871752335695, 0.95: 0.7341871752335695} 没变化 '''
feature_fraction
best_feature_fraction = dict()
feature_fraction = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
for feature in feature_fraction:
model = lgb.LGBMRegressor(objective='binary', max_depth=10, num_leaves=100, min_data_in_leaf=26,
feature_fraction=feature)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_feature_fraction[feature] = score
''' {0.5: 0.7341332691040499, 0.6: 0.7342461204659492, 0.7: 0.7340950793860553, 0.8: 0.734168394330798, 0.9: 0.7342417001187209, 0.95: 0.7340419425131396} 虽然0.6会高一些,但是出于样本特征的使用率我还是想用0.9 '''
reg_lambda
best_reg_lambda = dict()
reg_lambda = [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]
for lam in reg_lambda:
model = lgb.LGBMRegressor(objective='binary', max_depth=10, num_leaves=100, min_data_in_leaf=26,
feature_fraction=0.9, reg_lambda=lam)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_reg_lambda[lam] = score
''' {0: 0.7342417001187209, 0.001: 0.7340521878374329, 0.01: 0.7342087379791171, 0.03: 0.7342072587501143, 0.08: 0.7341178131960189, 0.3: 0.7342923823693306, 0.5: 0.7342815855243002} reg_lambda 最优值为 0.3: 0.7342923823693306 '''
reg_alpha
best_reg_alpha = dict()
reg_alpha = [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]
for alp in reg_alpha:
model = lgb.LGBMRegressor(objective='binary', max_depth=10, num_leaves=100, min_data_in_leaf=26,
feature_fraction=0.9, reg_lambda=0.3, reg_alpha=alp)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_reg_alpha[alp] = score
''' {0: 0.7342923823693306, 0.001: 0.7342141300723407, 0.01: 0.7342716599336013, 0.03: 0.7342356374031566, 0.08: 0.7342509380457417, 0.3: 0.7341836259662214, 0.5: 0.7342654379571296} reg_alpha 为0时最高 '''
learning_rate
best_learning_rate = dict()
learning_rate = [0.01, 0.05, 0.08, 0.1, 0.12]
for learn in learning_rate:
model = lgb.LGBMRegressor(objective='binary', max_depth=10, num_leaves=100, min_data_in_leaf=26,
feature_fraction=0.9, reg_lambda=0.3, learning_rate=learn)
score = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc').mean()
best_learning_rate[learn] = score
''' {0.01: 0.719100817422237, 0.05: 0.7315198412233572, 0.08: 0.733713956417723, 0.1: 0.7342923823693306, 0.12: 0.7341688215024998} learning_rate 为0.1时最好 '''
网格调参
网格调参+五折交叉验证非常非常非常耗时,建议开始步长选大一点,我步长较小,导致调参耗时非常可怕
''' sklearn 提供GridSearchCV用于进行网格搜索,只需要把模型的参数输进去,就能给出最优化的结果和参数。 相比起贪心调参,网格搜索的结果会更优,但是网格搜索只适合于小数据集,一旦数据的量级上去了,很难得出结果 '''
def get_best_cv_params(learning_rate=0.1, n_estimators=800, num_leaves=100, max_depth=10, feature_fraction=0.9,
min_data_in_leaf=26, reg_lambda=0.3, reg_alpha=0, objective='binary', param_grid=None):
cv_fold = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
'''设置五折交叉验证'''
model_lgb = lgb.LGBMRegressor(learning_rate=learning_rate, n_estimators=n_estimators, num_leaves=num_leaves,
max_depth=max_depth, feature_fraction=feature_fraction,
min_data_in_leaf=min_data_in_leaf, reg_lambda=reg_lambda, reg_alpha=reg_alpha,
objective=objective, n_jobs=-1)
grid_search = GridSearchCV(estimator=model_lgb, cv=cv_fold, param_grid=param_grid, scoring='roc_auc')
grid_search.fit(x_train, y_train)
print('模型当前最优参数为: {}'.format(grid_search.best_params_))
print('模型当前最优得分为: {}'.format(grid_search.best_score_))
先调 max_depth和 num_leaves
lgb_params = { 'num_leaves': range(80, 120, 5), 'max_depth': range(6, 14, 2)}
get_best_cv_params(param_grid=lgb_params)
''' 模型当前最优参数为: {'max_depth': 6, 'num_leaves': 80} 模型当前最优得分为: 0.7349883936428184 '''
min_data_in_leaf和min_child_weight
lgb_params = { 'min_data_in_leaf': range(20, 60, 5)}
get_best_cv_params(param_grid=lgb_params, max_depth=6, num_leaves=80)
''' 模型当前最优参数为: {'min_data_in_leaf': 45} 模型当前最优得分为: 0.7352238437118113 '''
feature_fraction
lgb_params = { 'feature_fraction': [i / 10 for i in range(5, 10, 1)]}
get_best_cv_params(param_grid=lgb_params, max_depth=6, num_leaves=80, min_data_in_leaf=45)
''' 模型当前最优参数为: {'feature_fraction': 0.5} 模型当前最优得分为: 0.7357516064800039 '''
reg_lambda 和 reg_alpha
lgb_params = { 'reg_alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6], 'reg_lambda': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]}
get_best_cv_params(param_grid=lgb_params, max_depth=6, num_leaves=80, min_data_in_leaf=45, feature_fraction=0.5, )
''' 模型当前最优参数为: {'reg_alpha': 0.5, 'reg_lambda': 0.4} 模型当前最优得分为: 0.7358840540809432 '''
总之,看似每一个参数的选择非常简短快速,实际调参过程非常漫长,建议增大步长缩小范围以后再精细调参。
贝叶斯调参
''' 贝叶斯优化是一种用模型找到函数最小值方法 贝叶斯方法与随机或网格搜索的不同之处在于:它在尝试下一组超参数时,会参考之前的评估结果,因此可以省去很多无用功 贝叶斯调参法使用不断更新的概率模型,通过推断过去的结果来'集中'有希望的超参数 贝叶斯优化问题的四个部分 1.目标函数 - 机器学习模型使用该组超参数在验证集上的损失 它的输入为一组超参数,输出需要最小化的值(交叉验证损失) 2.域空间 - 要搜索的超参数的取值范围 在搜索的每次迭代中,贝叶斯优化算法将从域空间为每个超参数选择一个值 当我们进行随机或网格搜索时,域空间是一个网格 而在贝叶斯优化中,不是按照顺序()网格)或者随机选择一个超参数,而是按照每个超参数的概率分布选择 3.优化算法 - 构造替代函数并选择下一个超参数值进行评估的方法 4.来自目标函数评估的存储结果,包括超参数和验证集上的损失 '''
定义目标函数,我们要这个目标函数输出的值最大
def rf_cv_lgb(num_leaves, max_depth, bagging_fraction, feature_fraction, bagging_freq, min_data_in_leaf,
min_child_weight, min_split_gain, reg_lambda, reg_alpha):
val = cross_val_score(
lgb.LGBMRegressor(
boosting_type='gbdt', objective='binary', metrics='auc', learning_rate=0.1, n_estimators=5000,
num_leaves=int(num_leaves), max_depth=int(max_depth), bagging_fraction=round(bagging_fraction, 2),
feature_fraction=round(feature_fraction, 2), bagging_freq=int(bagging_freq),
min_data_in_leaf=int(min_data_in_leaf), min_child_weight=min_child_weight,
min_split_gain=min_split_gain, reg_lambda=reg_lambda, reg_alpha=reg_alpha, n_jobs=-1
), x_train, y_train, cv=5, scoring='roc_auc'
).mean()
return val
定义优化参数(域空间)
rf_bo = BayesianOptimization(
rf_cv_lgb,
{
'num_leaves': (10, 200),
'max_depth': (3, 20),
'bagging_fraction': (0.5, 1.0),
'feature_fraction': (0.5, 1.0),
'bagging_freq': (0, 100),
'min_data_in_leaf': (10, 100),
'min_child_weight': (0, 10),
'min_split_gain': (0.0, 1.0),
'reg_alpha': (0.0, 10),
'reg_lambda': (0.0, 10)
}
)
开始优化,这里我会有15次迭代后的得分,我取了最高的一次贴上来
rf_bo.maximize(n_iter=10)
''' | iter | target | baggin... | baggin... | featur... | max_depth | min_ch... | min_da... | min_sp... | num_le... | reg_alpha | reg_la... | | 14 | 0.7367 | 0.8748 | 21 .07 | 0.9624 | 4.754 | 0.3129 | 21.14 | 0.4187 | 178.2 | 9.991 | 9.528 | '''
根据优化后的参数建立新的模型,降低学习率并寻找最优模型迭代次数
'''调整一个较小的学习率,并通过cv函数确定当前最优的迭代次数'''
base_params_lgb = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 178,
'max_depth': 5,
'min_data_in_leaf': 21,
'min_child_weight': 0.31,
'bagging_fraction': 0.88,
'feature_fraction': 0.96,
'bagging_freq': 21,
'reg_lambda': 9.5,
'reg_alpha': 10,
'min_split_gain': 0.42,
'nthread': 8,
'seed': 2020,
'silent': True,
'verbose': -1
}
train_matrix = lgb.Dataset(x_train, label=y_train)
cv_result_lgb = lgb.cv(
train_set=train_matrix,
early_stopping_rounds=1000,
num_boost_round=20000,
nfold=5,
stratified=True,
shuffle=True,
params=base_params_lgb,
metrics='auc',
seed=2020
)
print('迭代次数 {}'.format(len(cv_result_lgb['auc-mean'])))
print('最终模型的AUC为 {}'.format(max(cv_result_lgb['auc-mean'])))
''' 迭代次数 9364 最终模型的AUC为 0.7378500759884923 '''
模型参数已经确定,建立最终模型并对验证集进行验证
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(x_train, y_train)):
print('*** {} ***'.format(str(i+1)))
x_train_split, y_train_split, x_valid, y_valid = x_train.iloc[train_index], y_train[train_index], \
x_train.iloc[valid_index], y_train[valid_index]
train_matrix = lgb.Dataset(x_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(x_valid, label=y_valid)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.01,
'num_leaves': 178,
'max_depth': 5,
'min_data_in_leaf': 21,
'min_child_weight': 0.31,
'bagging_fraction': 0.88,
'feature_fraction': 0.96,
'bagging_freq': 21,
'reg_lambda': 9.5,
'reg_alpha': 10,
'min_split_gain': 0.42,
'nthread': 8,
'seed': 2020,
'silent': True,
'verbose': -1
}
model = lgb.train(params, train_set=train_matrix, num_boost_round=9364, valid_sets=valid_matrix,
verbose_eval=1000, early_stopping_rounds=200)
val_pred = model.predict(x_valid, num_iteration=model.best_iteration)
cv_scores.append(roc_auc_score(y_valid, val_pred))
print(cv_scores)
print('lgb_scotrainre_list: {}'.format(cv_scores))
print('lgb_score_mean: {}'.format(np.mean(cv_scores)))
print('lgb_score_std: {}'.format(np.std(cv_scores)))
''' lgb_scotrainre_list: [0.7386297996035015, 0.7356995636689628, 0.73900352698853, 0.7382979036633256, 0.7369681848895435] lgb_score_mean: 0.7377197957627727 lgb_score_std: 0.0012211910753377566 '''
使用训练集数据进行模型训练
final_model_lgb = lgb.train(base_params_lgb, train_set=train_matrix, valid_sets=valid_matrix, num_boost_round=13000,
verbose_eval=1000, early_stopping_rounds=200)
预测,并计算roc的相关指标
val_pred_lgb = final_model_lgb.predict(x_valid)
fpr, tpr, threshold = metrics.roc_curve(y_valid, val_pred_lgb)
roc_auc = metrics.auc(fpr, tpr)
print('调参后lgb在验证集上的AUC: {}'.format(roc_auc))
'''调参后lgb在验证集上的AUC: 0.7369681848895435'''
plt.figure(figsize=(8, 8))
plt.title('Validation ROC')
plt.plot(fpr, tpr, 'b', label='Val AUC = %0.4f' % roc_auc)
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.legend(loc='best')
plt.title('ROC')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.plot([0, 1], [0, 1], 'r--')
我的图片没有保存,总之结果和调参前相差不大
保存模型到本地
pickle.dump(final_model_lgb, open('model/model_lgb_1.pkl', 'wb'))
总结
这次调参下来,发现费了很大的功夫,模型的效果提升微乎其微,所以调参的优先级应该排在特征工程之后。选择什么样的模型,以及选择哪些数据作为特征训练,特征应该进行怎样的处理,这些特征工程对于分数的提高应该更大。
下一节在尝试模型融合之前,我会尝试用不同的模型先简单测试,看一下哪些模型适合该场景,另外在训练前,我需要先根据特征的重要性做一个特征的筛选,最后训练2-3个模型后再进行融合。希望分数能有一个大的提高。