赛题地址:http://www.scdata.net.cn/kfds/urgent2/pages/index.html ,诈骗电话识别是算法对抗赛的赛题之一。
attention:由于签了保密协议,本文不会提供代码涉及到的数据。
参赛历程:初赛时,由于大佬开的baseline分数已经很高,于是本人只调了调baseline然后就弃赛了。然后初赛结束的时候竟然收到了进入复赛的短信,于是交保密协议,下载数据,算是复赛才开始正式参赛吧。本来给一个大佬的结果融合可以进入top4的,可惜没选中。
本文参考的bl:https://github.com/biaobiao2/DC_phone
本方案简要说明:
特征工程:主要使用了统计特征,在bl的特征基础上新增了sem、skew等统计特征,实践证明sem、skew是该赛题数据非常优秀的特征。
模型搭建:lgb+xgb+cat,三模输出概率按0.25:0.25:0.5累加求0-1的概率,将1概率大于0.6的输出为1,否则为0。
输出结果:复赛B榜0.9005
其它trick:训练特征未做填充,测试数据做分位数填充。单模建模时通过求所有预测值0-1概率差的绝对值的和,找出和为最大的分位数作为填充,从B榜分数来看,能带来几个千分位的提升。
代码部分:
特征工程:
# coding=utf-8
'''
@author: csdn xuxml
'''
import os
import gc
import time
import psutil
import datetime
import numpy as np
import pandas as pd
import catboost as cat
import lightgbm as lgb
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from scipy.stats import entropy, pearsonr, stats
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth",100)
pd.set_option('display.max_rows', None)
pd.set_option('display.width',100)
path = "./0527/"
feat_path = path + "data/"
def get_app_feats(df):
phones_app = df[["phone_no_m"]].copy()
phones_app = phones_app.drop_duplicates(subset=['phone_no_m'], keep='last')
tmp = df.groupby("phone_no_m")["busi_name"].agg(busi_count="nunique")
phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
"""使用的流量统计
"""
tmp = df.groupby("phone_no_m")["flow"].agg(flow_mean="mean",
flow_median = "median",
flow_min = "min",
flow_max = "max",
flow_var = "var",
flow_skew = "skew",
flow_std = "std",
flow_quantile = "quantile",
flow_sem = "sem",
flow_sum = "sum")
phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
tmp = df.groupby("phone_no_m")["month_id"].agg(month_ids ="nunique")
phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
#月流量使用统计
phones_app["flow_month"] = phones_app["flow_sum"] / phones_app["month_ids"]
return phones_app
def get_voc_feat(df):
df["start_datetime"] = pd.to_datetime(df['start_datetime'] )
df["hour"] = df['start_datetime'].dt.hour
df["day"] = df['start_datetime'].dt.day
phone_no_m = df[["phone_no_m"]].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
#对话人数和对话次数
tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(opposite_count="count", opposite_unique="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
"""主叫通话
"""
df_call = df[df["calltype_id"]==1].copy()
tmp = df_call.groupby("phone_no_m")["imei_m"].agg(voccalltype1="count", imeis="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
phone_no_m["voc_calltype1"] = phone_no_m["voccalltype1"] / phone_no_m["opposite_count"]
tmp = df_call.groupby("phone_no_m")["city_name"].agg(city_name_call="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
tmp = df_call.groupby("phone_no_m")["county_name"].agg(county_name_call="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
"""和固定通话者的对话统计
"""
tmp = df.groupby(["phone_no_m","opposite_no_m"])["call_dur"].agg(count="count", sum="sum")
phone2opposite = tmp.groupby("phone_no_m")["count"].agg(phone2opposite_mean="mean"
, phone2opposite_median="median"
, phone2opposite_max="max"
, phone2opposite_min="min"
, phone2opposite_var="var"
, phone2opposite_skew="skew"
, phone2opposite_sem="sem"
, phone2opposite_std="std"
, phone2opposite_quantile="quantile"
)
phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")
phone2opposite = tmp.groupby("phone_no_m")["sum"].agg(phone2oppo_sum_mean="mean"
, phone2oppo_sum_median="median"
, phone2oppo_sum_max="max"
, phone2oppo_sum_min="min"
, phone2oppo_sum_var="var"
, phone2oppo_sum_skew="skew"
, phone2oppo_sum_sem="sem"
, phone2oppo_sum_std="std"
, phone2oppo_sum_quantile="quantile"
)
phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")
"""通话时间长短统计
"""
tmp = df.groupby("phone_no_m")["call_dur"].agg(call_dur_mean="mean"
, call_dur_median="median"
, call_dur_max="max"
, call_dur_min="min"
, call_dur_var="var"
, call_dur_skew="skew"
, call_dur_sem="sem"
, call_dur_std="std"
, call_dur_quantile="quantile"
)
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
tmp = df.groupby("phone_no_m")["city_name"].agg(city_name_nunique="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
tmp = df.groupby("phone_no_m")["county_name"].agg(county_name_nunique="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
tmp = df.groupby("phone_no_m")["calltype_id"].agg(calltype_id_unique="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
"""通话时间点偏好
"""
tmp = df.groupby("phone_no_m")["hour"].agg(voc_hour_mode = lambda x:stats.mode(x)[0][0],
voc_hour_mode_count = lambda x:stats.mode(x)[1][0],
voc_hour_nunique="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
tmp = df.groupby("phone_no_m")["day"].agg(voc_day_mode = lambda x:stats.mode(x)[0][0],
voc_day_mode_count = lambda x:stats.mode(x)[1][0],
voc_day_nunique="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
return phone_no_m
def get_sms_feats(df):
df['request_datetime'] = pd.to_datetime(df['request_datetime'] )
df["hour"] = df['request_datetime'].dt.hour
df["day"] = df['request_datetime'].dt.day
phone_no_m = df[["phone_no_m"]].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
#对话人数和对话次数
tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(sms_count="count", sms_nunique="nunique")
tmp["sms_rate"] = tmp["sms_count"]/tmp["sms_nunique"]
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
"""短信下行比例
"""
calltype2 = df[df["calltype_id"]==2].copy()
calltype2 = calltype2.groupby("phone_no_m")["calltype_id"].agg(calltype_2="count")
phone_no_m = phone_no_m.merge(calltype2, on="phone_no_m", how="left")
phone_no_m["calltype_rate"] = phone_no_m["calltype_2"] / phone_no_m["sms_count"]
"""短信时间
"""
tmp = df.groupby("phone_no_m")["hour"].agg(hour_mode = lambda x:stats.mode(x)[0][0],
hour_mode_count = lambda x:stats.mode(x)[1][0],
hour_nunique="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
tmp = df.groupby("phone_no_m")["day"].agg(day_mode = lambda x:stats.mode(x)[0][0],
day_mode_count = lambda x:stats.mode(x)[1][0],
day_nunique="nunique")
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
return phone_no_m
def feats():
test_voc=pd.read_csv(path+'test/test_voc.csv',)
test_voc_feat = get_voc_feat(test_voc)
test_voc_feat.to_csv(feat_path + "test_voc_feat.csv", index=False)
test_app=pd.read_csv(path+'test/test_app.csv',)
test_app_feat = get_app_feats(test_app)
test_app_feat.to_csv(feat_path + "test_app_feat.csv", index=False)
test_sms=pd.read_csv(path+'test/test_sms.csv',)
test_sms_feat = get_sms_feats(test_sms)
test_sms_feat.to_csv(feat_path + "test_sms_feat.csv", index=False)
train_voc=pd.read_csv(path+'train/train_voc.csv',)
train_voc_feat = get_voc_feat(train_voc)
train_voc_feat.to_csv(feat_path + "train_voc_feat.csv", index=False)
train_app=pd.read_csv(path+'train/train_app.csv',)
train_app_feat = get_app_feats(train_app)
train_app_feat.to_csv(feat_path + "train_app_feat.csv", index=False)
train_sms=pd.read_csv(path+'train/train_sms.csv',)
train_sms_feat = get_sms_feats(train_sms)
train_sms_feat.to_csv(feat_path + "train_sms_feat.csv", index=False)
test_vocfs=pd.read_csv(path + 'zpfsdata/test_voc.csv',)
test_voc_featfs = get_voc_feat(test_vocfs)
test_voc_featfs.to_csv(path + "zpfsdata/test_voc_feat.csv", index=False)
test_appfs=pd.read_csv(path + 'zpfsdata/test_app.csv',)
test_app_featfs = get_app_feats(test_appfs)
test_app_featfs.to_csv(path + "zpfsdata/test_app_feat.csv", index=False)
test_smsfs=pd.read_csv(path + 'zpfsdata/test_sms.csv',)
test_sms_featfs = get_sms_feats(test_smsfs)
test_sms_featfs.to_csv(path + "zpfsdata/test_sms_feat.csv", index=False)
生成特征:
#create and save voc、app、sms features
feats()
数据加载:
#load april features
test_app_feat=pd.read_csv(feat_path+'test_app_feat.csv')
test_voc_feat=pd.read_csv(feat_path+'test_voc_feat.csv')
test_sms_feat=pd.read_csv(feat_path + "test_sms_feat.csv")
test_user=pd.read_csv(path+'test/test_user.csv')
test_user = test_user.merge(test_app_feat, on="phone_no_m", how="left")
test_user = test_user.merge(test_voc_feat, on="phone_no_m", how="left")
test_user = test_user.merge(test_sms_feat, on="phone_no_m", how="left")
test_user["city_name"] = LabelEncoder().fit_transform(test_user["city_name"].astype(np.str))
test_user["county_name"] = LabelEncoder().fit_transform(test_user["county_name"].astype(np.str))
#load april label
test_user_lb1 = pd.read_csv(path + 'zpfsdata/4yuelabel1.csv')
test_user_lb2 = pd.read_csv(path + 'zpfsdata/4yuelabel2.csv')
#concat april label and merge with features
test_user_label = pd.concat([test_user_lb1, test_user_lb2])
test_user = test_user.merge(test_user_label, on="phone_no_m", how="left")
test_user.rename(columns={"arpu_202004":"arpu_202005"},inplace=True)
#load train features and label
train_app_feat = pd.read_csv(feat_path + "train_app_feat.csv")
train_voc_feat = pd.read_csv(feat_path + "train_voc_feat.csv")
train_sms_feat = pd.read_csv(feat_path + "train_sms_feat.csv")
train_user=pd.read_csv(path+'train/train_user.csv')
drop_r = ["arpu_201908","arpu_201909","arpu_201910","arpu_201911","arpu_201912","arpu_202001","arpu_202002"]
train_user.drop(drop_r, axis=1,inplace=True)
train_user.rename(columns={"arpu_202003":"arpu_202005"},inplace=True)
train_user = train_user.merge(train_app_feat, on="phone_no_m", how="left")
train_user = train_user.merge(train_voc_feat, on="phone_no_m", how="left")
train_user = train_user.merge(train_sms_feat, on="phone_no_m", how="left")
train_user["city_name"] = LabelEncoder().fit_transform(train_user["city_name"].astype(np.str))
train_user["county_name"] = LabelEncoder().fit_transform(train_user["county_name"].astype(np.str))
#concat preli data(train and test)
train_user = pd.concat([train_user, test_user])
#final label
train_label = train_user[["label"]].copy()
#drop phone_no_m
test_user.drop(["phone_no_m"], axis=1,inplace=True)
train_user.drop(["phone_no_m", "label"], axis=1,inplace=True)
#load final test features as testfs, fs means fusai
test_app_featfs=pd.read_csv(path + 'zpfsdata/test_app_feat.csv')
test_voc_featfs=pd.read_csv(path + 'zpfsdata/test_voc_feat.csv')
test_sms_featfs=pd.read_csv(path + 'zpfsdata/test_sms_feat.csv')
test_userfs=pd.read_csv(path + 'zpfsdata/test_user.csv')
test_userfs = test_userfs.merge(test_app_featfs, on="phone_no_m", how="left")
test_userfs = test_userfs.merge(test_voc_featfs, on="phone_no_m", how="left")
test_userfs = test_userfs.merge(test_sms_featfs, on="phone_no_m", how="left")
test_userfs["city_name"] = LabelEncoder().fit_transform(test_userfs["city_name"].astype(np.str))
test_userfs["county_name"] = LabelEncoder().fit_transform(test_userfs["county_name"].astype(np.str))
#create submission dataframe
submission = test_userfs[["phone_no_m"]].copy()
#drop phone_no_m
test_userfs.drop(["phone_no_m"], axis=1,inplace=True)
#test_userfs.replace([r'\\N'], np.nan, inplace=True)
test_userfs.replace([u'\\N'], np.nan, inplace=True)
test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
#col = list(test_userfs.columns)
#test_userfs[col] = test_userfs[col].apply(pd.to_numeric, errors='coerce')
#test_userfs['arpu_202005'] = test_userfs['arpu_202005'].apply(lambda x:x.replace('\n', '').replace('\r', '').replace('\\N', '')).astype(np.float32)
test_userfs_ori = test_userfs
模型搭建:
由于数据量少,服务器性能还可以,于是建模部分非常简单粗暴,直接grid search
depth = 8
cv = 5
#create catboost model
catclf = cat.CatBoostClassifier(
allow_writing_files = False
, od_type= 'Iter'
, silent=True
)
#final parameters
cat_grid = {'depth':[depth]
, 'bootstrap_type':['Bernoulli']
, 'od_type':['Iter']
, 'l2_leaf_reg':[15]
, 'learning_rate': [0.1]
, 'allow_writing_files':[False]
, 'silent':[True]
}
#search and fit
catgrid = GridSearchCV(cat.CatBoostClassifier(), param_grid=cat_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose = 10)
catgrid.fit( train_user, train_label['label'] )
#predict output prob
test_userfs = test_userfs_ori.fillna( test_userfs_ori.quantile(0.39) )
test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
cat_proba = catgrid.predict_proba( test_userfs )
rslt_prob_cat = pd.DataFrame( cat_proba )
rslt_prob_cat.columns = ['lb0','lb1']
#create lgb model
#final parameters
lgb_grid = {'booster':['gbdt']
, 'num_leaves':[256]
, 'min_child_weight':[4]
, 'feature_fraction':[0.7]
, 'bagging_fraction':[0.8]
, 'bagging_freq': [1]
}
#search and fit
lgbgrid = GridSearchCV(lgb.LGBMClassifier(), param_grid=lgb_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose = 10)
lgbgrid.fit( train_user, train_label['label'] )
#predict output prob
test_userfs = test_userfs_ori.fillna( test_userfs_ori.quantile(0.34) )
test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
lgb_proba = lgbgrid.predict_proba( test_userfs )
rslt_prob_lgb = pd.DataFrame(lgb_proba)
rslt_prob_lgb.columns = ['lb0','lb1']
#create xgb model
#final parameters
from xgboost import XGBClassifier
xgbclf=XGBClassifier(base_score=0.5
, booster='gbtree'
, colsample_bytree=0.9
, learning_rate=0.1
, max_depth=8
, min_child_weight=7
, n_estimators=100
, n_jobs=-1
, objective='binary:logistic'
, subsample=0.75
, verbosity=1)
#fit
xgbclf.fit( train_user, train_label['label'] )
#predict output prob
test_userfs = test_userfs_ori.fillna( test_userfs_ori.quantile(0.319) )
test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
xgb_proba = xgbclf.predict_proba( test_userfs )
rslt_prob_xgb = pd.DataFrame(lgb_proba)
rslt_prob_xgb.columns = ['lb0','lb1']
调整概率输出:
bestnew112 = 0.25*rslt_prob_lgb + 0.25*rslt_prob_xgb + 0.5*rslt_prob_cat
bestnew112["label"]=bestnew112["lb1"]
bestnew112["label"][bestnew112.label>60/100]=1
bestnew112["label"][bestnew112.label<60/100]=0
sub['label'] = bestnew112['label']
print(sub['label'].value_counts())
print(sub['label'].value_counts()/sub.shape[0])
sub.to_csv('lgb25xgb25cat50threshold60.csv',index=None)
以上结果B榜0.9005