1. 一、前期工作 1.1. 导入库 本文用到的库有 numpy、pandas 、matplotlib、seaborn、sklearn以及lightgbmimport pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import cross_val_score from bayes_opt import BayesianOptimization import joblib from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier from sklearn.metrics import roc_auc_score, mean_absolute_error, mean_squared_error, r2_score import lightgbm as lgb
1.2. 数据读取 采用pandas读取训练集数据集data_train = pd.read_csv('train.csv' )
2. 二、数据清洗与整理 原有数据中有很多我们无法使用的信息,如一些非数值信息,或者是无效的信息,缺失的信息,为了后面建模的方便,我们将数据进行清洗。
2.1. 缺失值处理 missing_values = data_train.isnull().sum () missing_values
个人年龄 0 个人性别 2 个人教育程度 0 个人收入 0 个人工作经验 0 个人房屋所有权 0 借记卡余额 0 信用卡使用意向 0 信用卡利率 0 贷款占收入的百分比 0 个人信用历史时长 0 信用评分 0 档案中以往贷款违约情况 0 信用卡状态 0 dtype: int64
去除缺失值 data_train = data_train.dropna()
再次检查数据集,确保所有缺失值已被移除missing_values_cleaned = data_train.isnull().sum () missing_values_cleaned
个人年龄 0 个人性别 0 个人教育程度 0 个人收入 0 个人工作经验 0 个人房屋所有权 0 借记卡余额 0 信用卡使用意向 0 信用卡利率 0 贷款占收入的百分比 0 个人信用历史时长 0 信用评分 0 档案中以往贷款违约情况 0 信用卡状态 0 dtype: int64
2.2. 非数值数据独热编码 独热编码是针对具有明确分类值的数据进行预处理的有效方法,通过将每个分类值转换为独立的二进制向量,确保模型正确理解非数值分类特征,避免数值关系的误判。 独热编码特别适用于处理那些具有明确、有限且通常不带有数值意义的分类值的数据。data_train = pd.get_dummies(data_train)
3. 三、相关性分析 相关性分析是一种统计学方法,用于研究两个或多个变量之间的关系强度和方向,旨在衡量不同因素之间的相关密切程度,帮助人们理解变量之间的相互作用,以及一个变量的变化如何影响另一个变量。它广泛应用于数据分析、市场研究、科学研究等领域,是一种发现数据中的模式、趋势和关联的重要工具。
3.1. 热力图 plt.rcParams['font.sans-serif' ] = ['SimHei' ] plt.figure(figsize=(20 , 10 )) sns.heatmap(data_train.corr(), cmap='coolwarm' , annot=True ) plt.title('相关性热力图' ) plt.show()
3.2. 成对关系图 sns.pairplot(data_train, corner = True )
https://raw.githubusercontent.com/tanyunhao/picture/master/宇宙太空-黑白星星太空_爱给网_aigei_com.gif
data_train.corr()['信用卡状态' ].sort_values(ascending=True ).plot(kind='bar' )
3.3. 去除相关性较低的变量 data_train = data_train.drop(['信用评分' , '个人教育程度_Master' , '个人性别_male' , '个人教育程度_Associate' , '个人教育程度_Bachelor' ,'个人性别_female' ,'个人教育程度_Doctorate' ,'借记卡余额' ,'个人教育程度_High School' ], axis=1 )
4. 四、模型建立及参数优化 4.1. 随机森林 def rf_cv (n_estimators, max_depth, min_samples_split ): model = RandomForestClassifier( n_estimators=int (n_estimators), max_depth=None if max_depth < 1 else int (max_depth), min_samples_split=int (min_samples_split), random_state=42 ) return cross_val_score(model, X_train, y_train, scoring='roc_auc' , cv=3 ).mean() rf_bo = BayesianOptimization( rf_cv, {'n_estimators' : (100 , 200 ), 'max_depth' : (1 , 20 ), 'min_samples_split' : (2 , 5 )} ) rf_bo.maximize(init_points=10 , n_iter=10 )
优化结果
| iter | target | max_depth | min_sa... | n_esti... |------------------------------------------------------------- | [39 m1 [39 m | [39 m0.9511 [39 m | [39 m2.858 [39 m | [39 m3.179 [39 m | [39 m166.5 [39 m | | [35 m2 [39 m | [35 m0.9627 [39 m | [35 m6.985 [39 m | [35 m2.859 [39 m | [35 m174.6 [39 m | | [35 m3 [39 m | [35 m0.9671 [39 m | [35 m8.686 [39 m | [35 m3.493 [39 m | [35 m105.4 [39 m | | [39 m4 [39 m | [39 m0.963 [39 m | [39 m6.566 [39 m | [39 m4.675 [39 m | [39 m101.0 [39 m | | [35 m5 [39 m | [35 m0.9708 [39 m | [35 m14.54 [39 m | [35 m3.967 [39 m | [35 m166.8 [39 m | | [39 m6 [39 m | [39 m0.9708 [39 m | [39 m18.88 [39 m | [39 m2.042 [39 m | [39 m110.1 [39 m | | [39 m7 [39 m | [39 m0.9607 [39 m | [39 m5.095 [39 m | [39 m4.083 [39 m | [39 m164.9 [39 m | | [39 m8 [39 m | [39 m0.9708 [39 m | [39 m13.2 [39 m | [39 m4.364 [39 m | [39 m107.0 [39 m | | [35 m9 [39 m | [35 m0.971 [39 m | [35 m16.8 [39 m | [35 m3.127 [39 m | [35 m186.8 [39 m | | [39 m10 [39 m | [39 m0.9708 [39 m | [39 m14.72 [39 m | [39 m4.795 [39 m | [39 m129.8 [39 m | | [39 m11 [39 m | [39 m0.9708 [39 m | [39 m15.39 [39 m | [39 m4.64 [39 m | [39 m111.2 [39 m | | [39 m12 [39 m | [39 m0.9707 [39 m | [39 m18.96 [39 m | [39 m4.923 [39 m | [39 m104.8 [39 m | | [39 m13 [39 m | [39 m0.9708 [39 m | [39 m13.43 [39 m | [39 m4.612 [39 m | [39 m160.0 [39 m | | [39 m14 [39 m | [39 m0.9709 [39 m | [39 m19.87 [39 m | [39 m2.322 [39 m | [39 m162.7 [39 m | | [35 m15 [39 m | [35 m0.9711 [39 m | [35 m15.84 [39 m | [35 m2.986 [39 m | [35 m121.6 [39 m | | [39 m16 [39 m | [39 m0.967 [39 m | [39 m8.499 [39 m | [39 m3.989 [39 m | [39 m124.1 [39 m | | [39 m17 [39 m | [39 m0.971 [39 m | [39 m19.98 [39 m | [39 m2.443 [39 m | [39 m128.4 [39 m | | [39 m18 [39 m | [39 m0.971 [39 m | [39 m19.7 [39 m | [39 m2.395 [39 m | [39 m194.4 [39 m | | [39 m19 [39 m | [39 m0.9699 [39 m | [39 m11.22 [39 m | [39 m3.639 [39 m | [39 m193.9 [39 m | | [39 m20 [39 m | [39 m0.9708 [39 m | [39 m19.67 [39 m | [39 m3.937 [39 m | [39 m137.6 [39 m | =============================================================
4.2. 梯度提升 def gb_cv (n_estimators, learning_rate, max_depth ): model = GradientBoostingClassifier( n_estimators=int (n_estimators), learning_rate=learning_rate, max_depth=int (max_depth), random_state=42 ) return cross_val_score(model, X_train, y_train, scoring='roc_auc' , cv=3 ).mean() gb_bo = BayesianOptimization( gb_cv, {'n_estimators' : (100 , 200 ), 'learning_rate' : (0.01 , 0.1 ), 'max_depth' : (3 , 5 )} ) gb_bo.maximize(init_points=10 , n_iter=10 )
优化结果| iter | target | learni... | max_depth | n_esti... |------------------------------------------------------------- | [39 m1 [39 m | [39 m0.9732 [39 m | [39 m0.09128 [39 m | [39 m4.884 [39 m | [39 m118.4 [39 m | | [39 m2 [39 m | [39 m0.9648 [39 m | [39 m0.0332 [39 m | [39 m3.908 [39 m | [39 m103.6 [39 m | | [39 m3 [39 m | [39 m0.967 [39 m | [39 m0.02648 [39 m | [39 m3.823 [39 m | [39 m175.7 [39 m | | [39 m4 [39 m | [39 m0.9696 [39 m | [39 m0.04652 [39 m | [39 m3.293 [39 m | [39 m163.1 [39 m | | [39 m5 [39 m | [39 m0.9698 [39 m | [39 m0.04087 [39 m | [39 m4.85 [39 m | [39 m129.7 [39 m | | [39 m6 [39 m | [39 m0.9702 [39 m | [39 m0.07502 [39 m | [39 m3.998 [39 m | [39 m117.6 [39 m | | [39 m7 [39 m | [39 m0.9624 [39 m | [39 m0.01339 [39 m | [39 m4.905 [39 m | [39 m119.4 [39 m | | [39 m8 [39 m | [39 m0.9729 [39 m | [39 m0.09138 [39 m | [39 m4.891 [39 m | [39 m118.4 [39 m | | [39 m9 [39 m | [39 m0.967 [39 m | [39 m0.02536 [39 m | [39 m4.512 [39 m | [39 m118.5 [39 m | | [39 m10 [39 m | [39 m0.9684 [39 m | [39 m0.03262 [39 m | [39 m4.959 [39 m | [39 m118.7 [39 m | | [39 m11 [39 m | [39 m0.9719 [39 m | [39 m0.07358 [39 m | [39 m4.894 [39 m | [39 m118.1 [39 m | | [39 m12 [39 m | [39 m0.9721 [39 m | [39 m0.07516 [39 m | [39 m4.746 [39 m | [39 m117.7 [39 m | | [39 m13 [39 m | [39 m0.9694 [39 m | [39 m0.03989 [39 m | [39 m4.997 [39 m | [39 m117.8 [39 m | | [39 m14 [39 m | [39 m0.9729 [39 m | [39 m0.09002 [39 m | [39 m4.53 [39 m | [39 m117.9 [39 m | | [39 m15 [39 m | [39 m0.9635 [39 m | [39 m0.01725 [39 m | [39 m3.793 [39 m | [39 m172.6 [39 m | | [39 m16 [39 m | [39 m0.9671 [39 m | [39 m0.03594 [39 m | [39 m3.626 [39 m | [39 m130.9 [39 m | | [39 m17 [39 m | [39 m0.9718 [39 m | [39 m0.07058 [39 m | [39 m4.554 [39 m | [39 m117.5 [39 m | | [39 m18 [39 m | [39 m0.9679 [39 m | [39 m0.03026 [39 m | [39 m4.382 [39 m | [39 m117.7 [39 m | | [39 m19 [39 m | [39 m0.9712 [39 m | [39 m0.06166 [39 m | [39 m4.666 [39 m | [39 m118.1 [39 m | | [39 m20 [39 m | [39 m0.973 [39 m | [39 m0.09644 [39 m | [39 m4.759 [39 m | [39 m117.3 [39 m | | [39 m21 [39 m | [39 m0.9691 [39 m | [39 m0.03482 [39 m | [39 m3.23 [39 m | [39 m196.8 [39 m | | [39 m22 [39 m | [39 m0.9677 [39 m | [39 m0.02879 [39 m | [39 m4.576 [39 m | [39 m117.1 [39 m | | [39 m23 [39 m | [39 m0.9665 [39 m | [39 m0.02399 [39 m | [39 m4.813 [39 m | [39 m117.5 [39 m | | [39 m24 [39 m | [39 m0.9709 [39 m | [39 m0.07578 [39 m | [39 m3.795 [39 m | [39 m142.4 [39 m | | [39 m25 [39 m | [39 m0.9716 [39 m | [39 m0.07247 [39 m | [39 m4.299 [39 m | [39 m111.6 [39 m | | [39 m26 [39 m | [39 m0.9721 [39 m | [39 m0.07639 [39 m | [39 m4.694 [39 m | [39 m117.9 [39 m | | [39 m27 [39 m | [39 m0.9678 [39 m | [39 m0.02956 [39 m | [39 m4.488 [39 m | [39 m118.0 [39 m | | [35 m28 [39 m | [35 m0.9741 [39 m | [35 m0.09322 [39 m | [35 m4.169 [39 m | [35 m146.5 [39 m | | [39 m29 [39 m | [39 m0.973 [39 m | [39 m0.07774 [39 m | [39 m4.264 [39 m | [39 m146.5 [39 m | | [39 m30 [39 m | [39 m0.9735 [39 m | [39 m0.08407 [39 m | [39 m4.012 [39 m | [39 m146.4 [39 m | =============================================================
4.3. LightGBM def lgb_cv (n_estimators, learning_rate, max_depth ): model = lgb.LGBMClassifier( n_estimators=int (n_estimators), learning_rate=learning_rate, max_depth=int (max_depth), random_state=42 ) return cross_val_score(model, X_train, y_train, scoring='roc_auc' , cv=3 ).mean() lgb_bo = BayesianOptimization( lgb_cv, {'n_estimators' : (100 , 200 ), 'learning_rate' : (0.01 , 0.1 ), 'max_depth' : (3 , 10 )} ) lgb_bo.maximize(init_points=10 , n_iter=10 )
4.4. AdaBoost def ada_cv (n_estimators, learning_rate ): model = AdaBoostClassifier( n_estimators=int (n_estimators), learning_rate=learning_rate, random_state=42 ) return cross_val_score(model, X_train, y_train, scoring='roc_auc' , cv=3 ).mean() ada_bo = BayesianOptimization( ada_cv, {'n_estimators' : (50 , 200 ), 'learning_rate' : (0.1 , 1.0 )} ) ada_bo.maximize(init_points=10 , n_iter=10 )
4.5. 提取模型的最佳参数 def get_best_params (bo ): params = bo.max ['params' ] params['n_estimators' ] = int (params['n_estimators' ]) if 'max_depth' in params: params['max_depth' ] = None if params['max_depth' ] < 1 else int (params['max_depth' ]) if 'min_samples_split' in params: params['min_samples_split' ] = int (params['min_samples_split' ]) return params rf_params = get_best_params(rf_bo) gb_params = get_best_params(gb_bo) ada_params = get_best_params(ada_bo) lgb_params = get_best_params(lgb_bo)
4.6. 训练模型 def train_and_evaluate (model, model_name, X_train, y_train, X_val, y_val ): model.fit(X_train, y_train) y_pred = model.predict(X_val) y_proba = model.predict_proba(X_val)[:, 1 ] metrics = { 'Accuracy' : (y_pred == y_val).mean(), 'AUC' : roc_auc_score(y_val, y_proba), 'MAE' : mean_absolute_error(y_val, y_pred), 'MSE' : mean_squared_error(y_val, y_pred) } print (f"\n{model_name} 验证集评估结果:" ) for metric, value in metrics.items(): print (f"{metric} : {value:.4 f} " ) return model, metrics
5. 五、模型评估 5.1. 随机森林 rf_model, rf_metrics = train_and_evaluate( RandomForestClassifier(random_state=42 , **rf_params), "随机森林" , X_train, y_train, X_val, y_val )
5.2. 梯度提升 gb_model, gb_metrics = train_and_evaluate( GradientBoostingClassifier(random_state=42 , **gb_params), "梯度提升" , X_train, y_train, X_val, y_val )
5.3. AdaBoost ada_model, ada_metrics = train_and_evaluate( AdaBoostClassifier(random_state=42 , **ada_params), "AdaBoost" , X_train, y_train, X_val, y_val )
5.4. LightGBM lgb_model, lgb_metrics = train_and_evaluate( lgb.LGBMClassifier(random_state=42 , **lgb_params), "LightGBM" , X_train, y_train, X_val, y_val )
5.5. 模型对比与最佳模型选取 results = pd.DataFrame({ 'RandomForest' : rf_metrics, 'GradientBoosting' : gb_metrics, 'AdaBoost' : ada_metrics, 'LightGBM' : lgb_metrics }) print ("\n模型性能对比:" ) print (results) best_model_name = results.idxmax(axis=1 )['AUC' ] best_models = { 'RandomForest' : rf_model, 'GradientBoosting' : gb_model, 'AdaBoost' : ada_model, 'LightGBM' : lgb_model } best_model = best_models[best_model_name] print (f"\n最优模型是: {best_model_name} ,AUC: {results.loc['AUC' , best_model_name]:.4 f} " )
模型性能对比: RandomForest GradientBoosting AdaBoost LightGBM Accuracy 0.925694 0.931111 0.914583 0.931528 AUC 0.973731 0.976323 0.966190 0.978273 MAE 0.074306 0.068889 0.085417 0.068472 MSE 0.074306 0.068889 0.085417 0.068472 最优模型是: LightGBM,AUC: 0.9783
选取 LightGBM模型,其AUC: 0.9783
6. 六、预测 6.1. 数据处理 data_test = pd.read_csv('202201108_test.csv' ) data_test = data_test.dropna() data_test = pd.get_dummies(data_test) columns_to_drop = ['信用评分' , '个人教育程度_Master' , '个人性别_male' , '个人教育程度_Associate' , '个人教育程度_Bachelor' , '个人性别_female' , '个人教育程度_Doctorate' , '借记卡余额' , '个人教育程度_High School' ] data_test = data_test.drop(columns_to_drop, axis=1 , errors='ignore' )
6.2. 预测 y_pred = best_model.predict(data_test) predictions = pd.DataFrame({ '预测结果' : y_pred }) predictions.to_csv('predictions.csv' , index=False ) print (f"预测结果已保存至 predictions.csv" ) dump(best_model, f"{best_model_name} .pkl" ) print (f"{best_model_name} 模型已保存为 {best_model_name} .pkl" )
6.3. AUC曲线 from sklearn.metrics import roc_curve, aucimport matplotlib.pyplot as plt best_model = load(f"{best_model_name} .pkl" ) y_pred_proba = best_model.predict_proba(X_val)[:, 1 ] fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba) roc_auc = auc(fpr, tpr) plt.figure() plt.plot(fpr, tpr, color='darkorange' , lw=2 , label=f'ROC curve (area = {roc_auc:.2 f} )' ) plt.plot([0 , 1 ], [0 , 1 ], color='navy' , lw=2 , linestyle='--' ) plt.xlim([0.0 , 1.0 ]) plt.ylim([0.0 , 1.05 ]) plt.xlabel('False Positive Rate' ) plt.ylabel('True Positive Rate' ) plt.title('Receiver Operating Characteristic' ) plt.legend(loc="lower right" ) plt.show()
6.4. 特征重要性可视化 if hasattr (best_model, 'feature_importances_' ): importances = best_model.feature_importances_ feature_importance = pd.DataFrame({'feature' : X_train.columns, 'importance' : importances}).sort_values(by='importance' , ascending=False ) plt.figure(figsize=(12 , 8 )) sns.barplot(x='importance' , y='feature' , data=feature_importance.head(20 )) plt.title(f'{best_model_name} 特征重要性 Top 20' ) plt.xlabel('特征重要性' ) plt.ylabel('特征名' ) plt.tight_layout() plt.show()
7. References