1. 一、前期工作

1.1. 导入库

本文用到的库有 numpy、pandas 、matplotlib、seaborn、sklearn以及lightgbm

import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import roc_auc_score, mean_absolute_error, mean_squared_error, r2_score
import lightgbm as lgb

1.2. 数据读取

采用pandas读取训练集数据集

data_train = pd.read_csv('train.csv') 

2. 二、数据清洗与整理

原有数据中有很多我们无法使用的信息,如一些非数值信息,或者是无效的信息,缺失的信息,为了后面建模的方便,我们将数据进行清洗。

2.1. 缺失值处理

# 检查缺失值
missing_values = data_train.isnull().sum() # 使用isnull()和sum()函数检查缺失值
missing_values # 显示每列的缺失值数量
个人年龄           0
个人性别 2
个人教育程度 0
个人收入 0
个人工作经验 0
个人房屋所有权 0
借记卡余额 0
信用卡使用意向 0
信用卡利率 0
贷款占收入的百分比 0
个人信用历史时长 0
信用评分 0
档案中以往贷款违约情况 0
信用卡状态 0
dtype: int64

去除缺失值

# 检查和处理缺失值  
data_train = data_train.dropna()

再次检查数据集,确保所有缺失值已被移除
missing_values_cleaned = data_train.isnull().sum()
missing_values_cleaned # 显示每列的缺失值数量

个人年龄           0
个人性别 0
个人教育程度 0
个人收入 0
个人工作经验 0
个人房屋所有权 0
借记卡余额 0
信用卡使用意向 0
信用卡利率 0
贷款占收入的百分比 0
个人信用历史时长 0
信用评分 0
档案中以往贷款违约情况 0
信用卡状态 0
dtype: int64

2.2. 非数值数据独热编码

独热编码是针对具有明确分类值的数据进行预处理的有效方法,通过将每个分类值转换为独立的二进制向量,确保模型正确理解非数值分类特征,避免数值关系的误判。 独热编码特别适用于处理那些具有明确、有限且通常不带有数值意义的分类值的数据。

# 独热编码处理  
data_train = pd.get_dummies(data_train)

3. 三、相关性分析

相关性分析是一种统计学方法,用于研究两个或多个变量之间的关系强度和方向,旨在衡量不同因素之间的相关密切程度,帮助人们理解变量之间的相互作用,以及一个变量的变化如何影响另一个变量。它广泛应用于数据分析、市场研究、科学研究等领域,是一种发现数据中的模式、趋势和关联的重要工具。

3.1. 热力图

# 使用seaBorn库绘制热图
#中文字体设置
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(20, 10)) # 设置图形大小
sns.heatmap(data_train.corr(), cmap='coolwarm', annot=True) # 使用heatmap()函数绘制热图
plt.title('相关性热力图')
plt.show() # 显示图形

output_9_1.png

3.2. 成对关系图

sns.pairplot(data_train, corner = True)

output_13_1.png

https://raw.githubusercontent.com/tanyunhao/picture/master/宇宙太空-黑白星星太空_爱给网_aigei_com.gif

#用柱状图显示
data_train.corr()['信用卡状态'].sort_values(ascending=True).plot(kind='bar') # 使用plot()函数绘制柱状图

output_11_2.png

3.3. 去除相关性较低的变量

data_train = data_train.drop(['信用评分', '个人教育程度_Master', '个人性别_male', '个人教育程度_Associate', 
'个人教育程度_Bachelor','个人性别_female','个人教育程度_Doctorate','借记卡余额','个人教育程度_High School'], axis=1)

4. 四、模型建立及参数优化

4.1. 随机森林

# 定义模型1: 随机森林  
def rf_cv(n_estimators, max_depth, min_samples_split):
model = RandomForestClassifier(
n_estimators=int(n_estimators),
max_depth=None if max_depth < 1 else int(max_depth),
min_samples_split=int(min_samples_split),
random_state=42
)
return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=3).mean()

rf_bo = BayesianOptimization(
rf_cv,
{'n_estimators': (100, 200), 'max_depth': (1, 20), 'min_samples_split': (2, 5)}
)
rf_bo.maximize(init_points=10, n_iter=10)

优化结果

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| 1  | 0.9511  | 2.858  | 3.179  | 166.5  |
| 2  | 0.9627  | 6.985  | 2.859  | 174.6  |
| 3  | 0.9671  | 8.686  | 3.493  | 105.4  |
| 4  | 0.963  | 6.566  | 4.675  | 101.0  |
| 5  | 0.9708  | 14.54  | 3.967  | 166.8  |
| 6  | 0.9708  | 18.88  | 2.042  | 110.1  |
| 7  | 0.9607  | 5.095  | 4.083  | 164.9  |
| 8  | 0.9708  | 13.2  | 4.364  | 107.0  |
| 9  | 0.971  | 16.8  | 3.127  | 186.8  |
| 10  | 0.9708  | 14.72  | 4.795  | 129.8  |
| 11  | 0.9708  | 15.39  | 4.64  | 111.2  |
| 12  | 0.9707  | 18.96  | 4.923  | 104.8  |
| 13  | 0.9708  | 13.43  | 4.612  | 160.0  |
| 14  | 0.9709  | 19.87  | 2.322  | 162.7  |
| 15  | 0.9711  | 15.84  | 2.986  | 121.6  |
| 16  | 0.967  | 8.499  | 3.989  | 124.1  |
| 17  | 0.971  | 19.98  | 2.443  | 128.4  |
| 18  | 0.971  | 19.7  | 2.395  | 194.4  |
| 19  | 0.9699  | 11.22  | 3.639  | 193.9  |
| 20  | 0.9708  | 19.67  | 3.937  | 137.6  |
=============================================================

4.2. 梯度提升

# 定义模型2: 梯度提升  
def gb_cv(n_estimators, learning_rate, max_depth):
model = GradientBoostingClassifier(
n_estimators=int(n_estimators),
learning_rate=learning_rate,
max_depth=int(max_depth),
random_state=42
)
return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=3).mean()

gb_bo = BayesianOptimization(
gb_cv,
{'n_estimators': (100, 200), 'learning_rate': (0.01, 0.1), 'max_depth': (3, 5)}
)
gb_bo.maximize(init_points=10, n_iter=10)

优化结果

|   iter    |  target   | learni... | max_depth | n_esti... |
-------------------------------------------------------------
| 1  | 0.9732  | 0.09128  | 4.884  | 118.4  |
| 2  | 0.9648  | 0.0332  | 3.908  | 103.6  |
| 3  | 0.967  | 0.02648  | 3.823  | 175.7  |
| 4  | 0.9696  | 0.04652  | 3.293  | 163.1  |
| 5  | 0.9698  | 0.04087  | 4.85  | 129.7  |
| 6  | 0.9702  | 0.07502  | 3.998  | 117.6  |
| 7  | 0.9624  | 0.01339  | 4.905  | 119.4  |
| 8  | 0.9729  | 0.09138  | 4.891  | 118.4  |
| 9  | 0.967  | 0.02536  | 4.512  | 118.5  |
| 10  | 0.9684  | 0.03262  | 4.959  | 118.7  |
| 11  | 0.9719  | 0.07358  | 4.894  | 118.1  |
| 12  | 0.9721  | 0.07516  | 4.746  | 117.7  |
| 13  | 0.9694  | 0.03989  | 4.997  | 117.8  |
| 14  | 0.9729  | 0.09002  | 4.53  | 117.9  |
| 15  | 0.9635  | 0.01725  | 3.793  | 172.6  |
| 16  | 0.9671  | 0.03594  | 3.626  | 130.9  |
| 17  | 0.9718  | 0.07058  | 4.554  | 117.5  |
| 18  | 0.9679  | 0.03026  | 4.382  | 117.7  |
| 19  | 0.9712  | 0.06166  | 4.666  | 118.1  |
| 20  | 0.973  | 0.09644  | 4.759  | 117.3  |
| 21  | 0.9691  | 0.03482  | 3.23  | 196.8  |
| 22  | 0.9677  | 0.02879  | 4.576  | 117.1  |
| 23  | 0.9665  | 0.02399  | 4.813  | 117.5  |
| 24  | 0.9709  | 0.07578  | 3.795  | 142.4  |
| 25  | 0.9716  | 0.07247  | 4.299  | 111.6  |
| 26  | 0.9721  | 0.07639  | 4.694  | 117.9  |
| 27  | 0.9678  | 0.02956  | 4.488  | 118.0  |
| 28  | 0.9741  | 0.09322  | 4.169  | 146.5  |
| 29  | 0.973  | 0.07774  | 4.264  | 146.5  |
| 30  | 0.9735  | 0.08407  | 4.012  | 146.4  |
=============================================================

4.3. LightGBM

 # 定义模型4: LightGBM  
def lgb_cv(n_estimators, learning_rate, max_depth):
model = lgb.LGBMClassifier(
n_estimators=int(n_estimators),
learning_rate=learning_rate,
max_depth=int(max_depth),
random_state=42
)
return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=3).mean()

lgb_bo = BayesianOptimization(
lgb_cv,
{'n_estimators': (100, 200), 'learning_rate': (0.01, 0.1), 'max_depth': (3, 10)}
)
lgb_bo.maximize(init_points=10, n_iter=10)

4.4. AdaBoost

 # 定义模型3: AdaBoost  
def ada_cv(n_estimators, learning_rate):
model = AdaBoostClassifier(
n_estimators=int(n_estimators),
learning_rate=learning_rate,
random_state=42
)
return cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=3).mean()

ada_bo = BayesianOptimization(
ada_cv,
{'n_estimators': (50, 200), 'learning_rate': (0.1, 1.0)}
)
ada_bo.maximize(init_points=10, n_iter=10)

4.5. 提取模型的最佳参数

 # 获取最佳参数  
def get_best_params(bo):
params = bo.max['params']
params['n_estimators'] = int(params['n_estimators']) # 转换为整数
if 'max_depth' in params:
params['max_depth'] = None if params['max_depth'] < 1 else int(params['max_depth'])
if 'min_samples_split' in params:
params['min_samples_split'] = int(params['min_samples_split'])
return params

rf_params = get_best_params(rf_bo)
gb_params = get_best_params(gb_bo)
ada_params = get_best_params(ada_bo)
lgb_params = get_best_params(lgb_bo)

4.6. 训练模型

# 模型训练和验证  
def train_and_evaluate(model, model_name, X_train, y_train, X_val, y_val):
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

metrics = {
'Accuracy': (y_pred == y_val).mean(),
'AUC': roc_auc_score(y_val, y_proba),
'MAE': mean_absolute_error(y_val, y_pred),
'MSE': mean_squared_error(y_val, y_pred)
}
print(f"\n{model_name} 验证集评估结果:")
for metric, value in metrics.items():
print(f"{metric}: {value:.4f}")

return model, metrics

5. 五、模型评估

5.1. 随机森林

# 训练并评估所有模型  
rf_model, rf_metrics = train_and_evaluate(
RandomForestClassifier(random_state=42, **rf_params),
"随机森林",
X_train, y_train, X_val, y_val
)

5.2. 梯度提升

gb_model, gb_metrics = train_and_evaluate(  
GradientBoostingClassifier(random_state=42, **gb_params),
"梯度提升",
X_train, y_train, X_val, y_val
)

5.3. AdaBoost

ada_model, ada_metrics = train_and_evaluate(  
AdaBoostClassifier(random_state=42, **ada_params),
"AdaBoost",
X_train, y_train, X_val, y_val
)

5.4. LightGBM

lgb_model, lgb_metrics = train_and_evaluate(  
lgb.LGBMClassifier(random_state=42, **lgb_params),
"LightGBM",
X_train, y_train, X_val, y_val
)

5.5. 模型对比与最佳模型选取

# 汇总所有模型的性能  
results = pd.DataFrame({
'RandomForest': rf_metrics,
'GradientBoosting': gb_metrics,
'AdaBoost': ada_metrics,
'LightGBM': lgb_metrics
})

print("\n模型性能对比:")
print(results)

# 选择最佳模型
best_model_name = results.idxmax(axis=1)['AUC']
best_models = {
'RandomForest': rf_model,
'GradientBoosting': gb_model,
'AdaBoost': ada_model,
'LightGBM': lgb_model
}
best_model = best_models[best_model_name]

print(f"\n最优模型是: {best_model_name},AUC: {results.loc['AUC', best_model_name]:.4f}")
模型性能对比:
RandomForest GradientBoosting AdaBoost LightGBM
Accuracy 0.925694 0.931111 0.914583 0.931528
AUC 0.973731 0.976323 0.966190 0.978273
MAE 0.074306 0.068889 0.085417 0.068472
MSE 0.074306 0.068889 0.085417 0.068472

最优模型是: LightGBM,AUC: 0.9783

选取 LightGBM模型,其AUC: 0.9783

6. 六、预测

6.1. 数据处理

data_test = pd.read_csv('202201108_test.csv')  # 测试集输入 
data_test = data_test.dropna()
data_test = pd.get_dummies(data_test) # 独热编码
# 删除重复和无用信息特征
columns_to_drop = ['信用评分', '个人教育程度_Master', '个人性别_male', '个人教育程度_Associate',
'个人教育程度_Bachelor', '个人性别_female', '个人教育程度_Doctorate',
'借记卡余额', '个人教育程度_High School']
data_test = data_test.drop(columns_to_drop, axis=1, errors='ignore')

6.2. 预测

# 测试集预测  
y_pred = best_model.predict(data_test)

# 保存预测结果
predictions = pd.DataFrame({
#'最优模型': best_model_name,
'预测结果': y_pred
})

predictions.to_csv('predictions.csv', index=False)
print(f"预测结果已保存至 predictions.csv")

# 保存最优模型以便于再次使用
dump(best_model, f"{best_model_name}.pkl")
print(f"{best_model_name} 模型已保存为 {best_model_name}.pkl")

6.3. AUC曲线

 # 绘制AUC曲线
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# 加载最优模型
best_model = load(f"{best_model_name}.pkl")

# 计算验证集的预测概率
y_pred_proba = best_model.predict_proba(X_val)[:, 1]

# 计算ROC曲线
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

# 绘制AUC曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

image-20250110020654976

6.4. 特征重要性可视化

# 4. 特征重要性可视化  
if hasattr(best_model, 'feature_importances_'):
importances = best_model.feature_importances_
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': importances}).sort_values(by='importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
plt.title(f'{best_model_name} 特征重要性 Top 20')
plt.xlabel('特征重要性')
plt.ylabel('特征名')
plt.tight_layout()
plt.show()

image.png

7. References