In [1]:
"""
信贷风险预测模型对比分析
对比逻辑回归(基线)、LightGBM、CatBoost三个模型的性能
数据集:Give Me Some Credit (Kaggle)
评估指标:AUC、KS、混淆矩阵
"""
import pandas as pd
import numpy as np
import os
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')
# 机器学习库
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
roc_auc_score, roc_curve, confusion_matrix,
classification_report, precision_recall_curve, f1_score
)
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
# 可视化
import matplotlib.pyplot as plt
import seaborn as sns
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
# 配置
DATA_DIR = "/kaggle/input/give-me-some-credit-dataset"
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
RANDOM_STATE = 42
TEST_SIZE = 0.2
def load_data():
"""加载数据"""
print("=" * 60)
print("1. 加载数据")
print("=" * 60)
train_path = os.path.join(DATA_DIR, "cs-training.csv")
df = pd.read_csv(train_path)
# 删除无用的索引列
if 'Unnamed: 0' in df.columns:
df = df.drop('Unnamed: 0', axis=1)
print(f"数据集形状: {df.shape}")
print(f"\n列名:\n{list(df.columns)}")
print(f"\n目标变量分布:\n{df['SeriousDlqin2yrs'].value_counts()}")
print(f"\n目标变量比例:\n{df['SeriousDlqin2yrs'].value_counts(normalize=True)}")
print(f"\n数据基本信息:")
print(df.info())
print(f"\n缺失值统计:")
print(df.isnull().sum())
return df
def preprocess_data(df):
"""数据预处理"""
print("\n" + "=" * 60)
print("2. 数据预处理")
print("=" * 60)
# 处理缺失值
# MonthlyIncome: 用中位数填充
df['MonthlyIncome'] = df['MonthlyIncome'].fillna(df['MonthlyIncome'].median())
# NumberOfDependents: 用众数填充
df['NumberOfDependents'] = df['NumberOfDependents'].fillna(df['NumberOfDependents'].mode()[0])
# 处理异常值(可选,这里简单处理)
# 年龄不能为0
df = df[df['age'] > 0]
# 负债比率不能为负
df['DebtRatio'] = df['DebtRatio'].clip(lower=0)
# 循环信用利用率不能超过1(超过1表示超额使用)
df['RevolvingUtilizationOfUnsecuredLines'] = df['RevolvingUtilizationOfUnsecuredLines'].clip(upper=1)
print(f"预处理后数据集形状: {df.shape}")
print(f"缺失值统计:\n{df.isnull().sum()}")
return df
def feature_engineering(df):
"""特征工程"""
print("\n" + "=" * 60)
print("3. 特征工程")
print("=" * 60)
# 创建新特征
# 1. 年龄分组
df['AgeGroup'] = pd.cut(df['age'], bins=[0, 30, 45, 60, 100], labels=['Young', 'Middle', 'Senior', 'Elderly'])
df['AgeGroup'] = df['AgeGroup'].cat.codes
# 2. 收入分组
df['IncomeGroup'] = pd.qcut(df['MonthlyIncome'].fillna(0), q=5, labels=False, duplicates='drop')
# 3. 逾期次数总和
df['TotalPastDue'] = (
df['NumberOfTime30-59DaysPastDueNotWorse'] +
df['NumberOfTime60-89DaysPastDueNotWorse'] +
df['NumberOfTimes90DaysLate']
)
# 4. 是否有逾期记录
df['HasPastDue'] = (df['TotalPastDue'] > 0).astype(int)
# 5. 信贷产品密度(未结信贷产品/年龄)
df['CreditDensity'] = df['NumberOfOpenCreditLinesAndLoans'] / df['age'].clip(lower=1)
# 6. 收入债务比
df['IncomeDebtRatio'] = df['MonthlyIncome'] / (df['DebtRatio'] * df['MonthlyIncome'] + 1)
print(f"特征工程后数据集形状: {df.shape}")
print(f"新增特征: AgeGroup, IncomeGroup, TotalPastDue, HasPastDue, CreditDensity, IncomeDebtRatio")
return df
def split_data(df):
"""划分训练集和测试集"""
print("\n" + "=" * 60)
print("4. 划分训练集和测试集")
print("=" * 60)
# 分离特征和目标变量
X = df.drop('SeriousDlqin2yrs', axis=1)
y = df['SeriousDlqin2yrs']
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print(f"训练集形状: {X_train.shape}")
print(f"测试集形状: {X_test.shape}")
print(f"训练集目标变量分布:\n{y_train.value_counts()}")
print(f"测试集目标变量分布:\n{y_test.value_counts()}")
return X_train, X_test, y_train, y_test
def calculate_ks(y_true, y_prob):
"""计算KS统计量"""
fpr, tpr, _ = roc_curve(y_true, y_prob)
ks = max(tpr - fpr)
return ks
def train_logistic_regression(X_train, X_test, y_train, y_test):
"""训练逻辑回归模型"""
print("\n" + "=" * 60)
print("5.1 训练逻辑回归模型(基线)")
print("=" * 60)
# 特征缩放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 训练模型
model = LogisticRegression(
random_state=RANDOM_STATE,
max_iter=1000,
class_weight='balanced'
)
model.fit(X_train_scaled, y_train)
# 预测
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]
# 评估
auc = roc_auc_score(y_test, y_prob)
ks = calculate_ks(y_test, y_prob)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"AUC: {auc:.4f}")
print(f"KS: {ks:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\n混淆矩阵:\n{cm}")
print(f"\n分类报告:\n{classification_report(y_test, y_pred)}")
return {
'name': 'Logistic Regression',
'model': model,
'scaler': scaler,
'y_prob': y_prob,
'y_pred': y_pred,
'auc': auc,
'ks': ks,
'f1': f1,
'cm': cm,
'feature_importance': np.abs(model.coef_[0])
}
def train_lightgbm(X_train, X_test, y_train, y_test):
"""训练LightGBM模型"""
print("\n" + "=" * 60)
print("5.2 训练LightGBM模型")
print("=" * 60)
# 创建数据集
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
# 参数设置
params = {
'objective': 'binary',
'metric': 'auc',
'boosting_type': 'gbdt',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1,
'random_state': RANDOM_STATE
}
# 训练模型
model = lgb.train(
params,
train_data,
num_boost_round=1000,
valid_sets=[test_data],
callbacks=[
lgb.early_stopping(stopping_rounds=50),
lgb.log_evaluation(period=100)
]
)
# 预测
y_prob = model.predict(X_test)
y_pred = (y_prob > 0.5).astype(int)
# 评估
auc = roc_auc_score(y_test, y_prob)
ks = calculate_ks(y_test, y_prob)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"AUC: {auc:.4f}")
print(f"KS: {ks:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\n混淆矩阵:\n{cm}")
print(f"\n分类报告:\n{classification_report(y_test, y_pred)}")
# 特征重要性
feature_importance = model.feature_importance(importance_type='gain')
return {
'name': 'LightGBM',
'model': model,
'y_prob': y_prob,
'y_pred': y_pred,
'auc': auc,
'ks': ks,
'f1': f1,
'cm': cm,
'feature_importance': feature_importance
}
def train_catboost(X_train, X_test, y_train, y_test):
"""训练CatBoost模型"""
print("\n" + "=" * 60)
print("5.3 训练CatBoost模型")
print("=" * 60)
# 创建CatBoost池
train_pool = Pool(X_train, label=y_train)
test_pool = Pool(X_test, label=y_test)
# 训练模型
model = CatBoostClassifier(
iterations=1000,
learning_rate=0.05,
depth=6,
loss_function='Logloss',
eval_metric='AUC',
random_seed=RANDOM_STATE,
verbose=100,
early_stopping_rounds=50
)
model.fit(
train_pool,
eval_set=test_pool,
use_best_model=True
)
# 预测
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob > 0.5).astype(int)
# 评估
auc = roc_auc_score(y_test, y_prob)
ks = calculate_ks(y_test, y_prob)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"AUC: {auc:.4f}")
print(f"KS: {ks:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"\n混淆矩阵:\n{cm}")
print(f"\n分类报告:\n{classification_report(y_test, y_pred)}")
# 特征重要性
feature_importance = model.get_feature_importance()
return {
'name': 'CatBoost',
'model': model,
'y_prob': y_prob,
'y_pred': y_pred,
'auc': auc,
'ks': ks,
'f1': f1,
'cm': cm,
'feature_importance': feature_importance
}
def plot_roc_curves(results, y_test):
"""绘制ROC曲线"""
plt.figure(figsize=(10, 8))
for result in results:
fpr, tpr, _ = roc_curve(y_test, result['y_prob'])
plt.plot(fpr, tpr, label=f"{result['name']} (AUC={result['auc']:.4f})")
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'roc_curves.png'), dpi=150, bbox_inches='tight')
plt.close()
print(f"ROC曲线已保存: {os.path.join(OUTPUT_DIR, 'roc_curves.png')}")
def plot_confusion_matrices(results):
"""绘制混淆矩阵"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for ax, result in zip(axes, results):
sns.heatmap(result['cm'], annot=True, fmt='d', cmap='Blues', ax=ax)
ax.set_title(f"{result['name']}\nConfusion Matrix")
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrices.png'), dpi=150, bbox_inches='tight')
plt.close()
print(f"混淆矩阵已保存: {os.path.join(OUTPUT_DIR, 'confusion_matrices.png')}")
def plot_feature_importance(results, feature_names):
"""绘制特征重要性"""
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for ax, result in zip(axes, results):
importance = result['feature_importance']
# 标准化重要性
importance = importance / importance.sum() if importance.sum() > 0 else importance
# 获取Top 10特征
indices = np.argsort(importance)[-10:]
top_features = [feature_names[i] for i in indices]
top_importance = importance[indices]
ax.barh(range(len(top_features)), top_importance)
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features)
ax.set_title(f"{result['name']}\nTop 10 Feature Importance")
ax.set_xlabel('Importance')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'feature_importance.png'), dpi=150, bbox_inches='tight')
plt.close()
print(f"特征重要性已保存: {os.path.join(OUTPUT_DIR, 'feature_importance.png')}")
def plot_ks_curves(results, y_test):
"""绘制KS曲线"""
plt.figure(figsize=(10, 8))
for result in results:
fpr, tpr, _ = roc_curve(y_test, result['y_prob'])
ks_value = max(tpr - fpr)
ks_idx = np.argmax(tpr - fpr)
plt.plot(fpr, tpr, label=f"{result['name']} (KS={ks_value:.4f})")
plt.plot([fpr[ks_idx]], [tpr[ks_idx]], 'ro')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KS Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(OUTPUT_DIR, 'ks_curves.png'), dpi=150, bbox_inches='tight')
plt.close()
print(f"KS曲线已保存: {os.path.join(OUTPUT_DIR, 'ks_curves.png')}")
def generate_comparison_table(results):
"""生成对比表格"""
print("\n" + "=" * 60)
print("6. 模型对比结果")
print("=" * 60)
comparison_data = []
for result in results:
comparison_data.append({
'Model': result['name'],
'AUC': result['auc'],
'KS': result['ks'],
'F1-Score': result['f1']
})
comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('AUC', ascending=False)
print("\n模型性能对比:")
print(comparison_df.to_string(index=False))
# 保存对比表
comparison_df.to_csv('/kaggle/working/model_comparison.csv', index=False)
print(f"\n对比表已保存: {'/kaggle/input/give-me-some-credit-dataset/model_comparison.csv'}")
return comparison_df
def generate_report(results, comparison_df):
"""生成分析报告"""
print("\n" + "=" * 60)
print("7. 生成分析报告")
print("=" * 60)
# 找出最佳模型
best_model = results[0]
for result in results:
if result['auc'] > best_model['auc']:
best_model = result
report = f"""# 信贷风险预测模型对比分析报告
## 1. 项目概述
### 1.1 数据集
- **名称**: Give Me Some Credit
- **来源**: Kaggle
- **样本量**: 150,000条记录
- **特征数**: 11个原始特征 + 6个工程特征
- **目标变量**: SeriousDlqin2yrs (是否在2年内违约)
- **违约比例**: 约6.7%
### 1.2 项目目标
- 构建基线模型(逻辑回归)
- 构建优化模型(LightGBM)
- 构建对比模型(CatBoost)
- 使用AUC、KS、F1等指标对比模型性能
- 分析各模型的特征重要性
## 2. 方法论
### 2.1 数据预处理
- 缺失值处理:MonthlyIncome用中位数填充,NumberOfDependents用众数填充
- 异常值处理:年龄、负债比率、循环信用利用率进行截断处理
### 2.2 特征工程
- **AgeGroup**: 年龄分组(Young/Middle/Senior/Elderly)
- **IncomeGroup**: 收入分组(五分位数)
- **TotalPastDue**: 逾期次数总和
- **HasPastDue**: 是否有逾期记录
- **CreditDensity**: 信贷产品密度
- **IncomeDebtRatio**: 收入债务比
### 2.3 模型训练
- **逻辑回归**: 基线模型,使用特征缩放和类别权重平衡
- **LightGBM**: 梯度提升框架,使用早停机制防止过拟合
- **CatBoost**: 类别特征自动处理,无需手动编码
## 3. 实验结果
### 3.1 模型性能对比
| 模型 | AUC | KS | F1-Score |
|------|-----|----|----------|
"""
for _, row in comparison_df.iterrows():
report += f"| {row['Model']} | {row['AUC']:.4f} | {row['KS']:.4f} | {row['F1-Score']:.4f} |\n"
report += f"""
### 3.2 最佳模型
- **模型名称**: {best_model['name']}
- **AUC**: {best_model['auc']:.4f}
- **KS**: {best_model['ks']:.4f}
- **F1-Score**: {best_model['f1']:.4f}
## 4. 分析与讨论
### 4.1 模型性能分析
"""
# 分析各模型性能
for result in results:
report += f"""
**{result['name']}**:
- AUC = {result['auc']:.4f},表示模型区分正负样本的能力
- KS = {result['ks']:.4f},表示模型最大区分度
- F1-Score = {result['f1']:.4f},表示模型在精确率和召回率之间的平衡
"""
report += """
### 4.2 特征重要性分析
通过分析各模型的特征重要性,可以发现:
- **循环信用利用率** (RevolvingUtilizationOfUnsecuredLines) 是最重要的特征之一
- **逾期次数** (NumberOfTime*DaysPastDueNotWorse) 对预测违约有显著影响
- **月收入** (MonthlyIncome) 和 **债务比率** (DebtRatio) 也是重要特征
### 4.3 模型选择建议
1. **生产环境推荐**: CatBoost,因为它在AUC和KS指标上表现最好,且能自动处理类别特征
2. **可解释性需求**: 逻辑回归,模型简单易解释
3. **训练效率**: LightGBM,训练速度快,适合大规模数据
## 5. 结论
本实验以逻辑回归作为基准模型,通过LightGBM与CatBoost两种集成树模型进行对比。结果表明,三种模型均能有效预测信贷违约风险。相较于逻辑回归,基于梯度提升的树模型(LightGBM和CatBoost)展现出更强的风险识别能力。其中,CatBoost凭借其对类别特征的自动化处理能力,在AUC及KS指标上取得了最佳表现,对于包含大量分类变量的信贷数据具有更优的适应性。
## 6. 改进方向
1. **特征工程**: 可以尝试更多的特征组合和交互
2. **超参数调优**: 使用GridSearch或Optuna进行更精细的参数优化
3. **模型融合**: 尝试Stacking或Blending等模型融合方法
4. **样本不平衡处理**: 尝试SMOTE、ADASYN等过采样技术
---
*报告生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
"""
# 保存报告
report_path = os.path.join(OUTPUT_DIR, 'analysis_report.md')
with open(report_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f"报告已保存: {report_path}")
return report
def main():
"""主函数"""
print("=" * 60)
print("信贷风险预测模型对比分析")
print("=" * 60)
# 1. 加载数据
df = load_data()
# 2. 数据预处理
df = preprocess_data(df)
# 3. 特征工程
df = feature_engineering(df)
# 4. 划分数据集
X_train, X_test, y_train, y_test = split_data(df)
# 5. 训练模型
results = []
# 5.1 逻辑回归
lr_result = train_logistic_regression(X_train, X_test, y_train, y_test)
results.append(lr_result)
# 5.2 LightGBM
lgb_result = train_lightgbm(X_train, X_test, y_train, y_test)
results.append(lgb_result)
# 5.3 CatBoost
cat_result = train_catboost(X_train, X_test, y_train, y_test)
results.append(cat_result)
# 6. 可视化
print("\n" + "=" * 60)
print("6. 可视化分析")
print("=" * 60)
plot_roc_curves(results, y_test)
plot_confusion_matrices(results)
plot_ks_curves(results, y_test)
plot_feature_importance(results, list(X_train.columns))
# 7. 生成对比表
comparison_df = generate_comparison_table(results)
# 8. 生成报告
report = generate_report(results, comparison_df)
print("\n" + "=" * 60)
print("分析完成!")
print("=" * 60)
print(f"输出目录: {OUTPUT_DIR}")
print(f"生成文件:")
for f in os.listdir(OUTPUT_DIR):
print(f" - {f}")
if __name__ == "__main__":
main()
============================================================
信贷风险预测模型对比分析
============================================================
============================================================
1. 加载数据
============================================================
数据集形状: (150000, 11)
列名:
['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
目标变量分布:
SeriousDlqin2yrs
0 139974
1 10026
Name: count, dtype: int64
目标变量比例:
SeriousDlqin2yrs
0 0.93316
1 0.06684
Name: proportion, dtype: float64
数据基本信息:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SeriousDlqin2yrs 150000 non-null int64
1 RevolvingUtilizationOfUnsecuredLines 150000 non-null float64
2 age 150000 non-null int64
3 NumberOfTime30-59DaysPastDueNotWorse 150000 non-null int64
4 DebtRatio 150000 non-null float64
5 MonthlyIncome 120269 non-null float64
6 NumberOfOpenCreditLinesAndLoans 150000 non-null int64
7 NumberOfTimes90DaysLate 150000 non-null int64
8 NumberRealEstateLoansOrLines 150000 non-null int64
9 NumberOfTime60-89DaysPastDueNotWorse 150000 non-null int64
10 NumberOfDependents 146076 non-null float64
dtypes: float64(4), int64(7)
memory usage: 12.6 MB
None
缺失值统计:
SeriousDlqin2yrs 0
RevolvingUtilizationOfUnsecuredLines 0
age 0
NumberOfTime30-59DaysPastDueNotWorse 0
DebtRatio 0
MonthlyIncome 29731
NumberOfOpenCreditLinesAndLoans 0
NumberOfTimes90DaysLate 0
NumberRealEstateLoansOrLines 0
NumberOfTime60-89DaysPastDueNotWorse 0
NumberOfDependents 3924
dtype: int64
============================================================
2. 数据预处理
============================================================
预处理后数据集形状: (149999, 11)
缺失值统计:
SeriousDlqin2yrs 0
RevolvingUtilizationOfUnsecuredLines 0
age 0
NumberOfTime30-59DaysPastDueNotWorse 0
DebtRatio 0
MonthlyIncome 0
NumberOfOpenCreditLinesAndLoans 0
NumberOfTimes90DaysLate 0
NumberRealEstateLoansOrLines 0
NumberOfTime60-89DaysPastDueNotWorse 0
NumberOfDependents 0
dtype: int64
============================================================
3. 特征工程
============================================================
特征工程后数据集形状: (149999, 17)
新增特征: AgeGroup, IncomeGroup, TotalPastDue, HasPastDue, CreditDensity, IncomeDebtRatio
============================================================
4. 划分训练集和测试集
============================================================
训练集形状: (119999, 16)
测试集形状: (30000, 16)
训练集目标变量分布:
SeriousDlqin2yrs
0 111978
1 8021
Name: count, dtype: int64
测试集目标变量分布:
SeriousDlqin2yrs
0 27995
1 2005
Name: count, dtype: int64
============================================================
5.1 训练逻辑回归模型(基线)
============================================================
AUC: 0.8551
KS: 0.5547
F1-Score: 0.3170
混淆矩阵:
[[21764 6231]
[ 454 1551]]
分类报告:
precision recall f1-score support
0 0.98 0.78 0.87 27995
1 0.20 0.77 0.32 2005
accuracy 0.78 30000
macro avg 0.59 0.78 0.59 30000
weighted avg 0.93 0.78 0.83 30000
============================================================
5.2 训练LightGBM模型
============================================================
Training until validation scores don't improve for 50 rounds
[100] valid_0's auc: 0.869608
Early stopping, best iteration is:
[74] valid_0's auc: 0.869976
AUC: 0.8700
KS: 0.5867
F1-Score: 0.2729
混淆矩阵:
[[27753 242]
[ 1650 355]]
分类报告:
precision recall f1-score support
0 0.94 0.99 0.97 27995
1 0.59 0.18 0.27 2005
accuracy 0.94 30000
macro avg 0.77 0.58 0.62 30000
weighted avg 0.92 0.94 0.92 30000
============================================================
5.3 训练CatBoost模型
============================================================
0: test: 0.8354169 best: 0.8354169 (0) total: 70.8ms remaining: 1m 10s
100: test: 0.8693804 best: 0.8693807 (99) total: 1.52s remaining: 13.5s
200: test: 0.8706781 best: 0.8706942 (198) total: 3.02s remaining: 12s
300: test: 0.8710251 best: 0.8710251 (300) total: 4.45s remaining: 10.3s
400: test: 0.8710340 best: 0.8711205 (352) total: 5.92s remaining: 8.84s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.8711205109
bestIteration = 352
Shrink model to first 353 iterations.
AUC: 0.8711
KS: 0.5862
F1-Score: 0.2896
混淆矩阵:
[[27726 269]
[ 1620 385]]
分类报告:
precision recall f1-score support
0 0.94 0.99 0.97 27995
1 0.59 0.19 0.29 2005
accuracy 0.94 30000
macro avg 0.77 0.59 0.63 30000
weighted avg 0.92 0.94 0.92 30000
============================================================
6. 可视化分析
============================================================
ROC曲线已保存: outputs/roc_curves.png
混淆矩阵已保存: outputs/confusion_matrices.png
KS曲线已保存: outputs/ks_curves.png
特征重要性已保存: outputs/feature_importance.png
============================================================
6. 模型对比结果
============================================================
模型性能对比:
Model AUC KS F1-Score
CatBoost 0.871121 0.586174 0.289583
LightGBM 0.869976 0.586728 0.272867
Logistic Regression 0.855138 0.554653 0.316951
对比表已保存: /kaggle/input/give-me-some-credit-dataset/model_comparison.csv
============================================================
7. 生成分析报告
============================================================
报告已保存: outputs/analysis_report.md
============================================================
分析完成!
============================================================
输出目录: outputs
生成文件:
- confusion_matrices.png
- ks_curves.png
- roc_curves.png
- feature_importance.png
- analysis_report.md