In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, roc_curve)
from sklearn.inspection import PartialDependenceDisplay
import time
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体(跨平台兼容)
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
# ========== 1. 读取数据 ==========
print("=" * 60)
print("开始运行:房企违约风险识别分析")
print("=" * 60)
# 相对路径读取数据(请确保 .csv 在同一目录下)
df = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27223125/.csv')
print(f"\n数据加载成功,样本总数:{df.shape[0]},变量数:{df.shape[1]}")
print(f"违约组(Default=1)样本数:{df['Default'].sum()}")
print(f"非违约组(Default=0)样本数:{df.shape[0] - df['Default'].sum()}")
# ========== 2. 定义核心变量 ==========
core_vars = ['LERATE', 'DBMORT', 'RST', 'TORATE']
target = 'Default'
# 所有特征(排除目标列)
features = [col for col in df.columns if col != target]
# 检查是否存在完美分离的特征(数据泄露风险)
print("\n--- 数据分离性检查 ---")
leakage_warning = False
for col in features:
min_def = df[df[target]==1][col].min()
max_non = df[df[target]==0][col].max()
if min_def > max_non:
print(f"警告:特征 {col} 违约组最小值 {min_def:.3f} > 非违约组最大值 {max_non:.3f},存在完全分离!")
leakage_warning = True
if not leakage_warning:
print("未发现完全分离的特征,数据合理。")
else:
print("注意:完全分离会导致模型过拟合(AUC=1),请使用更真实的伪数据。")
# ========== 3. 图表1:违约组与非违约组核心变量箱线图 ==========
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
axes = axes.flatten()
for i, var in enumerate(core_vars):
sns.boxplot(data=df, x=target, y=var, ax=axes[i], palette=['#1f77b4', '#ff7f0e'])
axes[i].set_title(f'{var} 分组对比', fontsize=12)
axes[i].set_xlabel('违约状态 (0=非违约, 1=违约)')
plt.suptitle('图1 重储地、快扩张、强冲击指标在违约/非违约组的分布', fontsize=14)
plt.tight_layout()
plt.savefig('图1_核心变量箱线图.png', dpi=300)
plt.show()
# ========== 4. 图表2:变量间相关性热图 ==========
important_fin = ['GRNP', 'GROP', 'NPM', 'JROA', 'NCF_L', 'GROR']
plot_vars = core_vars + important_fin
corr = df[plot_vars].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='RdBu_r', center=0, square=True)
plt.title('图2 核心变量与重要财务指标的相关性热图', fontsize=14)
plt.tight_layout()
plt.savefig('图2_相关性热图.png', dpi=300)
plt.show()
# ========== 5. 构建随机森林模型(合成RF模型) ==========
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=2024, stratify=y
)
print(f"\n训练集样本数:{len(X_train)},测试集样本数:{len(X_test)}")
rf = RandomForestClassifier(n_estimators=100, max_features=3,
random_state=2024, oob_score=True)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
oob = rf.oob_score_
print("\n" + "=" * 60)
print("模型评价结果(测试集)")
print("=" * 60)
print(f"准确率: {acc:.4f}")
print(f"精确率: {prec:.4f}")
print(f"召回率: {rec:.4f}")
print(f"F1分数: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print(f"袋外误差(OOB Score): {oob:.4f}")
# ========== 6. 图表3:变量重要性条形图 ==========
importance = pd.DataFrame({
'variable': features,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
importance['type'] = importance['variable'].apply(
lambda x: '核心变量' if x in core_vars else '财务指标'
)
plt.figure(figsize=(12, 8))
top15 = importance.head(15)
colors = ['#d62728' if v in core_vars else '#1f77b4' for v in top15['variable']]
sns.barplot(data=top15, x='importance', y='variable', palette=colors)
plt.title('图3 随机森林变量重要性排序(前15名)', fontsize=14)
plt.xlabel('平均基尼减少量(重要性)')
plt.ylabel('变量')
for i, row in enumerate(top15.itertuples()):
if row.variable in core_vars:
plt.text(row.importance + 0.002, i, ' ★', va='center', color='red', fontsize=12)
plt.tight_layout()
plt.savefig('图3_变量重要性.png', dpi=300)
plt.show()
print("\n核心变量重要性排名及贡献:")
for var in core_vars:
rank = importance[importance['variable'] == var].index[0] + 1
imp = importance[importance['variable'] == var]['importance'].values[0]
print(f"{var}: 排名第{rank}, 重要性 = {imp:.4f}")
# ========== 7. 图表4:部分依赖图 ==========
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()
for i, var in enumerate(core_vars):
PartialDependenceDisplay.from_estimator(rf, X_train, [var], ax=axes[i])
axes[i].set_title(f'部分依赖图:{var}', fontsize=12)
axes[i].set_xlabel(var)
axes[i].set_ylabel('预测违约概率')
plt.suptitle('图4 核心变量对违约概率的部分依赖效应', fontsize=14)
plt.tight_layout()
plt.savefig('图4_部分依赖图.png', dpi=300)
plt.show()
# ========== 8. 图表5:ROC曲线对比(有/无核心变量) ==========
finance_features = [f for f in features if f not in core_vars]
X_finance = df[finance_features]
X_fin_train, X_fin_test, y_fin_train, y_fin_test = train_test_split(
X_finance, y, test_size=0.3, random_state=2024, stratify=y
)
rf_fin = RandomForestClassifier(n_estimators=100, max_features=3, random_state=2024)
rf_fin.fit(X_fin_train, y_fin_train)
y_proba_fin = rf_fin.predict_proba(X_fin_test)[:, 1]
auc_fin = roc_auc_score(y_fin_test, y_proba_fin)
fpr_full, tpr_full, _ = roc_curve(y_test, y_proba)
fpr_fin, tpr_fin, _ = roc_curve(y_fin_test, y_proba_fin)
plt.figure(figsize=(8, 6))
plt.plot(fpr_full, tpr_full, label=f'全部变量模型 (AUC={auc:.3f})', linewidth=2)
plt.plot(fpr_fin, tpr_fin, label=f'仅财务指标模型 (AUC={auc_fin:.3f})', linewidth=2, linestyle='--')
plt.plot([0, 1], [0, 1], 'k--', label='随机猜测')
plt.xlabel('假阳性率 (1-特异性)')
plt.ylabel('真阳性率 (灵敏度)')
plt.title('图5 ROC曲线对比:加入核心变量前后的模型性能')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig('图5_ROC曲线对比.png', dpi=300)
plt.show()
# ========== 9. 分析三者关系 ==========
core_imp_sum = importance[importance['variable'].isin(core_vars)]['importance'].sum()
total_imp = importance['importance'].sum()
print("\n" + "=" * 60)
print("重储地、快扩张、强冲击对违约风险的总体贡献")
print("=" * 60)
print(f"核心变量总重要性占比: {core_imp_sum/total_imp:.2%}")
for var in core_vars:
imp = importance[importance['variable'] == var]['importance'].values[0]
print(f" {var}: {imp/total_imp:.2%}")
corr_core = df[core_vars].corr()
print("\n核心变量之间的相关系数矩阵:")
print(corr_core)
auc_dim = {}
for dim, vars_list in zip(['重储地', '快扩张', '强冲击'],
[['LERATE'], ['DBMORT', 'RST'], ['TORATE']]):
X_dim = df[vars_list]
X_dim_train, X_dim_test, y_dim_train, y_dim_test = train_test_split(
X_dim, y, test_size=0.3, random_state=2024, stratify=y
)
rf_dim = RandomForestClassifier(
n_estimators=100,
max_features=min(2, len(vars_list)),
random_state=2024
)
rf_dim.fit(X_dim_train, y_dim_train)
y_proba_dim = rf_dim.predict_proba(X_dim_test)[:, 1]
auc_dim[dim] = roc_auc_score(y_dim_test, y_proba_dim)
print("\n各维度单独模型的AUC值:")
for dim, a in auc_dim.items():
print(f"{dim}: AUC = {a:.4f}")
print("说明:强冲击(TORATE)单独预测能力最强,快扩张次之,重储地单独效果较弱;三者结合才能充分识别违约风险。")
# ========== 10. 保存运行日志 ==========
log_filename = f"running_log_{time.strftime('%Y%m%d_%H%M%S')}.txt"
with open(log_filename, 'w', encoding='utf-8') as f:
f.write("=" * 60 + "\n")
f.write("房企违约风险识别分析运行日志\n")
f.write("=" * 60 + "\n")
f.write(f"运行时间:{time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"样本总数:{len(df)}\n")
f.write(f"违约组:{df['Default'].sum()},非违约组:{len(df)-df['Default'].sum()}\n")
f.write(f"训练集比例:70%,测试集比例:30%\n")
f.write(f"随机森林参数:n_estimators=100, max_features=3, oob_score=True\n\n")
f.write("模型评价指标(测试集):\n")
f.write(f"准确率:{acc:.4f}\n")
f.write(f"精确率:{prec:.4f}\n")
f.write(f"召回率:{rec:.4f}\n")
f.write(f"F1分数:{f1:.4f}\n")
f.write(f"AUC:{auc:.4f}\n")
f.write(f"OOB Score:{oob:.4f}\n\n")
f.write("核心变量重要性及排名:\n")
for var in core_vars:
rank = importance[importance['variable'] == var].index[0] + 1
imp = importance[importance['variable'] == var]['importance'].values[0]
f.write(f"{var}: 排名第{rank}, 重要性={imp:.4f}\n")
f.write(f"\n核心变量总重要性占比:{core_imp_sum/total_imp:.2%}\n")
f.write("\n核心变量相关系数矩阵:\n")
f.write(corr_core.to_string())
f.write("\n\n各维度单独模型AUC:\n")
for dim, a in auc_dim.items():
f.write(f"{dim}: AUC={a:.4f}\n")
f.write("\n" + "=" * 60 + "\n")
f.write("日志结束\n")
print(f"\n运行日志已保存为:{log_filename}")
print("所有图表已保存为PNG文件,请查看当前文件夹。")
print("分析完成。")
============================================================ 开始运行:房企违约风险识别分析 ============================================================ 数据加载成功,样本总数:10000,变量数:30 违约组(Default=1)样本数:6000 非违约组(Default=0)样本数:4000 --- 数据分离性检查 --- 未发现完全分离的特征,数据合理。
训练集样本数:7000,测试集样本数:3000 ============================================================ 模型评价结果(测试集) ============================================================ 准确率: 0.9850 精确率: 0.9861 召回率: 0.9889 F1分数: 0.9875 AUC: 0.9989 袋外误差(OOB Score): 0.9821
核心变量重要性排名及贡献: LERATE: 排名第29, 重要性 = 0.2336 DBMORT: 排名第28, 重要性 = 0.1084 RST: 排名第27, 重要性 = 0.0022 TORATE: 排名第26, 重要性 = 0.1456
============================================================
重储地、快扩张、强冲击对违约风险的总体贡献
============================================================
核心变量总重要性占比: 48.98%
LERATE: 23.36%
DBMORT: 10.84%
RST: 0.22%
TORATE: 14.56%
核心变量之间的相关系数矩阵:
LERATE DBMORT RST TORATE
LERATE 1.000000 0.409740 0.002421 -0.491255
DBMORT 0.409740 1.000000 0.002167 -0.332017
RST 0.002421 0.002167 1.000000 0.004013
TORATE -0.491255 -0.332017 0.004013 1.000000
各维度单独模型的AUC值:
重储地: AUC = 0.9073
快扩张: AUC = 0.7981
强冲击: AUC = 0.8267
说明:强冲击(TORATE)单独预测能力最强,快扩张次之,重储地单独效果较弱;三者结合才能充分识别违约风险。
运行日志已保存为:running_log_20260627_051803.txt
所有图表已保存为PNG文件,请查看当前文件夹。
分析完成。