In [1]:
import sys
pkgs = ["pandas", "numpy", "scikit-learn", "imbalanced-learn",
        "lightgbm", "xgboost", "matplotlib", "seaborn"]
!{sys.executable} -m pip install {" ".join(pkgs)}
Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.3.3)

Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.4.6)

Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)

Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.12/dist-packages (0.14.1)

Requirement already satisfied: lightgbm in /usr/local/lib/python3.12/dist-packages (4.6.0)

Requirement already satisfied: xgboost in /usr/local/lib/python3.12/dist-packages (3.2.0)

Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0)

Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2)

Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0)

Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2)

Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2026.1)

Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3)

Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.5.3)

Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)

Requirement already satisfied: sklearn-compat<0.2,>=0.1.5 in /usr/local/lib/python3.12/dist-packages (from imbalanced-learn) (0.1.5)

Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.29.7)

Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3)

Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1)

Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.1)

Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0)

Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0)

Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0)

Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2)

Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)

In [2]:
# 导入时间模块,用于计算程序运行时间import time# 导入日期时间模块,用于生成时间戳import datetime# 导入 pandas,用于读取 .dta 文件和处理数据import pandas as pd# 导入 numpy,用于数组运算import numpy as np# 导入 matplotlib 绘图库import matplotlib.pyplot as plt# 导入 seaborn,用于绘制混淆矩阵热力图import seaborn as sns# 从 sklearn 导入数据划分函数from sklearn.model_selection import train_test_split# 导入分层 K 折交叉验证from sklearn.model_selection import StratifiedKFold# 导入交叉验证预测函数from sklearn.model_selection import cross_val_predict# 导入逻辑回归模型from sklearn.linear_model import LogisticRegression# 导入随机森林模型from sklearn.ensemble import RandomForestClassifier# 导入多层感知机(神经网络)模型from sklearn.neural_network import MLPClassifier# 导入评估指标from sklearn.metrics import roc_auc_scorefrom sklearn.metrics import accuracy_scorefrom sklearn.metrics import precision_scorefrom sklearn.metrics import recall_scorefrom sklearn.metrics import f1_scorefrom sklearn.metrics import roc_curve      # 计算 ROC 曲线数据from sklearn.metrics import confusion_matrix  # 混淆矩阵# 导入 SMOTE 过采样处理不平衡数据from imblearn.over_sampling import SMOTE# 导入 LightGBM 模型import lightgbm as lgb# 导入 XGBoost 模型(额外增加)import xgboost as xgb# 忽略警告信息,保持输出整洁import warningswarnings.filterwarnings('ignore')# 设置 matplotlib 中文字体,防止图表中文乱码plt.rcParams['font.sans-serif'] = ['SimHei']# 解决负号显示问题plt.rcParams['axes.unicode_minus'] = False# 获取当前时间并格式化为字符串,作为开始时间戳# 打印开始时间start_timestamp = datetime.datetime.now()print(f"代码开始运行时间: {start_timestamp}")# 记录开始运行的秒数(用于计算耗时)start_seconds = time.time()# ========== 1. 读取数据 ==========print("\n[1] 读取家庭经济数据 (famecon)...")# 读取 CFPS 2022 家庭经济数据文件(Stata 格式)df_econ = pd.read_stata(    "cfps2022famecon_202410.dta",  # 文件名    convert_categoricals=False     # 不自动转换分类变量)# 打印数据维度(行数, 列数)print(f"    famecon 形状: {df_econ.shape}")print("读取家庭关系数据 (famconf)...")# 读取家庭关系数据文件df_fam = pd.read_stata(    "cfps2022famconf_202410.dta",    convert_categoricals=False)print(f"    famconf 形状: {df_fam.shape}")# ========== 2. 合并特征 ==========# 定义需要从 famconf 中合并的额外特征列名extra_cols = ['familysize22', 'fid_urban22']# 检查这些列是否真实存在于 famconf 中existing_extra = [c for c in extra_cols if c in df_fam.columns]# 如果存在至少一个额外特征,则进行合并if existing_extra:    # 按家庭ID (fid22) 左连接,将 famconf 的特征加到 famecon 中    df = df_econ.merge(        df_fam[['fid22'] + existing_extra],  # 选择需要合并的列        on='fid22',                          # 连接键        how='left'                           # 左连接,保留所有 famecon 样本    )    print(f"合并后新增特征: {existing_extra}")else:    # 如果没有额外特征,直接复制 famecon    df = df_econ.copy()# 打印合并后的总样本数print(f"合并后总样本数: {df.shape[0]}")# ========== 3. 构造目标变量 risk ==========# 构造 risk 所需的四个核心字段required = ['mortage', 'fincome1', 'house_debts', 'total_asset']# 检查这些字段是否都存在missing = [c for c in required if c not in df.columns]if missing:    # 如果缺少字段,抛出错误    raise ValueError(f"缺少构造 risk 的字段: {missing}")# 计算月收入(年收入除以12)monthly_income = df['fincome1'] / 12# 计算偿债收入比 DSR = 月供 / 月收入,加极小值避免除零dsr = df['mortage'] / (monthly_income + 1e-6)# 计算资产负债率 DTA = 住房负债 / 总资产dta = df['house_debts'] / (df['total_asset'] + 1e-6)# 若 DSR>0.5 或 DTA>0.8 则标记为风险(1),否则为无风险(0)df['risk'] = ((dsr > 0.5) | (dta > 0.8)).astype(int)# 打印风险标签的分布比例print(f"风险标签分布:\n{df['risk'].value_counts(normalize=True)}")# ========== 4. 特征集(剔除泄露变量) ==========# 这些变量直接用于计算 risk,不能作为特征(防止数据泄露)leaked = ['mortage', 'fincome1', 'house_debts', 'total_asset']# 候选特征字典:论文变量名 -> 数据中的实际列名candidates = {    'savings': 'savings',                      # 流动资产    'financial_product': 'financial_product',  # 金融资产    'land_asset': 'land_asset',                # 土地资产    'pce': 'pce',                              # 人均消费    'resivalue': 'resivalue',                  # 自住房价值    'durables_asset': 'durables_asset',        # 耐用资产    'fixed_asset': 'fixed_asset',              # 固定资产    'nonhousing_debts': 'nonhousing_debts',    # 非住房负债}# 如果之前合并了额外特征,也加入候选if existing_extra:    for col in existing_extra:        candidates[col] = col# 筛选出实际存在于数据中且不在泄露列表中的列名feature_cols = []for col in candidates.values():    if col in df.columns and col not in leaked:        feature_cols.append(col)# 打印最终使用的特征列及数量print(f"使用的特征列(共 {len(feature_cols)} 个): {feature_cols}")# 提取特征矩阵 XX = df[feature_cols].copy()# 提取目标变量 yy = df['risk'].copy()# ========== 5. 数据清洗 ==========# 设置阈值:如果某行缺失的特征数超过一半,则删除该行th = len(feature_cols) * 0.5X = X.dropna(thresh=th)# 同步删除对应的 y 值y = y.loc[X.index]# 对剩余的缺失值用中位数填充X = X.fillna(X.median())# 对每个数值特征进行 3σ 异常值截尾for col in X.columns:    # 只处理数值列    if np.issubdtype(X[col].dtype, np.number):        m = X[col].mean()     # 均值        s = X[col].std()      # 标准差        # 将超出 [m-3s, m+3s] 的值截断到边界        X[col] = X[col].clip(m - 3*s, m + 3*s)# 打印清洗后的样本数量print(f"清洗后样本数: {X.shape[0]}")# ========== 6. 划分训练/测试集 ==========# 分层抽样,按 y 的比例划分,保证训练/测试集中风险比例一致X_train, X_test, y_train, y_test = train_test_split(    X, y, test_size=0.2, random_state=42, stratify=y)# ========== 7. SMOTE 过采样 ==========# 初始化 SMOTE,固定随机种子smote = SMOTE(random_state=42)# 仅在训练集上进行过采样,使正负样本数量平衡X_train_res, y_train_res = smote.fit_resample(X_train, y_train)# 打印过采样后的训练集大小和风险比例print(f"SMOTE 后训练集: {X_train_res.shape[0]} 样本"      f", 风险比例: {y_train_res.mean():.3f}")# ========== 8. 定义评估函数 ==========def train_and_predict(model, X_tr, y_tr, X_te, name):    """    训练模型,预测测试集,并打印评估指标    """    # 训练模型    model.fit(X_tr, y_tr)    # 预测类别    y_pred = model.predict(X_te)    # 预测为正类的概率    y_proba = model.predict_proba(X_te)[:, 1]    # 计算 AUC    auc = roc_auc_score(y_test, y_proba)    # 准确率    acc = accuracy_score(y_test, y_pred)    # 精确率(防止除零警告)    prec = precision_score(y_test, y_pred, zero_division=0)    # 召回率    rec = recall_score(y_test, y_pred, zero_division=0)    # F1 分数    f1 = f1_score(y_test, y_pred, zero_division=0)    # 打印指标,名称固定宽度12字符    print(f"{name:12} | AUC:{auc:.4f} | ACC:{acc:.4f} "          f"| PRE:{prec:.4f} | REC:{rec:.4f} | F1:{f1:.4f}")    # 返回训练好的模型和预测概率    return model, y_proba# ========== 9. 单模型 ==========# 定义模型字典:模型名称 -> 模型实例models = {    "Logit": LogisticRegression(max_iter=1000, random_state=42),    "RandomForest": RandomForestClassifier(        n_estimators=200, max_depth=10, random_state=42    ),    "LightGBM": lgb.LGBMClassifier(        n_estimators=200, learning_rate=0.05,        random_state=42, verbose=-1    ),    "XGBoost": xgb.XGBClassifier(        n_estimators=200, learning_rate=0.05, random_state=42,        use_label_encoder=False, eval_metric='logloss'    ),    "MLP": MLPClassifier(        hidden_layer_sizes=(64,32), max_iter=500,        random_state=42, early_stopping=True,        validation_fraction=0.1    )}print("\n单模型测试集表现:")# 存储训练好的模型对象trained = {}# 存储各模型在测试集上的预测概率(用于画ROC曲线)probas = {}# 遍历每个模型,训练并评估for name, model in models.items():    trained[name], probas[name] = train_and_predict(        model, X_train_res, y_train_res, X_test, name    )# ========== 10. Stacking 融合 ==========# 第一层基学习器:随机森林、LightGBM、MLPbase = [trained["RandomForest"], trained["LightGBM"], trained["MLP"]]# 5 折分层交叉验证skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)# 初始化新特征矩阵(训练集和测试集)new_train = np.zeros((X_train_res.shape[0], len(base)))new_test = np.zeros((X_test.shape[0], len(base)))# 对每个基模型,生成预测概率作为新特征for i, m in enumerate(base):    # 在训练集上使用交叉验证得到预测概率(避免过拟合)    tp = cross_val_predict(        m, X_train_res, y_train_res,        cv=skf, method='predict_proba'    )[:, 1]    new_train[:, i] = tp    # 用全部训练数据重新训练模型,用于测试集    m.fit(X_train_res, y_train_res)    tp_test = m.predict_proba(X_test)[:, 1]    new_test[:, i] = tp_test# 第二层元模型:逻辑回归meta = LogisticRegression(max_iter=1000)# 在训练集的新特征上训练元模型meta.fit(new_train, y_train_res)# 预测测试集的类别和概率y_pred_stack = meta.predict(new_test)y_proba_stack = meta.predict_proba(new_test)[:, 1]# 计算 Stacking 模型的各项指标auc_stack = roc_auc_score(y_test, y_proba_stack)acc_stack = accuracy_score(y_test, y_pred_stack)prec_stack = precision_score(y_test, y_pred_stack, zero_division=0)rec_stack = recall_score(y_test, y_pred_stack, zero_division=0)f1_stack = f1_score(y_test, y_pred_stack, zero_division=0)print("\n=== Stacking 融合模型 (RF+LGB+MLP → Logit) ===")print(f"AUC: {auc_stack:.4f} | ACC: {acc_stack:.4f} "      f"| PRE: {prec_stack:.4f} | REC: {rec_stack:.4f} "      f"| F1: {f1_stack:.4f}")# 将 Stacking 的预测概率加入字典,用于绘制 ROC 曲线probas["Stacking"] = y_proba_stack# ========== 11. 绘图部分 ==========# 生成时间戳用于文件名和标题timestamp_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")# 11.1 ROC 曲线plt.figure(figsize=(8, 6))                # 设置画布大小# 遍历所有模型(包括 Stacking)绘制 ROC 曲线for name, y_prob in probas.items():    fpr, tpr, _ = roc_curve(y_test, y_prob)  # 计算假正率、真正率    auc = roc_auc_score(y_test, y_prob)      # 计算 AUC    plt.plot(fpr, tpr, lw=2,                # 画曲线,线宽2             label=f'{name} (AUC = {auc:.4f})')# 画对角线(随机分类器)plt.plot([0, 1], [0, 1], 'k--', lw=1)plt.xlim([0.0, 1.0])                        # X轴范围plt.ylim([0.0, 1.05])                       # Y轴范围plt.xlabel('False Positive Rate')           # X轴标签plt.ylabel('True Positive Rate')            # Y轴标签plt.title(f'ROC Curves (Run at {start_timestamp})')  # 标题包含开始时间plt.legend(loc="lower right")               # 图例位置plt.grid(alpha=0.3)                         # 添加半透明网格plt.tight_layout()                          # 自动调整布局# 保存图片,文件名包含时间戳plt.savefig(f'roc_curves_{timestamp_str}.png', dpi=300)plt.show()                                  # 显示图片# 11.2 特征重要性图(基于随机森林)rf_model = trained["RandomForest"]          # 获取训练好的随机森林模型importances = rf_model.feature_importances_  # 特征重要性(基尼不纯度减少)indices = np.argsort(importances)[::-1]     # 降序排序的索引plt.figure(figsize=(10, 6))plt.title(f"Feature Importance - Random Forest (Run at {start_timestamp})")# 水平条形图plt.barh(range(len(indices)), importances[indices], align='center')# Y轴刻度标签为特征名plt.yticks(range(len(indices)), [feature_cols[i] for i in indices])plt.xlabel("Importance")plt.gca().invert_yaxis()                    # 反转Y轴,最重要的在上方plt.tight_layout()plt.savefig(f'feature_importance_{timestamp_str}.png', dpi=300)plt.show()# 11.3 混淆矩阵热力图(Stacking 模型)cm = confusion_matrix(y_test, y_pred_stack)  # 计算混淆矩阵plt.figure(figsize=(6, 5))# 绘制热力图,annot=True 显示数字,fmt='d' 整数格式sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',            xticklabels=['No Risk', 'Risk'],            yticklabels=['No Risk', 'Risk'])plt.xlabel('Predicted')plt.ylabel('Actual')plt.title(f'Confusion Matrix - Stacking (Run at {start_timestamp})')plt.tight_layout()plt.savefig(f'confusion_matrix_{timestamp_str}.png', dpi=300)plt.show()print(f"\n图片已保存,文件名包含时间戳: {timestamp_str}")# ========== 12. 输出结束时间戳和总耗时 ==========end_seconds = time.time()elapsed = end_seconds - start_seconds      # 计算总耗时(秒)end_timestamp = datetime.datetime.now()print(f"\n代码结束运行时间: {end_timestamp}")print(f"程序总运行时间: {elapsed:.2f} 秒 ({elapsed/60:.2f} 分钟)")print("复现及绘图完成!")