In [1]:
import sys
pkgs = ["pandas", "numpy", "scikit-learn", "imbalanced-learn",
"lightgbm", "xgboost", "matplotlib", "seaborn"]
!{sys.executable} -m pip install {" ".join(pkgs)}
Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (2.3.3) Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (2.4.6) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1) Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.12/dist-packages (0.14.1) Requirement already satisfied: lightgbm in /usr/local/lib/python3.12/dist-packages (4.6.0) Requirement already satisfied: xgboost in /usr/local/lib/python3.12/dist-packages (3.2.0) Requirement already satisfied: matplotlib in /usr/local/lib/python3.12/dist-packages (3.10.0) Requirement already satisfied: seaborn in /usr/local/lib/python3.12/dist-packages (0.13.2) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas) (2025.2) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas) (2026.1) Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3) Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.5.3) Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0) Requirement already satisfied: sklearn-compat<0.2,>=0.1.5 in /usr/local/lib/python3.12/dist-packages (from imbalanced-learn) (0.1.5) Requirement already satisfied: nvidia-nccl-cu12 in /usr/local/lib/python3.12/dist-packages (from xgboost) (2.29.7) Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.3.3) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (0.12.1) Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (4.62.1) Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (1.5.0) Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (26.0) Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (11.3.0) Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.12/dist-packages (from matplotlib) (3.3.2) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)
In [2]:
# 导入时间模块,用于计算程序运行时间import time# 导入日期时间模块,用于生成时间戳import datetime# 导入 pandas,用于读取 .dta 文件和处理数据import pandas as pd# 导入 numpy,用于数组运算import numpy as np# 导入 matplotlib 绘图库import matplotlib.pyplot as plt# 导入 seaborn,用于绘制混淆矩阵热力图import seaborn as sns# 从 sklearn 导入数据划分函数from sklearn.model_selection import train_test_split# 导入分层 K 折交叉验证from sklearn.model_selection import StratifiedKFold# 导入交叉验证预测函数from sklearn.model_selection import cross_val_predict# 导入逻辑回归模型from sklearn.linear_model import LogisticRegression# 导入随机森林模型from sklearn.ensemble import RandomForestClassifier# 导入多层感知机(神经网络)模型from sklearn.neural_network import MLPClassifier# 导入评估指标from sklearn.metrics import roc_auc_scorefrom sklearn.metrics import accuracy_scorefrom sklearn.metrics import precision_scorefrom sklearn.metrics import recall_scorefrom sklearn.metrics import f1_scorefrom sklearn.metrics import roc_curve # 计算 ROC 曲线数据from sklearn.metrics import confusion_matrix # 混淆矩阵# 导入 SMOTE 过采样处理不平衡数据from imblearn.over_sampling import SMOTE# 导入 LightGBM 模型import lightgbm as lgb# 导入 XGBoost 模型(额外增加)import xgboost as xgb# 忽略警告信息,保持输出整洁import warningswarnings.filterwarnings('ignore')# 设置 matplotlib 中文字体,防止图表中文乱码plt.rcParams['font.sans-serif'] = ['SimHei']# 解决负号显示问题plt.rcParams['axes.unicode_minus'] = False# 获取当前时间并格式化为字符串,作为开始时间戳# 打印开始时间start_timestamp = datetime.datetime.now()print(f"代码开始运行时间: {start_timestamp}")# 记录开始运行的秒数(用于计算耗时)start_seconds = time.time()# ========== 1. 读取数据 ==========print("\n[1] 读取家庭经济数据 (famecon)...")# 读取 CFPS 2022 家庭经济数据文件(Stata 格式)df_econ = pd.read_stata( "cfps2022famecon_202410.dta", # 文件名 convert_categoricals=False # 不自动转换分类变量)# 打印数据维度(行数, 列数)print(f" famecon 形状: {df_econ.shape}")print("读取家庭关系数据 (famconf)...")# 读取家庭关系数据文件df_fam = pd.read_stata( "cfps2022famconf_202410.dta", convert_categoricals=False)print(f" famconf 形状: {df_fam.shape}")# ========== 2. 合并特征 ==========# 定义需要从 famconf 中合并的额外特征列名extra_cols = ['familysize22', 'fid_urban22']# 检查这些列是否真实存在于 famconf 中existing_extra = [c for c in extra_cols if c in df_fam.columns]# 如果存在至少一个额外特征,则进行合并if existing_extra: # 按家庭ID (fid22) 左连接,将 famconf 的特征加到 famecon 中 df = df_econ.merge( df_fam[['fid22'] + existing_extra], # 选择需要合并的列 on='fid22', # 连接键 how='left' # 左连接,保留所有 famecon 样本 ) print(f"合并后新增特征: {existing_extra}")else: # 如果没有额外特征,直接复制 famecon df = df_econ.copy()# 打印合并后的总样本数print(f"合并后总样本数: {df.shape[0]}")# ========== 3. 构造目标变量 risk ==========# 构造 risk 所需的四个核心字段required = ['mortage', 'fincome1', 'house_debts', 'total_asset']# 检查这些字段是否都存在missing = [c for c in required if c not in df.columns]if missing: # 如果缺少字段,抛出错误 raise ValueError(f"缺少构造 risk 的字段: {missing}")# 计算月收入(年收入除以12)monthly_income = df['fincome1'] / 12# 计算偿债收入比 DSR = 月供 / 月收入,加极小值避免除零dsr = df['mortage'] / (monthly_income + 1e-6)# 计算资产负债率 DTA = 住房负债 / 总资产dta = df['house_debts'] / (df['total_asset'] + 1e-6)# 若 DSR>0.5 或 DTA>0.8 则标记为风险(1),否则为无风险(0)df['risk'] = ((dsr > 0.5) | (dta > 0.8)).astype(int)# 打印风险标签的分布比例print(f"风险标签分布:\n{df['risk'].value_counts(normalize=True)}")# ========== 4. 特征集(剔除泄露变量) ==========# 这些变量直接用于计算 risk,不能作为特征(防止数据泄露)leaked = ['mortage', 'fincome1', 'house_debts', 'total_asset']# 候选特征字典:论文变量名 -> 数据中的实际列名candidates = { 'savings': 'savings', # 流动资产 'financial_product': 'financial_product', # 金融资产 'land_asset': 'land_asset', # 土地资产 'pce': 'pce', # 人均消费 'resivalue': 'resivalue', # 自住房价值 'durables_asset': 'durables_asset', # 耐用资产 'fixed_asset': 'fixed_asset', # 固定资产 'nonhousing_debts': 'nonhousing_debts', # 非住房负债}# 如果之前合并了额外特征,也加入候选if existing_extra: for col in existing_extra: candidates[col] = col# 筛选出实际存在于数据中且不在泄露列表中的列名feature_cols = []for col in candidates.values(): if col in df.columns and col not in leaked: feature_cols.append(col)# 打印最终使用的特征列及数量print(f"使用的特征列(共 {len(feature_cols)} 个): {feature_cols}")# 提取特征矩阵 XX = df[feature_cols].copy()# 提取目标变量 yy = df['risk'].copy()# ========== 5. 数据清洗 ==========# 设置阈值:如果某行缺失的特征数超过一半,则删除该行th = len(feature_cols) * 0.5X = X.dropna(thresh=th)# 同步删除对应的 y 值y = y.loc[X.index]# 对剩余的缺失值用中位数填充X = X.fillna(X.median())# 对每个数值特征进行 3σ 异常值截尾for col in X.columns: # 只处理数值列 if np.issubdtype(X[col].dtype, np.number): m = X[col].mean() # 均值 s = X[col].std() # 标准差 # 将超出 [m-3s, m+3s] 的值截断到边界 X[col] = X[col].clip(m - 3*s, m + 3*s)# 打印清洗后的样本数量print(f"清洗后样本数: {X.shape[0]}")# ========== 6. 划分训练/测试集 ==========# 分层抽样,按 y 的比例划分,保证训练/测试集中风险比例一致X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y)# ========== 7. SMOTE 过采样 ==========# 初始化 SMOTE,固定随机种子smote = SMOTE(random_state=42)# 仅在训练集上进行过采样,使正负样本数量平衡X_train_res, y_train_res = smote.fit_resample(X_train, y_train)# 打印过采样后的训练集大小和风险比例print(f"SMOTE 后训练集: {X_train_res.shape[0]} 样本" f", 风险比例: {y_train_res.mean():.3f}")# ========== 8. 定义评估函数 ==========def train_and_predict(model, X_tr, y_tr, X_te, name): """ 训练模型,预测测试集,并打印评估指标 """ # 训练模型 model.fit(X_tr, y_tr) # 预测类别 y_pred = model.predict(X_te) # 预测为正类的概率 y_proba = model.predict_proba(X_te)[:, 1] # 计算 AUC auc = roc_auc_score(y_test, y_proba) # 准确率 acc = accuracy_score(y_test, y_pred) # 精确率(防止除零警告) prec = precision_score(y_test, y_pred, zero_division=0) # 召回率 rec = recall_score(y_test, y_pred, zero_division=0) # F1 分数 f1 = f1_score(y_test, y_pred, zero_division=0) # 打印指标,名称固定宽度12字符 print(f"{name:12} | AUC:{auc:.4f} | ACC:{acc:.4f} " f"| PRE:{prec:.4f} | REC:{rec:.4f} | F1:{f1:.4f}") # 返回训练好的模型和预测概率 return model, y_proba# ========== 9. 单模型 ==========# 定义模型字典:模型名称 -> 模型实例models = { "Logit": LogisticRegression(max_iter=1000, random_state=42), "RandomForest": RandomForestClassifier( n_estimators=200, max_depth=10, random_state=42 ), "LightGBM": lgb.LGBMClassifier( n_estimators=200, learning_rate=0.05, random_state=42, verbose=-1 ), "XGBoost": xgb.XGBClassifier( n_estimators=200, learning_rate=0.05, random_state=42, use_label_encoder=False, eval_metric='logloss' ), "MLP": MLPClassifier( hidden_layer_sizes=(64,32), max_iter=500, random_state=42, early_stopping=True, validation_fraction=0.1 )}print("\n单模型测试集表现:")# 存储训练好的模型对象trained = {}# 存储各模型在测试集上的预测概率(用于画ROC曲线)probas = {}# 遍历每个模型,训练并评估for name, model in models.items(): trained[name], probas[name] = train_and_predict( model, X_train_res, y_train_res, X_test, name )# ========== 10. Stacking 融合 ==========# 第一层基学习器:随机森林、LightGBM、MLPbase = [trained["RandomForest"], trained["LightGBM"], trained["MLP"]]# 5 折分层交叉验证skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)# 初始化新特征矩阵(训练集和测试集)new_train = np.zeros((X_train_res.shape[0], len(base)))new_test = np.zeros((X_test.shape[0], len(base)))# 对每个基模型,生成预测概率作为新特征for i, m in enumerate(base): # 在训练集上使用交叉验证得到预测概率(避免过拟合) tp = cross_val_predict( m, X_train_res, y_train_res, cv=skf, method='predict_proba' )[:, 1] new_train[:, i] = tp # 用全部训练数据重新训练模型,用于测试集 m.fit(X_train_res, y_train_res) tp_test = m.predict_proba(X_test)[:, 1] new_test[:, i] = tp_test# 第二层元模型:逻辑回归meta = LogisticRegression(max_iter=1000)# 在训练集的新特征上训练元模型meta.fit(new_train, y_train_res)# 预测测试集的类别和概率y_pred_stack = meta.predict(new_test)y_proba_stack = meta.predict_proba(new_test)[:, 1]# 计算 Stacking 模型的各项指标auc_stack = roc_auc_score(y_test, y_proba_stack)acc_stack = accuracy_score(y_test, y_pred_stack)prec_stack = precision_score(y_test, y_pred_stack, zero_division=0)rec_stack = recall_score(y_test, y_pred_stack, zero_division=0)f1_stack = f1_score(y_test, y_pred_stack, zero_division=0)print("\n=== Stacking 融合模型 (RF+LGB+MLP → Logit) ===")print(f"AUC: {auc_stack:.4f} | ACC: {acc_stack:.4f} " f"| PRE: {prec_stack:.4f} | REC: {rec_stack:.4f} " f"| F1: {f1_stack:.4f}")# 将 Stacking 的预测概率加入字典,用于绘制 ROC 曲线probas["Stacking"] = y_proba_stack# ========== 11. 绘图部分 ==========# 生成时间戳用于文件名和标题timestamp_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")# 11.1 ROC 曲线plt.figure(figsize=(8, 6)) # 设置画布大小# 遍历所有模型(包括 Stacking)绘制 ROC 曲线for name, y_prob in probas.items(): fpr, tpr, _ = roc_curve(y_test, y_prob) # 计算假正率、真正率 auc = roc_auc_score(y_test, y_prob) # 计算 AUC plt.plot(fpr, tpr, lw=2, # 画曲线,线宽2 label=f'{name} (AUC = {auc:.4f})')# 画对角线(随机分类器)plt.plot([0, 1], [0, 1], 'k--', lw=1)plt.xlim([0.0, 1.0]) # X轴范围plt.ylim([0.0, 1.05]) # Y轴范围plt.xlabel('False Positive Rate') # X轴标签plt.ylabel('True Positive Rate') # Y轴标签plt.title(f'ROC Curves (Run at {start_timestamp})') # 标题包含开始时间plt.legend(loc="lower right") # 图例位置plt.grid(alpha=0.3) # 添加半透明网格plt.tight_layout() # 自动调整布局# 保存图片,文件名包含时间戳plt.savefig(f'roc_curves_{timestamp_str}.png', dpi=300)plt.show() # 显示图片# 11.2 特征重要性图(基于随机森林)rf_model = trained["RandomForest"] # 获取训练好的随机森林模型importances = rf_model.feature_importances_ # 特征重要性(基尼不纯度减少)indices = np.argsort(importances)[::-1] # 降序排序的索引plt.figure(figsize=(10, 6))plt.title(f"Feature Importance - Random Forest (Run at {start_timestamp})")# 水平条形图plt.barh(range(len(indices)), importances[indices], align='center')# Y轴刻度标签为特征名plt.yticks(range(len(indices)), [feature_cols[i] for i in indices])plt.xlabel("Importance")plt.gca().invert_yaxis() # 反转Y轴,最重要的在上方plt.tight_layout()plt.savefig(f'feature_importance_{timestamp_str}.png', dpi=300)plt.show()# 11.3 混淆矩阵热力图(Stacking 模型)cm = confusion_matrix(y_test, y_pred_stack) # 计算混淆矩阵plt.figure(figsize=(6, 5))# 绘制热力图,annot=True 显示数字,fmt='d' 整数格式sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Risk', 'Risk'], yticklabels=['No Risk', 'Risk'])plt.xlabel('Predicted')plt.ylabel('Actual')plt.title(f'Confusion Matrix - Stacking (Run at {start_timestamp})')plt.tight_layout()plt.savefig(f'confusion_matrix_{timestamp_str}.png', dpi=300)plt.show()print(f"\n图片已保存,文件名包含时间戳: {timestamp_str}")# ========== 12. 输出结束时间戳和总耗时 ==========end_seconds = time.time()elapsed = end_seconds - start_seconds # 计算总耗时(秒)end_timestamp = datetime.datetime.now()print(f"\n代码结束运行时间: {end_timestamp}")print(f"程序总运行时间: {elapsed:.2f} 秒 ({elapsed/60:.2f} 分钟)")print("复现及绘图完成!")