In [1]:
# -*- coding: utf-8 -*-
"""
日期: 2026/05/31
作者: lzh
描述: 随机森林复现工具模块 —— 数据加载、预处理、模型评估等核心函数。
复现论文《机器学习与会计舞弊治理——基于非遴选因子的预测视角》(周玮)。
"""
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import auc as prc_auc_func
def norm(series: pd.Series) -> pd.Series:
"""L2 归一化"""
if (series != 0).any():
val = series.values
a = np.linalg.norm(val)
series = series / a
return series
def get_data(df, testyear, nor, test_length=15):
"""滚动窗口划分:测试年前 test_length 年作为训练集,测试年作为测试集"""
df_trainval = df[df.year.isin(list(range(testyear - test_length, testyear)))]
df_test = df[df.year.isin([testyear])]
df_trainval = df_trainval.dropna(axis=0, how="any")
df_test = df_test.dropna(axis=0, how="any")
list_noX = ["stock", "year", "y"]
X_columns = [x for x in df.columns if x not in list_noX]
X_trainval, y_trainval = df_trainval[X_columns], df_trainval["y"]
X_test, y_test = df_test[X_columns], df_test["y"]
if nor == 1:
for col in X_columns:
X_trainval[col] = norm(X_trainval[col])
X_test[col] = norm(X_test[col])
return X_trainval, y_trainval, X_test, y_test
def get_ndcg(model, X_test, y_test, k=0.06):
"""计算 NDCG@k,评估模型将正样本排在 Top-k% 的能力"""
y_proba = model.predict_proba(X_test.values)[:, 1]
y_true = y_test.values.flatten()
n = len(y_test)
df_rank = pd.DataFrame({"y_proba": y_proba, "y_true": y_true})
df_rank = df_rank.sort_values(by="y_proba", ascending=False)
df_rank = df_rank.iloc[: int(k * n)]
dcg = (2 ** df_rank["y_true"] - 1) / np.log2(np.arange(1, len(df_rank) + 1) + 1)
dcg = np.sum(dcg)
df_ideal = pd.DataFrame({"y_true": y_true})
df_ideal = df_ideal.sort_values(by="y_true", ascending=False)
df_ideal = df_ideal.iloc[: int(k * n)]
idcg = (2 ** df_ideal["y_true"] - 1) / np.log2(np.arange(1, len(df_ideal) + 1) + 1)
idcg = np.sum(idcg)
return dcg / idcg if idcg != 0 else 0.0
def model_evaluate(model, X_test, y_test):
"""计算 AUC 与 PRC"""
y_proba = model.predict_proba(X_test.values)[:, 1]
y_true = y_test.values.flatten()
auc = roc_auc_score(y_true, y_proba)
precision, recall, _ = precision_recall_curve(y_true, y_proba)
prc = prc_auc_func(recall, precision)
return auc, prc
def train_and_evaluate(X_trainval, y_trainval, X_test, y_test, best_params, k=0.06):
"""训练随机森林并返回 AUC、PRC、NDCG"""
model = RF(
random_state=0,
max_depth=best_params["max_depth"],
n_estimators=best_params["n_estimators"],
)
model.fit(X_trainval, y_trainval)
auc, prc = model_evaluate(model, X_test, y_test)
ndcg = get_ndcg(model, X_test, y_test, k)
return model, auc, prc, ndcg
def compute_feature_importance(df, testyear, best_params, nor=0):
"""训练随机森林并返回特征重要性排序"""
X_trainval, y_trainval, X_test, y_test = get_data(df, testyear, nor)
X_columns = [x for x in df.columns if x not in ["stock", "year", "y"]]
model = RF(
random_state=0,
max_depth=best_params["max_depth"],
n_estimators=best_params["n_estimators"],
)
model.fit(X_trainval, y_trainval)
importance = pd.DataFrame({
"feature": X_columns,
"importance": model.feature_importances_,
}).sort_values(by="importance", ascending=False).reset_index(drop=True)
importance["cumulative"] = importance["importance"].cumsum()
importance["rank"] = range(1, len(importance) + 1)
return importance
In [2]:
# -*- coding: utf-8 -*-
"""
日期: 2026/05/31
作者: lzh
描述: 随机森林算法复现 —— 机器学习在会计风险(财务错报)预测中的应用。
复现论文《机器学习与会计舞弊治理——基于非遴选因子的预测视角》(周玮)。
主入口:数据加载、滚动窗口划分、模型训练、AUC/PRC/NDCG 评估、特征重要性分析及日志输出。
"""
import os
import sys
import ast
import logging
from datetime import datetime
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# ======================== 路径与日志配置 ========================
DATA_DIR = "/kaggle/input/datasets/songsammy/final-data-27123229"
LOG_DIR = "../3.运行日志"
os.makedirs(LOG_DIR, exist_ok=True)
log_filename = os.path.join(LOG_DIR, f"random_forest_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[
logging.FileHandler(log_filename, encoding="utf-8"),
logging.StreamHandler(sys.stdout),
],
)
logger = logging.getLogger(__name__)
logger.info("=" * 60)
logger.info("随机森林算法复现 —— 会计风险(财务错报)预测")
logger.info(f"运行开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logger.info("=" * 60)
# ======================== 主流程 ========================
def main():
# 1. 加载数据
logger.info("[步骤 1/4] 加载数据文件...")
df_14 = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/data_14.csv')
df_33 = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/data_33.csv')
df_331 = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/data_331.csv')
logger.info(f" data_14 形状: {df_14.shape}")
logger.info(f" data_33 形状: {df_33.shape}")
logger.info(f" data_331 形状: {df_331.shape}")
# 加载超参数,仅保留随机森林配置
df_params = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/params_table3_table7panelA_tableC_part1.csv')
df_params = df_params[df_params["model"] == "RF"].reset_index(drop=True)
logger.info(f" 随机森林实验配置数: {len(df_params)}")
# 2. 逐实验运行
logger.info("[步骤 2/4] 开始逐实验训练与评估...")
results = []
for i in range(len(df_params)):
data_type = str(df_params.loc[i, "data"])
testyear = int(df_params.loc[i, "testyear"])
nor = int(df_params.loc[i, "normalize"])
best_params = ast.literal_eval(df_params.loc[i, "best_params"])
if data_type == "14":
df = df_14.copy()
elif data_type == "33":
df = df_33.copy()
elif data_type == "331":
df = df_331.copy()
else:
raise ValueError(f"未知数据类型: {data_type}")
logger.info(f" [{i+1}/{len(df_params)}] 数据类型={data_type}, 测试年={testyear}, "
f"归一化={'是' if nor else '否'}, 参数={best_params}")
X_trainval, y_trainval, X_test, y_test = get_data(df, testyear, nor)
model, auc, prc, ndcg = train_and_evaluate(
X_trainval, y_trainval, X_test, y_test, best_params
)
logger.info(f" -> AUC={auc:.4f}, PRC={prc:.4f}, NDCG={ndcg:.4f}")
results.append({
"data": data_type,
"testyear": testyear,
"model": "RF",
"auc": "{:.4f}".format(auc),
"prc": "{:.4f}".format(prc),
"ndcg": "{:.4f}".format(ndcg),
})
# 3. 汇总结果
logger.info("[步骤 3/4] 汇总结果...")
df_result = pd.DataFrame(results)
logger.info("\n" + "=" * 60)
logger.info("随机森林复现结果汇总")
logger.info("=" * 60)
logger.info(f"{'数据集':<12}{'测试年':<8}{'AUC':<10}{'PRC':<10}{'NDCG':<10}")
logger.info("-" * 50)
for _, row in df_result.iterrows():
logger.info(f"{row['data']:<12}{row['testyear']:<8}{row['auc']:<10}{row['prc']:<10}{row['ndcg']:<10}")
logger.info("\n--- 按数据集分组的 AUC 描述统计 ---")
df_result["auc_float"] = df_result["auc"].astype(float)
for dtype in ["14", "33", "331"]:
sub = df_result[df_result["data"] == dtype]
logger.info(f" 数据集 {dtype}: 平均 AUC={sub['auc_float'].mean():.4f}, "
f"最高={sub['auc_float'].max():.4f}, 最低={sub['auc_float'].min():.4f}")
# 保存结果
result_csv = os.path.join(LOG_DIR, f"random_forest_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
df_result.to_csv(result_csv, index=False, encoding="utf-8-sig")
logger.info(f"\n结果已保存至: {result_csv}")
# 4. 特征重要性分析
logger.info("[步骤 4/4] 特征重要性分析...")
target_year = 2018
rf_params_row = df_params[
(df_params["data"].astype(str) == "14") & (df_params["testyear"] == target_year)
]
if len(rf_params_row) > 0:
best_params = ast.literal_eval(rf_params_row.iloc[0]["best_params"])
logger.info(f" 使用参数: {best_params}")
importance_df = compute_feature_importance(df_14, target_year, best_params, nor=0)
# 映射中文名
df_var14 = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/variable_14.csv')
var_name_map = dict(zip(df_var14["var"], df_var14["chinese"]))
importance_df["chinese_name"] = importance_df["feature"].map(var_name_map)
logger.info("\n--- 特征重要性排序(14 会计比率,测试年=2018)---")
logger.info(f"{'排名':<6}{'特征代码':<16}{'中文名称':<20}{'重要性':<12}{'累计贡献':<12}")
logger.info("-" * 60)
for _, row in importance_df.head(15).iterrows():
logger.info(
f"{row['rank']:<6}"
f"{row['feature']:<16}"
f"{str(row['chinese_name']):<20}"
f"{row['importance']:<12.4f}"
f"{row['cumulative']:<12.4f}"
)
imp_csv = os.path.join(LOG_DIR, f"feature_importance_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
importance_df.to_csv(imp_csv, index=False, encoding="utf-8-sig")
logger.info(f"\n特征重要性结果已保存至: {imp_csv}")
logger.info(f"\n运行结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logger.info(f"日志文件: {log_filename}")
if __name__ == "__main__":
main()
2026-06-27 13:00:02 [INFO] ============================================================
2026-06-27 13:00:02 [INFO] 随机森林算法复现 —— 会计风险(财务错报)预测
2026-06-27 13:00:02 [INFO] 运行开始时间: 2026-06-27 13:00:02
2026-06-27 13:00:02 [INFO] ============================================================
2026-06-27 13:00:02 [INFO] [步骤 1/4] 加载数据文件...
2026-06-27 13:00:04 [INFO] data_14 形状: (40574, 17)
2026-06-27 13:00:04 [INFO] data_33 形状: (40574, 36)
2026-06-27 13:00:04 [INFO] data_331 形状: (40574, 334)
2026-06-27 13:00:04 [INFO] 随机森林实验配置数: 18
2026-06-27 13:00:04 [INFO] [步骤 2/4] 开始逐实验训练与评估...
2026-06-27 13:00:04 [INFO] [1/18] 数据类型=14, 测试年=2013, 归一化=否, 参数={'max_depth': 5, 'n_estimators': 300}
2026-06-27 13:00:17 [INFO] -> AUC=0.6594, PRC=0.0472, NDCG=0.1059
2026-06-27 13:00:17 [INFO] [2/18] 数据类型=14, 测试年=2014, 归一化=否, 参数={'max_depth': 6, 'n_estimators': 840}
2026-06-27 13:01:05 [INFO] -> AUC=0.6215, PRC=0.0592, NDCG=0.1207
2026-06-27 13:01:05 [INFO] [3/18] 数据类型=14, 测试年=2015, 归一化=否, 参数={'max_depth': 8, 'n_estimators': 860}
2026-06-27 13:02:14 [INFO] -> AUC=0.6241, PRC=0.0720, NDCG=0.0814
2026-06-27 13:02:14 [INFO] [4/18] 数据类型=14, 测试年=2016, 归一化=否, 参数={'max_depth': 11, 'n_estimators': 110}
2026-06-27 13:02:27 [INFO] -> AUC=0.6104, PRC=0.0943, NDCG=0.1343
2026-06-27 13:02:27 [INFO] [5/18] 数据类型=14, 测试年=2017, 归一化=否, 参数={'max_depth': 8, 'n_estimators': 100}
2026-06-27 13:02:36 [INFO] -> AUC=0.6486, PRC=0.1180, NDCG=0.1672
2026-06-27 13:02:36 [INFO] [6/18] 数据类型=14, 测试年=2018, 归一化=否, 参数={'max_depth': 13, 'n_estimators': 480}
2026-06-27 13:03:48 [INFO] -> AUC=0.7392, PRC=0.1393, NDCG=0.1716
2026-06-27 13:03:48 [INFO] [7/18] 数据类型=33, 测试年=2013, 归一化=否, 参数={'max_depth': 17, 'n_estimators': 70}
2026-06-27 13:03:59 [INFO] -> AUC=0.7324, PRC=0.1954, NDCG=0.4161
2026-06-27 13:03:59 [INFO] [8/18] 数据类型=33, 测试年=2014, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 190}
2026-06-27 13:04:35 [INFO] -> AUC=0.7623, PRC=0.1324, NDCG=0.2708
2026-06-27 13:04:35 [INFO] [9/18] 数据类型=33, 测试年=2015, 归一化=否, 参数={'max_depth': 20, 'n_estimators': 1000}
2026-06-27 13:08:03 [INFO] -> AUC=0.7165, PRC=0.1303, NDCG=0.2293
2026-06-27 13:08:03 [INFO] [10/18] 数据类型=33, 测试年=2016, 归一化=否, 参数={'max_depth': 13, 'n_estimators': 610}
2026-06-27 13:09:58 [INFO] -> AUC=0.7278, PRC=0.1514, NDCG=0.2297
2026-06-27 13:09:58 [INFO] [11/18] 数据类型=33, 测试年=2017, 归一化=否, 参数={'max_depth': 17, 'n_estimators': 600}
2026-06-27 13:12:21 [INFO] -> AUC=0.7929, PRC=0.2402, NDCG=0.3164
2026-06-27 13:12:21 [INFO] [12/18] 数据类型=33, 测试年=2018, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 600}
2026-06-27 13:15:00 [INFO] -> AUC=0.8214, PRC=0.2081, NDCG=0.3023
2026-06-27 13:15:00 [INFO] [13/18] 数据类型=331, 测试年=2013, 归一化=否, 参数={'max_depth': 16, 'n_estimators': 300}
2026-06-27 13:16:13 [INFO] -> AUC=0.7911, PRC=0.2304, NDCG=0.4714
2026-06-27 13:16:13 [INFO] [14/18] 数据类型=331, 测试年=2014, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 1210}
2026-06-27 13:22:09 [INFO] -> AUC=0.7941, PRC=0.1563, NDCG=0.2890
2026-06-27 13:22:09 [INFO] [15/18] 数据类型=331, 测试年=2015, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 560}
2026-06-27 13:25:16 [INFO] -> AUC=0.7439, PRC=0.1413, NDCG=0.2630
2026-06-27 13:25:16 [INFO] [16/18] 数据类型=331, 测试年=2016, 归一化=否, 参数={'max_depth': 12, 'n_estimators': 510}
2026-06-27 13:27:35 [INFO] -> AUC=0.7280, PRC=0.1217, NDCG=0.1676
2026-06-27 13:27:35 [INFO] [17/18] 数据类型=331, 测试年=2017, 归一化=否, 参数={'max_depth': 17, 'n_estimators': 1300}
2026-06-27 13:35:59 [INFO] -> AUC=0.8066, PRC=0.2778, NDCG=0.3875
2026-06-27 13:35:59 [INFO] [18/18] 数据类型=331, 测试年=2018, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 1390}
2026-06-27 13:46:32 [INFO] -> AUC=0.8292, PRC=0.2147, NDCG=0.3120
2026-06-27 13:46:32 [INFO] [步骤 3/4] 汇总结果...
2026-06-27 13:46:32 [INFO]
============================================================
2026-06-27 13:46:32 [INFO] 随机森林复现结果汇总
2026-06-27 13:46:32 [INFO] ============================================================
2026-06-27 13:46:32 [INFO] 数据集 测试年 AUC PRC NDCG
2026-06-27 13:46:32 [INFO] --------------------------------------------------
2026-06-27 13:46:32 [INFO] 14 2013 0.6594 0.0472 0.1059
2026-06-27 13:46:32 [INFO] 14 2014 0.6215 0.0592 0.1207
2026-06-27 13:46:32 [INFO] 14 2015 0.6241 0.0720 0.0814
2026-06-27 13:46:32 [INFO] 14 2016 0.6104 0.0943 0.1343
2026-06-27 13:46:32 [INFO] 14 2017 0.6486 0.1180 0.1672
2026-06-27 13:46:32 [INFO] 14 2018 0.7392 0.1393 0.1716
2026-06-27 13:46:32 [INFO] 33 2013 0.7324 0.1954 0.4161
2026-06-27 13:46:32 [INFO] 33 2014 0.7623 0.1324 0.2708
2026-06-27 13:46:32 [INFO] 33 2015 0.7165 0.1303 0.2293
2026-06-27 13:46:32 [INFO] 33 2016 0.7278 0.1514 0.2297
2026-06-27 13:46:32 [INFO] 33 2017 0.7929 0.2402 0.3164
2026-06-27 13:46:32 [INFO] 33 2018 0.8214 0.2081 0.3023
2026-06-27 13:46:32 [INFO] 331 2013 0.7911 0.2304 0.4714
2026-06-27 13:46:32 [INFO] 331 2014 0.7941 0.1563 0.2890
2026-06-27 13:46:32 [INFO] 331 2015 0.7439 0.1413 0.2630
2026-06-27 13:46:32 [INFO] 331 2016 0.7280 0.1217 0.1676
2026-06-27 13:46:32 [INFO] 331 2017 0.8066 0.2778 0.3875
2026-06-27 13:46:32 [INFO] 331 2018 0.8292 0.2147 0.3120
2026-06-27 13:46:32 [INFO]
--- 按数据集分组的 AUC 描述统计 ---
2026-06-27 13:46:32 [INFO] 数据集 14: 平均 AUC=0.6505, 最高=0.7392, 最低=0.6104
2026-06-27 13:46:32 [INFO] 数据集 33: 平均 AUC=0.7589, 最高=0.8214, 最低=0.7165
2026-06-27 13:46:32 [INFO] 数据集 331: 平均 AUC=0.7822, 最高=0.8292, 最低=0.7280
2026-06-27 13:46:32 [INFO]
结果已保存至: ../3.运行日志/random_forest_result_20260627_134632.csv
2026-06-27 13:46:32 [INFO] [步骤 4/4] 特征重要性分析...
2026-06-27 13:46:32 [INFO] 使用参数: {'max_depth': 13, 'n_estimators': 480}
2026-06-27 13:47:43 [INFO]
--- 特征重要性排序(14 会计比率,测试年=2018)---
2026-06-27 13:47:43 [INFO] 排名 特征代码 中文名称 重要性 累计贡献
2026-06-27 13:47:43 [INFO] ------------------------------------------------------------
2026-06-27 13:47:43 [INFO] 1 soft_assets 软资产比例 0.0960 0.0960
2026-06-27 13:47:43 [INFO] 2 dch_rec 应收账款变动率 0.0870 0.1830
2026-06-27 13:47:43 [INFO] 3 ch_cs 现金销售率变动率 0.0846 0.2677
2026-06-27 13:47:43 [INFO] 4 reoa 留存收益比例 0.0832 0.3508
2026-06-27 13:47:43 [INFO] 5 EBIT 息税前利润比例 0.0790 0.4298
2026-06-27 13:47:43 [INFO] 6 bm 账面市值比 0.0756 0.5055
2026-06-27 13:47:43 [INFO] 7 ch_cm 现金保证金变动率 0.0746 0.5801
2026-06-27 13:47:43 [INFO] 8 dpi 折旧率指数 0.0746 0.6546
2026-06-27 13:47:43 [INFO] 9 dch_wc 经营资本应计项变动率 0.0711 0.7257
2026-06-27 13:47:43 [INFO] 10 dch_invt 存货变动率 0.0696 0.7953
2026-06-27 13:47:43 [INFO] 11 ch_roa ROA增长率 0.0685 0.8638
2026-06-27 13:47:43 [INFO] 12 ch_rsst RSST应计项变动率 0.0656 0.9294
2026-06-27 13:47:43 [INFO] 13 ch_fcf 自由现金流变动率 0.0647 0.9941
2026-06-27 13:47:43 [INFO] 14 issue 是否再融资 0.0059 1.0000
2026-06-27 13:47:43 [INFO]
特征重要性结果已保存至: ../3.运行日志/feature_importance_20260627_134743.csv
2026-06-27 13:47:43 [INFO]
运行结束时间: 2026-06-27 13:47:43
2026-06-27 13:47:43 [INFO] 日志文件: ../3.运行日志/random_forest_log_20260627_130002.txt