In [1]:
# -*- coding: utf-8 -*-
"""
日期: 2026/05/31
作者: lzh
描述: 随机森林复现工具模块 —— 数据加载、预处理、模型评估等核心函数。
      复现论文《机器学习与会计舞弊治理——基于非遴选因子的预测视角》(周玮)。
"""

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import auc as prc_auc_func


def norm(series: pd.Series) -> pd.Series:
    """L2 归一化"""
    if (series != 0).any():
        val = series.values
        a = np.linalg.norm(val)
        series = series / a
    return series


def get_data(df, testyear, nor, test_length=15):
    """滚动窗口划分:测试年前 test_length 年作为训练集,测试年作为测试集"""
    df_trainval = df[df.year.isin(list(range(testyear - test_length, testyear)))]
    df_test = df[df.year.isin([testyear])]

    df_trainval = df_trainval.dropna(axis=0, how="any")
    df_test = df_test.dropna(axis=0, how="any")

    list_noX = ["stock", "year", "y"]
    X_columns = [x for x in df.columns if x not in list_noX]
    X_trainval, y_trainval = df_trainval[X_columns], df_trainval["y"]
    X_test, y_test = df_test[X_columns], df_test["y"]

    if nor == 1:
        for col in X_columns:
            X_trainval[col] = norm(X_trainval[col])
            X_test[col] = norm(X_test[col])

    return X_trainval, y_trainval, X_test, y_test


def get_ndcg(model, X_test, y_test, k=0.06):
    """计算 NDCG@k,评估模型将正样本排在 Top-k% 的能力"""
    y_proba = model.predict_proba(X_test.values)[:, 1]
    y_true = y_test.values.flatten()
    n = len(y_test)

    df_rank = pd.DataFrame({"y_proba": y_proba, "y_true": y_true})
    df_rank = df_rank.sort_values(by="y_proba", ascending=False)
    df_rank = df_rank.iloc[: int(k * n)]

    dcg = (2 ** df_rank["y_true"] - 1) / np.log2(np.arange(1, len(df_rank) + 1) + 1)
    dcg = np.sum(dcg)

    df_ideal = pd.DataFrame({"y_true": y_true})
    df_ideal = df_ideal.sort_values(by="y_true", ascending=False)
    df_ideal = df_ideal.iloc[: int(k * n)]
    idcg = (2 ** df_ideal["y_true"] - 1) / np.log2(np.arange(1, len(df_ideal) + 1) + 1)
    idcg = np.sum(idcg)

    return dcg / idcg if idcg != 0 else 0.0


def model_evaluate(model, X_test, y_test):
    """计算 AUC 与 PRC"""
    y_proba = model.predict_proba(X_test.values)[:, 1]
    y_true = y_test.values.flatten()

    auc = roc_auc_score(y_true, y_proba)
    precision, recall, _ = precision_recall_curve(y_true, y_proba)
    prc = prc_auc_func(recall, precision)

    return auc, prc


def train_and_evaluate(X_trainval, y_trainval, X_test, y_test, best_params, k=0.06):
    """训练随机森林并返回 AUC、PRC、NDCG"""
    model = RF(
        random_state=0,
        max_depth=best_params["max_depth"],
        n_estimators=best_params["n_estimators"],
    )
    model.fit(X_trainval, y_trainval)

    auc, prc = model_evaluate(model, X_test, y_test)
    ndcg = get_ndcg(model, X_test, y_test, k)

    return model, auc, prc, ndcg


def compute_feature_importance(df, testyear, best_params, nor=0):
    """训练随机森林并返回特征重要性排序"""
    X_trainval, y_trainval, X_test, y_test = get_data(df, testyear, nor)
    X_columns = [x for x in df.columns if x not in ["stock", "year", "y"]]

    model = RF(
        random_state=0,
        max_depth=best_params["max_depth"],
        n_estimators=best_params["n_estimators"],
    )
    model.fit(X_trainval, y_trainval)

    importance = pd.DataFrame({
        "feature": X_columns,
        "importance": model.feature_importances_,
    }).sort_values(by="importance", ascending=False).reset_index(drop=True)

    importance["cumulative"] = importance["importance"].cumsum()
    importance["rank"] = range(1, len(importance) + 1)

    return importance
In [2]:
# -*- coding: utf-8 -*-
"""
日期: 2026/05/31
作者: lzh
描述: 随机森林算法复现 —— 机器学习在会计风险(财务错报)预测中的应用。
      复现论文《机器学习与会计舞弊治理——基于非遴选因子的预测视角》(周玮)。
      主入口:数据加载、滚动窗口划分、模型训练、AUC/PRC/NDCG 评估、特征重要性分析及日志输出。
"""

import os
import sys
import ast
import logging
from datetime import datetime

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")


# ======================== 路径与日志配置 ========================

DATA_DIR = "/kaggle/input/datasets/songsammy/final-data-27123229"
LOG_DIR = "../3.运行日志"
os.makedirs(LOG_DIR, exist_ok=True)

log_filename = os.path.join(LOG_DIR, f"random_forest_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[
        logging.FileHandler(log_filename, encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger(__name__)

logger.info("=" * 60)
logger.info("随机森林算法复现 —— 会计风险(财务错报)预测")
logger.info(f"运行开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
logger.info("=" * 60)


# ======================== 主流程 ========================

def main():
    # 1. 加载数据
    logger.info("[步骤 1/4] 加载数据文件...")
    df_14 = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/data_14.csv')
    df_33 = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/data_33.csv')
    df_331 = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/data_331.csv')

    logger.info(f"  data_14 形状: {df_14.shape}")
    logger.info(f"  data_33 形状: {df_33.shape}")
    logger.info(f"  data_331 形状: {df_331.shape}")

    # 加载超参数,仅保留随机森林配置
    df_params = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/params_table3_table7panelA_tableC_part1.csv')
    df_params = df_params[df_params["model"] == "RF"].reset_index(drop=True)
    logger.info(f"  随机森林实验配置数: {len(df_params)}")

    # 2. 逐实验运行
    logger.info("[步骤 2/4] 开始逐实验训练与评估...")
    results = []

    for i in range(len(df_params)):
        data_type = str(df_params.loc[i, "data"])
        testyear = int(df_params.loc[i, "testyear"])
        nor = int(df_params.loc[i, "normalize"])
        best_params = ast.literal_eval(df_params.loc[i, "best_params"])

        if data_type == "14":
            df = df_14.copy()
        elif data_type == "33":
            df = df_33.copy()
        elif data_type == "331":
            df = df_331.copy()
        else:
            raise ValueError(f"未知数据类型: {data_type}")

        logger.info(f"  [{i+1}/{len(df_params)}] 数据类型={data_type}, 测试年={testyear}, "
                     f"归一化={'是' if nor else '否'}, 参数={best_params}")

        X_trainval, y_trainval, X_test, y_test = get_data(df, testyear, nor)
        model, auc, prc, ndcg = train_and_evaluate(
            X_trainval, y_trainval, X_test, y_test, best_params
        )

        logger.info(f"    -> AUC={auc:.4f}, PRC={prc:.4f}, NDCG={ndcg:.4f}")

        results.append({
            "data": data_type,
            "testyear": testyear,
            "model": "RF",
            "auc": "{:.4f}".format(auc),
            "prc": "{:.4f}".format(prc),
            "ndcg": "{:.4f}".format(ndcg),
        })

    # 3. 汇总结果
    logger.info("[步骤 3/4] 汇总结果...")
    df_result = pd.DataFrame(results)

    logger.info("\n" + "=" * 60)
    logger.info("随机森林复现结果汇总")
    logger.info("=" * 60)
    logger.info(f"{'数据集':<12}{'测试年':<8}{'AUC':<10}{'PRC':<10}{'NDCG':<10}")
    logger.info("-" * 50)
    for _, row in df_result.iterrows():
        logger.info(f"{row['data']:<12}{row['testyear']:<8}{row['auc']:<10}{row['prc']:<10}{row['ndcg']:<10}")

    logger.info("\n--- 按数据集分组的 AUC 描述统计 ---")
    df_result["auc_float"] = df_result["auc"].astype(float)
    for dtype in ["14", "33", "331"]:
        sub = df_result[df_result["data"] == dtype]
        logger.info(f"  数据集 {dtype}: 平均 AUC={sub['auc_float'].mean():.4f}, "
                     f"最高={sub['auc_float'].max():.4f}, 最低={sub['auc_float'].min():.4f}")

    # 保存结果
    result_csv = os.path.join(LOG_DIR, f"random_forest_result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
    df_result.to_csv(result_csv, index=False, encoding="utf-8-sig")
    logger.info(f"\n结果已保存至: {result_csv}")

    # 4. 特征重要性分析
    logger.info("[步骤 4/4] 特征重要性分析...")
    target_year = 2018
    rf_params_row = df_params[
        (df_params["data"].astype(str) == "14") & (df_params["testyear"] == target_year)
    ]
    if len(rf_params_row) > 0:
        best_params = ast.literal_eval(rf_params_row.iloc[0]["best_params"])
        logger.info(f"  使用参数: {best_params}")

        importance_df = compute_feature_importance(df_14, target_year, best_params, nor=0)

        # 映射中文名
        df_var14 = pd.read_csv('/kaggle/input/datasets/songsammy/final-data-27123229/variable_14.csv')
        var_name_map = dict(zip(df_var14["var"], df_var14["chinese"]))
        importance_df["chinese_name"] = importance_df["feature"].map(var_name_map)

        logger.info("\n--- 特征重要性排序(14 会计比率,测试年=2018)---")
        logger.info(f"{'排名':<6}{'特征代码':<16}{'中文名称':<20}{'重要性':<12}{'累计贡献':<12}")
        logger.info("-" * 60)
        for _, row in importance_df.head(15).iterrows():
            logger.info(
                f"{row['rank']:<6}"
                f"{row['feature']:<16}"
                f"{str(row['chinese_name']):<20}"
                f"{row['importance']:<12.4f}"
                f"{row['cumulative']:<12.4f}"
            )

        imp_csv = os.path.join(LOG_DIR, f"feature_importance_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
        importance_df.to_csv(imp_csv, index=False, encoding="utf-8-sig")
        logger.info(f"\n特征重要性结果已保存至: {imp_csv}")

    logger.info(f"\n运行结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    logger.info(f"日志文件: {log_filename}")


if __name__ == "__main__":
    main()
2026-06-27 13:00:02 [INFO] ============================================================
2026-06-27 13:00:02 [INFO] 随机森林算法复现 —— 会计风险(财务错报)预测
2026-06-27 13:00:02 [INFO] 运行开始时间: 2026-06-27 13:00:02
2026-06-27 13:00:02 [INFO] ============================================================
2026-06-27 13:00:02 [INFO] [步骤 1/4] 加载数据文件...
2026-06-27 13:00:04 [INFO]   data_14 形状: (40574, 17)
2026-06-27 13:00:04 [INFO]   data_33 形状: (40574, 36)
2026-06-27 13:00:04 [INFO]   data_331 形状: (40574, 334)
2026-06-27 13:00:04 [INFO]   随机森林实验配置数: 18
2026-06-27 13:00:04 [INFO] [步骤 2/4] 开始逐实验训练与评估...
2026-06-27 13:00:04 [INFO]   [1/18] 数据类型=14, 测试年=2013, 归一化=否, 参数={'max_depth': 5, 'n_estimators': 300}
2026-06-27 13:00:17 [INFO]     -> AUC=0.6594, PRC=0.0472, NDCG=0.1059
2026-06-27 13:00:17 [INFO]   [2/18] 数据类型=14, 测试年=2014, 归一化=否, 参数={'max_depth': 6, 'n_estimators': 840}
2026-06-27 13:01:05 [INFO]     -> AUC=0.6215, PRC=0.0592, NDCG=0.1207
2026-06-27 13:01:05 [INFO]   [3/18] 数据类型=14, 测试年=2015, 归一化=否, 参数={'max_depth': 8, 'n_estimators': 860}
2026-06-27 13:02:14 [INFO]     -> AUC=0.6241, PRC=0.0720, NDCG=0.0814
2026-06-27 13:02:14 [INFO]   [4/18] 数据类型=14, 测试年=2016, 归一化=否, 参数={'max_depth': 11, 'n_estimators': 110}
2026-06-27 13:02:27 [INFO]     -> AUC=0.6104, PRC=0.0943, NDCG=0.1343
2026-06-27 13:02:27 [INFO]   [5/18] 数据类型=14, 测试年=2017, 归一化=否, 参数={'max_depth': 8, 'n_estimators': 100}
2026-06-27 13:02:36 [INFO]     -> AUC=0.6486, PRC=0.1180, NDCG=0.1672
2026-06-27 13:02:36 [INFO]   [6/18] 数据类型=14, 测试年=2018, 归一化=否, 参数={'max_depth': 13, 'n_estimators': 480}
2026-06-27 13:03:48 [INFO]     -> AUC=0.7392, PRC=0.1393, NDCG=0.1716
2026-06-27 13:03:48 [INFO]   [7/18] 数据类型=33, 测试年=2013, 归一化=否, 参数={'max_depth': 17, 'n_estimators': 70}
2026-06-27 13:03:59 [INFO]     -> AUC=0.7324, PRC=0.1954, NDCG=0.4161
2026-06-27 13:03:59 [INFO]   [8/18] 数据类型=33, 测试年=2014, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 190}
2026-06-27 13:04:35 [INFO]     -> AUC=0.7623, PRC=0.1324, NDCG=0.2708
2026-06-27 13:04:35 [INFO]   [9/18] 数据类型=33, 测试年=2015, 归一化=否, 参数={'max_depth': 20, 'n_estimators': 1000}
2026-06-27 13:08:03 [INFO]     -> AUC=0.7165, PRC=0.1303, NDCG=0.2293
2026-06-27 13:08:03 [INFO]   [10/18] 数据类型=33, 测试年=2016, 归一化=否, 参数={'max_depth': 13, 'n_estimators': 610}
2026-06-27 13:09:58 [INFO]     -> AUC=0.7278, PRC=0.1514, NDCG=0.2297
2026-06-27 13:09:58 [INFO]   [11/18] 数据类型=33, 测试年=2017, 归一化=否, 参数={'max_depth': 17, 'n_estimators': 600}
2026-06-27 13:12:21 [INFO]     -> AUC=0.7929, PRC=0.2402, NDCG=0.3164
2026-06-27 13:12:21 [INFO]   [12/18] 数据类型=33, 测试年=2018, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 600}
2026-06-27 13:15:00 [INFO]     -> AUC=0.8214, PRC=0.2081, NDCG=0.3023
2026-06-27 13:15:00 [INFO]   [13/18] 数据类型=331, 测试年=2013, 归一化=否, 参数={'max_depth': 16, 'n_estimators': 300}
2026-06-27 13:16:13 [INFO]     -> AUC=0.7911, PRC=0.2304, NDCG=0.4714
2026-06-27 13:16:13 [INFO]   [14/18] 数据类型=331, 测试年=2014, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 1210}
2026-06-27 13:22:09 [INFO]     -> AUC=0.7941, PRC=0.1563, NDCG=0.2890
2026-06-27 13:22:09 [INFO]   [15/18] 数据类型=331, 测试年=2015, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 560}
2026-06-27 13:25:16 [INFO]     -> AUC=0.7439, PRC=0.1413, NDCG=0.2630
2026-06-27 13:25:16 [INFO]   [16/18] 数据类型=331, 测试年=2016, 归一化=否, 参数={'max_depth': 12, 'n_estimators': 510}
2026-06-27 13:27:35 [INFO]     -> AUC=0.7280, PRC=0.1217, NDCG=0.1676
2026-06-27 13:27:35 [INFO]   [17/18] 数据类型=331, 测试年=2017, 归一化=否, 参数={'max_depth': 17, 'n_estimators': 1300}
2026-06-27 13:35:59 [INFO]     -> AUC=0.8066, PRC=0.2778, NDCG=0.3875
2026-06-27 13:35:59 [INFO]   [18/18] 数据类型=331, 测试年=2018, 归一化=否, 参数={'max_depth': 19, 'n_estimators': 1390}
2026-06-27 13:46:32 [INFO]     -> AUC=0.8292, PRC=0.2147, NDCG=0.3120
2026-06-27 13:46:32 [INFO] [步骤 3/4] 汇总结果...
2026-06-27 13:46:32 [INFO] 
============================================================
2026-06-27 13:46:32 [INFO] 随机森林复现结果汇总
2026-06-27 13:46:32 [INFO] ============================================================
2026-06-27 13:46:32 [INFO] 数据集         测试年     AUC       PRC       NDCG      
2026-06-27 13:46:32 [INFO] --------------------------------------------------
2026-06-27 13:46:32 [INFO] 14          2013    0.6594    0.0472    0.1059    
2026-06-27 13:46:32 [INFO] 14          2014    0.6215    0.0592    0.1207    
2026-06-27 13:46:32 [INFO] 14          2015    0.6241    0.0720    0.0814    
2026-06-27 13:46:32 [INFO] 14          2016    0.6104    0.0943    0.1343    
2026-06-27 13:46:32 [INFO] 14          2017    0.6486    0.1180    0.1672    
2026-06-27 13:46:32 [INFO] 14          2018    0.7392    0.1393    0.1716    
2026-06-27 13:46:32 [INFO] 33          2013    0.7324    0.1954    0.4161    
2026-06-27 13:46:32 [INFO] 33          2014    0.7623    0.1324    0.2708    
2026-06-27 13:46:32 [INFO] 33          2015    0.7165    0.1303    0.2293    
2026-06-27 13:46:32 [INFO] 33          2016    0.7278    0.1514    0.2297    
2026-06-27 13:46:32 [INFO] 33          2017    0.7929    0.2402    0.3164    
2026-06-27 13:46:32 [INFO] 33          2018    0.8214    0.2081    0.3023    
2026-06-27 13:46:32 [INFO] 331         2013    0.7911    0.2304    0.4714    
2026-06-27 13:46:32 [INFO] 331         2014    0.7941    0.1563    0.2890    
2026-06-27 13:46:32 [INFO] 331         2015    0.7439    0.1413    0.2630    
2026-06-27 13:46:32 [INFO] 331         2016    0.7280    0.1217    0.1676    
2026-06-27 13:46:32 [INFO] 331         2017    0.8066    0.2778    0.3875    
2026-06-27 13:46:32 [INFO] 331         2018    0.8292    0.2147    0.3120    
2026-06-27 13:46:32 [INFO] 
--- 按数据集分组的 AUC 描述统计 ---
2026-06-27 13:46:32 [INFO]   数据集 14: 平均 AUC=0.6505, 最高=0.7392, 最低=0.6104
2026-06-27 13:46:32 [INFO]   数据集 33: 平均 AUC=0.7589, 最高=0.8214, 最低=0.7165
2026-06-27 13:46:32 [INFO]   数据集 331: 平均 AUC=0.7822, 最高=0.8292, 最低=0.7280
2026-06-27 13:46:32 [INFO] 
结果已保存至: ../3.运行日志/random_forest_result_20260627_134632.csv
2026-06-27 13:46:32 [INFO] [步骤 4/4] 特征重要性分析...
2026-06-27 13:46:32 [INFO]   使用参数: {'max_depth': 13, 'n_estimators': 480}
2026-06-27 13:47:43 [INFO] 
--- 特征重要性排序(14 会计比率,测试年=2018)---
2026-06-27 13:47:43 [INFO] 排名    特征代码            中文名称                重要性         累计贡献        
2026-06-27 13:47:43 [INFO] ------------------------------------------------------------
2026-06-27 13:47:43 [INFO] 1     soft_assets     软资产比例               0.0960      0.0960      
2026-06-27 13:47:43 [INFO] 2     dch_rec         应收账款变动率             0.0870      0.1830      
2026-06-27 13:47:43 [INFO] 3     ch_cs           现金销售率变动率            0.0846      0.2677      
2026-06-27 13:47:43 [INFO] 4     reoa            留存收益比例              0.0832      0.3508      
2026-06-27 13:47:43 [INFO] 5     EBIT            息税前利润比例             0.0790      0.4298      
2026-06-27 13:47:43 [INFO] 6     bm              账面市值比               0.0756      0.5055      
2026-06-27 13:47:43 [INFO] 7     ch_cm           现金保证金变动率            0.0746      0.5801      
2026-06-27 13:47:43 [INFO] 8     dpi             折旧率指数               0.0746      0.6546      
2026-06-27 13:47:43 [INFO] 9     dch_wc          经营资本应计项变动率          0.0711      0.7257      
2026-06-27 13:47:43 [INFO] 10    dch_invt        存货变动率               0.0696      0.7953      
2026-06-27 13:47:43 [INFO] 11    ch_roa          ROA增长率              0.0685      0.8638      
2026-06-27 13:47:43 [INFO] 12    ch_rsst         RSST应计项变动率          0.0656      0.9294      
2026-06-27 13:47:43 [INFO] 13    ch_fcf          自由现金流变动率            0.0647      0.9941      
2026-06-27 13:47:43 [INFO] 14    issue           是否再融资               0.0059      1.0000      
2026-06-27 13:47:43 [INFO] 
特征重要性结果已保存至: ../3.运行日志/feature_importance_20260627_134743.csv
2026-06-27 13:47:43 [INFO] 
运行结束时间: 2026-06-27 13:47:43
2026-06-27 13:47:43 [INFO] 日志文件: ../3.运行日志/random_forest_log_20260627_130002.txt