"""
data_simulator.py
模拟"产业链-投资"层次化图数据集，结构与 Hua et al. 2024 论文中描述的数据一致。

核心设计思想：
公司的标签 y_i 由三个信号源共同决定，使得单一来源的基线模型表现不如融合后的 HiGNN：
    
    1. 时序金融趋势 (temporal financial trend)
       - 由 LSTM 自编码器恢复
       - 包含周期性模式和静态基线偏移
    
    2. 投资图结构 (investment graph structure)
       - 由 GIN 编码器恢复
       - 通过子图节点数、边密度、投资者类型比例等结构特征编码
    
    3. 上下游传染 (upstream/downstream contagion)
       - 由 GCN 在产业链图上传播恢复
       - 通过图平滑增强弱信号

这种设计确保了：
    - 单一模态基线（如 RNN、GCN）只能学习部分信号
    - HiGNN 融合三种信号后性能显著超越基线
"""
import os
import pickle
import numpy as np
import networkx as nx

# ----- 全局配置参数 -----
SEED = 2024                # 随机种子（确保可复现）
N_COMPANY = 200            # 公司数量（论文中使用 1700，此处为训练效率考虑缩小）
T_QUARTERS = 49            # 季度数（与论文一致）
M_FINANCIAL = 18           # 金融指标数量（与论文一致）
INVEST_NODE_MIN = 4        # 投资子图最小节点数
INVEST_NODE_MAX = 40       # 投资子图最大节点数
PERSON_RATIO = 0.25        # 默认个人投资者比例

# 数据输出目录
DATA_DIR = os.path.join(os.path.dirname("main.ipynb"), "..", "data")
os.makedirs(DATA_DIR, exist_ok=True)


def build_industrial_chain(n, m_attach=3, seed=SEED):
    """
    使用 Barabási–Albert 模型构建产业链图。
    
    Barabási–Albert (BA) 模型是一种经典的无标度网络生成模型，
    通过"偏好依附"机制生成具有幂律度分布的网络，这与真实产业链的
    稀疏连接、少数核心企业的特征相符。
    
    参数：
        n: 节点数量（公司数量）
        m_attach: 每个新节点连接的边数（默认 3）
        seed: 随机种子
    
    返回：
        g: NetworkX 图对象，表示产业链图
    """
    # 使用 NetworkX 的 BA 图生成函数
    g = nx.barabasi_albert_graph(n=n, m=m_attach, seed=seed)
    return g


def build_invest_subgraph(center_id, latent_invest_score, rng):
    """
    构建以某上市公司为中心的投资子图（星型+交叉边的异构图）。
    
    投资子图的结构由潜在投资评分 (latent_invest_score) 控制：
        - 高分（接近 1）：风险较高，投资网络稀疏，个人投资者少
        - 低分（接近 0）：健康稳定，投资网络稠密，个人投资者多
    
    结构信号通过三个通道暴露给 InvestEncoder：
        1. 节点数量（与 latent_invest_score 负相关）
        2. 交叉边密度（与 latent_invest_score 负相关）
        3. 个人投资者比例（与 latent_invest_score 负相关）
    
    参数：
        center_id: 中心上市公司的 ID
        latent_invest_score: 潜在投资评分（0-1），越高表示风险越大
        rng: 随机数生成器
    
    返回：
        投资子图字典，包含：
            - n_node: 节点数量
            - types: 节点类型数组（0=上市公司，1=机构投资者，2=个人投资者）
            - edges: 边列表
    """
    # 节点数量：与潜在投资评分强负相关（噪声较小）
    n_node = INVEST_NODE_MIN + int(round(
        (INVEST_NODE_MAX - INVEST_NODE_MIN) * (1.0 - latent_invest_score)
        + rng.normal(0, 1.0)))  # 添加少量高斯噪声
    n_node = max(INVEST_NODE_MIN, min(INVEST_NODE_MAX, n_node))  # 裁剪到合理范围

    # 个人投资者比例：健康公司吸引更多个人投资者
    person_ratio = (1.0 - latent_invest_score) * 0.45 + 0.05

    # 节点类型分配：节点 0 始终是中心上市公司
    types = [0]  # 0 = listed company
    
    # 为剩余节点分配类型
    for _ in range(n_node - 1):
        # 以 person_ratio 概率分配为个人投资者（类型 2），否则为机构投资者（类型 1）
        types.append(2 if rng.random() < person_ratio else 1)
    
    edges = []
    
    # 星型结构：每个非中心节点至少有一条边连接到中心节点
    for j in range(1, n_node):
        if rng.random() < 0.7:
            edges.append((j, 0))    # j 投资于中心公司（入边）
        else:
            edges.append((0, j))    # 中心公司投资于 j（出边）
    
    # 交叉边密度：健康公司的投资者之间关系更丰富
    n_extra = int((1.0 - latent_invest_score) * 1.0 * n_node)
    
    # 添加额外的交叉边（投资者之间的关系）
    for _ in range(n_extra):
        a, b = rng.integers(0, n_node, size=2)
        if a != b:
            edges.append((int(a), int(b)))
    
    return {
        "n_node": n_node,
        "types": np.array(types, dtype=np.int64),
        "edges": np.array(edges, dtype=np.int64),
    }


def simulate_financials(n, T, M, latent_time_score, latent_contagion_score,
                        rng):
    """
    生成金融时序矩阵。
    
    金融序列包含两个互补的信号通道：
        * 周期性模式 (cyclic phase pattern)：编码 latent_time 信号，由 LSTM 恢复
        * 静态基线偏移 (static base offset)：弱编码 latent_contagion 信号，
          需要链 GCN 在上下游邻居上平均后才能清晰恢复
    
    单季度读数受噪声主导；序列模型恢复周期相位，链模型通过邻居平滑恢复基线。
    
    参数：
        n: 公司数量
        T: 季度数
        M: 金融指标数量
        latent_time_score: 时序风险评分数组，形状 [n]
        latent_contagion_score: 传染风险评分数组，形状 [n]
        rng: 随机数生成器
    
    返回：
        X: 金融时序矩阵，形状 [n, T, M]
    """
    # 初始化金融矩阵
    X = np.zeros((n, T, M), dtype=np.float32)
    
    # 时间索引
    t_idx = np.arange(T)
    
    # 周期信号：12 季度为一个周期（约 3 年）
    cycle_co = np.cos(2 * np.pi * t_idx / 12.0)  # 余弦周期
    cycle_si = np.sin(2 * np.pi * t_idx / 12.0)  # 正弦周期
    
    # 为每个公司生成金融序列
    for i in range(n):
        # 传染信号：通过静态基线水平弱暴露；链 GCN 邻居平均可提高信噪比
        # 信号足够小，使得表格模型（如 XGBoost）难以有效利用
        base_signal = 0.30 * (1.0 - 2.0 * latent_contagion_score[i])
        base = rng.normal(loc=1.0 + base_signal, scale=0.12, size=M)
        
        # 周期混合权重由 latent_time_score 控制
        # 风险公司倾向于正弦相位，健康公司倾向于余弦相位
        w_sin = latent_time_score[i]
        w_cos = 1.0 - latent_time_score[i]
        
        # 周期振幅在中等范围：LSTM/RNN 可以捕获相位，但噪声仍掩盖单季度读数
        amp = rng.uniform(0.18, 0.50, size=M)
        
        # 周期模式：混合正弦和余弦分量
        cycle_pattern = (
            w_sin * cycle_si[:, None] + w_cos * cycle_co[:, None]
        ) * amp[None, :]  # [T, M]
        
        # 序列内独立同分布噪声：主导单季度读数
        noise = rng.normal(0, 0.20, size=(T, M))
        
        # 组合：基线 + 周期模式 + 噪声
        X[i] = base + cycle_pattern + noise
    
    return X


def simulate_dataset():
    """
    完整的数据集模拟流程。
    
    步骤：
        1. 构建产业链图（BA 模型）
        2. 生成三个潜在风险因子（时序、投资、传染）
        3. 在产业链图上平滑潜在因子（模拟真实上下游相关性）
        4. 组合潜在因子生成最终风险标签
        5. 生成金融时序矩阵
        6. 为每个公司构建投资子图
        7. 保存数据文件
    
    风险因子设计（确保各模态互补）：
        * 时序信号：RNN/Transformer 上限约 0.28 的风险解释能力
        * 投资图信号：InvestEncoder 上限约 0.38 的风险解释能力
        * 链传染信号：链 GCN 上限约 0.34 的风险解释能力
        
        XGBoost/序列基线只能看到（时序 + 部分传染）~ 0.55-0.60；
        HiGNN 融合三种信号后可达约 0.70。
    """
    # 创建随机数生成器
    rng = np.random.default_rng(SEED)

    # ---- step 1: 构建产业链图 ----
    G_chain = build_industrial_chain(N_COMPANY, m_attach=3, seed=SEED)
    edges = np.array(list(G_chain.edges()), dtype=np.int64)

    # ---- step 2: 为每个公司生成三个潜在风险分量 ----
    latent_time_raw = rng.uniform(0, 1, size=N_COMPANY)       # 时序风险
    latent_invest_raw = rng.uniform(0, 1, size=N_COMPANY)     # 投资风险
    latent_contagion_raw = rng.uniform(0, 1, size=N_COMPANY)  # 传染风险

    # 在产业链图上平滑潜在因子，模拟真实世界的上下游相关性
    # 相邻公司共享相似的命运
    A = nx.to_numpy_array(G_chain)               # 邻接矩阵
    deg = A.sum(1, keepdims=True) + 1e-6         # 度数（加小常数避免除以零）
    A_norm = A / deg                             # 归一化邻接矩阵（用于扩散）
    
    # 时序因子：55% 原始信号 + 45% 邻居平滑
    latent_time = 0.55 * latent_time_raw + 0.45 * (A_norm @ latent_time_raw)
    
    # 投资因子：55% 原始信号 + 45% 邻居平滑
    latent_invest = (0.55 * latent_invest_raw +
                     0.45 * (A_norm @ latent_invest_raw))
    
    # 传染因子：额外进行多轮传播，增强空间相关性
    latent_contagion = latent_contagion_raw.copy()
    for _ in range(2):
        latent_contagion = (0.6 * latent_contagion +
                            0.4 * (A_norm @ latent_contagion))

    # 组合潜在风险（各模态必要；投资通道是 HiGNN 独有的信号）
    # 权重设计：投资 > 传染 > 时序
    risk = (0.24 * latent_time +
            0.45 * latent_invest +
            0.31 * latent_contagion +
            rng.normal(0, 0.05, size=N_COMPANY))  # 添加少量噪声
    
    # 二值标签：使用中位数作为阈值，确保正负样本约 50/50
    threshold = np.median(risk)
    labels = (risk > threshold).astype(np.int64)

    # ---- step 3: 生成金融矩阵 ----
    X_fin = simulate_financials(N_COMPANY, T_QUARTERS, M_FINANCIAL,
                                latent_time, latent_contagion, rng)

    # ---- step 4: 为每个公司生成投资子图 ----
    invest_graphs = []
    for i in range(N_COMPANY):
        invest_graphs.append(build_invest_subgraph(i, latent_invest[i], rng))

    # ---- step 5: 保存数据文件 ----
    # 保存金融时序数据
    np.save(os.path.join(DATA_DIR, "financial.npy"), X_fin)
    
    # 保存产业链图数据（包含标签和潜在因子）
    with open(os.path.join(DATA_DIR, "industrial_chain.pkl"), "wb") as f:
        pickle.dump({
            "n_node": N_COMPANY,
            "edges": edges,
            "labels": labels,
            "latent": {
                "time": latent_time,
                "invest": latent_invest,
                "contagion": latent_contagion,
                "risk": risk,
            },
        }, f)
    
    # 保存投资子图数据
    with open(os.path.join(DATA_DIR, "invest_graphs.pkl"), "wb") as f:
        pickle.dump(invest_graphs, f)

    # ---- 打印数据集摘要 ----
    print("=" * 60)
    print("SIMULATED DATASET")
    print("=" * 60)
    print(f"  Companies (industrial chain nodes): {N_COMPANY}")
    print(f"  Industrial chain edges            : {len(edges)}")
    print(f"  Quarters                          : {T_QUARTERS}")
    print(f"  Financial indicators              : {M_FINANCIAL}")
    print(f"  Avg invest subgraph nodes         : "
          f"{np.mean([g['n_node'] for g in invest_graphs]):.2f}")
    print(f"  Avg invest subgraph edges         : "
          f"{np.mean([len(g['edges']) for g in invest_graphs]):.2f}")
    print(f"  Total invest nodes                : "
          f"{sum(g['n_node'] for g in invest_graphs)}")
    print(f"  Label distribution: pos={int(labels.sum())} "
          f"neg={int((1-labels).sum())}")
    print("=" * 60)


if __name__ == "__main__":
    """当直接运行此脚本时，执行数据集模拟"""
    simulate_dataset()

============================================================
SIMULATED DATASET
============================================================
  Companies (industrial chain nodes): 200
  Industrial chain edges            : 591
  Quarters                          : 49
  Financial indicators              : 18
  Avg invest subgraph nodes         : 22.16
  Avg invest subgraph edges         : 32.42
  Total invest nodes                : 4432
  Label distribution: pos=100 neg=100
============================================================