In [1]:
!pip install ucimlrepo
Collecting ucimlrepo

  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)

Requirement already satisfied: pandas>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from ucimlrepo) (2.3.3)

Requirement already satisfied: certifi>=2020.12.5 in /usr/local/lib/python3.12/dist-packages (from ucimlrepo) (2026.2.25)

Requirement already satisfied: numpy>=1.26.0 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.0.0->ucimlrepo) (2.4.6)

Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.0.0->ucimlrepo) (2.9.0.post0)

Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.0.0->ucimlrepo) (2025.2)

Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.0.0->ucimlrepo) (2026.1)

Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas>=1.0.0->ucimlrepo) (1.17.0)

Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)

Installing collected packages: ucimlrepo

Successfully installed ucimlrepo-0.0.7

In [2]:
"""
==========================================================================
GP_SSVM: Group Penalty Semi-Supervised Support Vector Machine
==========================================================================
复现论文: 基于半监督支持向量机的信用评分模型
作者: 陈耸
期刊: 中国管理科学, 2024, Vol.32, No.3
DOI: 10.16381/j.cnki.issn1003-207x.2021.2434

算法实现:
  1. S3VM (半监督支持向量机) - 基于对偶QP求解
  2. Group LASSO 特征选择 - 基于近端梯度下降
  3. GP_SSVM (Group Penalty S3VM) - S3VM + Group LASSO联合优化
  4. MAR缺失数据机制处理

数据集: UCI Statlog (German Credit Data)
来源: https://archive.ics.uci.edu/datasets/144/statlog+german+credit+data
==========================================================================
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, confusion_matrix)
from sklearn.impute import SimpleImputer
from ucimlrepo import fetch_ucirepo
import time
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)


# ================================================================
# 1. 数据加载
# ================================================================
def load_german_credit():
    """
    加载UCI Statlog (German Credit Data)数据集。

    数据来源: UCI Machine Learning Repository
    https://archive.ics.uci.edu/datasets/144/statlog+german+credit+data
    原始来源: Hofmann, H. (1994). Statlog (German Credit Data).

    数据集说明:
    - 1000个样本, 20个特征 (7个数值型, 13个分类型)
    - 二分类: 好客户(0) vs 坏客户(1)
    - 类别分布: 好客户700(70%), 坏客户300(30%)
    """
    dataset = fetch_ucirepo(id=144)
    X = dataset.data.features.copy()
    y = dataset.data.targets.copy()

    # 分类特征和数值特征的索引
    cat_indices = [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19]
    num_indices = [1, 4, 7, 10, 12, 15, 17]

    y = (y.iloc[:, 0] - 1).values.astype(float)

    X_encoded = X.copy()
    for col_idx in cat_indices:
        col_name = X.columns[col_idx]
        le = LabelEncoder()
        X_encoded[col_name] = le.fit_transform(X[col_name].astype(str))

    feature_names = X_encoded.columns.tolist()
    return X_encoded.values.astype(float), y, feature_names, cat_indices, num_indices


# ================================================================
# 2. 数据预处理
# ================================================================
def preprocess_data(X, cat_indices, num_indices):
    """
    数据预处理: 独热编码 + 标准化。
    返回特征矩阵和Group LASSO分组信息。
    """
    n_samples = X.shape[0]

    X_num = StandardScaler().fit_transform(X[:, num_indices].copy())

    X_cat_parts = []
    groups = []
    group_names = []
    idx = 0

    for feat_idx in num_indices:
        groups.append([idx])
        group_names.append(f"num_{feat_idx}")
        idx += 1

    for feat_idx in cat_indices:
        col = X[:, feat_idx].astype(int)
        n_cat = int(col.max()) + 1
        one_hot = np.zeros((n_samples, n_cat))
        for k in range(n_cat):
            one_hot[:, k] = (col == k).astype(float)
        X_cat_parts.append(one_hot)
        groups.append(list(range(idx, idx + n_cat)))
        group_names.append(f"cat_{feat_idx}")
        idx += n_cat

    return np.hstack([X_num] + X_cat_parts), groups, group_names


# ================================================================
# 3. MAR缺失数据
# ================================================================
def simulate_mar(X, missing_rate=0.2, seed=42):
    """模拟MAR缺失数据: 缺失概率依赖于其他观测变量。"""
    rng = np.random.RandomState(seed)
    X_missing = X.copy().astype(float)
    n, d = X.shape
    missing_mask = np.zeros_like(X_missing, dtype=bool)

    n_miss_feat = max(2, d // 5)
    miss_feats = rng.choice(d, n_miss_feat, replace=False)

    for feat in miss_feats:
        dep_feat = rng.choice([f for f in range(d) if f != feat])
        vals = X[:, dep_feat]
        prob = ((vals - vals.min()) / (vals.max() - vals.min() + 1e-10)) * missing_rate * 2
        mask = rng.random(n) < prob
        X_missing[mask, feat] = np.nan
        missing_mask[mask, feat] = True

    return X_missing, missing_mask


def impute_mean(X_missing):
    """均值填补。"""
    return SimpleImputer(strategy='mean').fit_transform(X_missing)


# ================================================================
# 4. S3VM 实现 (对偶QP求解)
# ================================================================
class S3VM:
    """
    半监督支持向量机 (Semi-Supervised SVM)

    论文公式(2):
    min 1/2 ||w||² + C_l Σ ξ_i + C_u Σ (ξ_j⁺ + ξ_j⁻)
    s.t. y_i(w'x_i+b) ≥ 1-ξ_i, w'x_j+b ≥ 1-ξ_j⁺, -(w'x_j+b) ≥ 1-ξ_j⁻

    对偶等价: 将无标签数据扩展为正/负两份, 用标准SVM对偶QP求解。
    """

    def __init__(self, C_l=1.0, C_u=0.5, kernel='rbf', gamma=0.02):
        self.C_l = C_l
        self.C_u = C_u
        self.kernel = kernel
        self.gamma = gamma
        self.alpha = None
        self.X_train = None
        self.y_train = None
        self.b = 0.0

    def fit(self, X_l, y_l, X_u):
        """
        训练S3VM。
        将无标签数据扩展为正/负两类, 构造标准SVM对偶QP。
        """
        from cvxopt import matrix, solvers
        solvers.options['show_progress'] = False

        n_l, n_u = X_l.shape[0], X_u.shape[0]
        y_l = np.where(y_l.ravel() == 0, -1.0, 1.0)

        # 扩展数据集
        X_ext = np.vstack([X_l, X_u, X_u])
        y_ext = np.concatenate([y_l, np.ones(n_u), -np.ones(n_u)])
        n = len(y_ext)

        C_vec = np.concatenate([np.full(n_l, self.C_l),
                                np.full(n_u, self.C_u),
                                np.full(n_u, self.C_u)])

        # 核矩阵
        if self.kernel == 'linear':
            K = X_ext @ X_ext.T
        else:
            from scipy.spatial.distance import cdist
            K = np.exp(-self.gamma * cdist(X_ext, X_ext, 'sqeuclidean'))

        # QP: min 1/2 α'Pα - 1'α, s.t. y'α=0, 0≤α≤C
        P = matrix(np.outer(y_ext, y_ext) * K)
        q = matrix(-np.ones(n))
        G = matrix(np.vstack([-np.eye(n), np.eye(n)]))
        h = matrix(np.concatenate([np.zeros(n), C_vec]))
        A = matrix(y_ext.reshape(1, -1))
        b = matrix(0.0)

        try:
            sol = solvers.qp(P, q, G, h, A, b)
            alpha = np.array(sol['x']).flatten()
        except Exception:
            self.alpha = np.zeros(n)
            self.X_train = X_ext
            self.y_train = y_ext
            return self

        alpha[alpha < 1e-8] = 0
        self.alpha = alpha
        self.X_train = X_ext
        self.y_train = y_ext

        # 计算b: 使用自由支持向量
        free_sv = (alpha > 1e-8) & (alpha < C_vec - 1e-8)
        if free_sv.sum() > 0:
            K_sv = K[free_sv]  # (n_free, n)
            decision = K_sv @ (alpha * y_ext)  # (n_free,)
            self.b = np.mean(y_ext[free_sv] - decision)
        else:
            nz = alpha > 1e-8
            if nz.sum() > 0:
                K_nz = K[nz]
                decision = K_nz @ (alpha * y_ext)
                self.b = np.mean(y_ext[nz] - decision)

        return self

    def decision_function(self, X):
        if self.alpha is None or self.alpha.sum() == 0:
            return np.zeros(X.shape[0])
        if self.kernel == 'linear':
            K = X @ self.X_train.T
        else:
            from scipy.spatial.distance import cdist
            K = np.exp(-self.gamma * cdist(X, self.X_train, 'sqeuclidean'))
        return K @ (self.alpha * self.y_train) + self.b

    def predict(self, X):
        return (self.decision_function(X) >= 0).astype(int)


# ================================================================
# 5. Group LASSO
# ================================================================
def group_lasso_proximal(w, groups, lam, lr):
    """Group LASSO近端算子 (分组软阈值)。"""
    w_new = w.copy()
    for group in groups:
        w_g = w[group]
        norm_g = np.linalg.norm(w_g)
        if norm_g > 1e-10:
            scale = max(0, 1 - lr * lam / norm_g)
            w_new[group] = scale * w_g
        else:
            w_new[group] = 0.0
    return w_new


# ================================================================
# 6. GP_SSVM (Group Penalty S3VM) - 近端梯度下降求解
# ================================================================
class GP_SSVM:
    """
    GP_SSVM: 带Group LASSO惩罚的半监督SVM

    优化问题 (论文公式8):
    min 1/2||w||² + λΣ_g||w_g||₂ + C_lΣmax(0,1-y_i·f_i) + C_uΣ[max(0,1-f_j)+max(0,1+f_j)]

    求解: 近端梯度下降法
    - 梯度下降处理SVM铰链损失
    - 近端步处理Group LASSO非光滑惩罚
    """

    def __init__(self, C_l=1.0, C_u=0.5, lambda_g=0.01,
                 max_iter=500, lr=0.0005):
        self.C_l = C_l
        self.C_u = C_u
        self.lambda_g = lambda_g
        self.max_iter = max_iter
        self.lr = lr
        self.w = None
        self.b = 0.0
        self.groups = None
        self.loss_history = []

    def fit(self, X_l, y_l, X_u):
        """
        训练GP_SSVM。

        使用近端梯度下降直接优化公式(8)的目标函数,
        无需迭代伪标签——无标签数据通过S3VM损失项直接参与优化。
        """
        n_features = X_l.shape[1]
        y_l = y_l.ravel()
        # 转换为 -1/+1 标签
        y_svm = np.where(y_l == 0, -1.0, 1.0)

        if self.groups is None:
            self.groups = [[i] for i in range(n_features)]

        # 初始化: 用有标签数据训练
        init = LinearSVC(C=self.C_l, max_iter=5000, random_state=42)
        init.fit(X_l, y_l)
        self.w = init.coef_.flatten()
        self.b = init.intercept_[0]

        # 合并数据
        X_all = np.vstack([X_l, X_u])
        n_l = X_l.shape[0]
        n_u = X_u.shape[0]

        # 有标签/无标签掩码
        labeled_mask = np.zeros(len(X_all), dtype=bool)
        labeled_mask[:n_l] = True
        unlabeled_mask = ~labeled_mask

        # 近端梯度下降
        for it in range(self.max_iter):
            f = X_all @ self.w + self.b

            # --- 有标签数据梯度 (铰链损失) ---
            f_l = f[:n_l]
            margins = y_svm * f_l
            violated = margins < 1.0
            loss_l = self.C_l * np.sum(np.maximum(0, 1 - margins))
            gw_l = np.zeros(n_features)
            gb_l = 0.0
            if violated.any():
                gw_l = -self.C_l * (X_l[violated].T @ y_svm[violated])
                gb_l = -self.C_l * np.sum(y_svm[violated])

            # --- 无标签数据梯度 (S3VM损失) ---
            f_u = f[n_l:]
            loss_u = self.C_u * np.sum(np.maximum(0, 1-f_u) + np.maximum(0, 1+f_u))
            grad_pos = -np.where(f_u < 1, 1.0, 0.0)
            grad_neg = np.where(f_u > -1, 1.0, 0.0)
            gw_u = self.C_u * (X_u.T @ (grad_pos + grad_neg))
            gb_u = self.C_u * np.sum(grad_pos + grad_neg)

            # 总损失
            loss = 0.5 * np.dot(self.w, self.w) + loss_l + loss_u
            self.loss_history.append(loss)

            # 梯度步
            self.w -= self.lr * (self.w + gw_l + gw_u)
            self.b -= self.lr * (gb_l + gb_u)

            # 近端步 (Group LASSO)
            self.w = group_lasso_proximal(self.w, self.groups,
                                          self.lambda_g, self.lr)

        return self

    def decision_function(self, X):
        return X @ self.w + self.b

    def predict(self, X):
        return (self.decision_function(X) >= 0).astype(int)

    def get_selected_features(self, group_names=None):
        """获取被Group LASSO选中的特征组。"""
        selected = []
        for i, group in enumerate(self.groups):
            norm = np.linalg.norm(self.w[group])
            if norm > 1e-6:
                name = group_names[i] if group_names else f"group_{i}"
                selected.append((name, norm))
        return selected


# ================================================================
# 7. 评估函数
# ================================================================
def evaluate_model(y_true, y_pred, y_score=None):
    """计算分类性能指标。"""
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    else:
        specificity = 0
    auc = 0.0
    if y_score is not None:
        try:
            auc = roc_auc_score(y_true, y_score)
        except ValueError:
            pass
    return {'Accuracy': acc, 'Precision': prec, 'Recall': rec,
            'Specificity': specificity, 'F1': f1, 'AUC': auc}


# ================================================================
# 8. 主实验
# ================================================================
def run_experiments():
    """运行完整实验, 复现论文主要结果。"""
    print("=" * 72)
    print("  GP_SSVM 信用评分模型实验")
    print("  复现论文: 基于半监督支持向量机的信用评分模型 (陈耸, 2024)")
    print("=" * 72)

    # ---- 数据加载 ----
    print("\n[1] 加载UCI German Credit Data...")
    X, y, feature_names, cat_indices, num_indices = load_german_credit()
    print(f"    数据集: {X.shape[0]} 样本, {X.shape[1]} 特征")
    print(f"    类别: 好客户={int(np.sum(y==0))}, 坏客户={int(np.sum(y==1))}")
    print(f"    来源: UCI ML Repository - Statlog (German Credit Data)")
    print(f"    网址: https://archive.ics.uci.edu/datasets/144/statlog+german+credit+data")

    # ---- 预处理 ----
    print("\n[2] 数据预处理 (独热编码+标准化)...")
    X_proc, groups, group_names = preprocess_data(X, cat_indices, num_indices)
    print(f"    编码后特征数: {X_proc.shape[1]}, 分组数: {len(groups)}")

    # ---- MAR缺失 ----
    print("\n[3] 模拟MAR缺失数据 (缺失率~20%)...")
    X_missing, mask = simulate_mar(X_proc, 0.2)
    print(f"    实际缺失率: {mask.sum()/X_proc.size:.2%}")
    X_imputed = impute_mean(X_missing)

    # ---- 交叉验证 ----
    n_folds = 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    labeled_ratio = 0.1

    results = {
        'SVM_full': [], 'S3VM_full': [], 'GP_SSVM_full': [],
        'SVM_mar': [], 'S3VM_mar': [], 'GP_SSVM_mar': [],
    }

    print(f"\n[4] {n_folds}折交叉验证 (有标签比例={labeled_ratio:.0%})")
    print(f"    S3VM: C_l=1.0, C_u=0.5, kernel=rbf, gamma=0.02")
    print(f"    GP_SSVM: C_l=1.0, C_u=0.5, λ=0.01, 近端梯度下降")
    print("-" * 72)

    for fold, (train_idx, test_idx) in enumerate(skf.split(X_proc, y)):
        print(f"\n  === 第 {fold+1}/{n_folds} 折 ===")

        X_tr = X_proc[train_idx]
        y_tr = y[train_idx]
        X_te = X_proc[test_idx]
        y_te = y[test_idx]

        # 分层采样有标签子集
        rng = np.random.RandomState(42 + fold)
        labeled_idx = []
        for cls in [0, 1]:
            cls_idx = np.where(y_tr == cls)[0]
            n_cls = max(int(len(cls_idx) * labeled_ratio), 3)
            labeled_idx.extend(rng.choice(cls_idx, min(n_cls, len(cls_idx)), replace=False))
        labeled_idx = np.array(labeled_idx)
        unlabeled_idx = np.array([i for i in range(len(y_tr)) if i not in labeled_idx])

        X_l, y_l = X_tr[labeled_idx], y_tr[labeled_idx]
        X_u = X_tr[unlabeled_idx]

        print(f"    训练:{len(y_tr)} (标签:{len(labeled_idx)}, 无标签:{len(unlabeled_idx)})")

        # ---- [A] 完整数据 ----
        print("    [A] 完整数据:")

        # (1) 标准SVM
        svm = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
        svm.fit(X_l, y_l.ravel())
        r = evaluate_model(y_te, svm.predict(X_te), svm.predict_proba(X_te)[:, 1])
        results['SVM_full'].append(r)
        print(f"      SVM:     Acc={r['Accuracy']:.4f}  F1={r['F1']:.4f}  AUC={r['AUC']:.4f}")

        # (2) S3VM
        s3vm = S3VM(C_l=1.0, C_u=0.5, kernel='rbf', gamma=0.02)
        s3vm.fit(X_l, y_l, X_u)
        r = evaluate_model(y_te, s3vm.predict(X_te), s3vm.decision_function(X_te))
        results['S3VM_full'].append(r)
        print(f"      S3VM:    Acc={r['Accuracy']:.4f}  F1={r['F1']:.4f}  AUC={r['AUC']:.4f}")

        # (3) GP_SSVM
        gp = GP_SSVM(C_l=1.0, C_u=0.5, lambda_g=0.01, max_iter=500, lr=0.0005)
        gp.groups = groups
        gp.fit(X_l, y_l, X_u)
        r = evaluate_model(y_te, gp.predict(X_te), gp.decision_function(X_te))
        results['GP_SSVM_full'].append(r)
        print(f"      GP_SSVM: Acc={r['Accuracy']:.4f}  F1={r['F1']:.4f}  AUC={r['AUC']:.4f}")

        # ---- [B] MAR缺失数据 ----
        print("    [B] MAR缺失数据:")
        X_tr_m, _ = simulate_mar(X_tr, 0.2, seed=42+fold)
        X_tr_imp = impute_mean(X_tr_m)
        X_te_m, _ = simulate_mar(X_te, 0.2, seed=42+fold)
        X_te_imp = impute_mean(X_te_m)

        X_l_m, X_u_m = X_tr_imp[labeled_idx], X_tr_imp[unlabeled_idx]

        svm_m = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
        svm_m.fit(X_l_m, y_l.ravel())
        r = evaluate_model(y_te, svm_m.predict(X_te_imp), svm_m.predict_proba(X_te_imp)[:, 1])
        results['SVM_mar'].append(r)
        print(f"      SVM:     Acc={r['Accuracy']:.4f}  F1={r['F1']:.4f}  AUC={r['AUC']:.4f}")

        s3vm_m = S3VM(C_l=1.0, C_u=0.5, kernel='rbf', gamma=0.02)
        s3vm_m.fit(X_l_m, y_l, X_u_m)
        r = evaluate_model(y_te, s3vm_m.predict(X_te_imp), s3vm_m.decision_function(X_te_imp))
        results['S3VM_mar'].append(r)
        print(f"      S3VM:    Acc={r['Accuracy']:.4f}  F1={r['F1']:.4f}  AUC={r['AUC']:.4f}")

        gp_m = GP_SSVM(C_l=1.0, C_u=0.5, lambda_g=0.01, max_iter=500, lr=0.0005)
        gp_m.groups = groups
        gp_m.fit(X_l_m, y_l, X_u_m)
        r = evaluate_model(y_te, gp_m.predict(X_te_imp), gp_m.decision_function(X_te_imp))
        results['GP_SSVM_mar'].append(r)
        print(f"      GP_SSVM: Acc={r['Accuracy']:.4f}  F1={r['F1']:.4f}  AUC={r['AUC']:.4f}")

    # ========== 结果汇总 ==========
    print("\n" + "=" * 72)
    print("  实验结果汇总 (5折交叉验证平均值)")
    print("=" * 72)

    metrics = ['Accuracy', 'Precision', 'Recall', 'Specificity', 'F1', 'AUC']
    names = {
        'SVM_full': '标准SVM (完整)', 'S3VM_full': 'S3VM (完整)',
        'GP_SSVM_full': 'GP_SSVM (完整)',
        'SVM_mar': '标准SVM (MAR缺失)', 'S3VM_mar': 'S3VM (MAR缺失)',
        'GP_SSVM_mar': 'GP_SSVM (MAR缺失)',
    }

    header = f"{'模型':<22}" + "".join(f"{m:>10}" for m in metrics)
    print(header)
    print("-" * 82)

    summary = {}
    for key, rlist in results.items():
        if rlist:
            avg = {m: np.mean([r[m] for r in rlist]) for m in metrics}
            std = {m: np.std([r[m] for r in rlist]) for m in metrics}
            summary[key] = (avg, std)
            line = f"{names[key]:<22}" + "".join(f"{avg[m]:>10.4f}" for m in metrics)
            print(line)

    print("-" * 82)

    # 详细结果
    print("\n详细结果 (均值 ± 标准差):")
    for key, (avg, std) in summary.items():
        print(f"\n  {names[key]}:")
        for m in metrics:
            print(f"    {m:>12}: {avg[m]:.4f} ± {std[m]:.4f}")

    # ========== Group LASSO 分析 ==========
    print("\n" + "=" * 72)
    print("  Group LASSO 特征选择分析")
    print("=" * 72)

    X_tr, X_te, y_tr, y_te = train_test_split(
        X_proc, y, test_size=0.2, random_state=42, stratify=y)
    rng = np.random.RandomState(42)
    n_l = int(len(y_tr) * 0.1)
    l_idx = rng.choice(len(y_tr), n_l, replace=False)
    u_idx = np.array([i for i in range(len(y_tr)) if i not in l_idx])

    gp_final = GP_SSVM(C_l=1.0, C_u=0.5, lambda_g=0.01, max_iter=500, lr=0.0005)
    gp_final.groups = groups
    gp_final.fit(X_tr[l_idx], y_tr[l_idx], X_tr[u_idx])

    selected = gp_final.get_selected_features(group_names)
    print(f"\n  总特征组数: {len(groups)}")
    print(f"  选中特征组数: {len(selected)} ({len(selected)/len(groups):.1%})")

    # 特征重要性
    print(f"\n  特征重要性 (权重L2范数排序):")
    for i, (name, norm) in enumerate(sorted(selected, key=lambda x: -x[1])):
        # 映射回原始特征名
        if name.startswith("cat_") or name.startswith("num_"):
            idx = int(name.split("_")[1])
            orig = feature_names[idx] if idx < len(feature_names) else name
        else:
            orig = name
        print(f"    {i+1:2d}. {orig:<30} L2范数: {norm:.4f}")

    # 权重统计
    print(f"\n  权重统计:")
    print(f"    非零权重: {int(np.sum(np.abs(gp_final.w) > 1e-6))}/{len(gp_final.w)}")
    print(f"    L1范数: {np.sum(np.abs(gp_final.w)):.4f}")
    print(f"    L2范数: {np.linalg.norm(gp_final.w):.4f}")

    # 收敛曲线
    print(f"\n  优化收敛曲线 (前10次/每50次):")
    for i in range(0, len(gp_final.loss_history), 50):
        print(f"    iter {i:4d}: loss = {gp_final.loss_history[i]:.2f}")
    print(f"    iter {len(gp_final.loss_history)-1:4d}: loss = {gp_final.loss_history[-1]:.2f}")

    # 最终测试集性能
    y_pred = gp_final.predict(X_te)
    y_score = gp_final.decision_function(X_te)
    final = evaluate_model(y_te, y_pred, y_score)
    print(f"\n  最终GP_SSVM测试集性能:")
    for m, v in final.items():
        print(f"    {m:>12}: {v:.4f}")

    print("\n" + "=" * 72)
    print("  实验完成!")
    print("=" * 72)

    return results, summary


if __name__ == '__main__':
    t0 = time.time()
    results, summary = run_experiments()
========================================================================
  GP_SSVM 信用评分模型实验
  复现论文: 基于半监督支持向量机的信用评分模型 (陈耸, 2024)
========================================================================

[1] 加载UCI German Credit Data...
    数据集: 1000 样本, 20 特征
    类别: 好客户=700, 坏客户=300
    来源: UCI ML Repository - Statlog (German Credit Data)
    网址: https://archive.ics.uci.edu/datasets/144/statlog+german+credit+data

[2] 数据预处理 (独热编码+标准化)...
    编码后特征数: 61, 分组数: 20

[3] 模拟MAR缺失数据 (缺失率~20%)...
    实际缺失率: 1.90%

[4] 5折交叉验证 (有标签比例=10%)
    S3VM: C_l=1.0, C_u=0.5, kernel=rbf, gamma=0.02
    GP_SSVM: C_l=1.0, C_u=0.5, λ=0.01, 近端梯度下降
------------------------------------------------------------------------

  === 第 1/5 折 ===
    训练:800 (标签:80, 无标签:720)
    [A] 完整数据:
      SVM:     Acc=0.6900  F1=0.0606  AUC=0.6802
      S3VM:    Acc=0.6500  F1=0.4531  AUC=0.6776
      GP_SSVM: Acc=0.6900  F1=0.4151  AUC=0.6577
    [B] MAR缺失数据:
      SVM:     Acc=0.6950  F1=0.0896  AUC=0.6651
      S3VM:    Acc=0.7000  F1=0.0000  AUC=0.6590
      GP_SSVM: Acc=0.6700  F1=0.3529  AUC=0.6429

  === 第 2/5 折 ===
    训练:800 (标签:80, 无标签:720)
    [A] 完整数据:
      SVM:     Acc=0.6850  F1=0.3226  AUC=0.6654
      S3VM:    Acc=0.6900  F1=0.0882  AUC=0.6292
      GP_SSVM: Acc=0.7050  F1=0.3371  AUC=0.6321
    [B] MAR缺失数据:
      SVM:     Acc=0.6850  F1=0.3226  AUC=0.6744
      S3VM:    Acc=0.6600  F1=0.4848  AUC=0.6408
      GP_SSVM: Acc=0.7050  F1=0.3059  AUC=0.6500

  === 第 3/5 折 ===
    训练:800 (标签:80, 无标签:720)
    [A] 完整数据:
      SVM:     Acc=0.6900  F1=0.0000  AUC=0.6636
      S3VM:    Acc=0.6350  F1=0.4252  AUC=0.6479
      GP_SSVM: Acc=0.6900  F1=0.2619  AUC=0.6318
    [B] MAR缺失数据:
      SVM:     Acc=0.6850  F1=0.0000  AUC=0.6640
      S3VM:    Acc=0.6550  F1=0.4298  AUC=0.6440
      GP_SSVM: Acc=0.6850  F1=0.3505  AUC=0.6110

  === 第 4/5 折 ===
    训练:800 (标签:80, 无标签:720)
    [A] 完整数据:
      SVM:     Acc=0.7300  F1=0.1818  AUC=0.2706
      S3VM:    Acc=0.7000  F1=0.0000  AUC=0.7279
      GP_SSVM: Acc=0.7200  F1=0.2222  AUC=0.6739
    [B] MAR缺失数据:
      SVM:     Acc=0.7300  F1=0.1818  AUC=0.2514
      S3VM:    Acc=0.7000  F1=0.0000  AUC=0.7444
      GP_SSVM: Acc=0.7050  F1=0.2532  AUC=0.6318

  === 第 5/5 折 ===
    训练:800 (标签:80, 无标签:720)
    [A] 完整数据:
      SVM:     Acc=0.7050  F1=0.0923  AUC=0.7555
      S3VM:    Acc=0.7000  F1=0.0000  AUC=0.7590
      GP_SSVM: Acc=0.7150  F1=0.2785  AUC=0.7619
    [B] MAR缺失数据:
      SVM:     Acc=0.7050  F1=0.0923  AUC=0.7682
      S3VM:    Acc=0.7150  F1=0.5366  AUC=0.7656
      GP_SSVM: Acc=0.7100  F1=0.2564  AUC=0.7294

========================================================================
  实验结果汇总 (5折交叉验证平均值)
========================================================================
模型                      Accuracy Precision    RecallSpecificity        F1       AUC
----------------------------------------------------------------------------------
标准SVM (完整)                0.7000    0.4776    0.0867    0.9629    0.1315    0.6071
S3VM (完整)                 0.6750    0.2409    0.1967    0.8800    0.1933    0.6883
GP_SSVM (完整)              0.7040    0.5399    0.2233    0.9100    0.3030    0.6715
标准SVM (MAR缺失)             0.7000    0.4966    0.0900    0.9614    0.1373    0.6046
S3VM (MAR缺失)              0.6860    0.2789    0.3033    0.8500    0.2902    0.6908
GP_SSVM (MAR缺失)           0.6950    0.4980    0.2267    0.8957    0.3038    0.6530
----------------------------------------------------------------------------------

详细结果 (均值 ± 标准差):

  标准SVM (完整):
        Accuracy: 0.7000 ± 0.0164
       Precision: 0.4776 ± 0.3278
          Recall: 0.0867 ± 0.0878
     Specificity: 0.9629 ± 0.0466
              F1: 0.1315 ± 0.1122
             AUC: 0.6071 ± 0.1716

  S3VM (完整):
        Accuracy: 0.6750 ± 0.0272
       Precision: 0.2409 ± 0.1974
          Recall: 0.1967 ± 0.2215
     Specificity: 0.8800 ± 0.1330
              F1: 0.1933 ± 0.2035
             AUC: 0.6883 ± 0.0486

  GP_SSVM (完整):
        Accuracy: 0.7040 ± 0.0124
       Precision: 0.5399 ± 0.0756
          Recall: 0.2233 ± 0.0807
     Specificity: 0.9100 ± 0.0481
              F1: 0.3030 ± 0.0671
             AUC: 0.6715 ± 0.0479

  标准SVM (MAR缺失):
        Accuracy: 0.7000 ± 0.0167
       Precision: 0.4966 ± 0.3216
          Recall: 0.0900 ± 0.0860
     Specificity: 0.9614 ± 0.0460
              F1: 0.1373 ± 0.1091
             AUC: 0.6046 ± 0.1809

  S3VM (MAR缺失):
        Accuracy: 0.6860 ± 0.0240
       Precision: 0.2789 ± 0.2301
          Recall: 0.3033 ± 0.2509
     Specificity: 0.8500 ± 0.1245
              F1: 0.2902 ± 0.2394
             AUC: 0.6908 ± 0.0532

  GP_SSVM (MAR缺失):
        Accuracy: 0.6950 ± 0.0152
       Precision: 0.4980 ± 0.0467
          Recall: 0.2267 ± 0.0564
     Specificity: 0.8957 ± 0.0451
              F1: 0.3038 ± 0.0434
             AUC: 0.6530 ± 0.0404

========================================================================
  Group LASSO 特征选择分析
========================================================================

  总特征组数: 20
  选中特征组数: 20 (100.0%)

  特征重要性 (权重L2范数排序):
     1. Attribute4                     L2范数: 0.6232
     2. Attribute1                     L2范数: 0.5616
     3. Attribute7                     L2范数: 0.5394
     4. Attribute6                     L2范数: 0.4730
     5. Attribute14                    L2范数: 0.4212
     6. Attribute9                     L2范数: 0.4052
     7. Attribute3                     L2范数: 0.3461
     8. Attribute10                    L2范数: 0.3142
     9. Attribute12                    L2范数: 0.2074
    10. Attribute17                    L2范数: 0.1771
    11. Attribute15                    L2范数: 0.1569
    12. Attribute8                     L2范数: 0.1401
    13. Attribute5                     L2范数: 0.1398
    14. Attribute20                    L2范数: 0.1393
    15. Attribute18                    L2范数: 0.1152
    16. Attribute2                     L2范数: 0.0821
    17. Attribute19                    L2范数: 0.0750
    18. Attribute16                    L2范数: 0.0495
    19. Attribute11                    L2范数: 0.0226
    20. Attribute13                    L2范数: 0.0079

  权重统计:
    非零权重: 61/61
    L1范数: 9.1578
    L2范数: 1.4017

  优化收敛曲线 (前10次/每50次):
    iter    0: loss = 963.91
    iter   50: loss = 773.96
    iter  100: loss = 766.89
    iter  150: loss = 764.92
    iter  200: loss = 763.65
    iter  250: loss = 762.84
    iter  300: loss = 762.33
    iter  350: loss = 761.89
    iter  400: loss = 761.74
    iter  450: loss = 761.59
    iter  499: loss = 761.34

  最终GP_SSVM测试集性能:
        Accuracy: 0.6750
       Precision: 0.4675
          Recall: 0.6000
     Specificity: 0.7071
              F1: 0.5255
             AUC: 0.7052

========================================================================
  实验完成!
========================================================================