In [1]:
!pip install ucimlrepo
Collecting ucimlrepo Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB) Requirement already satisfied: pandas>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from ucimlrepo) (2.3.3) Requirement already satisfied: certifi>=2020.12.5 in /usr/local/lib/python3.12/dist-packages (from ucimlrepo) (2026.2.25) Requirement already satisfied: numpy>=1.26.0 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.0.0->ucimlrepo) (2.4.6) Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.0.0->ucimlrepo) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.0.0->ucimlrepo) (2025.2) Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas>=1.0.0->ucimlrepo) (2026.1) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas>=1.0.0->ucimlrepo) (1.17.0) Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB) Installing collected packages: ucimlrepo Successfully installed ucimlrepo-0.0.7
In [2]:
"""
==========================================================================
GP_SSVM: Group Penalty Semi-Supervised Support Vector Machine
==========================================================================
复现论文: 基于半监督支持向量机的信用评分模型
作者: 陈耸
期刊: 中国管理科学, 2024, Vol.32, No.3
DOI: 10.16381/j.cnki.issn1003-207x.2021.2434
算法实现:
1. S3VM (半监督支持向量机) - 基于对偶QP求解
2. Group LASSO 特征选择 - 基于近端梯度下降
3. GP_SSVM (Group Penalty S3VM) - S3VM + Group LASSO联合优化
4. MAR缺失数据机制处理
数据集: UCI Statlog (German Credit Data)
来源: https://archive.ics.uci.edu/datasets/144/statlog+german+credit+data
==========================================================================
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, confusion_matrix)
from sklearn.impute import SimpleImputer
from ucimlrepo import fetch_ucirepo
import time
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
# ================================================================
# 1. 数据加载
# ================================================================
def load_german_credit():
"""
加载UCI Statlog (German Credit Data)数据集。
数据来源: UCI Machine Learning Repository
https://archive.ics.uci.edu/datasets/144/statlog+german+credit+data
原始来源: Hofmann, H. (1994). Statlog (German Credit Data).
数据集说明:
- 1000个样本, 20个特征 (7个数值型, 13个分类型)
- 二分类: 好客户(0) vs 坏客户(1)
- 类别分布: 好客户700(70%), 坏客户300(30%)
"""
dataset = fetch_ucirepo(id=144)
X = dataset.data.features.copy()
y = dataset.data.targets.copy()
# 分类特征和数值特征的索引
cat_indices = [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19]
num_indices = [1, 4, 7, 10, 12, 15, 17]
y = (y.iloc[:, 0] - 1).values.astype(float)
X_encoded = X.copy()
for col_idx in cat_indices:
col_name = X.columns[col_idx]
le = LabelEncoder()
X_encoded[col_name] = le.fit_transform(X[col_name].astype(str))
feature_names = X_encoded.columns.tolist()
return X_encoded.values.astype(float), y, feature_names, cat_indices, num_indices
# ================================================================
# 2. 数据预处理
# ================================================================
def preprocess_data(X, cat_indices, num_indices):
"""
数据预处理: 独热编码 + 标准化。
返回特征矩阵和Group LASSO分组信息。
"""
n_samples = X.shape[0]
X_num = StandardScaler().fit_transform(X[:, num_indices].copy())
X_cat_parts = []
groups = []
group_names = []
idx = 0
for feat_idx in num_indices:
groups.append([idx])
group_names.append(f"num_{feat_idx}")
idx += 1
for feat_idx in cat_indices:
col = X[:, feat_idx].astype(int)
n_cat = int(col.max()) + 1
one_hot = np.zeros((n_samples, n_cat))
for k in range(n_cat):
one_hot[:, k] = (col == k).astype(float)
X_cat_parts.append(one_hot)
groups.append(list(range(idx, idx + n_cat)))
group_names.append(f"cat_{feat_idx}")
idx += n_cat
return np.hstack([X_num] + X_cat_parts), groups, group_names
# ================================================================
# 3. MAR缺失数据
# ================================================================
def simulate_mar(X, missing_rate=0.2, seed=42):
"""模拟MAR缺失数据: 缺失概率依赖于其他观测变量。"""
rng = np.random.RandomState(seed)
X_missing = X.copy().astype(float)
n, d = X.shape
missing_mask = np.zeros_like(X_missing, dtype=bool)
n_miss_feat = max(2, d // 5)
miss_feats = rng.choice(d, n_miss_feat, replace=False)
for feat in miss_feats:
dep_feat = rng.choice([f for f in range(d) if f != feat])
vals = X[:, dep_feat]
prob = ((vals - vals.min()) / (vals.max() - vals.min() + 1e-10)) * missing_rate * 2
mask = rng.random(n) < prob
X_missing[mask, feat] = np.nan
missing_mask[mask, feat] = True
return X_missing, missing_mask
def impute_mean(X_missing):
"""均值填补。"""
return SimpleImputer(strategy='mean').fit_transform(X_missing)
# ================================================================
# 4. S3VM 实现 (对偶QP求解)
# ================================================================
class S3VM:
"""
半监督支持向量机 (Semi-Supervised SVM)
论文公式(2):
min 1/2 ||w||² + C_l Σ ξ_i + C_u Σ (ξ_j⁺ + ξ_j⁻)
s.t. y_i(w'x_i+b) ≥ 1-ξ_i, w'x_j+b ≥ 1-ξ_j⁺, -(w'x_j+b) ≥ 1-ξ_j⁻
对偶等价: 将无标签数据扩展为正/负两份, 用标准SVM对偶QP求解。
"""
def __init__(self, C_l=1.0, C_u=0.5, kernel='rbf', gamma=0.02):
self.C_l = C_l
self.C_u = C_u
self.kernel = kernel
self.gamma = gamma
self.alpha = None
self.X_train = None
self.y_train = None
self.b = 0.0
def fit(self, X_l, y_l, X_u):
"""
训练S3VM。
将无标签数据扩展为正/负两类, 构造标准SVM对偶QP。
"""
from cvxopt import matrix, solvers
solvers.options['show_progress'] = False
n_l, n_u = X_l.shape[0], X_u.shape[0]
y_l = np.where(y_l.ravel() == 0, -1.0, 1.0)
# 扩展数据集
X_ext = np.vstack([X_l, X_u, X_u])
y_ext = np.concatenate([y_l, np.ones(n_u), -np.ones(n_u)])
n = len(y_ext)
C_vec = np.concatenate([np.full(n_l, self.C_l),
np.full(n_u, self.C_u),
np.full(n_u, self.C_u)])
# 核矩阵
if self.kernel == 'linear':
K = X_ext @ X_ext.T
else:
from scipy.spatial.distance import cdist
K = np.exp(-self.gamma * cdist(X_ext, X_ext, 'sqeuclidean'))
# QP: min 1/2 α'Pα - 1'α, s.t. y'α=0, 0≤α≤C
P = matrix(np.outer(y_ext, y_ext) * K)
q = matrix(-np.ones(n))
G = matrix(np.vstack([-np.eye(n), np.eye(n)]))
h = matrix(np.concatenate([np.zeros(n), C_vec]))
A = matrix(y_ext.reshape(1, -1))
b = matrix(0.0)
try:
sol = solvers.qp(P, q, G, h, A, b)
alpha = np.array(sol['x']).flatten()
except Exception:
self.alpha = np.zeros(n)
self.X_train = X_ext
self.y_train = y_ext
return self
alpha[alpha < 1e-8] = 0
self.alpha = alpha
self.X_train = X_ext
self.y_train = y_ext
# 计算b: 使用自由支持向量
free_sv = (alpha > 1e-8) & (alpha < C_vec - 1e-8)
if free_sv.sum() > 0:
K_sv = K[free_sv] # (n_free, n)
decision = K_sv @ (alpha * y_ext) # (n_free,)
self.b = np.mean(y_ext[free_sv] - decision)
else:
nz = alpha > 1e-8
if nz.sum() > 0:
K_nz = K[nz]
decision = K_nz @ (alpha * y_ext)
self.b = np.mean(y_ext[nz] - decision)
return self
def decision_function(self, X):
if self.alpha is None or self.alpha.sum() == 0:
return np.zeros(X.shape[0])
if self.kernel == 'linear':
K = X @ self.X_train.T
else:
from scipy.spatial.distance import cdist
K = np.exp(-self.gamma * cdist(X, self.X_train, 'sqeuclidean'))
return K @ (self.alpha * self.y_train) + self.b
def predict(self, X):
return (self.decision_function(X) >= 0).astype(int)
# ================================================================
# 5. Group LASSO
# ================================================================
def group_lasso_proximal(w, groups, lam, lr):
"""Group LASSO近端算子 (分组软阈值)。"""
w_new = w.copy()
for group in groups:
w_g = w[group]
norm_g = np.linalg.norm(w_g)
if norm_g > 1e-10:
scale = max(0, 1 - lr * lam / norm_g)
w_new[group] = scale * w_g
else:
w_new[group] = 0.0
return w_new
# ================================================================
# 6. GP_SSVM (Group Penalty S3VM) - 近端梯度下降求解
# ================================================================
class GP_SSVM:
"""
GP_SSVM: 带Group LASSO惩罚的半监督SVM
优化问题 (论文公式8):
min 1/2||w||² + λΣ_g||w_g||₂ + C_lΣmax(0,1-y_i·f_i) + C_uΣ[max(0,1-f_j)+max(0,1+f_j)]
求解: 近端梯度下降法
- 梯度下降处理SVM铰链损失
- 近端步处理Group LASSO非光滑惩罚
"""
def __init__(self, C_l=1.0, C_u=0.5, lambda_g=0.01,
max_iter=500, lr=0.0005):
self.C_l = C_l
self.C_u = C_u
self.lambda_g = lambda_g
self.max_iter = max_iter
self.lr = lr
self.w = None
self.b = 0.0
self.groups = None
self.loss_history = []
def fit(self, X_l, y_l, X_u):
"""
训练GP_SSVM。
使用近端梯度下降直接优化公式(8)的目标函数,
无需迭代伪标签——无标签数据通过S3VM损失项直接参与优化。
"""
n_features = X_l.shape[1]
y_l = y_l.ravel()
# 转换为 -1/+1 标签
y_svm = np.where(y_l == 0, -1.0, 1.0)
if self.groups is None:
self.groups = [[i] for i in range(n_features)]
# 初始化: 用有标签数据训练
init = LinearSVC(C=self.C_l, max_iter=5000, random_state=42)
init.fit(X_l, y_l)
self.w = init.coef_.flatten()
self.b = init.intercept_[0]
# 合并数据
X_all = np.vstack([X_l, X_u])
n_l = X_l.shape[0]
n_u = X_u.shape[0]
# 有标签/无标签掩码
labeled_mask = np.zeros(len(X_all), dtype=bool)
labeled_mask[:n_l] = True
unlabeled_mask = ~labeled_mask
# 近端梯度下降
for it in range(self.max_iter):
f = X_all @ self.w + self.b
# --- 有标签数据梯度 (铰链损失) ---
f_l = f[:n_l]
margins = y_svm * f_l
violated = margins < 1.0
loss_l = self.C_l * np.sum(np.maximum(0, 1 - margins))
gw_l = np.zeros(n_features)
gb_l = 0.0
if violated.any():
gw_l = -self.C_l * (X_l[violated].T @ y_svm[violated])
gb_l = -self.C_l * np.sum(y_svm[violated])
# --- 无标签数据梯度 (S3VM损失) ---
f_u = f[n_l:]
loss_u = self.C_u * np.sum(np.maximum(0, 1-f_u) + np.maximum(0, 1+f_u))
grad_pos = -np.where(f_u < 1, 1.0, 0.0)
grad_neg = np.where(f_u > -1, 1.0, 0.0)
gw_u = self.C_u * (X_u.T @ (grad_pos + grad_neg))
gb_u = self.C_u * np.sum(grad_pos + grad_neg)
# 总损失
loss = 0.5 * np.dot(self.w, self.w) + loss_l + loss_u
self.loss_history.append(loss)
# 梯度步
self.w -= self.lr * (self.w + gw_l + gw_u)
self.b -= self.lr * (gb_l + gb_u)
# 近端步 (Group LASSO)
self.w = group_lasso_proximal(self.w, self.groups,
self.lambda_g, self.lr)
return self
def decision_function(self, X):
return X @ self.w + self.b
def predict(self, X):
return (self.decision_function(X) >= 0).astype(int)
def get_selected_features(self, group_names=None):
"""获取被Group LASSO选中的特征组。"""
selected = []
for i, group in enumerate(self.groups):
norm = np.linalg.norm(self.w[group])
if norm > 1e-6:
name = group_names[i] if group_names else f"group_{i}"
selected.append((name, norm))
return selected
# ================================================================
# 7. 评估函数
# ================================================================
def evaluate_model(y_true, y_pred, y_score=None):
"""计算分类性能指标。"""
acc = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)
cm = confusion_matrix(y_true, y_pred)
if cm.shape == (2, 2):
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
else:
specificity = 0
auc = 0.0
if y_score is not None:
try:
auc = roc_auc_score(y_true, y_score)
except ValueError:
pass
return {'Accuracy': acc, 'Precision': prec, 'Recall': rec,
'Specificity': specificity, 'F1': f1, 'AUC': auc}
# ================================================================
# 8. 主实验
# ================================================================
def run_experiments():
"""运行完整实验, 复现论文主要结果。"""
print("=" * 72)
print(" GP_SSVM 信用评分模型实验")
print(" 复现论文: 基于半监督支持向量机的信用评分模型 (陈耸, 2024)")
print("=" * 72)
# ---- 数据加载 ----
print("\n[1] 加载UCI German Credit Data...")
X, y, feature_names, cat_indices, num_indices = load_german_credit()
print(f" 数据集: {X.shape[0]} 样本, {X.shape[1]} 特征")
print(f" 类别: 好客户={int(np.sum(y==0))}, 坏客户={int(np.sum(y==1))}")
print(f" 来源: UCI ML Repository - Statlog (German Credit Data)")
print(f" 网址: https://archive.ics.uci.edu/datasets/144/statlog+german+credit+data")
# ---- 预处理 ----
print("\n[2] 数据预处理 (独热编码+标准化)...")
X_proc, groups, group_names = preprocess_data(X, cat_indices, num_indices)
print(f" 编码后特征数: {X_proc.shape[1]}, 分组数: {len(groups)}")
# ---- MAR缺失 ----
print("\n[3] 模拟MAR缺失数据 (缺失率~20%)...")
X_missing, mask = simulate_mar(X_proc, 0.2)
print(f" 实际缺失率: {mask.sum()/X_proc.size:.2%}")
X_imputed = impute_mean(X_missing)
# ---- 交叉验证 ----
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
labeled_ratio = 0.1
results = {
'SVM_full': [], 'S3VM_full': [], 'GP_SSVM_full': [],
'SVM_mar': [], 'S3VM_mar': [], 'GP_SSVM_mar': [],
}
print(f"\n[4] {n_folds}折交叉验证 (有标签比例={labeled_ratio:.0%})")
print(f" S3VM: C_l=1.0, C_u=0.5, kernel=rbf, gamma=0.02")
print(f" GP_SSVM: C_l=1.0, C_u=0.5, λ=0.01, 近端梯度下降")
print("-" * 72)
for fold, (train_idx, test_idx) in enumerate(skf.split(X_proc, y)):
print(f"\n === 第 {fold+1}/{n_folds} 折 ===")
X_tr = X_proc[train_idx]
y_tr = y[train_idx]
X_te = X_proc[test_idx]
y_te = y[test_idx]
# 分层采样有标签子集
rng = np.random.RandomState(42 + fold)
labeled_idx = []
for cls in [0, 1]:
cls_idx = np.where(y_tr == cls)[0]
n_cls = max(int(len(cls_idx) * labeled_ratio), 3)
labeled_idx.extend(rng.choice(cls_idx, min(n_cls, len(cls_idx)), replace=False))
labeled_idx = np.array(labeled_idx)
unlabeled_idx = np.array([i for i in range(len(y_tr)) if i not in labeled_idx])
X_l, y_l = X_tr[labeled_idx], y_tr[labeled_idx]
X_u = X_tr[unlabeled_idx]
print(f" 训练:{len(y_tr)} (标签:{len(labeled_idx)}, 无标签:{len(unlabeled_idx)})")
# ---- [A] 完整数据 ----
print(" [A] 完整数据:")
# (1) 标准SVM
svm = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm.fit(X_l, y_l.ravel())
r = evaluate_model(y_te, svm.predict(X_te), svm.predict_proba(X_te)[:, 1])
results['SVM_full'].append(r)
print(f" SVM: Acc={r['Accuracy']:.4f} F1={r['F1']:.4f} AUC={r['AUC']:.4f}")
# (2) S3VM
s3vm = S3VM(C_l=1.0, C_u=0.5, kernel='rbf', gamma=0.02)
s3vm.fit(X_l, y_l, X_u)
r = evaluate_model(y_te, s3vm.predict(X_te), s3vm.decision_function(X_te))
results['S3VM_full'].append(r)
print(f" S3VM: Acc={r['Accuracy']:.4f} F1={r['F1']:.4f} AUC={r['AUC']:.4f}")
# (3) GP_SSVM
gp = GP_SSVM(C_l=1.0, C_u=0.5, lambda_g=0.01, max_iter=500, lr=0.0005)
gp.groups = groups
gp.fit(X_l, y_l, X_u)
r = evaluate_model(y_te, gp.predict(X_te), gp.decision_function(X_te))
results['GP_SSVM_full'].append(r)
print(f" GP_SSVM: Acc={r['Accuracy']:.4f} F1={r['F1']:.4f} AUC={r['AUC']:.4f}")
# ---- [B] MAR缺失数据 ----
print(" [B] MAR缺失数据:")
X_tr_m, _ = simulate_mar(X_tr, 0.2, seed=42+fold)
X_tr_imp = impute_mean(X_tr_m)
X_te_m, _ = simulate_mar(X_te, 0.2, seed=42+fold)
X_te_imp = impute_mean(X_te_m)
X_l_m, X_u_m = X_tr_imp[labeled_idx], X_tr_imp[unlabeled_idx]
svm_m = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)
svm_m.fit(X_l_m, y_l.ravel())
r = evaluate_model(y_te, svm_m.predict(X_te_imp), svm_m.predict_proba(X_te_imp)[:, 1])
results['SVM_mar'].append(r)
print(f" SVM: Acc={r['Accuracy']:.4f} F1={r['F1']:.4f} AUC={r['AUC']:.4f}")
s3vm_m = S3VM(C_l=1.0, C_u=0.5, kernel='rbf', gamma=0.02)
s3vm_m.fit(X_l_m, y_l, X_u_m)
r = evaluate_model(y_te, s3vm_m.predict(X_te_imp), s3vm_m.decision_function(X_te_imp))
results['S3VM_mar'].append(r)
print(f" S3VM: Acc={r['Accuracy']:.4f} F1={r['F1']:.4f} AUC={r['AUC']:.4f}")
gp_m = GP_SSVM(C_l=1.0, C_u=0.5, lambda_g=0.01, max_iter=500, lr=0.0005)
gp_m.groups = groups
gp_m.fit(X_l_m, y_l, X_u_m)
r = evaluate_model(y_te, gp_m.predict(X_te_imp), gp_m.decision_function(X_te_imp))
results['GP_SSVM_mar'].append(r)
print(f" GP_SSVM: Acc={r['Accuracy']:.4f} F1={r['F1']:.4f} AUC={r['AUC']:.4f}")
# ========== 结果汇总 ==========
print("\n" + "=" * 72)
print(" 实验结果汇总 (5折交叉验证平均值)")
print("=" * 72)
metrics = ['Accuracy', 'Precision', 'Recall', 'Specificity', 'F1', 'AUC']
names = {
'SVM_full': '标准SVM (完整)', 'S3VM_full': 'S3VM (完整)',
'GP_SSVM_full': 'GP_SSVM (完整)',
'SVM_mar': '标准SVM (MAR缺失)', 'S3VM_mar': 'S3VM (MAR缺失)',
'GP_SSVM_mar': 'GP_SSVM (MAR缺失)',
}
header = f"{'模型':<22}" + "".join(f"{m:>10}" for m in metrics)
print(header)
print("-" * 82)
summary = {}
for key, rlist in results.items():
if rlist:
avg = {m: np.mean([r[m] for r in rlist]) for m in metrics}
std = {m: np.std([r[m] for r in rlist]) for m in metrics}
summary[key] = (avg, std)
line = f"{names[key]:<22}" + "".join(f"{avg[m]:>10.4f}" for m in metrics)
print(line)
print("-" * 82)
# 详细结果
print("\n详细结果 (均值 ± 标准差):")
for key, (avg, std) in summary.items():
print(f"\n {names[key]}:")
for m in metrics:
print(f" {m:>12}: {avg[m]:.4f} ± {std[m]:.4f}")
# ========== Group LASSO 分析 ==========
print("\n" + "=" * 72)
print(" Group LASSO 特征选择分析")
print("=" * 72)
X_tr, X_te, y_tr, y_te = train_test_split(
X_proc, y, test_size=0.2, random_state=42, stratify=y)
rng = np.random.RandomState(42)
n_l = int(len(y_tr) * 0.1)
l_idx = rng.choice(len(y_tr), n_l, replace=False)
u_idx = np.array([i for i in range(len(y_tr)) if i not in l_idx])
gp_final = GP_SSVM(C_l=1.0, C_u=0.5, lambda_g=0.01, max_iter=500, lr=0.0005)
gp_final.groups = groups
gp_final.fit(X_tr[l_idx], y_tr[l_idx], X_tr[u_idx])
selected = gp_final.get_selected_features(group_names)
print(f"\n 总特征组数: {len(groups)}")
print(f" 选中特征组数: {len(selected)} ({len(selected)/len(groups):.1%})")
# 特征重要性
print(f"\n 特征重要性 (权重L2范数排序):")
for i, (name, norm) in enumerate(sorted(selected, key=lambda x: -x[1])):
# 映射回原始特征名
if name.startswith("cat_") or name.startswith("num_"):
idx = int(name.split("_")[1])
orig = feature_names[idx] if idx < len(feature_names) else name
else:
orig = name
print(f" {i+1:2d}. {orig:<30} L2范数: {norm:.4f}")
# 权重统计
print(f"\n 权重统计:")
print(f" 非零权重: {int(np.sum(np.abs(gp_final.w) > 1e-6))}/{len(gp_final.w)}")
print(f" L1范数: {np.sum(np.abs(gp_final.w)):.4f}")
print(f" L2范数: {np.linalg.norm(gp_final.w):.4f}")
# 收敛曲线
print(f"\n 优化收敛曲线 (前10次/每50次):")
for i in range(0, len(gp_final.loss_history), 50):
print(f" iter {i:4d}: loss = {gp_final.loss_history[i]:.2f}")
print(f" iter {len(gp_final.loss_history)-1:4d}: loss = {gp_final.loss_history[-1]:.2f}")
# 最终测试集性能
y_pred = gp_final.predict(X_te)
y_score = gp_final.decision_function(X_te)
final = evaluate_model(y_te, y_pred, y_score)
print(f"\n 最终GP_SSVM测试集性能:")
for m, v in final.items():
print(f" {m:>12}: {v:.4f}")
print("\n" + "=" * 72)
print(" 实验完成!")
print("=" * 72)
return results, summary
if __name__ == '__main__':
t0 = time.time()
results, summary = run_experiments()
========================================================================
GP_SSVM 信用评分模型实验
复现论文: 基于半监督支持向量机的信用评分模型 (陈耸, 2024)
========================================================================
[1] 加载UCI German Credit Data...
数据集: 1000 样本, 20 特征
类别: 好客户=700, 坏客户=300
来源: UCI ML Repository - Statlog (German Credit Data)
网址: https://archive.ics.uci.edu/datasets/144/statlog+german+credit+data
[2] 数据预处理 (独热编码+标准化)...
编码后特征数: 61, 分组数: 20
[3] 模拟MAR缺失数据 (缺失率~20%)...
实际缺失率: 1.90%
[4] 5折交叉验证 (有标签比例=10%)
S3VM: C_l=1.0, C_u=0.5, kernel=rbf, gamma=0.02
GP_SSVM: C_l=1.0, C_u=0.5, λ=0.01, 近端梯度下降
------------------------------------------------------------------------
=== 第 1/5 折 ===
训练:800 (标签:80, 无标签:720)
[A] 完整数据:
SVM: Acc=0.6900 F1=0.0606 AUC=0.6802
S3VM: Acc=0.6500 F1=0.4531 AUC=0.6776
GP_SSVM: Acc=0.6900 F1=0.4151 AUC=0.6577
[B] MAR缺失数据:
SVM: Acc=0.6950 F1=0.0896 AUC=0.6651
S3VM: Acc=0.7000 F1=0.0000 AUC=0.6590
GP_SSVM: Acc=0.6700 F1=0.3529 AUC=0.6429
=== 第 2/5 折 ===
训练:800 (标签:80, 无标签:720)
[A] 完整数据:
SVM: Acc=0.6850 F1=0.3226 AUC=0.6654
S3VM: Acc=0.6900 F1=0.0882 AUC=0.6292
GP_SSVM: Acc=0.7050 F1=0.3371 AUC=0.6321
[B] MAR缺失数据:
SVM: Acc=0.6850 F1=0.3226 AUC=0.6744
S3VM: Acc=0.6600 F1=0.4848 AUC=0.6408
GP_SSVM: Acc=0.7050 F1=0.3059 AUC=0.6500
=== 第 3/5 折 ===
训练:800 (标签:80, 无标签:720)
[A] 完整数据:
SVM: Acc=0.6900 F1=0.0000 AUC=0.6636
S3VM: Acc=0.6350 F1=0.4252 AUC=0.6479
GP_SSVM: Acc=0.6900 F1=0.2619 AUC=0.6318
[B] MAR缺失数据:
SVM: Acc=0.6850 F1=0.0000 AUC=0.6640
S3VM: Acc=0.6550 F1=0.4298 AUC=0.6440
GP_SSVM: Acc=0.6850 F1=0.3505 AUC=0.6110
=== 第 4/5 折 ===
训练:800 (标签:80, 无标签:720)
[A] 完整数据:
SVM: Acc=0.7300 F1=0.1818 AUC=0.2706
S3VM: Acc=0.7000 F1=0.0000 AUC=0.7279
GP_SSVM: Acc=0.7200 F1=0.2222 AUC=0.6739
[B] MAR缺失数据:
SVM: Acc=0.7300 F1=0.1818 AUC=0.2514
S3VM: Acc=0.7000 F1=0.0000 AUC=0.7444
GP_SSVM: Acc=0.7050 F1=0.2532 AUC=0.6318
=== 第 5/5 折 ===
训练:800 (标签:80, 无标签:720)
[A] 完整数据:
SVM: Acc=0.7050 F1=0.0923 AUC=0.7555
S3VM: Acc=0.7000 F1=0.0000 AUC=0.7590
GP_SSVM: Acc=0.7150 F1=0.2785 AUC=0.7619
[B] MAR缺失数据:
SVM: Acc=0.7050 F1=0.0923 AUC=0.7682
S3VM: Acc=0.7150 F1=0.5366 AUC=0.7656
GP_SSVM: Acc=0.7100 F1=0.2564 AUC=0.7294
========================================================================
实验结果汇总 (5折交叉验证平均值)
========================================================================
模型 Accuracy Precision RecallSpecificity F1 AUC
----------------------------------------------------------------------------------
标准SVM (完整) 0.7000 0.4776 0.0867 0.9629 0.1315 0.6071
S3VM (完整) 0.6750 0.2409 0.1967 0.8800 0.1933 0.6883
GP_SSVM (完整) 0.7040 0.5399 0.2233 0.9100 0.3030 0.6715
标准SVM (MAR缺失) 0.7000 0.4966 0.0900 0.9614 0.1373 0.6046
S3VM (MAR缺失) 0.6860 0.2789 0.3033 0.8500 0.2902 0.6908
GP_SSVM (MAR缺失) 0.6950 0.4980 0.2267 0.8957 0.3038 0.6530
----------------------------------------------------------------------------------
详细结果 (均值 ± 标准差):
标准SVM (完整):
Accuracy: 0.7000 ± 0.0164
Precision: 0.4776 ± 0.3278
Recall: 0.0867 ± 0.0878
Specificity: 0.9629 ± 0.0466
F1: 0.1315 ± 0.1122
AUC: 0.6071 ± 0.1716
S3VM (完整):
Accuracy: 0.6750 ± 0.0272
Precision: 0.2409 ± 0.1974
Recall: 0.1967 ± 0.2215
Specificity: 0.8800 ± 0.1330
F1: 0.1933 ± 0.2035
AUC: 0.6883 ± 0.0486
GP_SSVM (完整):
Accuracy: 0.7040 ± 0.0124
Precision: 0.5399 ± 0.0756
Recall: 0.2233 ± 0.0807
Specificity: 0.9100 ± 0.0481
F1: 0.3030 ± 0.0671
AUC: 0.6715 ± 0.0479
标准SVM (MAR缺失):
Accuracy: 0.7000 ± 0.0167
Precision: 0.4966 ± 0.3216
Recall: 0.0900 ± 0.0860
Specificity: 0.9614 ± 0.0460
F1: 0.1373 ± 0.1091
AUC: 0.6046 ± 0.1809
S3VM (MAR缺失):
Accuracy: 0.6860 ± 0.0240
Precision: 0.2789 ± 0.2301
Recall: 0.3033 ± 0.2509
Specificity: 0.8500 ± 0.1245
F1: 0.2902 ± 0.2394
AUC: 0.6908 ± 0.0532
GP_SSVM (MAR缺失):
Accuracy: 0.6950 ± 0.0152
Precision: 0.4980 ± 0.0467
Recall: 0.2267 ± 0.0564
Specificity: 0.8957 ± 0.0451
F1: 0.3038 ± 0.0434
AUC: 0.6530 ± 0.0404
========================================================================
Group LASSO 特征选择分析
========================================================================
总特征组数: 20
选中特征组数: 20 (100.0%)
特征重要性 (权重L2范数排序):
1. Attribute4 L2范数: 0.6232
2. Attribute1 L2范数: 0.5616
3. Attribute7 L2范数: 0.5394
4. Attribute6 L2范数: 0.4730
5. Attribute14 L2范数: 0.4212
6. Attribute9 L2范数: 0.4052
7. Attribute3 L2范数: 0.3461
8. Attribute10 L2范数: 0.3142
9. Attribute12 L2范数: 0.2074
10. Attribute17 L2范数: 0.1771
11. Attribute15 L2范数: 0.1569
12. Attribute8 L2范数: 0.1401
13. Attribute5 L2范数: 0.1398
14. Attribute20 L2范数: 0.1393
15. Attribute18 L2范数: 0.1152
16. Attribute2 L2范数: 0.0821
17. Attribute19 L2范数: 0.0750
18. Attribute16 L2范数: 0.0495
19. Attribute11 L2范数: 0.0226
20. Attribute13 L2范数: 0.0079
权重统计:
非零权重: 61/61
L1范数: 9.1578
L2范数: 1.4017
优化收敛曲线 (前10次/每50次):
iter 0: loss = 963.91
iter 50: loss = 773.96
iter 100: loss = 766.89
iter 150: loss = 764.92
iter 200: loss = 763.65
iter 250: loss = 762.84
iter 300: loss = 762.33
iter 350: loss = 761.89
iter 400: loss = 761.74
iter 450: loss = 761.59
iter 499: loss = 761.34
最终GP_SSVM测试集性能:
Accuracy: 0.6750
Precision: 0.4675
Recall: 0.6000
Specificity: 0.7071
F1: 0.5255
AUC: 0.7052
========================================================================
实验完成!
========================================================================