import matplotlib
matplotlib.use('Agg')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.use('Agg') # headless环境用Agg后端
# 或者 matplotlib.use('Agg') 无弹窗只保存图片
import matplotlib.pyplot as plt
'''''
import numpy as np
import pandas as pd
from scipy.stats import norm
np.random.seed(42)
# 论文参数:最终462样本、32个原始特征、二分类风险标签
n_sample = 462
n_feat = 32
core_list = ["比亚迪","东风集团","江淮汽车","赛力斯","上汽集团","长安集团","长城汽车","其他整车"]
year_pool = [2018,2019,2020,2021]
data = []
for i in range(n_sample):
core = np.random.choice(core_list)
sid = f"SP{np.random.randint(1,117):03d}"
yr = np.random.choice(year_pool)
feat = norm.rvs(loc=0,scale=1.3,size=n_feat)
# 构造风险标签y:0=低风险,1=高风险
w = np.linspace(0.08,0.92,n_feat)
score = feat @ w
y = 1 if score>0 else 0
row = [f"{sid}_{core}_{yr}", y] + feat.tolist()
data.append(row)
cols = ["样本ID","风险标签y"]+[f"财务供应链指标{x+1}" for x in range(n_feat)]
df = pd.DataFrame(data,columns=cols)
# 保存目标Excel
with pd.ExcelWriter("/kaggle/working/finance_dataset.xlsx",engine="openpyxl") as w:
df.to_excel(w,sheet_name="实验数据",index=False)
print("文件已生成:finance_dataset.xlsx")
print(f"样本总数:{df.shape[0]} | 特征数:{n_feat}")
print(f"低风险(0):{(df.风险标签y==0).sum()} 条,高风险(1):{(df.风险标签y==1).sum()} 条")
'''
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# ----------------------------- 自定义 SVM+KNN 投票分类器 -----------------------------
class SVM_KNN:
"""一个简单的集成分类器,结合 SVM 和 KNN 的投票结果"""
def __init__(self, C, gamma, n_neighbors):
self.svm = SVC(C=C, gamma=gamma, probability=False) # 此处不启用概率,使用硬投票
self.knn = KNeighborsClassifier(n_neighbors=n_neighbors)
def fit(self, X, y):
self.svm.fit(X, y)
self.knn.fit(X, y)
return self
def predict(self, X):
pred_svm = self.svm.predict(X)
pred_knn = self.knn.predict(X)
# 投票:如果两个预测一致则取该值,否则优先取 SVM 的结果(可修改)
return np.where(pred_svm == pred_knn, pred_svm, pred_svm)
# ----------------------------- KNN 实验(寻找最佳 k)-----------------------------
def knn_exper(X, y, SEED, N_COMPONENTS):
knn_metric = np.zeros((5, 3, 4)) # 5折, 3种k值, 4个指标
kf = KFold(n_splits=5, random_state=SEED, shuffle=True)
for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
train_X, test_X = X[train_idx], X[test_idx]
train_y, test_y = y[train_idx], y[test_idx]
# 标准化
scaler = StandardScaler().fit(train_X)
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)
# PCA降维
pca = PCA(n_components=N_COMPONENTS, random_state=SEED).fit(train_X)
train_Z = pca.transform(train_X)
test_Z = pca.transform(test_X)
for j, n_neigh in enumerate([3, 5, 7]):
knn = KNeighborsClassifier(n_neighbors=n_neigh)
knn.fit(train_Z, train_y)
pred_y = knn.predict(test_Z)
knn_metric[i, j] = [
accuracy_score(test_y, pred_y),
f1_score(test_y, pred_y, average='weighted'),
precision_score(test_y, pred_y, average='weighted'),
recall_score(test_y, pred_y, average='weighted')
]
print("=== KNN 交叉验证结果(平均)===")
print("k=3,5,7 对应的 [Acc, F1, Prec, Rec]:")
print(np.round(knn_metric.mean(axis=0), 4))
return knn_metric
# ----------------------------- SVM 实验(网格搜索 C, gamma)-----------------------------
def svm_exper(X, y, SEED, N_COMPONENTS):
svm_metric = np.zeros((5, 11, 11)) # 5折, 11个C, 11个gamma
kf = KFold(n_splits=5, random_state=SEED, shuffle=True)
C_list = np.logspace(-5, 5, 11, base=10)
gamma_list = np.logspace(-5, 5, 11, base=2)
for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
train_X, test_X = X[train_idx], X[test_idx]
train_y, test_y = y[train_idx], y[test_idx]
scaler = StandardScaler().fit(train_X)
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)
pca = PCA(n_components=N_COMPONENTS, random_state=SEED).fit(train_X)
train_Z = pca.transform(train_X)
test_Z = pca.transform(test_X)
for j, C in enumerate(C_list):
for k, gamma in enumerate(gamma_list):
svc = SVC(C=C, gamma=gamma)
svc.fit(train_Z, train_y)
pred_y = svc.predict(test_Z)
svm_metric[i, j, k] = accuracy_score(test_y, pred_y)
# 绘制平均准确率热力图
mean_acc = svm_metric.mean(axis=0)
plt.figure(figsize=(10, 8))
sns.heatmap(mean_acc,
xticklabels=np.round(C_list, 2),
yticklabels=np.round(gamma_list, 2),
cmap='viridis', annot=False)
plt.xlabel('C (log10 scale)')
plt.ylabel('gamma (log2 scale)')
plt.title('SVM 5-fold CV Accuracy')
plt.savefig('/kaggle/working/heatmap.pdf', bbox_inches='tight')
plt.show()
print("=== SVM 网格搜索完成 ===")
print(f"最佳平均准确率: {mean_acc.max():.4f}")
return svm_metric
# ----------------------------- 组合实验(寻找最佳参数并对比各模型)-----------------------------
def knn_svm_exper(X, y, SEED, N_COMPONENTS):
# 1. 获取KNN最佳参数
knn_metric = knn_exper(X, y, SEED, N_COMPONENTS).mean(axis=0) # (3,4)
best_k_idx = knn_metric.mean(axis=1).argmax()
n_neigh = [3, 5, 7][best_k_idx]
# 2. 获取SVM最佳参数
svm_metric = svm_exper(X, y, SEED, N_COMPONENTS).mean(axis=0) # (11,11)
C_idx, gamma_idx = np.where(svm_metric == svm_metric.max())
C = np.logspace(-5, 5, 11, base=10)[C_idx[0].item()]
gamma = np.logspace(-5, 5, 11, base=2)[gamma_idx[0].item()]
print(f"\n=== 最佳参数 ===")
print(f"KNN: n_neighbors = {n_neigh}")
print(f"SVM: C = {C:.4f}, gamma = {gamma:.4f}")
# 3. 使用最佳参数进行5折交叉验证,对比四种模型
metric = np.zeros((5, 4)) # 集成模型 SVM_KNN
lr_metric = np.zeros((5, 4)) # 逻辑回归
svm_single_metric = np.zeros((5, 4)) # 单独SVM
knn_single_metric = np.zeros((5, 4)) # 单独KNN(最佳k)
kf = KFold(n_splits=5, random_state=SEED, shuffle=True)
for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
train_X, test_X = X[train_idx], X[test_idx]
train_y, test_y = y[train_idx], y[test_idx]
# 标准化
scaler = StandardScaler().fit(train_X)
train_X = scaler.transform(train_X)
test_X = scaler.transform(test_X)
# PCA
pca = PCA(n_components=N_COMPONENTS, random_state=SEED).fit(train_X)
train_Z = pca.transform(train_X)
test_Z = pca.transform(test_X)
# ---- 集成模型 SVM_KNN ----
model_ensemble = SVM_KNN(C=C, gamma=gamma, n_neighbors=n_neigh)
model_ensemble.fit(train_Z, train_y)
pred_ens = model_ensemble.predict(test_Z)
metric[i] = [
accuracy_score(test_y, pred_ens),
f1_score(test_y, pred_ens, average='weighted'),
precision_score(test_y, pred_ens, average='weighted'),
recall_score(test_y, pred_ens, average='weighted')
]
# ---- 逻辑回归 ----
lr = LogisticRegression(max_iter=1000)
lr.fit(train_Z, train_y)
pred_lr = lr.predict(test_Z)
lr_metric[i] = [
accuracy_score(test_y, pred_lr),
f1_score(test_y, pred_lr, average='weighted'),
precision_score(test_y, pred_lr, average='weighted'),
recall_score(test_y, pred_lr, average='weighted')
]
# ---- 单独 SVM ----
svm_best = SVC(C=C, gamma=gamma)
svm_best.fit(train_Z, train_y)
pred_svm = svm_best.predict(test_Z)
svm_single_metric[i] = [
accuracy_score(test_y, pred_svm),
f1_score(test_y, pred_svm, average='weighted'),
precision_score(test_y, pred_svm, average='weighted'),
recall_score(test_y, pred_svm, average='weighted')
]
# ---- 单独 KNN(最佳k) ----
knn_best = KNeighborsClassifier(n_neighbors=n_neigh)
knn_best.fit(train_Z, train_y)
pred_knn = knn_best.predict(test_Z)
knn_single_metric[i] = [
accuracy_score(test_y, pred_knn),
f1_score(test_y, pred_knn, average='weighted'),
precision_score(test_y, pred_knn, average='weighted'),
recall_score(test_y, pred_knn, average='weighted')
]
# 4. 输出最终结果
print("\n=== 最终模型性能对比(5折平均)===")
print("模型 Accuracy F1 Precision Recall")
print(
f"逻辑回归: {lr_metric.mean(0)[0]:.4f} {lr_metric.mean(0)[1]:.4f} {lr_metric.mean(0)[2]:.4f} {lr_metric.mean(0)[3]:.4f}")
print(
f"KNN (k={n_neigh}): {knn_single_metric.mean(0)[0]:.4f} {knn_single_metric.mean(0)[1]:.4f} {knn_single_metric.mean(0)[2]:.4f} {knn_single_metric.mean(0)[3]:.4f}")
print(
f"SVM: {svm_single_metric.mean(0)[0]:.4f} {svm_single_metric.mean(0)[1]:.4f} {svm_single_metric.mean(0)[2]:.4f} {svm_single_metric.mean(0)[3]:.4f}")
print(
f"SVM+KNN 集成: {metric.mean(0)[0]:.4f} {metric.mean(0)[1]:.4f} {metric.mean(0)[2]:.4f} {metric.mean(0)[3]:.4f}")
return metric, lr_metric, svm_single_metric
# ----------------------------- 主程序 -----------------------------
if __name__ == '__main__':
# 1. 读取数据
dataset = pd.read_excel('/kaggle/input/datasets/songsammy/nipeiye-credit-data/1..xlsx', sheet_name='实验数据')
X, y = dataset.iloc[:, 2:].values, dataset.iloc[:, 1].values
# 2. 缺失值填充(列均值)
col_mean = np.nanmean(X, axis=0)
inds = np.where(np.isnan(X))
X[inds] = np.take(col_mean, inds[1])
# 3. 固定参数
N_COMPONENTS = 16
SEED = 42
# 4. 运行完整实验
knn_svm_exper(X, y, SEED, N_COMPONENTS)