#!/usr/bin/env python
# coding: utf-8

# 1. 导入所需库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
# 设置中文、符号显示，防止图表出现乱码
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 2. 加载数据（API下载绕过mount问题）
import subprocess as _sp, glob as _gl, os as _os
_os.makedirs('/tmp/_data_dl', exist_ok=True)
_sp.run(['kaggle', 'datasets', 'download', '-d', 'songsammy/qiuzitong-credit-data', '-p', '/tmp/_data_dl', '--unzip'], check=True)
file_path = _gl.glob('/tmp/_data_dl/*.csv')[0]
df = pd.read_csv(file_path)
print(f"原始数据形状: {df.shape}")
df.head()

Warning: Looks like you're using an outdated `kaggle` version (installed: 2.0.1), please consider upgrading to the latest version (2.2.2)
Dataset URL: https://www.kaggle.com/datasets/songsammy/qiuzitong-credit-data
License(s): CC0-1.0
Downloading qiuzitong-credit-data.zip to /tmp/_data_dl

100%|██████████| 1.80M/1.80M [00:00<00:00, 125MB/s]

原始数据形状: (10000, 25)

# 3. 数据预处理
target = 'loan_default'
# 删除原有标签，便于后续聚类赋予新标签
if 'cluster_id' in df.columns:
    df = df.drop(columns=['cluster_id'])

# 分类变量编码
categorical_cols = ['income_bracket', 'education_level', 'employment_type']
for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# 特征与标签
X = df.drop(columns=[target])
y = df[target].values

print(f"特征维度: {X.shape}")
print(f"目标变量分布:\n{pd.Series(y).value_counts()}")

特征维度: (10000, 23)
目标变量分布:
0    7021
1    2979
Name: count, dtype: int64

#第二部分、模仿参考论文定义特征组

# 1. 定义特征组（参考论文中的五个参与变量）
# 消费能力特征
cons_capa_features = ['avg_txn_amount', 'monthly_data_usage_gb', 'savings_worth',
                      'recent_app_installs', 'avg_daily_screen_time_hrs']
# 消费结构特征
cons_struct_features = ['online_txn_count_last_30d', 'bank_sms_count', 
                        'financial_apps_installed', 'finance_app_time_pct']
# 消费理念特征
cons_idea_features = ['social_media_pct', 'avg_call_duration_mins', 
                      'avg_distance_travelled_km', 'places_visited_weekly']
# 金融需求特征
fin_demand_features = ['existing_debt_amount', 'loan_amount_requested', 
                       'savings_worth', 'income_bracket']

# 用于整体聚类的所有特征（去重）
cluster_features = list(set(cons_capa_features + cons_struct_features + 
                            cons_idea_features + fin_demand_features))
print(f"用于聚类的特征数: {len(cluster_features)}")
print(cluster_features)

# 提取聚类特征并标准化
X_cluster = X[cluster_features].copy()
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_cluster)

用于聚类的特征数: 16
['online_txn_count_last_30d', 'recent_app_installs', 'income_bracket', 'finance_app_time_pct', 'financial_apps_installed', 'places_visited_weekly', 'existing_debt_amount', 'bank_sms_count', 'avg_txn_amount', 'savings_worth', 'social_media_pct', 'avg_distance_travelled_km', 'avg_daily_screen_time_hrs', 'monthly_data_usage_gb', 'avg_call_duration_mins', 'loan_amount_requested']

#第三部分、进行初步整体聚类，得出聚类数目，并进行可视化

# 1. 确定最佳聚类数（通过Gap统计量计算得出）
def gap_statistic_simple(X, max_k=10, n_refs=3):
    """简化版Gap统计量，返回最佳K和Gap值列表"""
    gaps = []
    for k in range(1, max_k+1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X)
        log_wcss = np.log(kmeans.inertia_)
        ref_log_wcss = []
        for _ in range(n_refs):
            X_ref = np.random.uniform(low=X.min(axis=0), high=X.max(axis=0), size=X.shape)
            kmeans_ref = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans_ref.fit(X_ref)
            ref_log_wcss.append(np.log(kmeans_ref.inertia_))
        gap = np.mean(ref_log_wcss) - log_wcss
        gaps.append(gap)
    best_k = np.argmax(gaps) + 1
    return best_k, gaps

best_k, gaps = gap_statistic_simple(X_scaled, max_k=10)
print(f"自动选择的最佳聚类数 K = {best_k}")

# 绘制Gap曲线
plt.figure(figsize=(8,5))
plt.plot(range(1,11), gaps, marker='o')
plt.xlabel('聚类数 K')
plt.ylabel('Gap统计量')
plt.title('Gap统计量选择最佳K值')
plt.axvline(x=best_k, color='r', linestyle='--', label=f'最佳K={best_k}')
plt.legend()
plt.grid(True)
plt.show()

自动选择的最佳聚类数 K = 1

# 2. 整体K-means聚类
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster_labels_all = kmeans.fit_predict(X_scaled)
print("整体聚类分布:")
print(pd.Series(cluster_labels_all).value_counts().sort_index())

# 可视化（PCA降维到2维）
pca_2d = PCA(n_components=2)
X_pca = pca_2d.fit_transform(X_scaled)
plt.figure(figsize=(10,7))
scatter = plt.scatter(X_pca[:,0], X_pca[:,1], c=cluster_labels_all, cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='Cluster')
plt.title('K-means整体聚类结果（PCA降维）')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

整体聚类分布:
0    10000
Name: count, dtype: int64

#第四部分，模仿参考论文构建模型，并输出最后结果

# 1. 三个关键维度独立聚类并赋分（参考论文，固定分为5簇，赋分30/45/60/75/90）
dimension_features = {
    '消费能力': cons_capa_features,
    '金融需求': fin_demand_features,
    '消费理念': cons_idea_features
}

dimension_scores = {}   # 存储每个维度的得分（10000个样本）
dimension_labels = {}   # 存储每个维度的聚类标签（0-4）

for dim_name, feat_list in dimension_features.items():
    feat_list = [f for f in feat_list if f in X.columns]
    if len(feat_list) == 0:
        print(f"警告：维度 {dim_name} 无可用特征，跳过")
        continue
    X_dim = X[feat_list].copy()
    scaler_dim = RobustScaler()
    X_dim_scaled = scaler_dim.fit_transform(X_dim)

    k_dim = 5  # 固定分为5簇
    kmeans_dim = KMeans(n_clusters=k_dim, random_state=42, n_init=10)
    labels_dim = kmeans_dim.fit_predict(X_dim_scaled)
    dimension_labels[dim_name] = labels_dim

    # 按簇中心均值排序，映射分数（均值越小 => 分数越低）
    center_means = kmeans_dim.cluster_centers_.mean(axis=1)
    sorted_idx = np.argsort(center_means)
    score_map = {old_label: 30 + 15*rank for rank, old_label in enumerate(sorted_idx)}
    scores_dim = np.array([score_map[label] for label in labels_dim])
    dimension_scores[dim_name] = scores_dim
    print(f"{dim_name} 赋分完成，分数分布: {np.unique(scores_dim)}")

消费能力 赋分完成，分数分布: [30 45 60 75 90]
金融需求 赋分完成，分数分布: [30 45 60 75 90]
消费理念 赋分完成，分数分布: [30 45 60 75 90]

# 2. PCA确定权重 + 线性加权初始评分
score_df = pd.DataFrame({
    '消费能力': dimension_scores['消费能力'],
    '金融需求': dimension_scores['金融需求'],
    '消费理念': dimension_scores['消费理念']
})

pca_weight = PCA(n_components=3)
pca_weight.fit(score_df)
variance_ratio = pca_weight.explained_variance_ratio_
print("三个主成分方差贡献率:", variance_ratio)
weights = variance_ratio / variance_ratio.sum()
print(f"归一化权重: 消费能力={weights[0]:.3f}, 金融需求={weights[1]:.3f}, 消费理念={weights[2]:.3f}")

initial_scores = (weights[0] * score_df['消费能力'] + 
                  weights[1] * score_df['金融需求'] + 
                  weights[2] * score_df['消费理念'])
print(f"初始评分范围: [{initial_scores.min():.2f}, {initial_scores.max():.2f}]")

三个主成分方差贡献率: [0.4404133  0.35600279 0.20358391]
归一化权重: 消费能力=0.440, 金融需求=0.356, 消费理念=0.204
初始评分范围: [30.00, 90.00]

# 3. 构建组合特征（三个维度的聚类标签 + 初始评分）
cluster_labels_array = np.column_stack([dimension_labels['消费能力'],
                                        dimension_labels['金融需求'],
                                        dimension_labels['消费理念']])
combined_features = np.column_stack([cluster_labels_array, initial_scores])
print(f"组合特征形状: {combined_features.shape}")

组合特征形状: (10000, 4)

# 4. 划分训练集和测试集（70%训练，30%测试）
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.3, random_state=42, stratify=y)
print(f"训练集样本数: {X_train.shape[0]}, 测试集样本数: {X_test.shape[0]}")

# 训练随机森林模型
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# 在测试集上预测概率
y_proba_test = rf.predict_proba(X_test)[:, 1]   # 违约概率

# 使用测试集上的ROC确定最优阈值（约登指数）
fpr, tpr, thresholds = roc_curve(y_test, y_proba_test)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"基于测试集的最优阈值: {optimal_threshold:.4f}")

# 在测试集上进行二分类预测
y_pred_test = (y_proba_test >= optimal_threshold).astype(int)

# 计算测试集上的评估指标
test_accuracy = accuracy_score(y_test, y_pred_test)
test_auc = auc(fpr, tpr)
print(f"\n测试集准确率: {test_accuracy:.4f}")
print(f"测试集AUC值: {test_auc:.4f}")

# 混淆矩阵和分类报告
print("\n测试集混淆矩阵:")
print(confusion_matrix(y_test, y_pred_test))
print("\n测试集分类报告:")
print(classification_report(y_test, y_pred_test, target_names=['正常', '违约']))

# 绘制ROC曲线
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'随机森林 (测试集 AUC = {test_auc:.3f})')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('假阳性率 (FPR)')
plt.ylabel('真阳性率 (TPR)')
plt.title('ROC曲线 - 测试集')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

训练集样本数: 7000, 测试集样本数: 3000
基于测试集的最优阈值: 0.2241

测试集准确率: 0.9633
测试集AUC值: 0.9773

测试集混淆矩阵:
[[2043   63]
 [  47  847]]

测试集分类报告:
              precision    recall  f1-score   support

          正常       0.98      0.97      0.97      2106
          违约       0.93      0.95      0.94       894

    accuracy                           0.96      3000
   macro avg       0.95      0.96      0.96      3000
weighted avg       0.96      0.96      0.96      3000

# 计算百分制信用评分（分数越高，表示放贷资质越好）
# 注：y_proba_test 是模型预测的违约概率（取值0~1）
credit_scores = (1 - y_proba_test) * 100   # 转化为0-100分，高分代表低违约风险

# 根据之前计算的最优阈值（违约概率阈值），转换为信用评分阈值
# 最优阈值 optimal_threshold 是违约概率的切割点（如0.3）
# 对应的信用评分阈值为：credit_threshold = (1 - optimal_threshold) * 100
credit_threshold = (1 - optimal_threshold) * 100

print("="*60)
print("百分制放贷资质评分结果（测试集）")
print("="*60)
print(f"信用评分范围: [{credit_scores.min():.2f}, {credit_scores.max():.2f}]")
print(f"根据ROC曲线确定的最优违约概率阈值: {optimal_threshold:.4f}")
print(f"对应的信用评分阈值: {credit_threshold:.2f}分")
print("\n业务解释：")
print(f"  - 信用评分 >= {credit_threshold:.2f} 分 → 预测为【适合放贷】农户（低违约风险）")
print(f"  - 信用评分 <  {credit_threshold:.2f} 分 → 预测为【不适合放贷】农户（高违约风险）")
print("="*60)

# 展示前20个样本的评分和决策
print("\n测试集前20个样本的评分及放贷建议：")
print("-"*50)
sample_df = pd.DataFrame({
    '真实标签': y_test[:20],
    '违约概率': y_proba_test[:20],
    '信用评分': credit_scores[:20],
    '放贷建议': ['适合放贷' if score >= credit_threshold else '不适合放贷' for score in credit_scores[:20]]
})
print(sample_df.to_string(index=False))

============================================================
百分制放贷资质评分结果（测试集）
============================================================
信用评分范围: [0.00, 100.00]
根据ROC曲线确定的最优违约概率阈值: 0.2241
对应的信用评分阈值: 77.59分

业务解释：
  - 信用评分 >= 77.59 分 → 预测为【适合放贷】农户（低违约风险）
  - 信用评分 <  77.59 分 → 预测为【不适合放贷】农户（高违约风险）
============================================================

测试集前20个样本的评分及放贷建议：
--------------------------------------------------
 真实标签     违约概率       信用评分  放贷建议
    1 0.961171   3.882867 不适合放贷
    0 0.070154  92.984630  适合放贷
    1 0.932509   6.749129 不适合放贷
    0 0.055805  94.419519  适合放贷
    1 0.904281   9.571883 不适合放贷
    0 0.000000 100.000000  适合放贷
    0 0.015013  98.498671  适合放贷
    0 0.025211  97.478912  适合放贷
    1 0.928012   7.198837 不适合放贷
    0 0.006134  99.386580  适合放贷
    0 0.029975  97.002459  适合放贷
    0 0.000000 100.000000  适合放贷
    0 0.055805  94.419519  适合放贷
    0 0.000000 100.000000  适合放贷
    1 0.859161  14.083933 不适合放贷
    1 0.845999  15.400104 不适合放贷
    0 0.025627  97.437299  适合放贷
    0 0.000000 100.000000  适合放贷
    0 0.000000 100.000000  适合放贷
    0 0.000000 100.000000  适合放贷

print("\n所有代码运行完毕。")

所有代码运行完毕。

	unique_apps_per_day	avg_daily_screen_time_hrs	financial_apps_installed	online_txn_count_last_30d	avg_txn_amount	bank_sms_count	avg_distance_travelled_km	places_visited_weekly	calls_per_day	distinct_contacts_weekly	...	monthly_data_usage_gb	recent_app_installs	income_bracket	existing_debt_amount	education_level	cluster_id	loan_amount_requested	savings_worth	employment_type	loan_default
0	48.220031	1.279052	2.509566	49.670324	3195.030093	32.971468	33.306665	3.974932	45.907747	51.829608	...	14.051215	7.462507	Low	23240.568017	Secondary	CL015	21811.471418	110605.245043	Daily Wage Worker	0
1	34.599007	1.243591	1.511769	47.999115	2849.833738	27.244845	29.207240	2.812507	31.086742	48.298890	...	16.145868	10.937982	High	26055.293669	Secondary	CL013	21245.210949	162480.806219	Delivery Boy	1
2	23.106321	2.689413	1.585904	48.301919	1401.151656	37.077180	30.409018	5.796278	44.156377	29.329085	...	22.666397	5.067663	Medium	38030.218765	Primary	CL039	23204.956348	188224.519306	Unemployed	1
3	30.551406	1.657311	1.833775	41.487291	2313.202179	67.462893	21.293282	10.170776	44.093604	24.126325	...	25.086062	9.358852	Low	13133.900208	NaN	CL017	20043.188568	97028.514863	Daily Wage Worker	0
4	46.256325	1.784115	2.024602	39.105087	2011.318679	40.120510	21.702138	2.958433	53.833026	54.125794	...	15.925233	9.204566	High	16018.682557	Secondary	CL021	18932.382536	132731.853815	Shop Owner	0