In [1]:
#!/usr/bin/env python
# coding: utf-8
In [ ]:

In [2]:
# 1. 导入所需库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
# 设置中文、符号显示,防止图表出现乱码
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
In [3]:
# 2. 加载数据(API下载绕过mount问题)
import subprocess as _sp, glob as _gl, os as _os
_os.makedirs('/tmp/_data_dl', exist_ok=True)
_sp.run(['kaggle', 'datasets', 'download', '-d', 'songsammy/qiuzitong-credit-data', '-p', '/tmp/_data_dl', '--unzip'], check=True)
file_path = _gl.glob('/tmp/_data_dl/*.csv')[0]
df = pd.read_csv(file_path)
print(f"原始数据形状: {df.shape}")
df.head()
Warning: Looks like you're using an outdated `kaggle` version (installed: 2.0.1), please consider upgrading to the latest version (2.2.2)
Dataset URL: https://www.kaggle.com/datasets/songsammy/qiuzitong-credit-data
License(s): CC0-1.0
Downloading qiuzitong-credit-data.zip to /tmp/_data_dl

100%|██████████| 1.80M/1.80M [00:00<00:00, 125MB/s]
原始数据形状: (10000, 25)
Out[3]:
unique_apps_per_day avg_daily_screen_time_hrs financial_apps_installed online_txn_count_last_30d avg_txn_amount bank_sms_count avg_distance_travelled_km places_visited_weekly calls_per_day distinct_contacts_weekly ... monthly_data_usage_gb recent_app_installs income_bracket existing_debt_amount education_level cluster_id loan_amount_requested savings_worth employment_type loan_default
0 48.220031 1.279052 2.509566 49.670324 3195.030093 32.971468 33.306665 3.974932 45.907747 51.829608 ... 14.051215 7.462507 Low 23240.568017 Secondary CL015 21811.471418 110605.245043 Daily Wage Worker 0
1 34.599007 1.243591 1.511769 47.999115 2849.833738 27.244845 29.207240 2.812507 31.086742 48.298890 ... 16.145868 10.937982 High 26055.293669 Secondary CL013 21245.210949 162480.806219 Delivery Boy 1
2 23.106321 2.689413 1.585904 48.301919 1401.151656 37.077180 30.409018 5.796278 44.156377 29.329085 ... 22.666397 5.067663 Medium 38030.218765 Primary CL039 23204.956348 188224.519306 Unemployed 1
3 30.551406 1.657311 1.833775 41.487291 2313.202179 67.462893 21.293282 10.170776 44.093604 24.126325 ... 25.086062 9.358852 Low 13133.900208 NaN CL017 20043.188568 97028.514863 Daily Wage Worker 0
4 46.256325 1.784115 2.024602 39.105087 2011.318679 40.120510 21.702138 2.958433 53.833026 54.125794 ... 15.925233 9.204566 High 16018.682557 Secondary CL021 18932.382536 132731.853815 Shop Owner 0

5 rows × 25 columns

In [4]:
# 3. 数据预处理
target = 'loan_default'
# 删除原有标签,便于后续聚类赋予新标签
if 'cluster_id' in df.columns:
    df = df.drop(columns=['cluster_id'])

# 分类变量编码
categorical_cols = ['income_bracket', 'education_level', 'employment_type']
for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))

# 特征与标签
X = df.drop(columns=[target])
y = df[target].values

print(f"特征维度: {X.shape}")
print(f"目标变量分布:\n{pd.Series(y).value_counts()}")
特征维度: (10000, 23)
目标变量分布:
0    7021
1    2979
Name: count, dtype: int64
In [5]:
#第二部分、模仿参考论文定义特征组
In [6]:
# 1. 定义特征组(参考论文中的五个参与变量)
# 消费能力特征
cons_capa_features = ['avg_txn_amount', 'monthly_data_usage_gb', 'savings_worth',
                      'recent_app_installs', 'avg_daily_screen_time_hrs']
# 消费结构特征
cons_struct_features = ['online_txn_count_last_30d', 'bank_sms_count', 
                        'financial_apps_installed', 'finance_app_time_pct']
# 消费理念特征
cons_idea_features = ['social_media_pct', 'avg_call_duration_mins', 
                      'avg_distance_travelled_km', 'places_visited_weekly']
# 金融需求特征
fin_demand_features = ['existing_debt_amount', 'loan_amount_requested', 
                       'savings_worth', 'income_bracket']

# 用于整体聚类的所有特征(去重)
cluster_features = list(set(cons_capa_features + cons_struct_features + 
                            cons_idea_features + fin_demand_features))
print(f"用于聚类的特征数: {len(cluster_features)}")
print(cluster_features)

# 提取聚类特征并标准化
X_cluster = X[cluster_features].copy()
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_cluster)
用于聚类的特征数: 16
['online_txn_count_last_30d', 'recent_app_installs', 'income_bracket', 'finance_app_time_pct', 'financial_apps_installed', 'places_visited_weekly', 'existing_debt_amount', 'bank_sms_count', 'avg_txn_amount', 'savings_worth', 'social_media_pct', 'avg_distance_travelled_km', 'avg_daily_screen_time_hrs', 'monthly_data_usage_gb', 'avg_call_duration_mins', 'loan_amount_requested']
In [7]:
#第三部分、进行初步整体聚类,得出聚类数目,并进行可视化
In [8]:
# 1. 确定最佳聚类数(通过Gap统计量计算得出)
def gap_statistic_simple(X, max_k=10, n_refs=3):
    """简化版Gap统计量,返回最佳K和Gap值列表"""
    gaps = []
    for k in range(1, max_k+1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X)
        log_wcss = np.log(kmeans.inertia_)
        ref_log_wcss = []
        for _ in range(n_refs):
            X_ref = np.random.uniform(low=X.min(axis=0), high=X.max(axis=0), size=X.shape)
            kmeans_ref = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans_ref.fit(X_ref)
            ref_log_wcss.append(np.log(kmeans_ref.inertia_))
        gap = np.mean(ref_log_wcss) - log_wcss
        gaps.append(gap)
    best_k = np.argmax(gaps) + 1
    return best_k, gaps

best_k, gaps = gap_statistic_simple(X_scaled, max_k=10)
print(f"自动选择的最佳聚类数 K = {best_k}")

# 绘制Gap曲线
plt.figure(figsize=(8,5))
plt.plot(range(1,11), gaps, marker='o')
plt.xlabel('聚类数 K')
plt.ylabel('Gap统计量')
plt.title('Gap统计量选择最佳K值')
plt.axvline(x=best_k, color='r', linestyle='--', label=f'最佳K={best_k}')
plt.legend()
plt.grid(True)
plt.show()
自动选择的最佳聚类数 K = 1
No description has been provided for this image
In [9]:
# 2. 整体K-means聚类
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster_labels_all = kmeans.fit_predict(X_scaled)
print("整体聚类分布:")
print(pd.Series(cluster_labels_all).value_counts().sort_index())

# 可视化(PCA降维到2维)
pca_2d = PCA(n_components=2)
X_pca = pca_2d.fit_transform(X_scaled)
plt.figure(figsize=(10,7))
scatter = plt.scatter(X_pca[:,0], X_pca[:,1], c=cluster_labels_all, cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='Cluster')
plt.title('K-means整体聚类结果(PCA降维)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()
整体聚类分布:
0    10000
Name: count, dtype: int64
No description has been provided for this image
In [10]:
#第四部分,模仿参考论文构建模型,并输出最后结果
In [11]:
# 1. 三个关键维度独立聚类并赋分(参考论文,固定分为5簇,赋分30/45/60/75/90)
dimension_features = {
    '消费能力': cons_capa_features,
    '金融需求': fin_demand_features,
    '消费理念': cons_idea_features
}

dimension_scores = {}   # 存储每个维度的得分(10000个样本)
dimension_labels = {}   # 存储每个维度的聚类标签(0-4)

for dim_name, feat_list in dimension_features.items():
    feat_list = [f for f in feat_list if f in X.columns]
    if len(feat_list) == 0:
        print(f"警告:维度 {dim_name} 无可用特征,跳过")
        continue
    X_dim = X[feat_list].copy()
    scaler_dim = RobustScaler()
    X_dim_scaled = scaler_dim.fit_transform(X_dim)

    k_dim = 5  # 固定分为5簇
    kmeans_dim = KMeans(n_clusters=k_dim, random_state=42, n_init=10)
    labels_dim = kmeans_dim.fit_predict(X_dim_scaled)
    dimension_labels[dim_name] = labels_dim

    # 按簇中心均值排序,映射分数(均值越小 => 分数越低)
    center_means = kmeans_dim.cluster_centers_.mean(axis=1)
    sorted_idx = np.argsort(center_means)
    score_map = {old_label: 30 + 15*rank for rank, old_label in enumerate(sorted_idx)}
    scores_dim = np.array([score_map[label] for label in labels_dim])
    dimension_scores[dim_name] = scores_dim
    print(f"{dim_name} 赋分完成,分数分布: {np.unique(scores_dim)}")
消费能力 赋分完成,分数分布: [30 45 60 75 90]
金融需求 赋分完成,分数分布: [30 45 60 75 90]
消费理念 赋分完成,分数分布: [30 45 60 75 90]
In [12]:
# 2. PCA确定权重 + 线性加权初始评分
score_df = pd.DataFrame({
    '消费能力': dimension_scores['消费能力'],
    '金融需求': dimension_scores['金融需求'],
    '消费理念': dimension_scores['消费理念']
})

pca_weight = PCA(n_components=3)
pca_weight.fit(score_df)
variance_ratio = pca_weight.explained_variance_ratio_
print("三个主成分方差贡献率:", variance_ratio)
weights = variance_ratio / variance_ratio.sum()
print(f"归一化权重: 消费能力={weights[0]:.3f}, 金融需求={weights[1]:.3f}, 消费理念={weights[2]:.3f}")

initial_scores = (weights[0] * score_df['消费能力'] + 
                  weights[1] * score_df['金融需求'] + 
                  weights[2] * score_df['消费理念'])
print(f"初始评分范围: [{initial_scores.min():.2f}, {initial_scores.max():.2f}]")
三个主成分方差贡献率: [0.4404133  0.35600279 0.20358391]
归一化权重: 消费能力=0.440, 金融需求=0.356, 消费理念=0.204
初始评分范围: [30.00, 90.00]
In [13]:
# 3. 构建组合特征(三个维度的聚类标签 + 初始评分)
cluster_labels_array = np.column_stack([dimension_labels['消费能力'],
                                        dimension_labels['金融需求'],
                                        dimension_labels['消费理念']])
combined_features = np.column_stack([cluster_labels_array, initial_scores])
print(f"组合特征形状: {combined_features.shape}")
组合特征形状: (10000, 4)
In [14]:
# 4. 划分训练集和测试集(70%训练,30%测试)
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.3, random_state=42, stratify=y)
print(f"训练集样本数: {X_train.shape[0]}, 测试集样本数: {X_test.shape[0]}")

# 训练随机森林模型
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# 在测试集上预测概率
y_proba_test = rf.predict_proba(X_test)[:, 1]   # 违约概率

# 使用测试集上的ROC确定最优阈值(约登指数)
fpr, tpr, thresholds = roc_curve(y_test, y_proba_test)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"基于测试集的最优阈值: {optimal_threshold:.4f}")

# 在测试集上进行二分类预测
y_pred_test = (y_proba_test >= optimal_threshold).astype(int)

# 计算测试集上的评估指标
test_accuracy = accuracy_score(y_test, y_pred_test)
test_auc = auc(fpr, tpr)
print(f"\n测试集准确率: {test_accuracy:.4f}")
print(f"测试集AUC值: {test_auc:.4f}")

# 混淆矩阵和分类报告
print("\n测试集混淆矩阵:")
print(confusion_matrix(y_test, y_pred_test))
print("\n测试集分类报告:")
print(classification_report(y_test, y_pred_test, target_names=['正常', '违约']))

# 绘制ROC曲线
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'随机森林 (测试集 AUC = {test_auc:.3f})')
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('假阳性率 (FPR)')
plt.ylabel('真阳性率 (TPR)')
plt.title('ROC曲线 - 测试集')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
训练集样本数: 7000, 测试集样本数: 3000
基于测试集的最优阈值: 0.2241

测试集准确率: 0.9633
测试集AUC值: 0.9773

测试集混淆矩阵:
[[2043   63]
 [  47  847]]

测试集分类报告:
              precision    recall  f1-score   support

          正常       0.98      0.97      0.97      2106
          违约       0.93      0.95      0.94       894

    accuracy                           0.96      3000
   macro avg       0.95      0.96      0.96      3000
weighted avg       0.96      0.96      0.96      3000

No description has been provided for this image
In [15]:
# 计算百分制信用评分(分数越高,表示放贷资质越好)
# 注:y_proba_test 是模型预测的违约概率(取值0~1)
credit_scores = (1 - y_proba_test) * 100   # 转化为0-100分,高分代表低违约风险

# 根据之前计算的最优阈值(违约概率阈值),转换为信用评分阈值
# 最优阈值 optimal_threshold 是违约概率的切割点(如0.3)
# 对应的信用评分阈值为:credit_threshold = (1 - optimal_threshold) * 100
credit_threshold = (1 - optimal_threshold) * 100

print("="*60)
print("百分制放贷资质评分结果(测试集)")
print("="*60)
print(f"信用评分范围: [{credit_scores.min():.2f}, {credit_scores.max():.2f}]")
print(f"根据ROC曲线确定的最优违约概率阈值: {optimal_threshold:.4f}")
print(f"对应的信用评分阈值: {credit_threshold:.2f}分")
print("\n业务解释:")
print(f"  - 信用评分 >= {credit_threshold:.2f} 分 → 预测为【适合放贷】农户(低违约风险)")
print(f"  - 信用评分 <  {credit_threshold:.2f} 分 → 预测为【不适合放贷】农户(高违约风险)")
print("="*60)

# 展示前20个样本的评分和决策
print("\n测试集前20个样本的评分及放贷建议:")
print("-"*50)
sample_df = pd.DataFrame({
    '真实标签': y_test[:20],
    '违约概率': y_proba_test[:20],
    '信用评分': credit_scores[:20],
    '放贷建议': ['适合放贷' if score >= credit_threshold else '不适合放贷' for score in credit_scores[:20]]
})
print(sample_df.to_string(index=False))
============================================================
百分制放贷资质评分结果(测试集)
============================================================
信用评分范围: [0.00, 100.00]
根据ROC曲线确定的最优违约概率阈值: 0.2241
对应的信用评分阈值: 77.59分

业务解释:
  - 信用评分 >= 77.59 分 → 预测为【适合放贷】农户(低违约风险)
  - 信用评分 <  77.59 分 → 预测为【不适合放贷】农户(高违约风险)
============================================================

测试集前20个样本的评分及放贷建议:
--------------------------------------------------
 真实标签     违约概率       信用评分  放贷建议
    1 0.961171   3.882867 不适合放贷
    0 0.070154  92.984630  适合放贷
    1 0.932509   6.749129 不适合放贷
    0 0.055805  94.419519  适合放贷
    1 0.904281   9.571883 不适合放贷
    0 0.000000 100.000000  适合放贷
    0 0.015013  98.498671  适合放贷
    0 0.025211  97.478912  适合放贷
    1 0.928012   7.198837 不适合放贷
    0 0.006134  99.386580  适合放贷
    0 0.029975  97.002459  适合放贷
    0 0.000000 100.000000  适合放贷
    0 0.055805  94.419519  适合放贷
    0 0.000000 100.000000  适合放贷
    1 0.859161  14.083933 不适合放贷
    1 0.845999  15.400104 不适合放贷
    0 0.025627  97.437299  适合放贷
    0 0.000000 100.000000  适合放贷
    0 0.000000 100.000000  适合放贷
    0 0.000000 100.000000  适合放贷
In [16]:
print("\n所有代码运行完毕。")
所有代码运行完毕。
In [ ]: