python信⽤卡违约预测分析_Python数据分析及可视化实例之
银⾏信⽤卡违约预测(24)...
1.项⽬背景:
银⾏体系对于信⽤可违约进⾏预测,原始数据集如下:
2.分析步骤:
(1)数据清洗(Data Cleaning)
(2) 探索性可视化(Exploratory Visualization)
(3) 特征⼯程(Feature Engineering)
(4)基本建模&评估(Basic Modeling& Evaluation)
3.源码:
数据集下载:易⼀⽹络科技 - 付费⽂章w ww.intumu
加载数据
import pandas as pd
ad_excel('LRGWFB.xls')
df.head()
年龄 教育 ⼯龄 地址 收⼊ 负债率 信⽤卡负债 其他负债 违约 0 41 3 17 12 176 9.3 11.359392 5.008608 1 1 27 1 10 6 31 17.3
1.362202 4.000798 0 2 40 1 15 14 55 5.5 0.856075
2.168925 0 3 41 1 15 14 120 2.9 2.658720 0.821280 0 4 24 2
2 0 28 17.
3 1.787436 3.05656
4 1
是否有空值
df.isnull().any()
年龄 False
教育 False
⼯龄 False
地址 False
收⼊ False
负债率 False
信⽤卡负债 False
其他负债 False
违约 False
dtype: bool
⽬标集分类
训练集、⽬标集分割
X, y = df.iloc[:,1:-1],df.iloc[:,-1]
特征相关性
classes = list()
classes
['教育', '⼯龄', '地址', '收⼊', '负债率', '信⽤卡负债', '其他负债']
from yellowbrick.features import Rank2D
visualizer = Rank2D(algorithm='pearson',size=(800, 600),title="7特征向量的⽪尔森相关系数")
visualizer.fit(X, y)
visualizer.poof()
E:\Anaconda3\lib\site-packages\yellowbrick\features\rankd.py:262: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
X = X.as_matrix()
特征重要性
semble import RandomForestClassifier
from yellowbrick.features.importances import FeatureImportances
model = RandomForestClassifier(n_estimators=10)
viz = FeatureImportances(model,size=(800, 600),title="随机森林算法分类训练特征重要性",xlabel='重要性评分')
viz.fit(X, y)
viz.poof()
分类报告
训练集、测试集分割
del_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size =0.2, random_state=10)
分类结果报告
semble import RandomForestClassifier
from yellowbrick.classifier import ClassificationReport
model = RandomForestClassifier(n_estimators=10)
visualizer = ClassificationReport(model, support=True,size=(800, 600),title="机森林算法分类报告")
visualizer.fit(X_train.values, y_train)
print('得分:',visualizer.score(X_test.values, y_test))
semble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)
model.fit(X_train.values, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
als import joblib
joblib.dump(model,'model.pickle') #保存
['model.pickle']
载⼊训练模型
model = joblib.load('model.pickle') #载⼊
model.predict(X_test) # 输出每组数据的预测结果的标签值
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 1, 1, 0, 0, 0, 0], dtype=int64)
model.predict_proba(X_test) # 输出的是⼆维矩阵 ,第i⾏j列表⽰测试数据第i⾏测试数据在每个label上的概率array([[1. , 0. ],
[0.9, 0.1],
[0.8, 0.2],
[1. , 0. ],
[0.9, 0.1],
[1. , 0. ],
[0.5, 0.5],
[0.6, 0.4],
[0.3, 0.7],
[1. , 0. ], [0.6, 0.4], [0.9, 0.1],
[0.7, 0.3],
[1. , 0. ], [0.9, 0.1], [0.4, 0.6], [0.4, 0.6],
[0.5, 0.5],
[1. , 0. ],
[0.8, 0.2],
[1. , 0. ], [0.9, 0.1], [0.5, 0.5], [0.1, 0.9], [0.9, 0.1], [0.8, 0.2], [0.6, 0.4], [0.8, 0.2], [0.9, 0.1],
[0.7, 0.3],
[1. , 0. ], [0.2, 0.8],
[0.9, 0.1],
[1. , 0. ], [1. , 0. ], [1. , 0. ],
[0.5, 0.5],
[0.1, 0.9],
[1. , 0. ], [1. , 0. ], [0.8, 0.2],
[0.7, 0.3],
[1. , 0. ], [0.5, 0.5], [0.8, 0.2], [0.7, 0.3], [0.9, 0.1], [0.8, 0.2], [0.3, 0.7],
[0.9, 0.1],
银行汽车贷款[1. , 0. ], [0.9, 0.1], [0.9, 0.1], [0.9, 0.1], [0.8, 0.2],
[0.9, 0.1],
[1. , 0. ], [0.9, 0.1], [0.4, 0.6], [0.5, 0.5], [0.9, 0.1], [0.8, 0.2], [0.6, 0.4],
[0.8, 0.2],
[1. , 0. ],
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论