干貨 | 基于 Python 的信用評(píng)分模型實(shí)戰(zhàn)!


#載入數(shù)據(jù)
data?=?pd.read_csv('cs-training.csv')
#數(shù)據(jù)集確實(shí)和分布情況
data.describe().to_csv('DataDescribe.csv')

3.1缺失值處理
1.直接刪除含有缺失值的樣本。
2.根據(jù)樣本之間的相似性填補(bǔ)缺失值。
3.根據(jù)變量之間的相關(guān)關(guān)系填補(bǔ)缺失值。
變量 MonthlyIncome 缺失率比較大,所以我們根據(jù)變量之間的相關(guān)關(guān)系填補(bǔ)缺失值,我們采用隨機(jī)森林法:
#?用隨機(jī)森林對(duì)缺失值預(yù)測(cè)填充函數(shù)
def?set_missing(df):
????#?把已有的數(shù)值型特征取出來
????process_df?=?df.ix[:,[5,0,1,2,3,4,6,7,8,9]]
????#?分成已知該特征和未知該特征兩部分
????known?=?process_df[process_df.MonthlyIncome.notnull()].as_matrix()
????unknown?=?process_df[process_df.MonthlyIncome.isnull()].as_matrix()
????#?X為特征屬性值
????X?=?known[:,?1:]
????#?y為結(jié)果標(biāo)簽值
????y?=?known[:,?0]
????#?fit到RandomForestRegressor之中
????rfr?=?RandomForestRegressor(random_state=0,
????n_estimators=200,max_depth=3,n_jobs=-1)
????rfr.fit(X,y)
????#?用得到的模型進(jìn)行未知特征值預(yù)測(cè)
????predicted?=?rfr.predict(unknown[:,?1:]).round(0)
????print(predicted)
????#?用得到的預(yù)測(cè)結(jié)果填補(bǔ)原缺失數(shù)據(jù)
????df.loc[(df.MonthlyIncome.isnull()),?'MonthlyIncome']?=?predicted
????return?df
data=set_missing(data)#用隨機(jī)森林填補(bǔ)比較多的缺失值
data=data.dropna()#刪除比較少的缺失值
data?=?data.drop_duplicates()#刪除重復(fù)項(xiàng)
data.to_csv('MissingData.csv',index=False)
3.2異常值處理
#?年齡等于0的異常值進(jìn)行剔除
data?=?data[data['age']?>?0]

#剔除異常值
data?=?data[data['NumberOfTime30-59DaysPastDueNotWorse']?90]????
#變量SeriousDlqin2yrs取反
data['SeriousDlqin2yrs']=1-data['SeriousDlqin2yrs']
3.3數(shù)據(jù)切分
from?sklearn.cross_validation?import?train_test_splitY?=?data['SeriousDlqin2yrs']
????X?=?data.ix[:,?1:]
????#測(cè)試集占比30%
????X_train,?X_test,?Y_train,?Y_test?=?train_test_split(X,?Y,?test_size=0.3,?random_state=0)
????#?print(Y_train)
????train?=?pd.concat([Y_train,?X_train],?axis=1)
????test?=?pd.concat([Y_test,?X_test],?axis=1)
????clasTest?=?test.groupby('SeriousDlqin2yrs')['SeriousDlqin2yrs'].count()
????train.to_csv('TrainData.csv',index=False)
????test.to_csv('TestData.csv',index=False)
客戶年齡分布如圖4-1所示,可以看到年齡變量大致呈正態(tài)分布,符合統(tǒng)計(jì)分析的假設(shè)。


在本文中,我們采用信用評(píng)分模型的變量選擇方法,通過?WOE分析方法,即是通過比較指標(biāo)分箱和對(duì)應(yīng)分箱的違約概率來確定指標(biāo)是否符合經(jīng)濟(jì)意義。首先我們對(duì)變量進(jìn)行離散化(分箱)處理。
5.1分箱處理
我們首先選擇對(duì)連續(xù)變量進(jìn)行最優(yōu)分段,在連續(xù)變量的分布不滿足最優(yōu)分段的要求時(shí),再考慮對(duì)連續(xù)變量進(jìn)行等距分段。最優(yōu)分箱的代碼如下:
#?定義自動(dòng)分箱函數(shù)def?mono_bin(Y,?X,?n?=?20):
????r?=?0
????good=Y.sum()
????bad=Y.count()?-?good
????while?np.abs(r)?1:
????????d1?=?pd.DataFrame({"X":?X,?"Y":?Y,?"Bucket":?pd.qcut(X,?n)})
????????d2?=?d1.groupby('Bucket',?as_index?=?True)
????????r,?p?=?stats.spearmanr(d2.mean().X,?d2.mean().Y)
????????n?=?n?-?1
????d3?=?pd.DataFrame(d2.X.min(),?columns?=?['min'])
????d3['min']=d2.min().X
????d3['max']?=?d2.max().X
????d3['sum']?=?d2.sum().Y
????d3['total']?=?d2.count().Y
????d3['rate']?=?d2.mean().Y
????d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
????d4?=?(d3.sort_index(by?=?'min')).reset_index(drop=True)
????print("="?*?60)
????print(d4)
????return?d4




#?連續(xù)變量離散化
cutx3?=?[ninf,?0,?1,?3,?5,?pinf]
cutx6?=?[ninf,?1,?2,?3,?5,?pinf]
cutx7?=?[ninf,?0,?1,?3,?5,?pinf]
cutx8?=?[ninf,?0,1,2,?3,?pinf]
cutx9?=?[ninf,?0,?1,?3,?pinf]
cutx10?=?[ninf,?0,?1,?2,?3,?5,?pinf]
5.2WOE
woe=ln(goodattribute/badattribute)
在進(jìn)行分析時(shí),我們需要對(duì)各指標(biāo)從小到大排列,并計(jì)算出相應(yīng)分檔的 WOE 值。其中正向指標(biāo)越大,WOE 值越?。环聪蛑笜?biāo)越大,WOE 值越大。正向指標(biāo)的 WOE 值負(fù)斜率越大,反響指標(biāo)的正斜率越大,則說明指標(biāo)區(qū)分能力好。WOE 值趨近于直線,則意味指標(biāo)判斷能力較弱。若正向指標(biāo)和 WOE 正相關(guān)趨勢(shì)、反向指標(biāo)同 WOE 出現(xiàn)負(fù)相關(guān)趨勢(shì),則說明此指標(biāo)不符合經(jīng)濟(jì)意義,則應(yīng)當(dāng)予以去除。
WOE函數(shù)實(shí)現(xiàn)在上一節(jié)的mono_bin()函數(shù)里面已經(jīng)包含,這里不再重復(fù)。
5.3相關(guān)性分析和IV篩選
相關(guān)性圖我們通過Python里面的seaborn包,調(diào)用heatmap()繪圖函數(shù)進(jìn)行繪制,實(shí)現(xiàn)代碼如下:
corr?=?data.corr()
#計(jì)算各變量的相關(guān)性系數(shù)
xticks?=?['x0','x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x軸標(biāo)簽
yticks?=?list(corr.index)
#y軸標(biāo)簽
fig?=?plt.figure()ax1?=?fig.add_subplot(1,?1,?1)sns.heatmap(corr,?annot=True,?cmap='rainbow',?ax=ax1,?annot_kws={'size':?9,?'weight':?'bold',?'color':?'blue'})
#繪制相關(guān)性系數(shù)熱力圖
ax1.set_xticklabels(xticks,?rotation=0,?fontsize=10)ax1.set_yticklabels(yticks,?rotation=0,?fontsize=10)plt.show()

#?定義自動(dòng)分箱函數(shù)def?mono_bin(Y,?X,?n?=?20):
????r?=?0
????good=Y.sum()
????bad=Y.count()-good
????while?np.abs(r)?1:
????????d1?=?pd.DataFrame({"X":?X,?"Y":?Y,?"Bucket":?pd.qcut(X,?n)})
????????d2?=?d1.groupby('Bucket',?as_index?=?True)
????????r,?p?=?stats.spearmanr(d2.mean().X,?d2.mean().Y)
????????n?=?n?-?1
????d3?=?pd.DataFrame(d2.X.min(),?columns?=?['min'])
????d3['min']=d2.min().X
????d3['max']?=?d2.max().X
????d3['sum']?=?d2.sum().Y
????d3['total']?=?d2.count().Y
????d3['rate']?=?d2.mean().Y
????d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad))
????d3['goodattribute']=d3['sum']/good
????d3['badattribute']=(d3['total']-d3['sum'])/bad
????iv=((d3['goodattribute']-d3['badattribute'])*d3['woe']).sum()
????d4?=?(d3.sort_index(by?=?'min')).reset_index(drop=True)
????print("="?*?60)
????print(d4)
????cut=[]
????cut.append(float('-inf'))
????for?i?in?range(1,n+1):
????????qua=X.quantile(i/(n+1))
????????cut.append(round(qua,4))
????cut.append(float('inf'))
????woe=list(d4['woe'].round(3))
????return?d4,iv,cut,woe
ivlist=[ivx1,ivx2,ivx3,ivx4,ivx5,ivx6,ivx7,ivx8,ivx9,ivx10]
#各變量
IVindex=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
#x軸的標(biāo)簽
fig1?=?plt.figure(1)ax1?=?fig1.add_subplot(1,?1,?1)x?=?np.arange(len(index))+1ax1.bar(x,?ivlist,?width=0.4)
#生成柱狀圖
ax1.set_xticks(x)ax1.set_xticklabels(index,?rotation=0,?fontsize=12)ax1.set_ylabel('IV(Information?Value)',?fontsize=14)
#在柱狀圖上添加數(shù)字標(biāo)簽
for?a,?b?in?zip(x,?ivlist):
????plt.text(a,?b?+?0.01,?'%.4f'?%?b,?ha='center',?va='bottom',?fontsize=10)plt.show()

6.1WOE轉(zhuǎn)換
#替換成woe函數(shù)def?replace_woe(series,?cut,?woe):
????list?=?[]
????I?=?0
????while?i????????value=series[i]
????????j=len(cut)?-?2
????????m=len(cut)?-?2
????????while?j?>=?0:
????????????if?value>=cut[j]:
????????????????j?=?-1
????????????else:
????????????????j?-=?1
????????????????m?-=?1
????????list.append(woe[m])
????????i?+=?1
????return?list
#?替換成
woedata['RevolvingUtilizationOfUnsecuredLines']?=?Series(replace_woe(data['RevolvingUtilizationOfUnsecuredLines'],?cutx1,?woex1))
data['age']?=?Series(replace_woe(data['age'],?cutx2,?woex2))
data['NumberOfTime30-59DaysPastDueNotWorse']?=?Series(replace_woe(data['NumberOfTime30-59DaysPastDueNotWorse'],?cutx3,?woex3))
data['DebtRatio']?=?Series(replace_woe(data['DebtRatio'],?cutx4,?woex4))
data['MonthlyIncome']?=?Series(replace_woe(data['MonthlyIncome'],?cutx5,?woex5))
data['NumberOfOpenCreditLinesAndLoans']?=?Series(replace_woe(data['NumberOfOpenCreditLinesAndLoans'],?cutx6,?woex6))
data['NumberOfTimes90DaysLate']?=?Series(replace_woe(data['NumberOfTimes90DaysLate'],?cutx7,?woex7))
data['NumberRealEstateLoansOrLines']?=?Series(replace_woe(data['NumberRealEstateLoansOrLines'],?cutx8,?woex8))
data['NumberOfTime60-89DaysPastDueNotWorse']?=?Series(replace_woe(data['NumberOfTime60-89DaysPastDueNotWorse'],?cutx9,?woex9))
data['NumberOfDependents']?=?Series(replace_woe(data['NumberOfDependents'],?cutx10,?woex10))
data.to_csv('WoeData.csv',?index=False)
6.2Logisic模型建立
導(dǎo)入數(shù)據(jù)data?=?pd.read_csv('WoeData.csv')
#應(yīng)變量
Y=data['SeriousDlqin2yrs']
#自變量,剔除對(duì)因變量影響不明顯的變量
X=data.drop(['SeriousDlqin2yrs','DebtRatio','MonthlyIncome',?'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
X1=sm.add_constant(X)
logit=sm.Logit(Y,X1)
result=logit.fit()
print(result.summary())

6.3模型檢驗(yàn)
#應(yīng)變量
Y_test?=?test['SeriousDlqin2yrs']
#自變量,剔除對(duì)因變量影響不明顯的變量,與模型變量對(duì)應(yīng)
X_test?=?test.drop(['SeriousDlqin2yrs',?'DebtRatio',?'MonthlyIncome',?'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines',?'NumberOfDependents'],?axis=1)
X3?=?sm.add_constant(X_test)
resu?=?result.predict(X3)
#進(jìn)行預(yù)測(cè)
fpr,?tpr,?threshold?=?roc_curve(Y_test,?resu)
rocauc?=?auc(fpr,?tpr)
#計(jì)算
AUCplt.plot(fpr,?tpr,?'b',?label='AUC?=?%0.2f'?%?rocauc)
#生成ROC曲線
plt.legend(loc='lower?right')
plt.plot([0,?1],?[0,?1],?'r--')
plt.xlim([0,?1])
plt.ylim([0,?1])
plt.ylabel('真正率')
plt.xlabel('假正率')
plt.show()

7.1評(píng)分標(biāo)準(zhǔn)

#?我們?nèi)?00分為基礎(chǔ)分值,PDO為20(每高20分好壞比翻一倍),好壞比取20。
z?=?20?/?math.log(2)
q?=?600?-?20?*?math.log(20)?/?math.log(2)
baseScore?=?round(q?+?p?*?coe[0],?0)
7.2部分評(píng)分
#計(jì)算分?jǐn)?shù)函數(shù)?def?get_score(coe,woe,factor):
????scores=[]
????for?w?in?woe:
????????score=round(coe*w*factor,0)
????????scores.append(score)
????return?scores
#?各項(xiàng)部分分?jǐn)?shù)
x1?=?get_score(coe[1],?woex1,?p)
x2?=?get_score(coe[2],?woex2,?p)
x3?=?get_score(coe[3],?woex3,?p)
x7?=?get_score(coe[4],?woex7,?p)
x9?=?get_score(coe[5],?woex9,?p)

#根據(jù)變量計(jì)算分?jǐn)?shù)
def?compute_score(series,cut,score):
????list?=?[]
????i?=?0
????while?i?????????value?=?series[i]
????????j?=?len(cut)?-?2
????????m?=?len(cut)?-?2
????????while?j?>=?0:
????????????if?value?>=?cut[j]:
????????????????j?=?-1
????????????else:
????????????????j?-=?1
????????????????m?-=?1
????????list.append(score[m])
????????i?+=?1
????return?list
test1?=?pd.read_csv('TestData.csv')
test1['BaseScore']=Series(np.zeros(len(test1)))+baseScore
test1['x1']?=?Series(compute_score(test1['RevolvingUtilizationOfUnsecuredLines'],?cutx1,?x1))
test1['x2']?=?Series(compute_score(test1['age'],?cutx2,?x2))
test1['x3']?=?Series(compute_score(test1['NumberOfTime30-59DaysPastDueNotWorse'],?cutx3,?x3))
test1['x7']?=?Series(compute_score(test1['NumberOfTimes90DaysLate'],?cutx7,?x7)
test1['x9']?=?Series(compute_score(test1['NumberOfTime60-89DaysPastDueNotWorse'],?cutx9,?x9))
test1['Score']?=?test1['x1']?+?test1['x2']?+?test1['x3']?+?test1['x7']?+test1['x9']??+?baseScore
test1.to_csv('ScoreData.csv',?index=False)

基于 AI 的機(jī)器學(xué)習(xí)評(píng)分卡系統(tǒng)可通過把舊數(shù)據(jù)(某個(gè)時(shí)間點(diǎn)后,例如2年)剔除掉后再進(jìn)行自動(dòng)建模、模型評(píng)估、并不斷優(yōu)化特征變量,可以使系統(tǒng)更加強(qiáng)大。
評(píng)論
圖片
表情
