数据分析中的变量分箱——德国信贷数据集(--688IT编程网

数据分析中的变量分箱——德国信贷数据集

（variablebinindataanalysis。。。

最近看了⼀本《Python⾦融⼤数据风控建模实战：基于机器学习》（机械⼯业出版社）这本书，看了其中第6章：变量分箱⽅法内容，总结了主要内容以及做了代码详解，分享给⼤家。

⼀、主要知识点：

1. 变量分箱是⼀种特征⼯程⽅法，意在增强变量的可解释性与预测能⼒。变量分箱⽅法主要⽤于连续变量，对于变量取值较稀疏的离散变量也应该进⾏分箱处理。

2. 变量分箱的好处：

（1）降低异常值的影响，增加模型的稳定性。

（2）缺失值作为特殊变量参与分箱，减少缺失值填补的不确定性。

（3）增加变量的可解释性。

（4）增加变量的⾮线性。

（5）增加模型的预测效果。

3. 变量分箱的局限性：

（1）同⼀箱内的样本具有同质性。（2）需要专家经验⽀持。

4. 变量分箱的注意事项：

（1）分箱结果不宜过多。（2）分箱结果不宜过少。（3）分箱后单调性的要求。

5. 变量分箱流程

⼆、代码实现

数据的使⽤还是德国信贷数据集，具体数据集介绍和获取⽅法请看

1# -*- coding: utf-8 -*-

2"""

3第6章：变量分箱⽅法

4 1: Chi-merge(卡⽅分箱)

5 2: IV(最优IV值分箱)

6 3: 信息熵(基于树的分箱)

7"""

8import os

9import pandas as pd

10import numpy as np

del_selection import train_test_split

12import warnings

13 warnings.filterwarnings("ignore") # 忽略警告

16def data_read(data_path, file_name):

17 df = pd.read_csv(os.path.join(data_path, file_name),

18 delim_whitespace=True,

20# 变量重命名

21 columns = [

22'status_account', 'duration', 'credit_history', 'purpose', 'amount',

23'svaing_account', 'present_emp', 'income_rate', 'personal_status',

24'other_debtors', 'residence_info', 'property', 'age', 'inst_plans',

25'housing', 'num_credits', 'job', 'dependents', 'telephone',

26'foreign_worker', 'target'

27 ]

28 df.columns = columns

29# 将标签变量由状态1,2转为0,1;0表⽰好⽤户，1表⽰坏⽤户

30 df.target = df.target - 1

31# 数据分为data_train和 data_test两部分，训练集⽤于得到编码函数，验证集⽤已知的编码规则对验证集编码

32 data_train, data_test = train_test_split(df,

33 test_size=0.2,

34 random_state=0,

35 stratify=df.target)

36return data_train, data_test

39def cal_advantage(temp, piont, method, flag='sel'):

40"""

41计算当前切分点下的指标值

42 # 参数

43 temp: 上⼀步的分箱结果，pandas dataframe

44 piont: 切分点，以此来划分分箱

45 method: 分箱⽅法选择，1:chi-merge , 2:IV值, 3：信息熵

46"""

47# temp = binDS

48if flag == 'sel':

49# ⽤于最优切分点选择，这⾥只是⼆叉树，即⼆分

50 bin_num = 2

51# np.empty 依给定的shape, 和数据类型 dtype, 返回⼀个⼀维或者多维数组，数组的元素不为空，为随机产⽣的数据。

52 good_bad_matrix = np.empty((bin_num, 3))

53for ii in range(bin_num):

54if ii == 0:

55 df_temp_1 = temp[temp['bin_raw'] <= piont]

56else:

57 df_temp_1 = temp[temp['bin_raw'] > piont]

58# 计算每个箱内的好坏样本书

59 good_bad_matrix[ii][0] = df_temp_1['good'].sum()

60 good_bad_matrix[ii][1] = df_temp_1['bad'].sum()

61 good_bad_matrix[ii][2] = df_temp_1['total'].sum()

63elif flag == 'gain':

64# ⽤于计算本次分箱后的指标结果，即分箱数，每增加⼀个，就要算⼀下当前分箱下的指标结果

65 bin_num = temp['bin'].max()

66 good_bad_matrix = np.empty((bin_num, 3))

67for ii in range(bin_num):

68 df_temp_1 = temp[temp['bin'] == (ii + 1)]

69 good_bad_matrix[ii][0] = df_temp_1['good'].sum()

70 good_bad_matrix[ii][1] = df_temp_1['bad'].sum()

71 good_bad_matrix[ii][2] = df_temp_1['total'].sum()

73# 计算总样本中的好坏样本

74 total_matrix = np.empty(3)

75# sum（）函数⽤于获取所请求轴的值之和。

76 total_matrix[0] = d.sum()

77 total_matrix[1] = temp.bad.sum()

78 total_matrix[2] = al.sum()

80# Chi-merger分箱

81if method == 1:

82 X2 = 0

83# i 是区间的信息

84for i in range(bin_num):

85# j=0 表⽰好样本, j=1 表⽰坏样本

87# 期望值好(坏)样本/总样本 * 该区间的样本总数

88 expect = (total_matrix[j] / total_matrix[2]) * good_bad_matrix[i][2]

89# 计算实际值和期望值的差异距离的平⽅/该样本的期望值

variable used in lambda

90 X2 = X2 + (good_bad_matrix[i][j] - expect)**2 / expect

91 M_value = X2

92# IV分箱

93elif method == 2:

94if pd.isnull(total_matrix[0]) or pd.isnull(total_matrix[1]) or total_matrix[0] == 0 or total_matrix[1] == 0:

95 M_value = np.NaN

96else:

97 IV = 0

98for i in range(bin_num):

99# 坏好⽐

100 weight = good_bad_matrix[i][1] / total_matrix[1] - good_bad_matrix[i][0] / total_matrix[0]

101# 本来对照公式觉得这⾥出现问题，后来化简下⽅程，发现是对的

102 IV = IV + weight * np.log((good_bad_matrix[i][1] * total_matrix[0]) / (good_bad_matrix[i][0] * total_matrix[1])) 103 M_value = IV

104# 信息熵分箱

105elif method == 3:

106# 总的信息熵

107 entropy_total = 0

108for j in range(2):

109 weight = (total_matrix[j] / total_matrix[2])

110 entropy_total = entropy_total - weight * (np.log(weight))

111

112# 计算条件熵

113 entropy_cond = 0

114for i in range(bin_num):

115 entropy_temp = 0

116for j in range(2):

117 entropy_temp = entropy_temp - \

118 ((good_bad_matrix[i][j] / good_bad_matrix[i][2]) * np.log(good_bad_matrix[i][j] / good_bad_matrix[i][2])) 119 entropy_cond = entropy_cond + good_bad_matrix[i][2] / total_matrix[2] * entropy_temp

120

121# 计算归⼀化信息增益

122 M_value = 1 - (entropy_cond / entropy_total)

123# Best-Ks分箱

124else:

125pass

126return M_value

127

128

129def best_split(df_temp0, method, bin_num):

130"""

131在每个候选集中寻切分点，完成⼀次分裂。

132 select_split_point函数的中间过程函数

133 # 参数

134 df_temp0: 上⼀次分箱后的结果，pandas dataframe

135 method: 分箱⽅法选择，1:chi-merge , 2:IV值, 3：信息熵

136 bin_num: 分箱编号，在不同编号的分箱结果中继续⼆分

137 # 返回值

138返回在本次分箱标号内的最有切分结果， pandas dataframe

139"""

140# df_temp0 = df_temp

141# bin_num = 1

142 df_temp0 = df_temp0.sort_values(by=['bin', 'bad_rate'])

143 piont_len = len(df_temp0[df_temp0['bin'] == bin_num]) # 候选集的长度

144 bestValue = 0

145 bestI = 1

146 li = []

147# 以候选集的每个切分点做分隔，计算指标值

148for i in range(1, piont_len):

149# 计算指标值

150 value = cal_advantage(df_temp0, i, method, flag='sel')

151 li.append(value)

152# 要的是⼤的值

154 bestValue = value

155 bestI = i

156# print("beasValue = ", bestValue)

157# create new var split according to bestI，运⾏后多了⼀个维度

158 df_temp0['split'] = np.where(df_temp0['bin_raw'] <= bestI, 1, 0)

159# dataFrame.drop⽤于删除指定的⾏列

160 df_temp0 = df_temp0.drop('bin_raw', axis=1)

161# 重新排序，默认是升序排序

162 newbinDS = df_temp0.sort_values(by=['split', 'bad_rate'])

163# rebuild var i

164 newbinDS_0 = newbinDS[newbinDS['split'] == 0]

165 newbinDS_1 = newbinDS[newbinDS['split'] == 1]

166 newbinDS_0 = py()

167 newbinDS_1 = py()

168 newbinDS_0['bin_raw'] = range(1, len(newbinDS_0) + 1)

169 newbinDS_1['bin_raw'] = range(1, len(newbinDS_1) + 1)

170 newbinDS = pd.concat([newbinDS_0, newbinDS_1], axis=0)

171return newbinDS

172

173

174def select_split_point(temp_bin, method):

175"""

176⼆叉树分割⽅式，从候选者中挑选每次的最优切分点，与切分后的指标计算cont_var_bin函数的中间过程函数，177 # 参数

178 temp_bin: 分箱后的结果 pandas dataframe

179 method：分箱⽅法选择，1:chi-merge , 2:IV值, 3：信息熵

180 # 返回值

181新的分箱结果 pandas dataframe

182"""

183# temp_bin = df_temp_all

184# sort_values()函数原理类似于SQL中的order by，可以将数据集依照某个字段中的数据进⾏排序

185# 参数by指定列名(axis=0或’index’)或索引值(axis=1或’columns’)

186 temp_bin = temp_bin.sort_values(by=['bin', 'bad_rate'])

187# 得到当前的最⼤的分箱值

188 max_num = max(temp_bin['bin'])

189# temp_binC = dict()

190# m = dict()

191# # 不同箱内的数据取出来

192# for i in range(1, max_num + 1):

193# temp_binC[i] = temp_bin[temp_bin['bin'] == i]

194# m[i] = len(temp_binC[i])

195 temp_main = dict()

196 bin_i_value = []

197for i in range(1, max_num + 1):

198# 得到这⼀类别的数据

199 df_temp = temp_bin[temp_bin['bin'] == i]

200# 如果这⼀类别的数据⼤于1

201if df_temp.shape[0] > 1:

202# bin=i的做分裂

203 temp_split = best_split(df_temp, method, i)

204# 完成⼀次分箱，更新bin的之 np.where(condition, x, y) 满⾜条件condition，输出x，否则输出y

205# 这⾥把 ['bin'] 这⼀列本来都是相同的值区分开来

206 temp_split['bin'] = np.where(temp_split['split'] == 1, max_num + 1, temp_split['bin'])

207# 取出bin!=i合并为新租

208 temp_main[i] = temp_bin[temp_bin['bin'] != i]

209# 这⾥ temp_split ⽐ temp_main[i] 多了⼀列变量，合并的时候，不存在的值为 NaN

210 temp_main[i] = pd.concat([temp_main[i], temp_split], axis=0, sort=False)

211# 计算新分组的指标值

212 value = cal_advantage(temp_main[i], 0, method, flag='gain')

213 newdata = [i, value]

214 bin_i_value.append(newdata)

215# 最终只选择⼀个 df_temp.shape[0]>1 的分类分组结果

216# find maxinum of value bintoSplit

217 bin_i_value.sort(key=lambda x: x[1], reverse=True)

218# binNum = temp_all_Vals['BinToSplit']

219 binNum = bin_i_value[0][0]

221return newBins.sort_values(by=['bin', 'bad_rate']), round(bin_i_value[0][1], 4)

222

223

224def init_equal_bin(x, bin_rate):

225"""

226初始化等距分组，cont_var_bin函数的中间过程函数

227 # 参数

228 x：要分组的变量值，pandas series

229 bin_rate：⽐例值1/bin_rate

230 # 返回值

231返回初始化分箱结果，pandas dataframe

232"""

233# 异常值剔除，只考虑90%没的最⼤值与最⼩值，边界与-inf或inf分为⼀组

234# np.percentile 是计算⼀组数的分位数值

235# print("np.percentile(x, 95) = ", np.percentile(x, 95))

236if len(x[x > np.percentile(x, 95)]) > 0 and len(np.unique(x)) >= 30:

237 var_up = min(x[x > np.percentile(x, 95)])

238else:

239 var_up = max(x)

240# print("var_up = ", var_up)

241# print("np.percentile(x, 5) = ", np.percentile(x, 5))

242if len(x[x < np.percentile(x, 5)]) > 0:

243 var_low = max(x[x < np.percentile(x, 5)])

244else:

245 var_low = min(x)

246# print("var_low = ", var_low)

247

248# 初始化分组个数

249 bin_num = int(1 / bin_rate)

250# 分箱间隔

251 dist_bin = (var_up - var_low) / bin_num

252 bin_up = []

253 bin_low = []

254for i in range(1, bin_num + 1):

255if i == 1:

256 bin_up.append(var_low + i * dist_bin)

257 bin_low.append(-np.inf)

258elif i == bin_num:

259 bin_up.append(np.inf)

260 bin_low.append(var_low + (i - 1) * dist_bin)

261else:

262 bin_up.append(var_low + i * dist_bin)

263 bin_low.append(var_low + (i - 1) * dist_bin)

264 result = pd.DataFrame({'bin_up': bin_up, 'bin_low': bin_low})

265# 设置result数据的索引名

266 result.index.name = 'bin_num'

267return result

268

269

270def limit_min_sample(temp_cont, bin_min_num_0):

271"""

272分箱约束条件：每个箱内的样本数不能⼩于bin_min_num_0，cont_var_bin函数的中间过程函数273 # 参数

274 temp_cont: 初始化分箱后的结果 pandas dataframe

275 bin_min_num_0：每组内的最⼩样本限制

276 # 返回值

277合并后的分箱结果，pandas dataframe

278"""

279# print("合并前 temp_cont.shape = ", temp_cont.shape)

280# print("temp_cont.index.max() = ", temp_cont.index.max())

281for i in temp_cont.index:

282# 获取某⼀⾏的数据

283 rowdata = temp_cont.loc[i, :]

284# print("rowdata = ", rowdata)

285if i == temp_cont.index.max():

286# 如果是最后⼀个箱就，取倒数第⼆个值

688IT编程网

数据分析中的变量分箱——德国信贷数据集(

发表评论

推荐文章

Linux怎么直接执行PHP脚本文件

php文件写入或追加数据

php中实现文件上传的函数

php文件上传类程序代码

413 request entity too large 解决方法 -回复

热门文章

php中用来导入其他文件的语句

php获取文件后缀名的方法

创建php文件方法

国家电网公司电子商务平台常见问题

【2018-2019】别克英朗说明书-实用word文档 (12页)

诺基亚E71常见问题以及解决方法

HXD3型电力机车故障应急处理

卫星电视中星9号解密方法及节目参数,长期可用

硬盘U盘等启动奶瓶beini详细步骤教程

BT3使用教程

破解网通铁通电信封路由器的几种方法

手把手教你WPA2加密无线网络

教你如何破解搜索到的无线网络

Get清风OD入门系列图文详细教程、破解做辅助起步

java rar破解原理

同余方程在密码学中的应用与破解

无限网络解码

winrar破解方法

macOS终端中的文件加密和解密技巧

rar加密原理

最新文章

php中实现文件上传的函数

413 request entity too large 解决方法 -回复

php实现编辑和保存文件的方法

php 配置文件的用法 -回复

突破php网站上传文件大小限制

php(实现url重写)

标签列表