数据预处理

获取数据

!wget https://raw.githubusercontent.com/Rosefinch-Midsummer/Rosefinch-Midsummer.github.io/main/content/posts/file/bankpep.csv

读入数据并以id为索引,展示前五个数据

1
2
3
4
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('bankpep.csv',index_col='id')
print(df.head(5))

把字符型数据替换成数值型数据

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
seq = ['married' ,'car','save_act','current_act', 'mortgage' ,'pep']
for feature in seq:

  df.loc[df[feature]=='YES',feature] = 1

  df.loc[df[feature]=='NO',feature] = 0


#替换性别
df.loc[df['sex']=='MALE','sex'] = 1
df.loc[df['sex']=='FEMALE','sex'] = 0

print(df[0:5])

利用dummmies矩阵处理多个离散值的特征项如把children分成children1,children2,children3

1
2
3
4
5
#将norminal数据转换成dummies矩阵
dumm_region = pd.get_dummies(df['region'], prefix='region')
dumm_children = pd.get_dummies(df['children'], prefix='children')

print(dumm_region)

1
2
3
4
5
6
7
#删除df中原来两列,再连结dummies

df1 = df.drop(['region','children'],axis=1)

df = df1.join([dumm_region,dumm_children],how='outer')

print(df[0:5])

1
2
3
4
5
#将df删除'pep'后作为x

x = df.drop(['pep'],axis=1).values.astype(float)

y = df['pep'].values.astype(int)

可视化

1.客户年龄分布的直方图和密度图

1
df['age'].plot(kind='hist',bins=20,title='Age Distribution')

1
df['age'].plot(kind='kde',xlim=[15,85],style='k--')

2.客户年龄和收入关系的散点图

1
df.plot(kind='scatter',x='age',y='income',title='The Relation of Age and Incomde',marker='*',grid=True,xlim=[15,70],ylim=[5000,99999],label='(age,income)')

3.绘制散点图矩阵观察客户各项信息之间的关系

如研究银行客户的年龄、收入和孩子数之间的关系,对角线显示直方图

1
2
3
data = df[['age','income','children']]#children列未拆分可以画下面的散点图矩阵

pd.plotting.scatter_matrix(data,diagonal='hist',color='k')

4.按区域展示平均收入的柱状图,并显示标准差

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
import matplotlib.pyplot as plt
import numpy as np

mean = df.groupby(['region']).agg({'income':np.mean})
std = df.groupby(['region']).agg({'income':np.std})

plt.bar(mean.index,mean.income,yerr=std.income,width=0.6)
plt.xlabel('Region')
plt.ylabel('Customer Income')
plt.xticks(rotation=45)

plt.show()

5.多子图绘制三种性别占比饼图

账户中性别占比饼图,有车的性别占比饼图,按孩子数的账户占比饼图

1
2
3
4
5
piedata = df.groupby(['sex'])['sex'].count()

#piedata.plot(kind='pie',labels=['FEMALE','MALE'],startangle=45,autopct='%0.1f%%')

piedata.plot(kind='pie',figsize=(6,6),labels=['FEMALE','MALE'],title='The_Relation_Between_Gender_And_Income',fontsize=14,shadow=True,startangle=45,autopct='%0.1f%%')

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6,6))

ax1 = fig.add_subplot(2,2,1)
piedata1 = df.groupby(['sex'])['sex'].count()
plt.pie(piedata1,labels=['FEMALE','MALE'],startangle=45,autopct='%0.1f%%')
plt.ylabel('sex')
plt.title('Customer Sex')

  

ax2=fig.add_subplot(2,2,2)
piedata2=df[df['car']=='YES'].groupby(['sex'])['sex'].count()
plt.pie(piedata2,labels=['FEMALE','MALE'],startangle=45,autopct='%0.1f%%')
#若前面把YES变成1,这里会报错
plt.ylabel('sex')
plt.title('Customer Car & Sex')

  

ax3 = fig.add_subplot(223)#223之间可加也可不加,
piedata3 = df.groupby(['children'])['children'].count()
plt.pie(piedata3,labels=[0,1,2,3],explode=[0,0,0.2,0],startangle=45,autopct='%0.1f%%')
plt.ylabel('Children')
plt.title('Customer Children')

plt.show()

6.各性别收入的箱须图

作用:表达数据的分位数分布,观察异常值

  • 将样本居中的50%值域用一个长方形表示
  • 较大和较小的25%值域各用一根线表示
  • 异常值用’o’表示
1
df[['sex','income']].boxplot(by='sex')

训练模型,测试性能

svm

原理:将数据看作多维空间的点,用数学优化的方法求解一个最优的超平面将不同类别的点分开,“越胖越好”。若数据集在低维空间无法使用超平面切割,则可以用核函数将低维数据映射到高维空间,从而实现分割

1
2
3
4
5
6
7
from sklearn import svm

clf = svm.SVC(kernel='rbf',gamma=0.6,C=1.0)
#参数详解:kernel包括linear、poly、rbf(高斯核函数)和sigmoid,gamma指poly、rbf或sigmoid的核系数,一般取值在(0,1),C指误差项的惩罚参数一般取10^n,如1,0.1,0.01等
clf.fit(x,y)

print("Accuracy: ",clf.score(x,y))#Accuracy:  1.0

评估分类器性能

1
2
3
4
5
6
7
#评估分类器性能

from sklearn import metrics

y_predicted = clf.predict(x)

print(metrics.classification_report(y,y_predicted))

划分训练集和测试集,并在测试集上测试性能

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
from sklearn import model_selection

x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size=0.3,random_state=1)

clf = svm.SVC(kernel='rbf',gamma=0.6,C=1.0)
#参数详解:kernel包括linear、poly、rbf(高斯核函数)和sigmoid,gamma指poly、rbf或sigmoid的核系数,一般取值在(0,1),C指误差项的惩罚参数一般取10^n,如1,0.1,0.01等
clf.fit(x_train,y_train)

print("Performance on train set: ",clf.score(x_train,y_train))

print("Performance on test set: ",clf.score(x_test,y_test))

svm样本距离计算,数值型数据需标准化处理,标准化分为StandardScaler(标准化)、MinMaxScaler(归一化也可以写成Normalization)、QuantileTransformer等。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
from sklearn import preprocessing

x_scale = preprocessing.scale(x)

x_train,x_test,y_train,y_test = model_selection.train_test_split(x_scale,y,test_size=0.3,random_state=1)

clf = svm.SVC(kernel='rbf',gamma=0.7,C=1.0)
#参数详解:kernel包括linear、poly、rbf(高斯核函数)和sigmoid,gamma指poly、rbf或sigmoid的核系数,一般取值在(0,1),C指误差项的惩罚参数一般取10^n,如1,0.1,0.01等

clf.fit(x_train,y_train)

print("Performance on train set: ",clf.score(x_train,y_train))
print("Performance on test set: ",clf.score(x_test,y_test))

DecisionTree

DecisionTreeClassifier

训练分类器

1
2
3
4
5
6
7
from sklearn import tree

clf = tree.DecisionTreeClassifier()

clf.fit(x,y)

clf.score(x,y)#1.0

评估分类器性能

1
2
3
4
5
6
7
8
9
from sklearn import metrics

predicted_y = clf.predict(x)

print(metrics.classification_report(y,predicted_y))

print('Confusion matrix:')

print(metrics.confusion_matrix(y,predicted_y))

分割训练集训练

1
2
3
4
5
6
7
8
9
from sklearn import model_selection

x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size=0.3,random_state=1)

clf = tree.DecisionTreeClassifier()

clf.fit(x_train,y_train)
print(clf.score(x_train,y_train))#1.0
print(clf.score(x_test,y_test))#0.75,自动取两位小数了

GradientBoostingClassifier

1
2
3
4
5
6
7
8
9
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection

x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size=0.3,random_state=1)

clf = GradientBoostingClassifier()
clf.fit(x_train,y_train)
print(clf.score(x_train,y_train))#0.9547619047619048
print(clf.score(x_test,y_test))#0.8555555555555555

MLPClassifier

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
import datetime #计时

x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size=0.3,random_state=1)

  

start = datetime.datetime.now()
clf = MLPClassifier(solver='sgd',activation='relu',alpha=1e-5,hidden_layer_sizes=(9,9),random_state=1)

#clf = MLPClassifier(solver='lbfgs',activation='identity',alpha=1e-5,hidden_layer_sizes=(9,9),random_state=1)

clf.fit(x_train,y_train)
end = datetime.datetime.now()

print("运行时长:",end-start)#运行时长: 0:00:00.034772
print(clf.score(x_train,y_train))#0.5380952380952381
print(clf.score(x_test,y_test))#0.5555555555555556

Naive_Bayes

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
import datetime #计时

x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size=0.3,random_state=1)

start = datetime.datetime.now()
clf = GaussianNB()
clf.fit(x_train,y_train)
end = datetime.datetime.now()

print("运行时长:",end-start)#运行时长: 0:00:00.001123
print(clf.score(x_train,y_train))#0.719047619047619
print(clf.score(x_test,y_test))#0.6888888888888889

XGBClassifier

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection
import datetime #计时

x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size=0.3,random_state=1)

start = datetime.datetime.now()
clf = XGBClassifier(max_depth=6,gamma=0.1,subsample=1,colsample_bytree=1)
clf.fit(x_train,y_train)
end = datetime.datetime.now()

print("运行时长:",end-start)#运行时长: 0:00:00.335160
print(clf.score(x_train,y_train))#1.0
print(clf.score(x_test,y_test))#0.8333333333333334