-
Notifications
You must be signed in to change notification settings - Fork 4
/
logistic_regression.py
171 lines (155 loc) · 5.98 KB
/
logistic_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 16 21:21:43 2018
@author: htshinichi
"""
##用于可视化图表
import matplotlib.pyplot as plt
##用于做科学计算
import numpy as np
##用于做数据分析
import pandas as pd
##用于加载数据或生成数据等
from sklearn import datasets
##加载线性模型
from sklearn import linear_model
###用于交叉验证以及训练集和测试集的划分
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_predict
#from sklearn.cross_validation import cross_val_score
###这个模块中含有评分函数,性能度量,距离计算等
from sklearn import metrics
###用于做数据预处理
from sklearn import preprocessing
digits = datasets.load_digits()
print(digits.keys())
print(digits.data.shape)
print(digits.target_names)
for i in range(0,2):
print(digits.target[i])
print(digits.images[i])
print(digits.data[i])
plt.gray()
for i in range(0,2):
plt.matshow(digits.images[i])
plt.show()
print(digits.target[i])
fig=plt.figure(figsize=(8,8))
fig.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
#绘制数字:每张图像8*8像素点
for i in range(30):
ax=fig.add_subplot(6,5,i+1,xticks=[],yticks=[])
ax.imshow(digits.images[i],cmap=plt.cm.binary,interpolation='nearest')
#用目标值标记图像
ax.text(0,7,str(digits.target[i]))
plt.show()
digits_X = digits.data ##获得数据集中的输入
digits_y = digits.target ##获得数据集中的输出,即标签(也就是类别)
### test_size:测试数据大小
X_train,X_test,y_train,y_test = train_test_split(digits_X, digits_y, test_size = 0.1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
plt.gray()
plt.matshow(np.abs(X_train[0].reshape(8, 8)))
plt.show()
X_train=preprocessing.StandardScaler().fit_transform(X_train)
plt.gray()
plt.matshow(np.abs(X_train[0].reshape(8, 8)))
plt.show()
print(X_train[0])
print("第一个特征的均值为",X_train[0].mean())
print("第一个特征的方差为",X_train[0].var())
print(y_train[0:20])
y_train_bin = (y_train > 4).astype(np.int)
print(y_train_bin[0:20])
##加载逻辑回归模型
model_LR_l1=linear_model.LogisticRegression(C=0.5, penalty='l1', tol=0.01)
##将训练数据传入开始训练
model_LR_l1.fit(X_train,y_train_bin)
X_test_scale=preprocessing.StandardScaler().fit_transform(X_test)
y_test_bin = (y_test>4).astype(np.int)
print(model_LR_l1.score(X_test_scale,y_test_bin))
y_pred = model_LR_l1.predict(X_test_scale)
##显示前30个样本的真实标签和预测值,用图显示
fig1=plt.figure(figsize=(8,8))
fig1.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
for i in range(30):
ax=fig1.add_subplot(6,5,i+1,xticks=[],yticks=[])
ax.imshow(np.abs(X_test[i].reshape(8, 8)),cmap=plt.cm.binary,interpolation='nearest')
ax.text(0,1,str(y_test[i]))
ax.text(0,7,str(y_pred[i]))
plt.show()
##找出分类错误的样本,用图显示
fig2=plt.figure(figsize=(8,8))
fig2.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
num=0
for i in range(180):
if(y_test_bin[i]!=y_pred[i]):
num=num+1
ax=fig2.add_subplot(12,5,num,xticks=[],yticks=[])
ax.imshow(np.abs(X_test[i].reshape(8, 8)),cmap=plt.cm.binary,interpolation='nearest')
ax.text(0,1,str(y_test[i]))
ax.text(0,7,str(y_pred[i]))
plt.show()
print(num)
##加载逻辑回归模型,选择随机平均梯度下降,多分类方法用one vs rest
model_LR_ovr=linear_model.LogisticRegression(solver='sag',max_iter=3000,random_state=42,multi_class='ovr')
##将训练数据传入开始训练
model_LR_ovr.fit(X_train,y_train)
X_test_scale=preprocessing.StandardScaler().fit_transform(X_test)
print(model_LR_ovr.score(X_test_scale,y_test))
y_pred = model_LR_ovr.predict(X_test_scale)
##显示前30个样本的真实标签和预测值,用图显示
fig3=plt.figure(figsize=(8,8))
fig3.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
for i in range(30):
ax=fig3.add_subplot(6,5,i+1,xticks=[],yticks=[])
ax.imshow(np.abs(X_test[i].reshape(8, 8)),cmap=plt.cm.binary,interpolation='nearest')
ax.text(0,1,str(y_test[i]))
ax.text(0,7,str(y_pred[i]))
plt.show()
##找出分类错误的样本,用图显示
fig4=plt.figure(figsize=(8,8))
fig4.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
num=0
for i in range(180):
if(y_test[i]!=y_pred[i]):
num=num+1
ax=fig4.add_subplot(6,5,num,xticks=[],yticks=[])
ax.imshow(np.abs(X_test[i].reshape(8, 8)),cmap=plt.cm.binary,interpolation='nearest')
ax.text(0,1,str(y_test[i]))
ax.text(0,7,str(y_pred[i]))
plt.show()
print(num)
##加载逻辑回归模型,选择随机平均梯度下降,多分类方法用many vs many
model_LR_mult=linear_model.LogisticRegression(solver='sag',max_iter=3000,random_state=42,multi_class='multinomial')
##将训练数据传入开始训练
model_LR_mult.fit(X_train,y_train)
X_test_scale=preprocessing.StandardScaler().fit_transform(X_test)
print(model_LR_mult.score(X_test_scale,y_test))
y_pred = model_LR_mult.predict(X_test_scale)
##显示前30个样本的真实标签和预测值,用图显示
fig5=plt.figure(figsize=(8,8))
fig5.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
for i in range(30):
ax=fig5.add_subplot(6,5,i+1,xticks=[],yticks=[])
ax.imshow(np.abs(X_test[i].reshape(8, 8)),cmap=plt.cm.binary,interpolation='nearest')
ax.text(0,1,str(y_test[i]))
ax.text(0,7,str(y_pred[i]))
plt.show()
##找出分类错误的样本,用图显示
fig6=plt.figure(figsize=(8,8))
fig6.subplots_adjust(left=0,right=1,bottom=0,top=1,hspace=0.05,wspace=0.05)
num=0
for i in range(180):
if(y_test[i]!=y_pred[i]):
num=num+1
ax=fig6.add_subplot(6,5,num,xticks=[],yticks=[])
ax.imshow(np.abs(X_test[i].reshape(8, 8)),cmap=plt.cm.binary,interpolation='nearest')
#用目标值标记图像
ax.text(0,1,str(y_test[i]))
ax.text(0,7,str(y_pred[i]))
plt.show()
print(num)