-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassLogisticsRegression.py
114 lines (98 loc) · 3.66 KB
/
classLogisticsRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# 逻辑回归类
import numpy as np
from sklearn.metrics import accuracy_score
class logisticsRegression:
def __init__(self):
# 初始化logistics Regression模型
self.coef_ = None # 系数,对应theta1-n,对应的向量
self.interception_ = None # 截距,对应theta0
self._theta = None # 定义私有变量,整体计算的theta
# sigmoid函数数据溢出问题:https://blog.csdn.net/wofanzheng/article/details/103976889
# 定义私有sigmoid函数
# def _sigmoid(self, t):
# return 1. / 1. + np.exp(-t)
def _sigmoid(self, x):
l=len(x)
y=[]
for i in range(l):
if x[i]>=0:
y.append(1.0/(1+np.exp(-x[i])))
else:
y.append(np.exp(x[i])/(np.exp(x[i])+1))
return y
'''
梯度下降
'''
def fit(self, X_train, y_train, eta = 5.0, n_iters = 1e4):
# 根据训练数据集X_train, y_ .train训练logistics Regression模型
# X_train的样本数量和y_train的标记数量应该是一致的
# 使用shape[0]读取矩阵第一维度的长度,在这里就是列数
assert X_train.shape[0] == y_train.shape[0], \
"the size of x_ .train must be equal to the size of y_ train"
# 损失函数
def J(theta, X_b, y):
y_hat = self._sigmoid(X_b.dot(theta))
try:
return -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / len(y)
except:
return float('inf') # 返回float最大值
# 梯度(比较笨的方法)
def dJ(theta, X_b, y):
return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(X_b)
# 梯度下降求解theta矩阵
def gradient_descent(X_b, y, initial_theta, eta, n_iters = 1e4, epsilon = 1e-8):
theta = initial_theta
cur_iters = 0
while cur_iters < n_iters:
gradient = dJ(theta, X_b, y) # 求梯度
last_theta = theta # theta重新赋值前,记录上一场的值
theta = theta - eta * gradient # 通过一定的eta学习率取得下一个点的theta
# 最近两点的损失函数差值小于一定精度,退出循环
if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iters += 1
return theta
# 得到X_b
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1]) # 设置n+1维的向量,X_b.shape[1]:第一行的维数
# X_b.T是X_b的转置,.dot是点乘,np.linalg.inv是求逆
# 获取theta
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.interception_ = self._theta[0] # 截距
self.coef_ = self._theta[1:] # 系数
return self
'''
预测可能性的过程
'''
def predict_prob(self, X_predict):
# 给定待预测数据集X_predict,返回表示X_predict的结果概率向量
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return self._sigmoid(X_b.dot(self._theta))
'''
预测过程
'''
def predict(self, X_predict):
# 给定待预测数据集X_predict,返回表示X_predict的结果向量
prob = self.predict_prob(X_predict) # prob向量存储的都是0-1的浮点数
# 进行分类(布尔类型强制转换为整型)
# return np.array(prob >= 0.5, dtype='int')
l = len(prob)
temp_prob=[]
for i in range(l):
if prob[i] >= 0.5:
temp_prob.append(1)
else:
temp_prob.append(0)
return temp_prob
'''
预测准确度
'''
def score(self, X_test, y_test):
# 根据测试数据集 X_test 和y_test 确定当前模型的准确度
y_predict = self.predict(X_test)
return accuracy_score(y_test, y_predict)
'''
显示属性
'''
def __repr__(self):
return "logisticsRegression()"