-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
136 lines (108 loc) · 4.72 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
'''
author:Donglin Zhou
time:2021.12.4
工具包:数据集切分、获取数据、评估模型的方法
'''
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import keras.backend as K
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# 获取数据
def get_data(filename,type,look_back,test_size):
# Load the data:加载数据
df = pd.read_csv(filename, delimiter=',', header=0)
df.head()
data = df.copy()
if(type=='MC'):
data = data.iloc[:, [1, 2, 3, 4, 5, 6, 7]]
if(type=='stanford'):
data = data.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9]]
if(type=='senticnetAR'):
data = data.iloc[:, [1, 2, 3, 5, 6, 7]]
# 将数据转化成三维张量的形态
num_features = data.shape[1] # 特征+close的个数
# 这里得到的数据用于训练模型
X_train, y_train, X_test, y_test = df_to_cnn_rnn_format(df=data, test_size=test_size, look_back=look_back,
scale_X=True)
return X_train, y_train, X_test, y_test, look_back, num_features
# 切分数据集成三维形状
def df_to_cnn_rnn_format(df, test_size=0.5, look_back=5, scale_X=True):
target_location = df.shape[1] - 1 # close的位置,最后一列
# 划分训练集和测试集
X = df.values[:, :target_location]
y = df.values[:, target_location]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)
# 标准化
if scale_X:
scalerX = StandardScaler()
X_train = scalerX.fit_transform(X_train)
X_test = scalerX.transform(X_test)
# 提取需要预测的train和test
num_features = target_location # 自变量特征的个数
samples_train = X_train.shape[0] - look_back # 切分后训练数据集的个数
X_train_reshaped = np.zeros((samples_train, look_back, num_features))
y_train_reshaped = np.zeros((samples_train))
for i in range(samples_train):
y_position = i + look_back
X_train_reshaped[i] = X_train[i:y_position]
y_train_reshaped[i] = y_train[y_position]
samples_test = X_test.shape[0] - look_back
X_test_reshaped = np.zeros((samples_test, look_back, num_features))
y_test_reshaped = np.zeros((samples_test))
for i in range(samples_test):
y_position = i + look_back
X_test_reshaped[i] = X_test[i:y_position]
y_test_reshaped[i] = y_test[y_position]
return X_train_reshaped, y_train_reshaped, X_test_reshaped, y_test_reshaped
# 评估指标:mape
def mape(y_true, y_pred):
import keras.backend as K
return (K.abs(y_true - y_pred) / K.abs(y_pred)) * 100
# 评估指标:smape
def smape(y_true, y_pred):
return (K.abs(y_pred - y_true) / ((K.abs(y_true) + K.abs(y_pred))))*100
# 绘制结果图
def downsample_results(y_pred, y_true, model_name, path, savefig=False):
y_pred = y_pred.reshape(y_pred.shape[0])
y_true = y_true.reshape(y_true.shape[0])
results = pd.DataFrame(y_true, y_pred)
result = results.reset_index()
result.columns = ['y_pred', 'y_true']
result.to_csv(path + model_name + '_' + '_predictions.csv')
ytrue = result['y_true']
ypred = result['y_pred']
n = len(result)
# 计算评估指标
mse_result = (1 / n) * np.sum((ypred - ytrue) ** 2)
mape_result = (100 / n) * np.sum(np.abs((ytrue - ypred) / ypred))
smape_result = (100 / n) * np.sum(np.abs((ytrue - ypred)) / (np.abs(ytrue) + np.abs(ypred)))
# 画图
plt.figure(figsize=(20, 10))
plt.plot(result.index, result['y_true'], '.-', color='red', label='Real values', alpha=0.5,
ms=10) # ms is markersize
plt.plot(result.index, result['y_pred'], '.-', color='blue', label='Predicted values', ms=10)
plt.ylabel(r'price', fontsize=14)
plt.xlabel('datetime [-]', fontsize=14) # TODO: set x values as actual dates
plt.xticks(fontsize=14, rotation=45)
plt.yticks(fontsize=14)
plt.legend(loc='upper left', borderaxespad=0, frameon=False, fontsize=14, markerscale=3)
plt.title(
model_name + 'predictions \n MSE = %.2f \n MAPE = %.1f [%%] \n SMAPE = %.1f [%%]' % (
mse_result, mape_result, smape_result), fontsize=14)
if savefig:
plt.savefig(path + model_name + '_predict.png', dpi=1300)
plt.close()
# 训练过程中的loss的变化过程
def loss_plot(result,loss_png_path,history_path):
plt.plot(result.history['loss'])
plt.plot(result.history['val_loss'])
plt.title('INTC_LSTM_stanford_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.savefig(loss_png_path)
plt.show()
plt.close()
pd.DataFrame(result.history).to_csv(history_path)