-
Notifications
You must be signed in to change notification settings - Fork 0
/
prophet.py
394 lines (326 loc) · 17.5 KB
/
prophet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import os, sys, inspect
import configparser #for .ini file
from datetime import datetime
import random
import time
class Prophet:
def __init__(self, ticker, days, inifile='./conf/prophet.ini', env='default', debug=5, logfile='log/prophet.log'):
"""instantiate the object"""
self.env = env #environment - for the future, to acess different sections of ini files or diferent cloud envs (something like dev, staging, prod, whatever
self.ticker = ticker
self.days = days
self.logfile = logfile #using local default location for the logs since I might not have permissions to write in your /var/log
self.logfile = logfile
self.debug = debug #level of verbosity for the screen output and logging; not used so far
if (os.path.isfile(inifile) != True):
error = "Config file does not exist: %s" % inifile
self.log(error)
raise Exception(error)
self.config = configparser.ConfigParser()
try:
self.config.read(inifile)
except:
error = "failed to read %s" % inifile
self.log(error)
raise Exception(error)
return
def log(self, msg):
"""log a message in the app logfile, just to avoid installing logging"""
log_directory = 'log'
try:
os.stat(log_directory)
except:
os.mkdir(log_directory)
try:
with open(self.logfile, "a") as filehandle_log:
filehandle_log.write("%s: %s\n" % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), msg))
except:
raise Exception('Can not write to the log file:', self.logfile)
return 0
def parse_template(self, file_template, dict_content):
"""get html template to feed the assigned data to"""
if not os.path.exists(file_template):
return "template file %s does not exist" % file_template
with open(file_template, "r") as handle_template:
string_template = handle_template.read()
string_html = string_template.format(**dict_content) #send it in as kwargs
#print ("parsed: ", len(string_html))
return string_html
def enrich(self, df):
""""set derived columns of 'Spread', 'Yield', 'Yield % Open' for further calculations"""
df['#'] = range(len(df))
df['Yield'] = df['Close'] - df['Open']
df['Yield % Open'] = round (df['Yield'] / df['Open']*100, 4)
days = self.days
return df
def set_tiers(self, df, column_names=('Yield % Open',)):
"""assigns values of numeric columns to one of 3 tiers: high, medium, low to generate HMM strings"""
dict_tiers = {}
for column_name in (column_names):
if (df[column_name].dtype == np.float64 or df[column_name].dtype == np.int64): #check once per column_name
dict_tiers[column_name] = {}
dict_tiers[column_name]['min'] = df[column_name].min() #assign these two once per column_name
dict_tiers[column_name]['max'] = df[column_name].max()
#TODO: try on <days> range instead of the whole column
dict_tiers[column_name]['one_third'] = (dict_tiers[column_name]['max'] - dict_tiers[column_name]['min'])/3
dict_tiers[column_name]['tier_low_top' ] = round(dict_tiers[column_name]['min'] + dict_tiers[column_name]['one_third'], 4)
dict_tiers[column_name]['tier_medium_top'] = round(dict_tiers[column_name]['min'] + dict_tiers[column_name]['one_third']*2, 4)
dict_tiers[column_name]['tier_high_top'] = round(dict_tiers[column_name]['max'])
dict_tiers[column_name]['tier_low_mean'] = round((dict_tiers[column_name]['tier_low_top'] - dict_tiers[column_name]['min'])/2, 2)
dict_tiers[column_name]['tier_medium_mean'] = round((dict_tiers[column_name]['tier_medium_top'] - dict_tiers[column_name]['tier_low_top'])/2, 2)
dict_tiers[column_name]['tier_high_mean'] = round((dict_tiers[column_name]['tier_high_top'] - dict_tiers[column_name]['tier_medium_top'])/2, 2)
#print (dict_tiers[column_name]['tier_high_top'], dict_tiers[column_name]['tier_medium_top'], dict_tiers[column_name]['tier_high_mean'])
#print('column_name: ', column_name, 'min-max', dict_tiers[column_name]['min'], dict_tiers[column_name]['max'])
return dict_tiers
def set_hmm_state(self, df, source_key, dict_tiers_source):
"""assign actual state from historicaldata (up/down and tier char) for the record, e,g, +M"""
df['History'] = ''
df['Tier Guess'] = ''
df['Match'] = ''
counters = {}
tier = []
history = []
tier_guess = []
price_guess = []
match = []
up_down_guess = []
counter = -1
for record in df.itertuples():
counter += 1
#actual state
tier_item = set_days_state([df[source_key][counter],], dict_tiers_source)
tier.append(tier_item)
history_item = set_days_state(df[source_key][counter-self.days:counter], dict_tiers_source) #counter in the range means counter-1
history.append(history_item)
if history_item not in counters: #maybe not pythonic, but it's not an exception, so no try /except
counters[history_item] = {}
if tier_item not in counters[history_item]:
counters[history_item][tier_item] = 0
#if counter < self.days: #starting days that are used for calculating things for the following days but do not have predecessors to do produce preditions of their own
# continue
close_item_previous = df['Close'][counter-1]
counters[history_item][tier_item] += 1 #keep track of frequency of current state by previous chain
(tier_guess_item, price_guess_item, match_item, up_down_guess_item) = set_guesses(counter, counters, tier_item, history_item, close_item_previous, dict_tiers_source)
tier_guess.append(tier_guess_item)
price_guess.append(price_guess_item)
match.append(match_item)
up_down_guess.append(up_down_guess_item)
df['Tier'] = tier
df['History'] = history
df['Tier Guess'] = tier_guess
df['Price Guess'] = price_guess
df['Match'] = match
df['Up/Down Guess'] = up_down_guess
tier_guess = []
price_guess = []
match = []
up_down_guess = []
#backpropagate later-set assumptions - if we want to use all date range as one unit, i.e. not only chronologically sequencial assumptions
#can be commented out for the case of only causal chronology
#for counter, record in enumerate(df[source_key]):
counter = -1
for record in df.itertuples():
counter += 1
#if counter < self.days:
# continue
close_item_previous = df['Close'][counter-1]
tier_item = df['Tier'][counter]
history_item = df['History'][counter]
(tier_guess_item, price_guess_item, match_item, up_down_guess_item) = set_guesses(counter, counters, tier_item, history_item, close_item_previous, dict_tiers_source)
tier_guess.append(tier_guess_item)
price_guess.append(price_guess_item)
match.append(match_item)
up_down_guess.append(up_down_guess_item)
self.log("tier guess after: %s" % tier_guess)
df['Tier'] = tier
df['History'] = history
df['Tier Guess'] = tier_guess
df['Price Guess'] = price_guess
df['Match'] = match
df['Up/Down Guess'] = up_down_guess
return
def set_hmm_state_v1(self, df, source_key, dict_tiers_source):
"""assign actual state from historicaldata (up/down and tier char) for the record, e,g, +M"""
df['History'] = ''
df['Tier Guess'] = ''
df['Match'] = ''
counters = {}
tier = []
counter = -1
for record in df.itertuples():
counter += 1
#for counter, record in enumerate(df[source_key]):
# if counter > 50: #TODO REMOVE!!!
# break
# millisec1 = int(round(time.time() * 1000))
#actual state
df['Tier'][counter] = set_days_state([df[source_key][counter],], dict_tiers_source)
# millisec2 = int(round(time.time() * 1000))
# msg = '111: set_days_state1: %s, %s, %s' % ( millisec1, millisec2, millisec2-millisec1)
# self.log(msg)
# millisec1 = int(round(time.time() * 1000))
#History of <days> days
df['History'][counter] = set_days_state(df[source_key][counter-self.days:counter], dict_tiers_source) #counter in the range means counter-1
if df['History'][counter] not in counters: #maybe not pythonic, but it's not an exception, so no try /except
counters[df['History'][counter]] = {}
tier.append(df['History'][counter])
# millisec2 = int(round(time.time() * 1000))
# msg = '112: set_days_state2: %s, %s, %s' % ( millisec1, millisec2, millisec2-millisec1)
# self.log(msg)
if df['Tier'][counter] not in counters[df['History'][counter]]:
counters[df['History'][counter]][df['Tier'][counter]] = 0
if counter < self.days: #starting days that are used for calculating things for the following days but do not have predecessors to do produce preditions of their own
continue
# millisec1 = int(round(time.time() * 1000))
counters[df['History'][counter]][df['Tier'][counter]] += 1 #keep track of frequency
if not set_guesses_old(df, counters, counter, dict_tiers_source):
continue
# millisec2 = int(round(time.time() * 1000))
# msg = '113: set_guesses: %s, %s, %s' % ( millisec1, millisec2, millisec2-millisec1)
# self.log(msg)
#millisec1 = int(round(time.time() * 1000))
#backpropagate later-set assumptions - if we want to use all date range as one unit, i.e. not only chronologically sequencial assumptions
#can be commented out for the case of only causal chronology
#for counter, record in enumerate(df[source_key]):
counter = -1
for record in df.itertuples():
counter += 1
if counter < self.days:
continue
if not set_guesses_old(df, counters, counter, dict_tiers_source):
continue
# millisec2 = int(round(time.time() * 1000))
# msg = 'backpropagate: %s, %s, %s' % ( millisec1, millisec2, millisec2-millisec1)
# self.log(msg)
return
def set_guesses(counter, counters, tier_item, history_item, close_item_previuos, dict_tiers_source):
"""set 'Tier Guess', 'Up/Down Guess', and 'Price Guess' columns"""
#pull out the most popular next entry for a given HMM
#e.g. '-M|+M|-M': {'+M': 15, '-M': 11, '-L': 1} would return '+M'
#e.g. '+H|+H|+H': {'-L': 1} would return return 'no data' - a single occurence, does nto influence anything besides itself
#e.g. '-L|+H|-M': {'+M': 1, '-M': 1} - if curent record had +M, then returns the other one ( -M)
#e.g. '-L|+H|-M': {'+M': 1, '-M': 1, '+L': 1} - returns randomly selected out of all excluding current record pattern, i.e if current record is '+M' it's randomly selected from the other two
#print ("\n<======", counters[history_item], "======>")
list_counter_value = list(counters[history_item].values())
if len(list_counter_value) == 0:
#nothing assigned so far, set default and keep looping
#print ("CASE 1 no data so far, SKIPPING")
return 'no data', 'no data', 'no data', 'no data'
elif len(list_counter_value) == 1:
if list_counter_value[0] == 1:
#just one entry of 1, which means current record itself, should be ignored
#print ("CASE 2 one entry, ==1, SKIPPING")
return 'no data', 'no data', 'no data', 'no data'
else:
#good data but one entry, well, just use it
tier_guess_item = list(counters[history_item].keys())[0] #in python3 needs to be 'list'ed because it's an object now
#print ("CASE 3, one entry, good data, set to ", tier_guess_item)
elif check_all_values_equal(list_counter_value): #more than one entry but all counters are equal
#if counters are higher than 1, take the most recent one, otherwise consider no data
if list_counter_value[0] == 1:
#print ("CASE 4, more than one and all equal and ==1, choosing randomly from non-current: ", list_counter_value)
dict_temp = counters[history_item].copy()
del dict_temp[tier_item]
tier_guess_item = random.choice(list(dict_temp.keys()))
else: #there is some inconclusive statistics here, let's select one randomly. That probably will break a lot of ties further down the loop
#TODO: maybe assign to the tier that is close the latest entry in the HMM instead?
tier_guess_item = random.choice(list(counters[history_item].keys())) #call list on the returned object in python3
#print ("CASE 5, more than one and all equal but greater than 1, choosing randomly from : ", list_counter_value, "vinner:", tier_guess_item)
else:
#good, substantial data to make a decision
tier_guess_item = max(counters[history_item], key=counters[history_item].get)
#print ("CASE 6: winner: ", tier_guess_item)
match_item = ''
if tier_guess_item == tier_item:
match_item = 'OK'
up_down_guess_item = ''
if tier_guess_item[0] == tier_item[0]:
up_down_guess_item = 'OK'
price_guess_item = close_item_previuos
if tier_guess_item[-1] == 'L':
price_guess_item += dict_tiers_source['tier_low_mean']
elif tier_guess_item[-1] == 'M':
price_guess_item += dict_tiers_source['tier_medium_mean']
elif tier_guess_item[-1] == 'H':
price_guess_item += dict_tiers_source['tier_high_mean']
return tier_guess_item, price_guess_item, match_item, up_down_guess_item
def set_guesses_old(df, counters, counter, dict_tiers_source):
#yield generate_new_column(1,2)
"""deprecated - set 'Tier Guess', 'Up/Down Guess', and 'Price Guess' columns"""
#pull out the most popular next entry for a given HMM
#e.g. '-M|+M|-M': {'+M': 15, '-M': 11, '-L': 1} would return '+M'
#e.g. '+H|+H|+H': {'-L': 1} would return return 'no data' - a single occurence, does nto influence anything besides itself
#e.g. '-L|+H|-M': {'+M': 1, '-M': 1} - if curent record had +M, then returns the other one ( -M)
#e.g. '-L|+H|-M': {'+M': 1, '-M': 1, '+L': 1} - returns randomly selected out of all excluding current record pattern, i.e if current record is '+M' it's randomly selected from the other two
#print ("\n<======", counters[df['History'][counter]], "======>")
list_counter_value = list(counters[df['History'][counter]].values())
if len(list_counter_value) == 0:
#nothing assigned so far, set default and keep looping
df['Tier Guess'][counter] = 'no data'
#print ("CASE 1 no data so far, SKIPPING")
return
elif len(list_counter_value) == 1:
if list_counter_value[0] == 1:
#just one entry of 1, which means current record itself, should be ignored
df['Tier Guess'][counter] = 'no data'
#print ("CASE 2 one entry, ==1, SKIPPING")
return
else:
#good data but one entry, well, just use it
df['Tier Guess'][counter] = list(counters[df['History'][counter]].keys())[0] #in python3 needs to be 'list'ed because it's an object now
#print ("CASE 3, one entry, good data, set to ", df['Tier Guess'][counter])
elif check_all_values_equal(list_counter_value): #more than one entry but all counters are equal
#if counters are higher than 1, take the most recent one, otherwise consider no data
if list_counter_value[0] == 1:
#print ("CASE 4, more than one and all equal and ==1, choosing randomly from non-current: ", list_counter_value)
dict_temp = counters[df['History'][counter]].copy()
del dict_temp[df['Tier'][counter]]
df['Tier Guess'][counter] = random.choice(list(dict_temp.keys()))
else: #there is some inconclusive statistics here, let's select one randomly. That probably will break a lot of ties further down the loop
#TODO: maybe assign to the tier that is close the latest entry in the HMM instead?
df['Tier Guess'][counter] = random.choice(list(counters[df['History'][counter]].keys())) #call list on the returned object in python3
#print ("CASE 5, more than one and all equal but greater than 1, choosing randomly from : ", list_counter_value, "vinner:", df['Tier Guess'][counter])
else:
#good, substantial data to make a decision
df['Tier Guess'][counter] = max(counters[df['History'][counter]], key=counters[df['History'][counter]].get)
#print ("CASE 6: winner: ", df['Tier Guess'][counter])
if df['Tier Guess'][counter] == df['Tier'][counter]:
df['Match'][counter] = 'OK'
else:
df['Match'][counter] = ''
if df['Tier Guess'][counter][0] == df['Tier'][counter][0]:
df['Up/Down Guess'][counter] = 'OK'
df['Price Guess'][counter] = df['Close'][counter-1]
if df['Tier Guess'][counter][-1] == 'L':
df['Price Guess'][counter] += dict_tiers_source['tier_low_mean']
elif df['Tier Guess'][counter][-1] == 'M':
df['Price Guess'][counter] += dict_tiers_source['tier_medium_mean']
elif df['Tier Guess'][counter][-1] == 'H':
df['Price Guess'][counter] += dict_tiers_source['tier_high_mean']
return 1
def set_days_state(day_range, dict_tiers_source):
""" for each day in the input define HMM symbol, e.g. +M (up medium) or -H (down high), and concatenate them into HMM.
Works on single day as well"""
day_states = []
for day_record in day_range:
if (day_record < dict_tiers_source['tier_low_top']):
day_state = 'L' #low
elif (day_record < dict_tiers_source['tier_medium_top']):
day_state = 'M' #medium
else:
day_state = 'H' #high
day_state = '+' + day_state if (day_record > 0) else ('-' + day_state)
day_states.append(day_state)
return '|'.join(day_states)
def check_all_values_equal(list_values):
"""detect if all prediction tier HMM happened to have the same count"""
return list_values[1:] == list_values[:-1]
# does not work - returns NaN no matter how the input data is formatted
# def generate_new_column(func, column1, column2):
# """a generator for various calculations"""
# yield func(column1, column2)
# def func_diff(x, y):
# return x-y