-
Notifications
You must be signed in to change notification settings - Fork 2
/
clustering_datasets.py
148 lines (115 loc) · 4.11 KB
/
clustering_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import numpy as np
import os
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def load_mydata(length, first_year, data_region, path = './data'):
if data_region != 'X': # if not national region
str_arr = data_region.split('n')
data_region = str_arr[0]+'n '+str_arr[1]
input_file = os.path.join( path, 'ILINet.csv')
x = []
# indexed by region
all_data = {}
in_f = open(input_file)
in_f.readline()
in_f.readline()
for line in in_f:
raw = line.strip().split(',')
region = raw[1].strip()
year = int(raw[2].strip())
week = int(raw[3].strip())
## upto 20th week belongs to last years cycle
if(week <= 20):
year -= 1
infection = raw[4].strip()
inf = 0
if is_number(infection):
inf = float(infection)
if region not in all_data:
all_data[region]={}
if year not in all_data[region]:
all_data[region][year] = []
all_data[region][year].append(inf)
indexDic = {}
raw = all_data[data_region]
keylist = list(raw.keys())
keylist.sort()
for year in keylist:
if year>=first_year and len(raw[year]) == 52:
indexDic[len(x)] = year
x.append(raw[year][0:length])
return np.array(x)
def load_RNNdata(length, first_year, data_region, path = './data'):
if data_region != 'X': # if not national region
str_arr = data_region.split('n')
data_region = str_arr[0]+'n '+str_arr[1]
input_file = os.path.join( path, 'ILINet.csv')
x = []
y = []
peak = []
peak_time = []
onset_time = []
baseline_file = open(os.path.join(path, 'baseline'))
cdc_baselines = {}
for line in baseline_file:
arr = line.strip().split()
#print(arr)
year = int(arr[0])
baseline = float(arr[1])
cdc_baselines[year] = baseline
# indexed by region
all_data = {}
in_f = open(input_file)
in_f.readline()
in_f.readline()
for line in in_f:
raw = line.strip().split(',')
region = raw[1].strip()
year = int(raw[2].strip())
week = int(raw[3].strip())
## upto 20th week belongs to last years cycle
if(week <= 20):
year -= 1
infection = raw[4].strip()
inf = 0
if is_number(infection):
inf = float(infection)
if region not in all_data:
all_data[region]={}
if year not in all_data[region]:
all_data[region][year] = []
all_data[region][year].append(inf)
indexDic = {}
raw = all_data[data_region]
keylist = list(raw.keys())
keylist.sort()
for year in keylist:
if year>=first_year and len(raw[year]) == 52:
indexDic[len(x)] = year
x.append(raw[year][0:length])
y.append(raw[year][length])
peak.append(max(raw[year]))
peak_time_val = (raw[year]).index(max(raw[year]))
peak_time_vec = [0]*52
peak_time_vec[peak_time_val] = 1
peak_time.append(peak_time_vec) #careful the peak time is from the 21st week
#counts from 0, so 37 means 21+37-52=6 week next year
onset = -1
baseline_val = cdc_baselines[year]
for i in range(len(raw[year])-3):
trueVals = [raw[year][x]>=baseline_val for x in range(i,i+3)]
if all(trueVals):
onset = i
break
onset_vec = [0]*53
onset_vec[onset]= 1
onset_time.append(onset_vec) #careful the peak time is from the 21st week
#counts from 0, so 37 means 21+37-52=6 week next year
# -1 means no onset
x = np.array(x)
x = x[:, :,np.newaxis]
return x, np.array(y),np.array(peak),np.array(peak_time), np.array(onset_time)