-
Notifications
You must be signed in to change notification settings - Fork 0
/
macho_u_set.py
88 lines (69 loc) · 2.5 KB
/
macho_u_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# coding=utf-8
# Toma un directorio con archivos de features sampleadas de curvas de luz.
# Para cada set de features sampleadas calcula las gaussianas asociada a cada
# variable. Le agrega la clase y un peso con valor 1 y arma un dataframe y lo
# guarda como set de entrenamiento
# --------------------------------------------------------------------------
import sys
import os
import pandas as pd
import FATS
import lightcurves.lc_utils as lu
from config import *
def get_paths(directory):
for dirpath, _, filenames in os.walk(directory):
for f in filenames:
if '.csv' in f:
yield os.path.abspath(os.path.join(dirpath, f))
def calc_gaussian(file_path):
df = pd.read_csv(file_path)
medias = df.mean()
std = df.std()
l = medias - 3*std
r = medias + 3*std
medias = medias.tolist()
std = std.tolist()
l = l.tolist()
r = r.tolist()
linea = [lu.get_lightcurve_id(file_path)]
for i in xrange(len(medias)):
linea.append(str(l[i]))
linea.append(str(medias[i]))
linea.append(str(r[i]))
linea.append(str(std[i]))
linea.append('1.0')
linea.append(lu.get_lc_class_name(f))
return ','.join(linea) + '\n'
if __name__ == '__main__':
if len(sys.argv) == 2:
percentage = sys.argv[1]
else:
percentage = '100'
output_file_name = 'gp_u_set_' + percentage + '.csv'
path = LAB_PATH + 'Samples_Features/MACHO/' + percentage + '%/'
files = get_paths(path)
files = [x for x in files]
print files
# OJO!!!! Esto no se si es valido cuando agregue el id al calculo de features
# porque los datos van a venir con un indice
feature_list = pd.read_csv(files[0]).columns.tolist()
exclude_list = None
fs = FATS.FeatureSpace(Data=['magnitude', 'time', 'error'], featureList=feature_list,
excludeList=exclude_list)
# Dejo un vacío para el id de las curvas (index de pandas)
linea = ['']
for name in fs.featureList:
linea.append(name + '.l')
linea.append(name + '.mean')
linea.append(name + '.r')
linea.append(name + '.std')
linea.append('weight')
linea.append('class')
linea = ','.join(linea) + '\n'
f = open(TRAINING_SETS_DIR_PATH + '/MACHO_GP_Reduced/' + output_file_name, 'w')
f.write(linea)
f.close()
lineas = [calc_gaussian(f) for f in files]
f = open(TRAINING_SETS_DIR_PATH + '/MACHO_GP_Reduced/' + output_file_name, 'a')
f.writelines(lineas)
f.close()