-
Notifications
You must be signed in to change notification settings - Fork 21
/
dataset_script.py
123 lines (99 loc) · 3.52 KB
/
dataset_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
The purpose of this script is to run experiments on each of the toy datasets...
"""
import argparse
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from see import GeneticSearch
from see.base_classes import pipedata
from see.classifiers import Classifier
from see.classifier_fitness import ClassifierFitness
from see.classifier_helpers import helpers
from see.Workflow import workflow
parser = argparse.ArgumentParser(description="Create some csv data files.")
parser.add_argument(
"--filename-tail",
default="0",
help="tail to add to the end of the filenames generated (default: <filename>_0.csv)",
)
parser.add_argument(
"--num-gen",
default=20,
type=int,
help="number of generations to run genetic search (default: 20)",
)
parser.add_argument(
"--pop-size",
default=20,
type=int,
help="population size of each generation to run genetic search (default: 20)",
)
args = parser.parse_args()
# Initialize Algorithm Space and Workflow
algorithm_space = Classifier.algorithmspace
workflow.addalgos([Classifier, ClassifierFitness])
wf = workflow()
# Create Data: Sklearn tutorial toy datasets
# Moons
moons_ds = pipedata()
moons_ds.name = "Moons"
moons_ds.X, moons_ds.y = make_moons(noise=0.3, random_state=0)
# Circles
circles_ds = pipedata()
circles_ds.name = "Circles"
circles_ds.X, circles_ds.y = make_circles(
noise=0.2, factor=0.5, random_state=1)
# Linearly Seperable dataset
lin_ds = pipedata()
lin_ds.X, lin_ds.y = make_classification(
n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1
)
rng = np.random.RandomState(2)
lin_ds.name = "Linearly Separable"
lin_ds.X += 2 * rng.uniform(size=lin_ds.X.shape)
datasets = [moons_ds, circles_ds, lin_ds]
validation_sets = []
# Preprocess data
for ds in datasets:
ds.X = StandardScaler().fit_transform(ds.X)
# Split datasets into training, testing, and validation sets
for i, ds in enumerate(datasets):
#temp = helpers.generate_train_test_set(ds.X, ds.y)
# validation_sets.append(temp.testing_set)
# datasets[i] = helpers.generate_train_test_set(
# temp.training_set.X, temp.training_set.y
# )
datasets[i] = helpers.generate_train_test_set(ds.X, ds.y)
datasets[i].name = ds.name
temp = helpers.generate_train_test_set(ds.X, ds.y, random_state=31)
validation_sets.append(temp)
NUM_GENERATIONS = args.num_gen
POP_SIZE = args.pop_size
hof_per_dataset = []
for ds in datasets:
print("Running {} Dataset".format(ds.name))
my_evolver = GeneticSearch.Evolver(workflow, ds, pop_size=POP_SIZE)
my_evolver.run(
ngen=NUM_GENERATIONS,
# print_fitness_to_file=True,
print_fitness_to_file=True,
print_fitness_filename="{}_fitness_{}.csv".format(
ds.name, args.filename_tail),
)
# Store the best solution found for each dataset
hof_per_dataset.append(my_evolver.hof)
top_n = 5
for i, hof in enumerate(hof_per_dataset):
top_inds = hof[:top_n]
print('----------------\n')
for ind in top_inds:
# Initialize classifier
algo_name = ind[0]
param_list = ind
training_set = validation_sets[i].training_set
testing_set = validation_sets[i].testing_set
clf = Classifier.algorithmspace[algo_name](param_list)
predictions = clf.evaluate(training_set, testing_set)
score = ClassifierFitness().evaluate(predictions, testing_set.y)
print('|{}|{}|'.format(ind.fitness.values[0], score))