-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsa_classify.py
113 lines (89 loc) · 3.48 KB
/
sa_classify.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import sys
import pandas as pd
import matplotlib.pyplot as plt
from k_nn import get_accuracy
def get_table_subset(table, columns):
return table[columns]
def get_initial_solution(feature_size, selected_features):
sol = np.random.choice(range(feature_size), size=selected_features)
def get_neighbor(current_solution, feature_size, temperature):
all_features = range(feature_size)
selected = current_solution
not_selected = np.setdiff1d(all_features, selected)
# swap one selected feature with one non-selected feature
num_swaps = int(min(np.ceil(np.abs(np.random.normal(0, 0.1*len(selected)*temperature))), np.ceil(0.1*len(selected))))
feature_out = np.random.randint(0, len(selected), num_swaps)
feature_in = np.random.randint(0, len(not_selected), num_swaps)
selected = np.delete(selected, feature_out)
selected = np.append(selected, not_selected[feature_in])
return list(selected), num_swaps
def get_cost(solution):
limited_test_data = get_table_subset(test_data, train_data.columns[np.append(solution,[-1])])
limited_train_data = get_table_subset(train_data, train_data.columns[np.append(solution,[-1])])
correct_dict, incorrect_dict = get_accuracy(limited_train_data, limited_test_data, k)
correct_count = sum(correct_dict.values())
incorrect_count = sum(incorrect_dict.values())
return 1.0*incorrect_count/(correct_count+incorrect_count)
def get_probability(temperature, delta_cost):
return np.exp(delta_cost/temperature)
def simulated_annealing(init_soln, init_temp, max_iterations, alpha):
temperature = init_temp
solution = init_soln
cost = get_cost(solution)
best_solution = solution
best_cost = cost
iteration = 0
temp_history = [temperature]
cost_history = [cost]
prob_history = [0]
swaps_history = [0]
best_history = [best_cost]
while (iteration < max_iterations):
next_solution, swaps = get_neighbor(solution, len(features)-1, temperature)
next_cost = get_cost(next_solution)
probability = 0
if (next_cost > cost):
probability = get_probability(temperature, cost-next_cost)
if (next_cost < cost or np.random.random()<probability):
cost = next_cost
solution = next_solution
if (cost < best_cost):
best_cost = cost
best_solution = solution
iteration += 1
temperature *= alpha
temp_history.append(temperature)
cost_history.append(cost)
prob_history.append(probability)
swaps_history.append(swaps)
best_history.append(best_cost)
return best_solution, best_cost, temp_history, cost_history, prob_history, swaps_history, best_history
if len(sys.argv) != 7:
print "Invalid number of arguments!\nUsage: \npython ", sys.argv[0], " <TRAINING_DATA> <TEST_DATA> <ITERATIONS> <K> <NUM_FEATURES> <alpha>\n"
quit()
train_data = pd.read_csv(sys.argv[1])
test_data = pd.read_csv(sys.argv[2])
target_attribute = train_data.columns[-1]
max_iterations = int(sys.argv[3])
k = int(sys.argv[4])
selected_features = int(sys.argv[5])
alpha = float(sys.argv[6])
features = train_data.columns
random_solution = np.random.random_integers(0, len(features)-2, selected_features)
init_temp = 1.0
solution, cost, temp_history, cost_history, prob_history, swaps_history, best_history = simulated_annealing(random_solution, init_temp, max_iterations, alpha)
print "Best solution is ", train_data.columns[solution]
print "Cost is ", cost
plt.figure(1)
plt.subplot(511)
plt.plot(temp_history)
plt.subplot(512)
plt.plot(best_history)
plt.subplot(513)
plt.plot(cost_history)
plt.subplot(514)
plt.plot(prob_history)
plt.subplot(515)
plt.plot(swaps_history)
plt.show()