-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans_pp.py
119 lines (95 loc) · 3.29 KB
/
kmeans_pp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import numpy as np
import copy
import sys
import math
import mykmeanssp
# import mykmeanssp
# START FUNCTIONS AREA OF CODE
# def read_file_to_array(file_path):
# data = []
# with open(file_path, 'r') as file:
# for line in file:
# vector = [float(num) for num in line.strip().split(',')]
# data.append(vector)
# return data
def kmeans_plusplus(vectors_np, K):
np.random.seed(0)
centers = []
center_indexes = [] # Array to store the indexes of selected centers
n = vectors_np.shape[0] # num of vectors
random_index = np.random.choice(n)
centers.append(vectors_np[random_index])
center_indexes.append(random_index) # Save the index of the first center
for _ in range(1, K):
distances = np.array([min(np.linalg.norm(vector - center) for center in centers) for vector in vectors_np])
probabilities = distances / np.sum(distances)
random_index = np.random.choice(n, p=probabilities)
centers.append(vectors_np[random_index])
center_indexes.append(random_index) # Save the index of the selected center
return centers, center_indexes
# DEFINING INPUT VARIABLES AREA OF CODE
# initiating parameters
K = None
iter = None
file_path = None
# Check if the user provided 'iter', otherwise set the default value to 200
if len(sys.argv) >= 6:
K = sys.argv[1]
iter = sys.argv[2]
epsilon = sys.argv[3]
file_path_1 = str(sys.argv[4])
file_path_2 = str(sys.argv[5])
else:
iter = "300"
K = sys.argv[1]
epsilon = sys.argv[2]
file_path_1 = str(sys.argv[3])
file_path_2 = str(sys.argv[4])
centroids = []
final_centroids = []
vectors = []
# INNERJOIN AREA OF CODE
df1 = pd.read_csv(file_path_1, header=None)
df1.columns = ['index'] + list(df1.columns[1:])
df2 = pd.read_csv(file_path_2, header=None)
df2.columns = ['index'] + list(df2.columns[1:])
# Convert the 'index' column to integer type
df1['index'] = df1['index'].astype(int)
df2['index'] = df2['index'].astype(int)
# Perform an inner join on the first column
vectors_pre_sorted_pd = pd.merge(df1, df2, on='index', how='inner')
# Sort the data points by the 'key' column in ascending order
vectors_pd = vectors_pre_sorted_pd.sort_values(by='index', ascending=True)
# Convert the vectors from panda format to 2D array
vectors = vectors_pd.iloc[:, 1:].values
#print(vectors)
# CHECKING FOR INPUT ERRORS
if 1 < int(K) < len(vectors):
pass
else:
print("Invalid number of clusters!")
sys.exit(1)
if 1 < int(iter) < 1000:
pass
else:
print("Invalid maximum iteration!")
sys.exit(1)
K = int(K)
iter = int(iter)
epsilon = float(epsilon)
# Kmeans++ AREA OF CODE
vectors_np = np.array(vectors)
centroids, center_indexes = kmeans_plusplus(vectors_np, K)
centroids_np = np.array(centroids)
# Using C module mykmeanssp to get final centroids
#convert centroids to list of lists
centroids_np = centroids_np.tolist()
#convert vectors to list of lists
vectors_np = vectors_np.tolist()
final_centroids = mykmeanssp.fit(centroids_np, vectors_np, int(K), int(iter), float(epsilon))
if(final_centroids == None):
print("An Error Has Occurred")
sys.exit(1)
print(",".join([str(index) for index in center_indexes]))
print("\n".join([",".join(["{:.4f}".format(value) for value in centroid]) for centroid in final_centroids]))