-
Notifications
You must be signed in to change notification settings - Fork 1
/
main_addition_.py
116 lines (93 loc) · 3.34 KB
/
main_addition_.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
import sys
sys.path.append('../train_code')
import numpy as np
import pandas as pd
from utils.utils import *
from train_config import args
from sklearn.neighbors import NearestNeighbors
import joblib
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def f(x):
x = pd.Series(x)
res_v = len(set(x.values))-1
return res_v
def addtion_func(addtion_path):
test_path = '../../data/test_1.csv'
res_path = "../../finalA/S3_finalB.csv"
encode_type = 'utf-8'
model_names = [
'ExtraTreesClassifier',
'AdaBoostClassifier',
'GradientBoostingClassifier',
'RandomForestClassifier',
'BaggingClassifier',
]
test = pd.read_csv(test_path, keep_default_na=False, encoding=encode_type)
res = pd.read_csv(res_path, keep_default_na=False, encoding=encode_type)
test = pd.merge(test, res, on=['eventId'])
test = test[test['label']==1]
print(len(test))
print(test.columns)
encoders = joblib.load('../train_code//dataSet/encoders.pkl')
test, _ = ProcessData(test, encoders)
drop_cols = args['drop_cols']+['label']
print(drop_cols)
x_columns = [x for x in test.columns if x not in drop_cols]
X = test[x_columns]
X = pd.DataFrame(encoders[len(encoders)-1].transform(X), columns = x_columns)
X['srcAddress'] = test['srcAddress'].values
X = X.groupby(['srcAddress']).agg(['min', 'max', 'mean', 'std'])
X = X.fillna(0.)
# data = pd.DataFrame(test[['srcAddress', 'destAddress', 'eventId']])
X = test[x_columns]
X = pd.DataFrame(encoders[len(encoders)-1].transform(X), columns = x_columns)
SSE = []
SIS = []
DIFF = []
for n_clusters in range(5, 60, 5):
print("n_cluster:", n_clusters)
km = KMeans(n_clusters = n_clusters).fit(X)
labels = km.labels_
# centroids = km.cluster_centers_
# list_lablel = list(labels)
# for i in range(n_clusters):
# print(list_lablel.count(i))
sis = silhouette_score(X, labels, metric='euclidean')
print("score:", sis, km.inertia_)
SSE.append(km.inertia_)
SIS.append(sis)
# res = pd.DataFrame()
# res['label'] = km.labels_
# res['srcAddress'] = test['srcAddress'].values
# res = pd.DataFrame()
# res['label'] = km.labels_
# res['srcAddress'] = test['srcAddress'].values
# diff = (np.array(res.groupby(['srcAddress']).agg(f).values)).sum()
# DIFF.append(diff)
# print(np.array(res.groupby(['srcAddress']).agg(f).values))
# print(res.groupby(['srcAddress']).values)
X = range(5,20)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
plt.show()
X = range(5,20)
plt.xlabel('k')
plt.ylabel('SIS')
plt.plot(X,SIS,'o-')
plt.show()
# X = range(5,20)
# plt.xlabel('k')
# plt.ylabel('DIFF')
# plt.plot(X,DIFF,'o-')
# plt.show()
# test['label'] = labels
# submission = test[['eventId', 'label']]
# submission.to_csv('../../addition/S3_addition.csv', index = False, encoding='utf-8')
if __name__ == '__main__':
addtion_path = '../../finalA/S3_addtion.csv' # 该路径仅供参考
addtion_func(addtion_path)