-
Notifications
You must be signed in to change notification settings - Fork 1
/
train_trackml-DBSCAN.py
106 lines (92 loc) · 3.92 KB
/
train_trackml-DBSCAN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# The Mean Squares - TrackML Challenge 2018
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from sklearn.cluster import DBSCAN
from trackml.dataset import load_event, load_dataset
from trackml.score import score_event
# update path according to your folder setup
path_to_train = "/train_sample/train_100_events"
event_prefix = "event000001000"
hits, cells, particles, truth = load_event(os.path.join(path_to_train, event_prefix))
class Clusterer(object):
def processLabels(self, ht):
#helix transform equations
ht['rt'] = np.sqrt(ht.x**2+ht.y**2)
ht['a0'] = np.arctan2(ht.y,ht.x)
ht['r'] = np.sqrt(ht.x**2+ht.y**2+ht.z**2)
ht['z1'] = ht.z/ht['rt']
ht['z2'] = ht.z/ht['r']
dz0 = -0.00070
stepdz = 0.00001
stepeps = 0.000005
inv = 1
# 100 iterations of clustering
for i in tqdm(range(100)):
inv = inv * -1
dz = inv*(dz0 + i*stepdz)
ht['a1'] = ht['a0']+dz*ht['z']*np.sign(ht['z'].values)
ht['sina1'] = np.sin(ht['a1'])
ht['cosa1'] = np.cos(ht['a1'])
ss = StandardScaler()
dfs = ss.fit_transform(ht[['sina1','cosa1','z1','z2']].values)
#scales for euclidean distance
sc = np.array([1.0,1.0,0.4,0.4])
for j in range(np.shape(dfs)[1]):
dfs[:,j] *= sc[j]
#increment eps
clusters = DBSCAN(eps=0.0035+i*stepeps,min_samples=1,metric='euclidean',n_jobs=8).fit(dfs).labels_
if i==0:
ht['s1']= clusters
ht['N1'] = ht.groupby('s1')['s1'].transform('count')
else:
ht['s2'] = clusters
ht['N2'] = ht.groupby('s2')['s2'].transform('count')
max_s1 = ht['s1'].max()
cond = np.where(((ht['N2'].values>ht['N1'].values) & (ht['N2'].values<20)))
s1 = ht['s1'].values
s1[cond] = ht['s2'].values[cond]+max_s1
ht['s1'] = s1
ht['s1'] = ht['s1'].astype('int64')
self.clusters = ht['s1'].values
ht['N1'] = ht.groupby('s1')['s1'].transform('count')
return ht['s1'].values
def predict(self, hits):
self.clusters = self.processLabels(hits)
return self.clusters
def create_one_event_submission(event_id, hits, labels):
sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
return submission
# Training/Validation set
train_submissions = []
dataset_scores = []
for event_id, hits, cells, particles, truth in load_dataset(path_to_train, skip=0, nevents=5):
# Track pattern recognition
model = Clusterer()
labels = model.predict(hits)
# Prepare submission for an event
one_submission = create_one_event_submission(event_id, hits, labels)
train_submissions.append(one_submission)
# Score for the event
score = score_event(truth, one_submission)
dataset_scores.append(score)
print("Score for event %d: %.8f" % (event_id, score))
print('Mean score: %.8f' % (np.mean(dataset_scores)))
# Test set
path_to_test = "/test/test"
test_submission = []
# 125 test events
for event_id, hits, cells in load_dataset(path_to_test, parts=['hits', 'cells']):
# Track pattern recognition
model = Clusterer()
labels = model.predict(hits)
# Prepare submission for an event
one_submission = create_one_event_submission(event_id, hits, labels)
test_submission.append(one_submission)
print('Event ID: ', event_id)
# Create submission file
submission = pd.concat(test_submission, axis=0)
submission.to_csv('submission-001.csv', index=False) #Size about 200MB