-
Notifications
You must be signed in to change notification settings - Fork 1
/
2019_train_find_dublicates.py
56 lines (36 loc) · 1.88 KB
/
2019_train_find_dublicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import hashlib
from os.path import join
import pandas as pd
import psutil
from joblib import Parallel, delayed
# path to original APTOS 2019 train label file
train_df = pd.read_csv("../input/trainLabels19.csv")
# path to folder that contains original (unedited) APTOS 2019 images
train_path = "/path/to/2019_train/images/folder"
# train_df["diagnosis"].value_counts()
def get_hash(file):
with open(file, "rb") as f:
data = f.read()
md5_hash = hashlib.md5(data).hexdigest()
return md5_hash
def get_full_path(path, file_name):
return join(path, file_name + ".png")
train_file_hashes = Parallel(n_jobs=psutil.cpu_count(), verbose=1)(
(delayed(get_hash)(get_full_path(train_path, x)) for x in train_df.id_code))
train_df["md5"] = train_file_hashes
train_df['md5_count'] = train_df.groupby('md5').id_code.transform('count')
train_df['md5_dub_no_unique'] = train_df.groupby('md5').diagnosis.transform('nunique').astype('int')
df_uni = train_df[(train_df.md5_count > 1) & (train_df.md5_dub_no_unique > 1)]
len(df_uni), df_uni['md5'].nunique()
df_train_no_dub = pd.DataFrame(train_df.drop_duplicates(subset=['md5', 'diagnosis'], keep='first'))
df_train_no_dub['md5_count'] = df_train_no_dub.groupby('md5').id_code.transform('count')
df_train_no_dub.reset_index(inplace=True)
df_train_no_dub.drop(columns="index", inplace=True)
df_train_no_dub[df_train_no_dub.md5_count > 1].md5_count.value_counts()
df_train_dublicates = df_train_no_dub[df_train_no_dub.md5_count > 1].sort_values("md5")
df_train_dublicates.to_csv("../input/trainLabels19_duplicates.csv",
columns=["id_code", "diagnosis", "md5"],
index=False)
df_train_unique = pd.DataFrame(df_train_no_dub.drop_duplicates(subset=['md5'], keep=False))
df_train_unique.to_csv("../input/trainLabels19_unique.csv",
columns=["id_code", "diagnosis"], index=False)