-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_dataset.py
336 lines (255 loc) · 14.3 KB
/
clean_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# -*- coding: utf-8 -*-
"""3.0 Data Cleaning (Missing Values, Duplicates, Outliers).ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1XvNSyPO3DsggTBfDGzaq9NbykhGJ7PPE
# Data Cleaning
Une fois qu'on a identifié les problèmes dans nos données, on procède à leur nettoyage.
Cela consiste en :
- la suppression des données dupliquées
- le traitement des valeurs manquantes
- la suppression des valeurs aberrantes
Cette étape est cruciale pour garantir la qualité et la fiabilité des données avant de les utiliser pour l'analyse et la modélisation.
"""
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
# change style
plt.style.use('ggplot')
from tqdm import tqdm
from tools import *
tqdm.pandas()
import os
from params import SOUNDS_DATASET_PATH, SAMPLE_RATE, CLASS_COLORS
N_JOBS = -1
# euclidean distance with np.linalg.norm
def euclidean_distance(vector1: np.ndarray, vector2: np.ndarray):
"""
Compute euclidean distance between two vectors
"""
return np.linalg.norm(vector1 - vector2)
# cosine similarity with np.dot
def cosine_similarity(vector1: np.ndarray, vector2: np.ndarray):
"""
Compute cosine similarity between two vectors
"""
return np.dot(vector1, vector2) / (np.linalg.norm(vector1) * np.linalg.norm(vector2))
def similarity(vector1: np.ndarray, vector2: np.ndarray, metric: str = "euclidean"):
"""
Compute similarity between two vectors using the specified metric. Use numpy functions.
"""
if metric == "cosine":
sim = cosine_similarity(vector1, vector2)
return sim
# Normalize cosine similarity to [0, 1]
# return (sim + 1) / 2
elif metric == "euclidean":
dist = euclidean_distance(vector1, vector2)
# Normalize euclidean distance to [0, 1] by dividing by the maximum possible distance
max_dist = np.sqrt(len(vector1))
return 1 - (dist / max_dist)
else:
raise ValueError(f"Unknown metric: {metric}")
def compute_similarity_pair(file_i, row_i, df_X, metric="euclidean", threshold: float = None):
if threshold is None:
threshold = 0 if metric == "cosine" else 0.5
similarities = {}
for file_j, row_j in df_X.iterrows():
if file_i == file_j:
continue
if (file_i, file_j) in similarities or (file_j, file_i) in similarities:
continue
vectori = row_i.to_numpy()
vectorj = row_j.to_numpy()
sim = similarity(vectori, vectorj, metric=metric)
if sim >= threshold:
similarities[(file_i, file_j)] = sim
return similarities
def compute_similarities_parallel(df_X: pd.DataFrame, metric: str = "euclidean", threshold: float = None,
n_jobs: int = -1):
similarities = {}
# n_jobs = -1 # Utilisez tous les cœurs disponibles
results = Parallel(n_jobs=n_jobs)(
delayed(compute_similarity_pair)(file_i, row_i, df_X, metric=metric, threshold=threshold)
for file_i, row_i in tqdm(df_X.iterrows(), total=len(df_X))
)
for r in results:
similarities.update(r)
return similarities
def compute_df_similarities(df_X: pd.DataFrame, metric: str = "euclidean", threshold: float = None, n_jobs: int = -1):
similarities = compute_similarities_parallel(df_X, metric=metric, threshold=threshold, n_jobs=n_jobs)
df_similarities = pd.DataFrame.from_dict(
similarities, orient='index', columns=['similarity']
).sort_values(by=['similarity'], ascending=False)
print(f"Nombre de paires de fichiers audio similaires (similarity > {threshold}) : {len(df_similarities)}")
df_similarities.reset_index(inplace=True)
df_similarities["file_i"] = df_similarities["index"].progress_apply(lambda x: x[0])
df_similarities["file_j"] = df_similarities["index"].progress_apply(lambda x: x[1])
df_similarities.drop(columns=["index"], inplace=True)
return df_similarities
def get_outliers_iqr_per_class(df, column, class_column, multiplier=1.5):
outliers_indices = []
# Divisez le dataframe en sous-groupes en fonction des classes.
for class_value in df[class_column].unique():
class_df = df[df[class_column] == class_value]
# Appliquez la méthode IQR pour chaque sous-groupe.
Q1 = class_df[column].quantile(0.25)
Q3 = class_df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR
outliers_class_df = class_df[(class_df[column] < lower_bound) | (class_df[column] > upper_bound)]
outliers_indices.extend(outliers_class_df.index.tolist())
return outliers_indices
def main():
# LOAD DATASET -------------------------------------------------------------
print("*" * 20, "LOAD DATASET", "*" * 20)
now_day_str = "20230511"
dataset_csv_path = os.path.join(SOUNDS_DATASET_PATH, f'dataset_features_{now_day_str}.csv')
if not os.path.exists(dataset_csv_path):
raise Exception(f"Dataset csv file not found: {dataset_csv_path}")
print("> Dataset path: ", dataset_csv_path)
df_drums = pd.read_csv(dataset_csv_path)
df_drums.set_index('file_path', inplace=True) # set index to file_path
print("> Dataset shape: ", df_drums.shape)
print(f"> Nombre de columns: {len(df_drums.columns)}")
## FEATURES COLUMN
features_columns = [k for k, v in df_drums.dtypes.to_dict().items() if v == 'float64' or v == 'int64']
print("> Features columns: ", features_columns[:3], "...", features_columns[-3:])
print(f"> Nombre de features: {len(features_columns)}")
columns_by_prefix = get_columns_by_prefix_(features_columns)
print(columns_by_prefix)
## PROCESS MISSING VALUES
print("*" * 20, "MISSING VALUES", "*" * 20)
print(f"> Nombre de lignes avec valeurs manquantes: {df_drums.isna().any(axis=1).sum()}")
print(
f"> Nombre de lignes avec valeurs manquantes dans toutes les colonnes features: {df_drums[features_columns].isna().all(axis=1).sum()}")
# Delete rows with missing values in all features columns
print(f"> Dataset shape before: {df_drums.shape}")
df_drums.dropna(axis=0, how='all', subset=features_columns, inplace=True)
print(f"> Dataset shape after: {df_drums.shape}")
# DUPLICATES ---------------------------------------------------------------
print("*" * 20, "DUPLICATES", "*" * 20)
## 1. DUPLICATES ROWS ------------------------------------------------------
print(f"Nombre de lignes totalement dupliquées : {df_drums.duplicated().sum()}")
duplicated_focus_on_features = df_drums.duplicated(subset=features_columns)
print(
f"Nombre de lignes dupliquées (focus on features) : {duplicated_focus_on_features.sum()} lignes (qu'on peut potentiellement supprimer)")
# print per class
print(df_drums[duplicated_focus_on_features].groupby("class").count()["file_name"].sort_values(ascending=False))
duplicated_focus_on_features = df_drums.duplicated(subset=features_columns, keep=False)
duplicates_df = df_drums[duplicated_focus_on_features].sort_values(
by=features_columns) # Afficher les lignes dupliquées (toutes les copies)
grouped_duplicates = duplicates_df.groupby(
features_columns) # Regroupez les lignes en double en fonction de leurs valeurs de features
# Créez une liste contenant des listes de file_paths pour chaque groupe de duplicatas
duplicate_groups = []
for _, group in grouped_duplicates:
duplicate_groups.append(list(group.index))
# Affichez les groupes de duplicatas
# for i, group in enumerate(duplicate_groups):
# print(f"# Duplicate Group {i + 1}:")
# for file_path in group:
# print(f" - {file_path}")
# print()
duplicates_idx_to_delete = []
file_to_delete_num_group_map = {}
# Parcourez chaque groupe de doublons
for num_group, group in enumerate(duplicate_groups, start=1):
# Triez les fichiers audio du groupe par la taille de leur nom de fichier
file_name_len = lambda file_path: len(os.path.basename(file_path))
sorted_group = sorted(group, key=file_name_len)
# Gardez le fichier audio avec le plus petit nom de fichier (le premier de la liste triée)
to_keep = sorted_group[0]
# Ajoutez les autres fichiers audio du groupe à la liste des fichiers à supprimer
duplicates_idx_to_delete.extend(sorted_group[1:])
for file_path in sorted_group[1:]:
file_to_delete_num_group_map[file_path] = num_group
print(f"> {len(set(duplicates_idx_to_delete))} lignes dupliquées supprimés")
# Supprimez les autres fichiers audio du groupe de doublons du DataFrame
df_drums = df_drums.drop(file_path for file_path in duplicates_idx_to_delete)
# cleaned_df contient maintenant les données sans les doublons indésirables
print(
f"> Après cleaning, nombre de lignes dupliquées (focus on features) : {df_drums.duplicated(subset=features_columns).sum()} lignes (qu'on peut encore potentiellement supprimer)")
print("> Dataset shape:", df_drums.shape)
# Sauvegarder dans un fichier csv les données dupliquées (pour les supprimer manuellement)
if not duplicates_idx_to_delete:
duplicates_idx_to_delete.append("No duplicates")
backup_output_path = os.path.join(SOUNDS_DATASET_PATH, f"__duplicates_rows_{now_day_str}.csv")
backup_series = pd.Series(duplicates_idx_to_delete, name="file_path")
backup_series.to_csv(backup_output_path, index=False, header=True)
## 2. DUPLICATES FILE_NAME -------------------------------------------------
print(
f"> Nombre de lignes dupliquées sur la colonne 'file_name' : {df_drums.duplicated(subset=['file_name', 'file_extension']).sum()} lignes (qu'on peut potentiellement supprimer)")
df_drums[df_drums.duplicated(subset=['file_name', 'file_extension'], keep=False)].sort_values(
by=['file_name', 'file_extension'])
## 3. DUPLICATES AUDIO (TOO SIMILAR) ---------------------------------------
df_similarities = compute_df_similarities(df_drums[features_columns], metric="cosine", threshold=0.9,
n_jobs=N_JOBS)
#### Similary > 0.999
threshold = 0.999725
df_similarities_0_999 = df_similarities.query(f"similarity > {threshold}")
print(
f"> Nombre de lignes avec similarité > {threshold} : {df_similarities_0_999.shape[0]} lignes (qu'on peut potentiellement supprimer)")
#### Delete similar files
table_loser_0_999 = pd.Series(
df_similarities_0_999["file_i"].to_list() + df_similarities_0_999["file_j"].to_list()).value_counts()
similar_file_to_delete = [] # list of file to delete
for i, row in df_similarities_0_999.iterrows():
if row["file_i"] in similar_file_to_delete or row["file_j"] in similar_file_to_delete:
continue
if table_loser_0_999[row["file_j"]] > table_loser_0_999[row["file_i"]]:
# print(f"{os.path.basename(row['file_j'])} lose vs. {os.path.basename(row['file_i'])} (because {table_loser_0_999[row['file_j']]} > {table_loser_0_999[row['file_i']]})")
similar_file_to_delete.append(row["file_j"])
else:
# print(f"{os.path.basename(row['file_i'])} lose vs. {os.path.basename(row['file_j'])} (because {table_loser_0_999[row['file_i']]} > {table_loser_0_999[row['file_j']]})")
similar_file_to_delete.append(row["file_i"])
print(f"> {len(similar_file_to_delete)} fichiers à supprimer (similarité > {threshold})")
# get class value counts in similar_file_to_delete list
similar_class_value_counts = df_drums.loc[df_drums.index.isin(similar_file_to_delete), "class"].value_counts()
similar_class_value_counts
print("similar_class_value_counts", similar_class_value_counts)
# delete similar files from cleaned_df_drums
if similar_file_to_delete:
df_drums = df_drums.drop(similar_file_to_delete)
if not similar_file_to_delete:
similar_file_to_delete.append("x")
backup_output_path = os.path.join(SOUNDS_DATASET_PATH, f"__duplicates_too_similar_{threshold}_{now_day_str}.csv")
backup_series = pd.Series(similar_file_to_delete, name="file_path")
backup_series.to_csv(backup_output_path, index=False, header=True)
# Outliers detection (IQR) -------------------------------------------------
print()
print("*" * 20, "Outliers detection (IQR)", "*" * 20)
# Remplacez 'class_column' par le nom de la colonne contenant les classes dans votre dataframe.
class_column = 'class'
# Parcourez toutes les colonnes de cleaned_df_drums pour lesquelles vous souhaitez détecter les outliers.
outliers_counter = Counter()
for col in tqdm(features_columns):
outliers_indices = get_outliers_iqr_per_class(df_drums, col, class_column)
outliers_counter.update(outliers_indices)
outliers_counter_df = pd.DataFrame(outliers_counter.items()).set_index(0).sort_values(by=1, ascending=False)
limit_outliers_count = int(len(features_columns) * 0.5)
print(f"> limit_outliers_count: {limit_outliers_count}")
# Trouvez les index des lignes qui ont été détectées au moins limit_outliers_count fois comme outliers.
outliers_to_remove = outliers_counter_df[outliers_counter_df[1] >= limit_outliers_count].index.tolist()
print(f"> Nombre d'outliers à supprimer: {len(outliers_to_remove)}")
### Delete outliers
# Supprimez les outliers du dataframe.
print(f"> Dataframe avant nettoyage des outliers: {df_drums.shape}")
if outliers_to_remove:
df_drums = df_drums.drop(outliers_to_remove)
print(f"> Dataframe nettoyé des outliers: {df_drums.shape}")
if not duplicates_idx_to_delete:
duplicates_idx_to_delete.append("x")
backup_output_path = os.path.join(SOUNDS_DATASET_PATH, f"__outliers_{limit_outliers_count}_{now_day_str}.csv")
backup_series = pd.Series(duplicates_idx_to_delete, name="file_path")
backup_series.to_csv(backup_output_path, index=False, header=True)
# SAVE CLEANED DATASET -----------------------------------------------------
final_output_path = os.path.join(SOUNDS_DATASET_PATH, f"dataset_features_cleaned_{now_day_str}.csv")
df_drums.to_csv(final_output_path, index=True)
print("#" * 20, "Cleaned dataset", "#" * 20)
print(f"-> {final_output_path}")
if __name__ == '__main__':
main()