-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_preprocessor.py
50 lines (40 loc) · 1.98 KB
/
data_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from csv import DictReader
import pandas as pd
from io import StringIO
import re
class DataPreprocessor:
def __init__(self, file_path: str):
self.file_path = file_path
def extract_title_and_text(self):
return [
row["Titre"].strip().replace(".", "") + ". " + row["Texte"].lstrip()
for row in self.dict_reader
]
def preprocess(self) -> DictReader:
with open(self.file_path, "r", encoding="latin1") as f:
csv_lines = f.readlines()
csv_lines = self._clean_special_characters_html(csv_lines)
csv_lines = self._clean_misplaced_commas(csv_lines)
csv_lines = self._remove_last_header(csv_lines)
csv_lines = self._clean_double_quotes_inside_fields(csv_lines)
self.dict_reader = DictReader(StringIO("\n".join(csv_lines)))
return self.dict_reader
def _preprocess_dataframe(self, df: pd.DataFrame):
df["Titre"] = df["Titre"].fillna('')
df["Texte"] = df["Texte"].fillna('')
df["whole_text"] = df["Titre"] + ". " + df["Texte"]
df['whole_text'] = df['whole_text'].astype("string")
def _clean_special_characters_html(self, csv_lines):
return [
csv_line.replace("’\";", "'").replace("’\"", "'").replace("’", "'").replace("…", "…")
for csv_line in csv_lines]
def _clean_misplaced_commas(self, csv_lines):
csv_lines_to_return = [csv_lines[0].replace('"objet,"', '""objet"","')]
csv_lines_to_return.extend(
[csv_line.replace('"article,"', '""article"","').replace('"",publie"""', '""publie"""') for csv_line in
csv_lines[1:]])
return csv_lines_to_return
def _remove_last_header(self, csv_lines):
return [csv_line[0:csv_line.rfind(',')] for csv_line in csv_lines]
def _clean_double_quotes_inside_fields(self, csv_lines):
return [re.sub('(?<!")"(?!")', "", csv_line).replace('""', '"') for csv_line in csv_lines]