-
Notifications
You must be signed in to change notification settings - Fork 2
/
process-output.py
32 lines (24 loc) · 1.49 KB
/
process-output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# coding: utf-8
import glob
import os
import pandas as pd
df = pd.read_csv('/home/ahmed/Pictures/cogedis/cogedis_words_3/words.csv',sep=',')
df = df.astype(str)
print(len(df))
#df[['raw_value', 'manual_raw_value']] = df[['raw_value', 'manual_raw_value']][~df[['raw_value', 'manual_raw_value']].applymap(lambda x: any([xx in ['{', ',','à','â','$','€', ';', ':', '\\', '/', '.', '%','_','-','}'] for xx in x]))]
#df.dropna(axis = 0, how = 'any', inplace = True)
#df=df.replace(['é','è'],'e', regex=True)
#df.dropna()
#df = df.applymap(lambda x: x.lower())
#df[['raw_value', 'manual_raw_value']]= df[['raw_value', 'manual_raw_value']].applymap(lambda x: x.replace('é','e'))
#df = df.applymap(lambda x: x.replace('è','e'))
#df['filtering'] = df['raw_value'].apply(lambda x : 1 if x.str.contains(',',',','à','â','$','€', ';', ':', '\\', '/', '.', '%','_','-') else 0)
#df['filtering'] = df['raw_value'].apply(lambda x : 1 if x.str.contains('à') else 0)
a = [ '\,','à','â','à','È','À','-','_', ';', '\:', '\\\\', '\/', '\.', '\$', '€', '\%', '_', '-','°','<','>']
joined = "|".join(a)
mask = ~df['manual_raw_value'].str.contains(joined)
cols = ['manual_raw_value']
df = df[mask].astype(str).replace(['é','è','È','É'],'e', regex=True).apply(lambda x: x.str.lower()).reset_index(drop=True)
df = df.astype(str)
df.to_csv('/home/ahmed/Pictures/cogedis/cogedis_words_large/words_processed.csv',index=False,sep=',')
#df['filtering'] = df['raw_value'].apply(lambda x : 1 if x.contains(['à', 'é']) else 0)