-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranslate.py
88 lines (64 loc) · 3.03 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
from googletrans import Translator
from tqdm import tqdm
import datetime
from concurrent.futures import ThreadPoolExecutor
from Utils import Config
# Each language is translated into all the others
dests = {val: list(set([dst for dst in Config.langs.values() if dst != val])) for val in Config.langs.values()}
# train = pd.read_csv(Config.TRAIN_DIR + 'train.csv', header=0)
# train['source'] = train.country.apply(lambda country: Config.langs[country])
# batch_size = 10000
# dfs = [train[i:i+batch_size] for i in range(0, train.shape[0], batch_size)]
def process_row(row):
"""
Given a original span, translated it into all other languages and return a list with all possibilities.
Will return original span as first element.
All returned spans have their Text and Code include in order to be easily converted into a dataframe row.
:param row: np.Series representing a row from a dataframe
:return: List with future rows
"""
# This originally was a one liner but API is not exactly very stable
# dest_langs = dests[row.source]
# org = (row.Text, row.Code)
# t1 = (translator.translate(row.Text, src=row.source, dest=dest_langs[0]).text, row.Code)
# t2 = (translator.translate(row.Text, src=row.source, dest=dest_langs[1]).text, row.Code)
response = {(translator.translate(row.Text, src=row.source, dest=dest).text, row.Code) for dest in dests[row.source]}
response.update({(row.Text, row.Code)})
return list(response)
# # Select which dataframe to process
# df = dfs[14]
row_lst = []
train = list(pd.read_csv(Config.DATA_DIR + 'train.csv', header=0, chunksize=500))
prev_backup = pd.read_csv(Config.DATA_DIR + 'backup.csv', header=0)
df_idx = 0
translator = Translator()
print(datetime.datetime.now())
def translate_batches(idx):
print("Starting translation on batch #{}".format(idx))
global translator
global row_lst
# Do not overwhelm the API
for i, d in tqdm(enumerate(train[idx:]), total=len(train[idx:])):
d['source'] = d.country.apply(lambda country: Config.langs[country])
# Get new Translator object
translator = Translator()
try:
# Distribute processing to avoid 2h/dataframe
with ThreadPoolExecutor(max_workers=20) as executor:
row_lst.extend(
executor.map(process_row, d.itertuples(index=False))
)
except Exception as e:
print("Restarting translations: {}".format(e))
global df_idx
df_idx += i
translate_batches(df_idx)
backup_rows = [r for trsl in row_lst for r in trsl]
backup = pd.DataFrame(backup_rows, columns=['Text', 'Code'])
pd.concat([prev_backup, backup]).to_csv(Config.DATA_DIR + 'backup.csv', index=False)
del translator
translate_batches(df_idx)
print(datetime.datetime.now())
row_lst = [r for trsl in row_lst for r in trsl]
pd.DataFrame(row_lst, columns=['Text', 'Code']).sample(frac=1, random_state=42).to_csv(Config.DATA_DIR + 'train_trans.csv', index=False)