-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_err-cor-pair_new.py
198 lines (166 loc) · 7.23 KB
/
extract_err-cor-pair_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# coding: utf-8
import argparse
import codecs
import json
import re
import platform
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
language = ['Korean', 'English', 'Japanese', 'Mandarin', 'Traditional Chinese',
'Vietnamese', 'German', 'French', 'Other language', 'Spanish',
'Indonesian', 'Russian', 'Arabic', 'Thai', 'Swedish', 'Dutch',
'Hebrew', 'Tagalog', 'Portuguese(Brazil)', 'Cantonese', 'Italian',
'Esperanto', 'Hawaiian', 'Afrikaans', 'Mongolian', 'Hindi', 'Polish',
'Finnish', 'Greek', 'Bihari', 'Farsi', 'Urdu', 'Turkish', 'Portuguese(Portugal)',
'Bulgarian', 'Norwegian', 'Romanian', 'Albanian', 'Ukrainian', 'Catalan',
'Latvian', 'Danish', 'Serbian', 'Slovak', 'Georgian', 'Hungarian', 'Malaysian',
'Icelandic', 'Latin', 'Laotian', 'Croatian', 'Lithuanian', 'Bengali', 'Tongan',
'Slovenian', 'Swahili', 'Irish', 'Czech', 'Estonian', 'Khmer', 'Javanese', 'Sinhalese',
'Sanskrit', 'Armenian', 'Tamil', 'Basque', 'Welsh', 'Bosnian', 'Macedonian', 'Telugu',
'Uzbek', 'Gaelic', 'Azerbaijanian', 'Tibetan', 'Panjabi', 'Marathi', 'Yiddish', 'Ainu',
'Haitian', 'Slavic']
color_tags = ["f-red", "f-blue", "f-bold"]
sline_tag = "sline]"
def main():
assert platform.python_version_tuple(
)[0] == '3', 'This program supports only python3'
args = parse_args()
data_num = 0
error_num = 0
original_sentences = []
corrected_sentences = []
with codecs.open(args.data_path, 'r', encoding='utf8') as f:
for line in f:
data_num += 1
try:
jsonData = json.loads(line, strict=False)
l2_langs, l1_lang = jsonData[2], jsonData[3]
orig_sents, corr_sents = jsonData[4], jsonData[5]
if (args.l1 == None or args.l1 == l1_lang) and args.l2 in l2_langs:
for i, orig_sent in enumerate(orig_sents):
orig_sent = orig_sent.replace('\t', ' ')
if len(corr_sents[i]) > 0:
tag_err = False
for corr_sent in corr_sents[i]:
corr_sent = corr_sent.replace('\t', ' ')
text, tag_err = delete_tags_color(corr_sent, tag_err, args)
if sline_tag in text:
text, tag_err = delete_tags_sline(text, tag_err, args)
if not tag_err and text != "":
#print(orig_sent)
#print(text)
original_sentences.append(orig_sent)
corrected_sentences.append(text)
else:
original_sentences.append(orig_sent)
corrected_sentences.append("Uncorrected")
except:
error_num += 1
pass
df = pd.DataFrame(list(zip(original_sentences, corrected_sentences)), columns=['original', 'corrected'])
print(args.l1)
print(args.l2)
df.to_csv("2. Raw Data/"+args.l1+"_to_"+args.l2+".csv", index=False)
# def make_sent_pair(orig_sents, corr_sents, args):
# outputs = []
# for i, orig_sent in enumerate(orig_sents):
# orig_sent = orig_sent.replace('\t', ' ')
# if len(corr_sents[i]) > 0:
# tag_err = False
# for corr_sent in corr_sents[i]:
# corr_sent = corr_sent.replace('\t', ' ')
# text, tag_err = delete_tags_color(corr_sent, tag_err, args)
# if sline_tag in text:
# text, tag_err = delete_tags_sline(text, tag_err, args)
# if not tag_err and text != "":
# output = orig_sent + "\t" + text
# outputs.append(output)
# else:
# output = orig_sent + "\t" + orig_sent
# outputs.append(output)
# return outputs
def make_dataframe(orig_sents, corr_sents, args):
df = pd.DataFrame(columns=['original', 'corrected'])
for i, orig_sent in enumerate(orig_sents):
orig_sent = orig_sent.replace('\t', ' ')
if len(corr_sents[i]) > 0:
tag_err = False
for corr_sent in corr_sents[i]:
corr_sent = corr_sent.replace('\t', ' ')
text, tag_err = delete_tags_color(corr_sent, tag_err, args)
if sline_tag in text:
text, tag_err = delete_tags_sline(text, tag_err, args)
if not tag_err and text != "":
df['original'].append(orig_sent)
df['corrected'].append(text)
else:
df['original'].append(orig_sent)
df['corrected'].append("uncorrected")
return df
def delete_tags_sline(text, tag_err, args):
s_sline = "[sline]"
e_sline = "[/sline]"
if args.tags:
return text
words = text.split(" ")
total_s = total_e = 0
output_lists, tmp_list = [], []
for word in words:
num_s = word.count(s_sline)
num_e = word.count(e_sline)
total_s += num_s
total_e += num_e
tmp_list.append(word)
if total_s == 0 and total_e == 0:
output_lists.append(word)
tmp_list = []
elif total_s == total_e:
tmp_text = " ".join(tmp_list)
tmp_text = re.sub(r"\[sline\](.*)\[\/sline\]", r"", tmp_text)
if tmp_text != "":
output_lists.append(tmp_text)
total_s = total_e = 0
tmp_list = []
text = " ".join(output_lists)
if sline_tag in text:
tag_err = True
text = re.sub(r'\s+', ' ', text)
return text, tag_err
def delete_tags_color(text, tag_err, args):
if args.tags:
return text
text = replace_tags(text)
if text == None:
text = ""
for tag in color_tags:
s = "\[" + tag + "\]"
e = "\[\/" + tag + "\]"
text = re.sub(r"%s" % s, r"", text)
text = re.sub(r"%s" % e, r"", text)
if tag in text:
tag_err = True
return text, tag_err
def replace_tags(s):
s = s.replace("[赤]", "[f-red]")
s = s.replace("[/赤]", "[/f-red]")
s = s.replace("[青]", "[f-blue]")
s = s.replace("[/青]", "[/f-blue]")
return s
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-d", "--data", dest="data_path", type=str,
metavar='<str>', required=True, help="The path to the data set")
parser.add_argument("-l2", "--learn-lang", dest="l2", type=str,
metavar='<str>', required=False, default='English', help="L2 language")
parser.add_argument("-l1", "--native-lang", dest="l1", type=str,
metavar='<str>', required=False, default=None, help="L1 language")
parser.add_argument("-tags", "--remain-tags", dest="tags", default=False, action='store_true',
help="If you want to remain tags (e.g. [f-red]), please use this option")
args = parser.parse_args()
assert args.l2 in language
if args.l1 != None:
assert args.l1 in language
return args
if __name__ == "__main__":
main()