-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdatahelper.py
23 lines (23 loc) · 934 Bytes
/
datahelper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import re
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
# string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", "", string)
string = re.sub(r"!", "", string)
string = re.sub(r"。", "", string)
string = re.sub(r"-", "", string)
string = re.sub(r"《", "", string)
string = re.sub(r":", "", string)
string = re.sub(r"》", "", string)
string = re.sub(r"?", "", string)
# string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()