-
Notifications
You must be signed in to change notification settings - Fork 11
/
mrclean.py
95 lines (81 loc) · 4.08 KB
/
mrclean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import re
# START_TOKEN = '<s>'
# END_TOKEN = '</s>'
# PADDING_TOKEN = '<pad>'
START_TOKEN = ''
END_TOKEN = ''
PADDING_TOKEN = ''
def _make_padding_sequence(seq_length):
return ''.join([END_TOKEN] + seq_length * [PADDING_TOKEN])
def cleanup_simple_wikipedia(text, seq_length):
pad_seq = _make_padding_sequence(seq_length)
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq
return text
def cleanup_wikipedia(text, seq_length):
pad_seq = _make_padding_sequence(seq_length)
text = re.sub(r'= = = (.+?) = = =\n', r'\1', text)
lines = [line.strip() for line in text.splitlines()]
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, '\n'.join(lines)[1:]) + pad_seq
return text
def cleanup_qed(text, seq_length):
# TODO: this should probably be padded too, but it’s difficult to detect when subtitles start and end
# The handling of proper nouns and of parentheses isn’t perfect, but this is still an improvement over the base text
punctuation_ex = re.compile(r'([.!?]\s*)')
unimportant_chars_ex = re.compile(r'\(.*?\)|[.!?]')
lines = []
for line in text.splitlines():
nchars = len(line)
if nchars > 0:
line_body = unimportant_chars_ex.sub('', line)
f_upper = sum(c.isupper() for c in line_body) / len(line_body)
if f_upper >= 0.5: # Mostly uppercase characters
# Taken from https://stackoverflow.com/a/41662260
split_on_punctuation = punctuation_ex.split(line.replace('l', 'I'))
line = ''.join([sentence.capitalize() for sentence in split_on_punctuation])
lines.append(line.strip())
return START_TOKEN + '\n'.join(lines) + END_TOKEN + ''.join(seq_length * [PADDING_TOKEN])
def cleanup_extra_spaces(text):
multiple_spaces_ex = re.compile(r'[ \t\u00A0]+')
space_before_punctuation_ex = re.compile(r'[ \t\u00A0]([.,;!?])')
text = multiple_spaces_ex.sub(' ', text)
text = space_before_punctuation_ex.sub(r'\1', text)
return text
def cleanup_bnc_spoken(text, seq_length):
pad_seq = _make_padding_sequence(seq_length)
text = cleanup_extra_spaces(text)
text = START_TOKEN + re.sub(r'\n\n', pad_seq + START_TOKEN, text) + pad_seq
return text
def cleanup_aochildes(text, seq_length):
text = cleanup_extra_spaces(text)
return START_TOKEN + text + _make_padding_sequence(seq_length)
def cleanup_cbt(text, seq_length):
text = cleanup_extra_spaces(text)
space_before_apostroph = re.compile(r"([\w\d])[ \t\u00A0](['’]\w)")
#space_before_quote = re.compile(r"[ \t\u00A0](['’])")
#space_after_quote = re.compile(r"([`])[ \t\u00A0]")
#text = space_before_quote.sub(r'\1', text)
#text = space_after_quote.sub(r'\1', text)
text = space_before_apostroph.sub(r'\1\2', text)
return START_TOKEN + text + _make_padding_sequence(seq_length)
def cleanup_children_stories(text, seq_length):
# Sometimes one skipped line marks the beginning of a new story,
# but sometimes it is present within a same story, which doesn’t
# make it very useful for separating independent stories.
return START_TOKEN + text + _make_padding_sequence(seq_length)
def cleanup_gutenberg(text, seq_length):
# Overall, the text is clean, however some entries don’t seem
# very useful, e.g. figure captions preceded by a number.
# Not sure if we should remove them, because that would also
# remove bullet lists which are otherwise consistent with the
# surrounding text.
# No start or end tokens because the text seems to be cut.
return text + ''.join(seq_length * [PADDING_TOKEN])
def cleanup_open_subtitles(text, seq_length):
# The text is mostly clean, apart from some subtitle credits
# such as "Subtitles by ...".
subtitle_credit_ex = re.compile(r'^.*subtitle.*$\n', re.MULTILINE | re.IGNORECASE)
text = subtitle_credit_ex.sub('', text)
return START_TOKEN + text + _make_padding_sequence(seq_length)
def cleanup_switchboard(text, seq_length):
# No start or end tokens because the text seems to be cut.
return text + ''.join(seq_length * [PADDING_TOKEN])