Skip to content

Commit

Permalink
Title prefix format: both = title = and = title = are OK (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
lovit committed Jan 28, 2021
1 parent 9001e30 commit 54feff4
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 5 deletions.
2 changes: 1 addition & 1 deletion Korpora/korpus_kowiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def __init__(self, root_dir=None, force_download=False):
def load(self, path, num_lines=-1):
def split_title_text(wikitext):
lines = wikitext.split('\n')
title = lines[0]
title = lines[0].strip()
text = '\n'.join([line.strip() for line in lines[2:] if line.strip()])
return title, text

Expand Down
9 changes: 5 additions & 4 deletions Korpora/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,18 @@ def load_wikitext(path, num_lines=-1):
"""
if num_lines <= 0:
with open(path, encoding='utf-8') as f:
texts = f.read().split('\n =')
# noise robust
texts = f.read().replace('\n =', '\n=').split('\n=')
else:
lines = []
with open(path, encoding='utf-8') as f:
for i, line in enumerate(f):
if (i >= num_lines):
break
lines.append(line)
texts = ''.join(lines).split('\n =')
lines.append(line.strip() + "\n")
texts = ''.join(lines).split('\n=')
# fix missing prefix
texts = [texts[0]] + [f' ={text}' for text in texts[1:]]
texts = [texts[0]] + [f'={text}' for text in texts[1:]]
return texts


Expand Down

0 comments on commit 54feff4

Please sign in to comment.