Skip to content

Commit

Permalink
Merge pull request #198 from ko-nlp/dev#186
Browse files Browse the repository at this point in the history
Dev#186
  • Loading branch information
lovit authored Jan 24, 2021
2 parents 5282935 + 9c3be60 commit e592a79
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Korpora/about.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__author__ = 'ko-nlp'
__version__ = '0.2.0rc1'
__version__ = '0.3.0dev'
__description__ = 'This package provides easy-download and easy-usage for various Korean corpora.'
8 changes: 4 additions & 4 deletions Korpora/korpus_kowiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@

KOWIKITEXT_FETCH_INFORMATION = [
{
'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.train.zip',
'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v3/kowikitext_20200920.train.zip',
'destination': 'kowikitext/kowikitext_20200920.train.zip',
'method': 'download & unzip'
},
{
'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.test.zip',
'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v3/kowikitext_20200920.test.zip',
'destination': 'kowikitext/kowikitext_20200920.test.zip',
'method': 'download & unzip'
},
{
'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.dev.zip',
'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v3/kowikitext_20200920.dev.zip',
'destination': 'kowikitext/kowikitext_20200920.dev.zip',
'method': 'download & unzip'
}
Expand Down Expand Up @@ -72,7 +72,7 @@ def __init__(self, root_dir=None, force_download=False):
def load(self, path, num_lines=-1):
def split_title_text(wikitext):
lines = wikitext.split('\n')
title = lines[0]
title = lines[0].strip()
text = '\n'.join([line.strip() for line in lines[2:] if line.strip()])
return title, text

Expand Down
9 changes: 5 additions & 4 deletions Korpora/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,18 @@ def load_wikitext(path, num_lines=-1):
"""
if num_lines <= 0:
with open(path, encoding='utf-8') as f:
texts = f.read().split('\n =')
# noise robust
texts = f.read().replace('\n =', '\n=').split('\n=')
else:
lines = []
with open(path, encoding='utf-8') as f:
for i, line in enumerate(f):
if (i >= num_lines):
break
lines.append(line)
texts = ''.join(lines).split('\n =')
lines.append(line.strip() + "\n")
texts = ''.join(lines).split('\n=')
# fix missing prefix
texts = [texts[0]] + [f' ={text}' for text in texts[1:]]
texts = [texts[0]] + [f'={text}' for text in texts[1:]]
return texts


Expand Down

0 comments on commit e592a79

Please sign in to comment.