diff --git a/Korpora/about.py b/Korpora/about.py index cf4b044..bde0659 100644 --- a/Korpora/about.py +++ b/Korpora/about.py @@ -1,3 +1,3 @@ __author__ = 'ko-nlp' -__version__ = '0.2.0rc1' +__version__ = '0.3.0dev' __description__ = 'This package provides easy-download and easy-usage for various Korean corpora.' diff --git a/Korpora/korpus_kowiki.py b/Korpora/korpus_kowiki.py index 76a0930..3c0665c 100644 --- a/Korpora/korpus_kowiki.py +++ b/Korpora/korpus_kowiki.py @@ -5,17 +5,17 @@ KOWIKITEXT_FETCH_INFORMATION = [ { - 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.train.zip', + 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v3/kowikitext_20200920.train.zip', 'destination': 'kowikitext/kowikitext_20200920.train.zip', 'method': 'download & unzip' }, { - 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.test.zip', + 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v3/kowikitext_20200920.test.zip', 'destination': 'kowikitext/kowikitext_20200920.test.zip', 'method': 'download & unzip' }, { - 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.dev.zip', + 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v3/kowikitext_20200920.dev.zip', 'destination': 'kowikitext/kowikitext_20200920.dev.zip', 'method': 'download & unzip' } @@ -72,7 +72,7 @@ def __init__(self, root_dir=None, force_download=False): def load(self, path, num_lines=-1): def split_title_text(wikitext): lines = wikitext.split('\n') - title = lines[0] + title = lines[0].strip() text = '\n'.join([line.strip() for line in lines[2:] if line.strip()]) return title, text diff --git a/Korpora/utils.py b/Korpora/utils.py index bce402c..0a3d28c 100644 --- a/Korpora/utils.py +++ b/Korpora/utils.py @@ -60,17 +60,18 @@ def load_wikitext(path, num_lines=-1): """ if num_lines <= 0: with open(path, encoding='utf-8') as f: - texts = f.read().split('\n =') + # noise robust + texts = f.read().replace('\n =', '\n=').split('\n=') else: lines = [] with open(path, encoding='utf-8') as f: for i, line in enumerate(f): if (i >= num_lines): break - lines.append(line) - texts = ''.join(lines).split('\n =') + lines.append(line.strip() + "\n") + texts = ''.join(lines).split('\n=') # fix missing prefix - texts = [texts[0]] + [f' ={text}' for text in texts[1:]] + texts = [texts[0]] + [f'={text}' for text in texts[1:]] return texts