From 9c3be60f2b2a89d4b261644e58bae66138c1704a Mon Sep 17 00:00:00 2001 From: lovit Date: Sun, 24 Jan 2021 20:34:27 +0900 Subject: [PATCH] Title prefix format: both ` = title = ` and `= title =` are OK (#186) --- Korpora/korpus_kowiki.py | 2 +- Korpora/utils.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Korpora/korpus_kowiki.py b/Korpora/korpus_kowiki.py index 8384d97..3c0665c 100644 --- a/Korpora/korpus_kowiki.py +++ b/Korpora/korpus_kowiki.py @@ -72,7 +72,7 @@ def __init__(self, root_dir=None, force_download=False): def load(self, path, num_lines=-1): def split_title_text(wikitext): lines = wikitext.split('\n') - title = lines[0] + title = lines[0].strip() text = '\n'.join([line.strip() for line in lines[2:] if line.strip()]) return title, text diff --git a/Korpora/utils.py b/Korpora/utils.py index bce402c..0a3d28c 100644 --- a/Korpora/utils.py +++ b/Korpora/utils.py @@ -60,17 +60,18 @@ def load_wikitext(path, num_lines=-1): """ if num_lines <= 0: with open(path, encoding='utf-8') as f: - texts = f.read().split('\n =') + # noise robust + texts = f.read().replace('\n =', '\n=').split('\n=') else: lines = [] with open(path, encoding='utf-8') as f: for i, line in enumerate(f): if (i >= num_lines): break - lines.append(line) - texts = ''.join(lines).split('\n =') + lines.append(line.strip() + "\n") + texts = ''.join(lines).split('\n=') # fix missing prefix - texts = [texts[0]] + [f' ={text}' for text in texts[1:]] + texts = [texts[0]] + [f'={text}' for text in texts[1:]] return texts