From 54feff4543c6b913551643f05cb772cdcfda9dcf Mon Sep 17 00:00:00 2001 From: lovit Date: Sun, 24 Jan 2021 20:34:27 +0900 Subject: [PATCH] Title prefix format: both ` = title = ` and `= title =` are OK (#186) --- Korpora/korpus_kowiki.py | 2 +- Korpora/utils.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Korpora/korpus_kowiki.py b/Korpora/korpus_kowiki.py index 65fc603..a150905 100644 --- a/Korpora/korpus_kowiki.py +++ b/Korpora/korpus_kowiki.py @@ -72,7 +72,7 @@ def __init__(self, root_dir=None, force_download=False): def load(self, path, num_lines=-1): def split_title_text(wikitext): lines = wikitext.split('\n') - title = lines[0] + title = lines[0].strip() text = '\n'.join([line.strip() for line in lines[2:] if line.strip()]) return title, text diff --git a/Korpora/utils.py b/Korpora/utils.py index 933e4aa..3e0dc81 100644 --- a/Korpora/utils.py +++ b/Korpora/utils.py @@ -77,17 +77,18 @@ def load_wikitext(path, num_lines=-1): """ if num_lines <= 0: with open(path, encoding='utf-8') as f: - texts = f.read().split('\n =') + # noise robust + texts = f.read().replace('\n =', '\n=').split('\n=') else: lines = [] with open(path, encoding='utf-8') as f: for i, line in enumerate(f): if (i >= num_lines): break - lines.append(line) - texts = ''.join(lines).split('\n =') + lines.append(line.strip() + "\n") + texts = ''.join(lines).split('\n=') # fix missing prefix - texts = [texts[0]] + [f' ={text}' for text in texts[1:]] + texts = [texts[0]] + [f'={text}' for text in texts[1:]] return texts