From 362d984d0607d8030d12634251aad124c4221bed Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 5 Oct 2020 03:57:42 +0900 Subject: [PATCH 01/16] Implement News corpus (#103) --- Korpora/korpora.py | 3 ++ Korpora/korpus_modu.py | 78 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/Korpora/korpora.py b/Korpora/korpora.py index 6189494..3e4409b 100644 --- a/Korpora/korpora.py +++ b/Korpora/korpora.py @@ -28,6 +28,9 @@ def __str__(self): s = f"""{self.name}: size={len(self.texts)}\n{attributes}""" return s + def __repr__(self): + return self.__str__() + @dataclass class LabeledSentence: diff --git a/Korpora/korpus_modu.py b/Korpora/korpus_modu.py index cf3f9e1..a5e29cc 100644 --- a/Korpora/korpus_modu.py +++ b/Korpora/korpus_modu.py @@ -1,4 +1,9 @@ -from .korpora import Korpus, KorpusData +import json +from dataclasses import dataclass +from glob import glob +from tqdm import tqdm +from typing import List +from Korpora.korpora import Korpus, KorpusData description = """ 모두의 말뭉치는 문화체육관광부 산하 국립국어원에서 제공하는 말뭉치로 @@ -27,7 +32,76 @@ class ModuKorpus(Korpus): def __init__(self, root_dir=None, force_download=False): super().__init__(description, license) - fetch_modu() + + +class ModuNewsKorpus(Korpus): + def __init__(self, root_dir_or_paths, load_light=True, force_download=False): + super().__init__(description, license) + if isinstance(root_dir_or_paths, str): + paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json')) + else: + paths = root_dir_or_paths + self.train = ModuNewsData(load_modu_news(paths, load_light)) + + +class ModuNewsData(KorpusData): + def __init__(self, news): + super().__init__('모두의 말뭉치: 뉴스 말뭉치', news) + self.news = self.texts + + +@dataclass +class ModuNews: + document_id: str + title: str + author: str + author: str + publisher: str + date: str + topic: str + original_topic: str + paragraph: List[str] + + +@dataclass +class ModuNewsLight: + document_id: str + title: str + paragraph: str + + +def document_to_a_news(document): + document_id = document['id'] + meta = document['metadata'] + title = meta['title'] + author = meta['author'] + publisher = meta['publisher'] + date = meta['date'] + topic = meta['topic'] + original_topic = meta['original_topic'] + paragraph = [p['form'] for p in document['paragraph']] + return ModuNews(document_id, title, author, publisher, date, topic, original_topic, paragraph) + + +def document_to_a_news_light(document): + document_id = document['id'] + meta = document['metadata'] + title = meta['title'] + paragraph = '\n'.join([p['form'] for p in document['paragraph']]) + return ModuNewsLight(document_id, title, paragraph) + + +def load_modu_news(paths, load_light): + transform = document_to_a_news_light if load_light else document_to_a_news + news = [] + for i_path, path in enumerate(paths): + with open(path, encoding='utf-8') as f: + data = json.load(f) + documents = data['document'] + desc = f'Transform to ModuNews {i_path}/{len(paths)} files' + document_iterator = tqdm(documents, desc=desc, total=len(documents)) + news += [transform(document) for document in document_iterator] + return news def fetch_modu(): From f1d4ad0de3b2ddd8fe2e182153b85d787d6cb629 Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 5 Oct 2020 13:47:34 +0900 Subject: [PATCH 02/16] Add document_id to row index mapper (#103) --- Korpora/korpus_modu.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Korpora/korpus_modu.py b/Korpora/korpus_modu.py index a5e29cc..d626b7a 100644 --- a/Korpora/korpus_modu.py +++ b/Korpora/korpus_modu.py @@ -1,4 +1,5 @@ import json +import os from dataclasses import dataclass from glob import glob from tqdm import tqdm @@ -29,19 +30,20 @@ 정확한 라이센스는 확인 중 입니다.""" -class ModuKorpus(Korpus): - def __init__(self, root_dir=None, force_download=False): - super().__init__(description, license) - - class ModuNewsKorpus(Korpus): def __init__(self, root_dir_or_paths, load_light=True, force_download=False): super().__init__(description, license) if isinstance(root_dir_or_paths, str): - paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json')) + if os.path.isdir(root_dir_or_paths): + paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json')) + else: + # wildcard + paths = sorted(glob(root_dir_or_paths)) else: paths = root_dir_or_paths self.train = ModuNewsData(load_modu_news(paths, load_light)) + self.row_to_documentid = [news.document_id for news in self.train] + self.documentid_to_row = {document_id: idx for idx, document_id in enumerate(self.row_to_documentid)} class ModuNewsData(KorpusData): From c03b6b3dba77ac5c566541c6820477ca6f5ace92 Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 5 Oct 2020 13:48:18 +0900 Subject: [PATCH 03/16] Print only `KorpusData` class instances in `Korpus.__str__` --- Korpora/korpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Korpora/korpora.py b/Korpora/korpora.py index 3e4409b..898d954 100644 --- a/Korpora/korpora.py +++ b/Korpora/korpora.py @@ -157,7 +157,7 @@ def __str__(self): classname = self.__class__.__name__ s = f"{classname}\n{self.description}\n\nAttributes\n----------\n" for var_name, var in self.__dict__.items(): - if var_name not in {'description', 'license', 'self'}: + if isinstance(var, KorpusData): s += f'{str(var)}' return s From 8d9011dc49e31c83b24b6804f33779f3c8888ef4 Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 5 Oct 2020 13:50:02 +0900 Subject: [PATCH 04/16] Rename: A modu corpus, a file (#103) --- Korpora/{korpus_modu.py => korpus_modu_news.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Korpora/{korpus_modu.py => korpus_modu_news.py} (100%) diff --git a/Korpora/korpus_modu.py b/Korpora/korpus_modu_news.py similarity index 100% rename from Korpora/korpus_modu.py rename to Korpora/korpus_modu_news.py From 0bcaffede22ac2c186b45c5ce933439afbd87b97 Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 5 Oct 2020 03:57:42 +0900 Subject: [PATCH 05/16] Implement News corpus (#103) --- Korpora/korpora.py | 3 ++ Korpora/korpus_modu.py | 78 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 2 deletions(-) diff --git a/Korpora/korpora.py b/Korpora/korpora.py index 6189494..3e4409b 100644 --- a/Korpora/korpora.py +++ b/Korpora/korpora.py @@ -28,6 +28,9 @@ def __str__(self): s = f"""{self.name}: size={len(self.texts)}\n{attributes}""" return s + def __repr__(self): + return self.__str__() + @dataclass class LabeledSentence: diff --git a/Korpora/korpus_modu.py b/Korpora/korpus_modu.py index cf3f9e1..a5e29cc 100644 --- a/Korpora/korpus_modu.py +++ b/Korpora/korpus_modu.py @@ -1,4 +1,9 @@ -from .korpora import Korpus, KorpusData +import json +from dataclasses import dataclass +from glob import glob +from tqdm import tqdm +from typing import List +from Korpora.korpora import Korpus, KorpusData description = """ 모두의 말뭉치는 문화체육관광부 산하 국립국어원에서 제공하는 말뭉치로 @@ -27,7 +32,76 @@ class ModuKorpus(Korpus): def __init__(self, root_dir=None, force_download=False): super().__init__(description, license) - fetch_modu() + + +class ModuNewsKorpus(Korpus): + def __init__(self, root_dir_or_paths, load_light=True, force_download=False): + super().__init__(description, license) + if isinstance(root_dir_or_paths, str): + paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json')) + else: + paths = root_dir_or_paths + self.train = ModuNewsData(load_modu_news(paths, load_light)) + + +class ModuNewsData(KorpusData): + def __init__(self, news): + super().__init__('모두의 말뭉치: 뉴스 말뭉치', news) + self.news = self.texts + + +@dataclass +class ModuNews: + document_id: str + title: str + author: str + author: str + publisher: str + date: str + topic: str + original_topic: str + paragraph: List[str] + + +@dataclass +class ModuNewsLight: + document_id: str + title: str + paragraph: str + + +def document_to_a_news(document): + document_id = document['id'] + meta = document['metadata'] + title = meta['title'] + author = meta['author'] + publisher = meta['publisher'] + date = meta['date'] + topic = meta['topic'] + original_topic = meta['original_topic'] + paragraph = [p['form'] for p in document['paragraph']] + return ModuNews(document_id, title, author, publisher, date, topic, original_topic, paragraph) + + +def document_to_a_news_light(document): + document_id = document['id'] + meta = document['metadata'] + title = meta['title'] + paragraph = '\n'.join([p['form'] for p in document['paragraph']]) + return ModuNewsLight(document_id, title, paragraph) + + +def load_modu_news(paths, load_light): + transform = document_to_a_news_light if load_light else document_to_a_news + news = [] + for i_path, path in enumerate(paths): + with open(path, encoding='utf-8') as f: + data = json.load(f) + documents = data['document'] + desc = f'Transform to ModuNews {i_path}/{len(paths)} files' + document_iterator = tqdm(documents, desc=desc, total=len(documents)) + news += [transform(document) for document in document_iterator] + return news def fetch_modu(): From 608118b131374e103e43c31104a5d63f61899426 Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 5 Oct 2020 13:47:34 +0900 Subject: [PATCH 06/16] Add document_id to row index mapper (#103) --- Korpora/korpus_modu.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Korpora/korpus_modu.py b/Korpora/korpus_modu.py index a5e29cc..d626b7a 100644 --- a/Korpora/korpus_modu.py +++ b/Korpora/korpus_modu.py @@ -1,4 +1,5 @@ import json +import os from dataclasses import dataclass from glob import glob from tqdm import tqdm @@ -29,19 +30,20 @@ 정확한 라이센스는 확인 중 입니다.""" -class ModuKorpus(Korpus): - def __init__(self, root_dir=None, force_download=False): - super().__init__(description, license) - - class ModuNewsKorpus(Korpus): def __init__(self, root_dir_or_paths, load_light=True, force_download=False): super().__init__(description, license) if isinstance(root_dir_or_paths, str): - paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json')) + if os.path.isdir(root_dir_or_paths): + paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json')) + else: + # wildcard + paths = sorted(glob(root_dir_or_paths)) else: paths = root_dir_or_paths self.train = ModuNewsData(load_modu_news(paths, load_light)) + self.row_to_documentid = [news.document_id for news in self.train] + self.documentid_to_row = {document_id: idx for idx, document_id in enumerate(self.row_to_documentid)} class ModuNewsData(KorpusData): From 78d50820d71500153ed9929ac294605a33007314 Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 5 Oct 2020 13:48:18 +0900 Subject: [PATCH 07/16] Print only `KorpusData` class instances in `Korpus.__str__` --- Korpora/korpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Korpora/korpora.py b/Korpora/korpora.py index 3e4409b..898d954 100644 --- a/Korpora/korpora.py +++ b/Korpora/korpora.py @@ -157,7 +157,7 @@ def __str__(self): classname = self.__class__.__name__ s = f"{classname}\n{self.description}\n\nAttributes\n----------\n" for var_name, var in self.__dict__.items(): - if var_name not in {'description', 'license', 'self'}: + if isinstance(var, KorpusData): s += f'{str(var)}' return s From 9c81359884ee1f1bb9013e9ed6f0cddfd488af78 Mon Sep 17 00:00:00 2001 From: lovit Date: Mon, 5 Oct 2020 13:50:02 +0900 Subject: [PATCH 08/16] Rename: A modu corpus, a file (#103, #107) --- Korpora/{korpus_modu.py => korpus_modu_news.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Korpora/{korpus_modu.py => korpus_modu_news.py} (100%) diff --git a/Korpora/korpus_modu.py b/Korpora/korpus_modu_news.py similarity index 100% rename from Korpora/korpus_modu.py rename to Korpora/korpus_modu_news.py From 4df301550174ca571475b2b543bf32966970b15e Mon Sep 17 00:00:00 2001 From: lovit Date: Wed, 7 Oct 2020 04:02:00 +0900 Subject: [PATCH 09/16] First file index is 1, not 0 (#103, #107) --- Korpora/korpus_modu_news.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Korpora/korpus_modu_news.py b/Korpora/korpus_modu_news.py index d626b7a..c6ca469 100644 --- a/Korpora/korpus_modu_news.py +++ b/Korpora/korpus_modu_news.py @@ -100,7 +100,7 @@ def load_modu_news(paths, load_light): with open(path, encoding='utf-8') as f: data = json.load(f) documents = data['document'] - desc = f'Transform to ModuNews {i_path}/{len(paths)} files' + desc = f'Transform to ModuNews {i_path + 1}/{len(paths)} files' document_iterator = tqdm(documents, desc=desc, total=len(documents)) news += [transform(document) for document in document_iterator] return news From 9f82b4a7aa193b180912d9e15764837dd87c9861 Mon Sep 17 00:00:00 2001 From: lovit Date: Wed, 7 Oct 2020 04:20:25 +0900 Subject: [PATCH 10/16] Change ModuNewsData attributes (#103, #107) --- Korpora/korpus_modu_news.py | 48 ++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/Korpora/korpus_modu_news.py b/Korpora/korpus_modu_news.py index c6ca469..a69fede 100644 --- a/Korpora/korpus_modu_news.py +++ b/Korpora/korpus_modu_news.py @@ -41,15 +41,52 @@ def __init__(self, root_dir_or_paths, load_light=True, force_download=False): paths = sorted(glob(root_dir_or_paths)) else: paths = root_dir_or_paths - self.train = ModuNewsData(load_modu_news(paths, load_light)) + if load_light: + self.train = ModuNewsDataLight('모두의_뉴스_말뭉치(light).train', load_modu_news(paths, load_light)) + else: + self.train = ModuNewsData('모두의_뉴스_말뭉치.train', load_modu_news(paths, load_light)) self.row_to_documentid = [news.document_id for news in self.train] self.documentid_to_row = {document_id: idx for idx, document_id in enumerate(self.row_to_documentid)} class ModuNewsData(KorpusData): - def __init__(self, news): - super().__init__('모두의 말뭉치: 뉴스 말뭉치', news) - self.news = self.texts + def __init__(self, name, news): + super().__init__(name, news) + self.document_ids = [doc.document_id for doc in news] + self.titles = [doc.title for doc in news] + self.authors = [doc.author for doc in news] + self.publishers = [doc.publisher for doc in news] + self.dates = [doc.date for doc in news] + self.topics = [doc.topic for doc in news] + self.original_topics = [doc.original_topic for doc in news] + self.texts = [doc.paragraph for doc in news] + + def __getitem__(self, index): + news = ModuNews( + self.document_ids[index], + self.titles[index], + self.authors[index], + self.publishers[index], + self.dates[index], + self.topics[index], + self.original_topics[index], + self.texts[index].split('\n')) + return news + + +class ModuNewsDataLight(KorpusData): + def __init__(self, name, news): + super().__init__(name, news) + self.texts = [doc.paragraph for doc in news] + self.titles = [doc.title for doc in news] + self.document_ids = [doc.document_id for doc in news] + + def __getitem__(self, index): + news = ModuNewsLight( + self.document_ids[index], + self.titles[index], + self.texts[index]) + return news @dataclass @@ -57,7 +94,6 @@ class ModuNews: document_id: str title: str author: str - author: str publisher: str date: str topic: str @@ -81,7 +117,7 @@ def document_to_a_news(document): date = meta['date'] topic = meta['topic'] original_topic = meta['original_topic'] - paragraph = [p['form'] for p in document['paragraph']] + paragraph = '\n'.join([p['form'] for p in document['paragraph']]) return ModuNews(document_id, title, author, publisher, date, topic, original_topic, paragraph) From 585d400ee14e41210085d8426c11bcf430caa0b7 Mon Sep 17 00:00:00 2001 From: lovit Date: Wed, 7 Oct 2020 04:27:02 +0900 Subject: [PATCH 11/16] No print KorpusData.name in KorpusData.__str__ --- Korpora/korpora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Korpora/korpora.py b/Korpora/korpora.py index 898d954..4581bd4 100644 --- a/Korpora/korpora.py +++ b/Korpora/korpora.py @@ -23,7 +23,7 @@ def get_all_texts(self): def __str__(self): attributes = "" for var_name, var in self.__dict__.items(): - if var_name not in {'dataname', 'description', 'self'}: + if var_name not in {'name', 'description', 'self'}: attributes += f' - {self.name}.{var_name} : list[{var[0].__class__.__name__}]\n' s = f"""{self.name}: size={len(self.texts)}\n{attributes}""" return s From f0acef1f0dc31b54725ec947257e29d0fc62b2be Mon Sep 17 00:00:00 2001 From: lovit Date: Wed, 7 Oct 2020 04:41:55 +0900 Subject: [PATCH 12/16] Update usage (#103, #107) --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index 60e9aeb..557a41a 100644 --- a/README.md +++ b/README.md @@ -483,3 +483,32 @@ koen_news.dev[0] |---|---| |text|`ko` 문장| |pair|`en` 문장| + +### 모두의 말뭉치: 신문 말뭉치 (loader) +- author: 국립국어원 +- repository: https://corpus.korean.go.kr/ +- example +```python +from Korpora.korpus_modu_news import ModuNewsKorpus + +news_paths_or_news_dir = 'path/to/NIKL_NEWSPAPER(v1.0)/NPRW190000001*.json' # wildcard +news_paths_or_news_dir = 'path/to/NIKL_NEWSPAPER(v1.0)' + +# LOAD ONYL TITLE & PARAGRAPH +news_corpus = ModuNewsKorpus(news_paths_or_news_dir, load_light=True) +news_corpus.train[0] +# ModuNewsLight(document_id='NPRW1900000010.1', title='한국경제 2018년 기사', paragraph='"라니냐로 겨울 가뭄 온다"… ...') +news_corpus.train[0].document_id +# 'NPRW1900000010.1' + +# LOAD ALL ATTRIBUTES IN CORPUS +news_corpus = ModuNewsKorpus(news_paths_or_news_dir, load_light=False) +news_corpus.train[0] +# ModuNews(document_id='NPRW1900000010.1', title='한국경제 2018년 기사', author='김현석', publisher='한국경제신문사', date='20180101', topic='생활', original_topic='국제', paragraph=['"라니냐로 겨울 가뭄 온다"…', '...']) + +# DOCUMENT ID INDEX +news_corpus.row_to_documentid[:3] +# ['NPRW1900000010.1', 'NPRW1900000010.2', 'NPRW1900000010.3'] +news_corpus.documentid_to_row['NPRW1900000010.2'] +# 1 +``` \ No newline at end of file From d114e40659db2eb0da40706420f097b9cc50b126 Mon Sep 17 00:00:00 2001 From: lovit Date: Wed, 7 Oct 2020 04:47:02 +0900 Subject: [PATCH 13/16] Update description of attributes (#103, #107) --- README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 557a41a..c3300c9 100644 --- a/README.md +++ b/README.md @@ -511,4 +511,16 @@ news_corpus.row_to_documentid[:3] # ['NPRW1900000010.1', 'NPRW1900000010.2', 'NPRW1900000010.3'] news_corpus.documentid_to_row['NPRW1900000010.2'] # 1 -``` \ No newline at end of file +``` +- data structure + +| 속성명 | 내용 | +| --- | --- | +| document_id | 뉴스 고유 아이디 | +| title | metadata 의 title (기사 제목이 아님) | +| author | 기사 작성자 | +| publisher | 언론사 | +| date | 기사 작성 일자 | +| topic | 통합 분류 ((정치, 경제, 사회, 생활, IT/과학, 연예, 스포츠, 문화, 미용/건강) | +| original_topic | 신문 매체의 자체 주제 분류 | +| paragraph | 뉴스 기사 본문 (첫 줄이 기사의 제목으로 추정) | \ No newline at end of file From a0377551a3bbc065660133ec46b7f872d4e0f4d0 Mon Sep 17 00:00:00 2001 From: lovit Date: Sat, 10 Oct 2020 16:16:43 +0900 Subject: [PATCH 14/16] Raise exception when corpus file is not found (#103, #107) --- Korpora/korpus_modu_news.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Korpora/korpus_modu_news.py b/Korpora/korpus_modu_news.py index a69fede..b865145 100644 --- a/Korpora/korpus_modu_news.py +++ b/Korpora/korpus_modu_news.py @@ -41,6 +41,9 @@ def __init__(self, root_dir_or_paths, load_light=True, force_download=False): paths = sorted(glob(root_dir_or_paths)) else: paths = root_dir_or_paths + if not paths: + raise ValueError('Not found corpus files. Check `root_dir_or_paths`') + if load_light: self.train = ModuNewsDataLight('모두의_뉴스_말뭉치(light).train', load_modu_news(paths, load_light)) else: From 08073110325756251811e90492d8003e931566da Mon Sep 17 00:00:00 2001 From: lovit Date: Sat, 10 Oct 2020 16:49:00 +0900 Subject: [PATCH 15/16] Separate corpus path finding functions (#103, #107) --- Korpora/korpus_modu_news.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/Korpora/korpus_modu_news.py b/Korpora/korpus_modu_news.py index b865145..1264862 100644 --- a/Korpora/korpus_modu_news.py +++ b/Korpora/korpus_modu_news.py @@ -1,5 +1,6 @@ import json import os +import re from dataclasses import dataclass from glob import glob from tqdm import tqdm @@ -33,17 +34,7 @@ class ModuNewsKorpus(Korpus): def __init__(self, root_dir_or_paths, load_light=True, force_download=False): super().__init__(description, license) - if isinstance(root_dir_or_paths, str): - if os.path.isdir(root_dir_or_paths): - paths = sorted(glob(f'{root_dir_or_paths}/N*RW*.json')) - else: - # wildcard - paths = sorted(glob(root_dir_or_paths)) - else: - paths = root_dir_or_paths - if not paths: - raise ValueError('Not found corpus files. Check `root_dir_or_paths`') - + paths = find_corpus_paths(root_dir_or_paths) if load_light: self.train = ModuNewsDataLight('모두의_뉴스_말뭉치(light).train', load_modu_news(paths, load_light)) else: @@ -132,6 +123,24 @@ def document_to_a_news_light(document): return ModuNewsLight(document_id, title, paragraph) +def find_corpus_paths(root_dir_or_paths): + prefix_pattern = re.compile('N[WLPIZ]RW') + def match(path): + prefix = path.split(os.path.sep)[-1][:4] + return prefix_pattern.match(prefix) + + # directory + wildcard + if isinstance(root_dir_or_paths, str): + paths = sorted(glob(f'{root_dir_or_paths}/*.json') + glob(root_dir_or_paths)) + else: + paths = root_dir_or_paths + + paths = [path for path in paths if match(path)] + if not paths: + raise ValueError('Not found corpus files. Check `root_dir_or_paths`') + return paths + + def load_modu_news(paths, load_light): transform = document_to_a_news_light if load_light else document_to_a_news news = [] From 6ede350f25e852e23e8ddfb0136a16c120a1adc3 Mon Sep 17 00:00:00 2001 From: lovit Date: Sat, 10 Oct 2020 16:51:34 +0900 Subject: [PATCH 16/16] Change tqdm unit: lines in a file -> files (#103, #107) --- Korpora/korpus_modu_news.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Korpora/korpus_modu_news.py b/Korpora/korpus_modu_news.py index 1264862..842a381 100644 --- a/Korpora/korpus_modu_news.py +++ b/Korpora/korpus_modu_news.py @@ -144,13 +144,11 @@ def match(path): def load_modu_news(paths, load_light): transform = document_to_a_news_light if load_light else document_to_a_news news = [] - for i_path, path in enumerate(paths): + for i_path, path in enumerate(tqdm(paths, desc='Loading ModuNews', total=len(paths))): with open(path, encoding='utf-8') as f: data = json.load(f) documents = data['document'] - desc = f'Transform to ModuNews {i_path + 1}/{len(paths)} files' - document_iterator = tqdm(documents, desc=desc, total=len(documents)) - news += [transform(document) for document in document_iterator] + news += [transform(document) for document in documents] return news