Merge pull request #183 from ko-nlp/dev#135

Dev#135
ko-nlp · Jan 18, 2021 · 5282935 · 5282935
2 parents 4456c94 + daad8ca
commit 5282935
Show file tree

Hide file tree

Showing 4 changed files with 182 additions and 9 deletions.
diff --git a/Korpora/cli.py b/Korpora/cli.py
@@ -5,6 +5,7 @@
 from .loader import KORPUS_DESCRIPTION
 from .task_fetch import fetch
 from .task_lmdata import create_lmdata
+from .task_parallel_corpus import create_parallel_corpus
 
 
 def listup(args):
@@ -33,9 +34,9 @@ def main():
 
     # fetch
     parser_fetch = subparsers.add_parser('fetch', help='Fetch `corpus` to `root`')
-    parser_fetch.add_argument('--corpus', type=str, default='all', nargs='+', help='corpus name')
-    parser_fetch.add_argument('--root', type=str, default=None, help='path/to/Korpora/')
-    parser_fetch.add_argument('--force_download', dest='force_download', action='store_true')
+    parser_fetch.add_argument('--corpus', '-c', type=str, default='all', nargs='+', help='corpus name')
+    parser_fetch.add_argument('--root', '-r', type=str, default=None, help='path/to/Korpora/')
+    parser_fetch.add_argument('--force_download', '-f', dest='force_download', action='store_true')
     parser_fetch.set_defaults(func=fetch)
 
     # list
@@ -44,19 +45,33 @@ def main():
 
     # create language model train data
     parser_lmdata = subparsers.add_parser('lmdata', help='Create language model train data')
-    parser_lmdata.add_argument('--corpus', type=str, required=True, nargs='+', help='corpus names')
-    parser_lmdata.add_argument('--root_dir', type=str, default=None, help='path/to/Korpora')
-    parser_lmdata.add_argument('--output_dir', type=str, required=True, help='output file path')
+    parser_lmdata.add_argument('--corpus', '-c', type=str, required=True, nargs='+', help='corpus names')
+    parser_lmdata.add_argument('--root_dir', '-r', type=str, default=None, help='path/to/Korpora')
+    parser_lmdata.add_argument('--output_dir', '-o', type=str, required=True, help='output file path')
     parser_lmdata.add_argument('--sampling_ratio', type=float, default=None, help='Sampling ratio')
-    parser_lmdata.add_argument('--n_first_samples', type=int, default=None, help='Number of first samples')
+    parser_lmdata.add_argument('--head', type=int, default=None, help='Number of first samples')
     parser_lmdata.add_argument('--min_length', type=int, default=None, help='Mininum length of text')
     parser_lmdata.add_argument('--max_length', type=int, default=None, help='Maximum length of text')
     parser_lmdata.add_argument('--seed', type=int, default=None, help='Random seed')
-    parser_lmdata.add_argument('--force_download', dest='force_download', action='store_true')
+    parser_lmdata.add_argument('--force_download', '-f', dest='force_download', action='store_true')
     parser_lmdata.add_argument('--multilingual', dest='multilingual', action='store_true', help='If True, make include train data foreign language text')
     parser_lmdata.add_argument('--save_each', dest='save_each', action='store_true', help='store each corpus as a file')
     parser_lmdata.set_defaults(func=create_lmdata)
 
+    # create parallel corpus data
+    parser_parallel = subparsers.add_parser('parallel', help='Create parallel corpus data')
+    parser_parallel.add_argument('--corpus', '-c', type=str, required=True, nargs='+', help='corpus names')
+    parser_parallel.add_argument('--root_dir', '-r', type=str, default=None, help='path/to/Korpora')
+    parser_parallel.add_argument('--output_dir', '-o', type=str, required=True, help='output file path')
+    parser_parallel.add_argument('--sampling_ratio', type=float, default=None, help='Sampling ratio')
+    parser_parallel.add_argument('--head', type=int, default=None, help='Number of first samples')
+    parser_parallel.add_argument('--min_length', type=int, default=None, help='Mininum length of text')
+    parser_parallel.add_argument('--max_length', type=int, default=None, help='Maximum length of text')
+    parser_parallel.add_argument('--seed', type=int, default=None, help='Random seed')
+    parser_parallel.add_argument('--force_download', '-f', dest='force_download', action='store_true')
+    parser_parallel.add_argument('--save_each', dest='save_each', action='store_true', help='store each corpus as a file')
+    parser_parallel.set_defaults(func=create_parallel_corpus)
+
     # Do task
     args = parser.parse_args()
     show_arguments(args)

diff --git a/Korpora/task_lmdata.py b/Korpora/task_lmdata.py
@@ -15,7 +15,7 @@ def create_lmdata(args):
         sampling_ratio = float(sampling_ratio)
         if not (0 < sampling_ratio < 1):
             raise ValueError('`sampling_ratio` must be None or (0, 1) float')
-    n_first_samples = args.n_first_samples
+    n_first_samples = args.head
     np.random.seed(args.seed)
     selector = Selector(sampling_ratio, args.min_length, args.max_length)
 

diff --git a/Korpora/task_parallel_corpus.py b/Korpora/task_parallel_corpus.py
@@ -0,0 +1,132 @@
+import numpy as np
+import os
+from tqdm import tqdm
+
+from .loader import Korpora
+from .utils import default_korpora_path
+
+
+def create_parallel_corpus(args):
+    corpus_names = check_corpus(args.corpus)
+    os.makedirs(os.path.abspath(args.output_dir), exist_ok=True)
+
+    sampling_ratio = args.sampling_ratio
+    if sampling_ratio is not None:
+        sampling_ratio = float(sampling_ratio)
+        if not (0 < sampling_ratio < 1):
+            raise ValueError('`sampling_ratio` must be None or (0, 1) float')
+    n_first_samples = args.head
+    np.random.seed(args.seed)
+    selector = Selector(sampling_ratio, args.min_length, args.max_length)
+
+    status = [['', name, ' - ', ''] for name in corpus_names]
+
+    for i_corpus, name in enumerate(corpus_names):
+        if not args.save_each and i_corpus > 0:
+            mode = 'a'
+        else:
+            mode = 'w'
+
+        source_filename = f'{name}.source' if args.save_each else 'all.source'
+        target_filename = f'{name}.target' if args.save_each else 'all.target'
+        source_corpus_path = f'{args.output_dir}/{source_filename}'
+        target_corpus_path = f'{args.output_dir}/{target_filename}'
+
+        pair_iterator = tqdm(
+            Korpora.load(name, root_dir=args.root_dir, force_download=args.force_download).train,
+            desc=f'Create train data from {name}'
+        )
+        print_status(status)
+
+        n_sampled = 0
+        fs = open(source_corpus_path, mode, encoding='utf-8')
+        ft = open(target_corpus_path, mode, encoding='utf-8')
+        for i_sent, pair in enumerate(pair_iterator):
+            if not selector.use(pair.text) or not selector.use(pair.pair):
+                continue
+            source = pair.text.replace('\n', ' ')
+            target = pair.pair.replace('\n', ' ')
+            fs.write(f'{source}\n')
+            ft.write(f'{target}\n')
+            n_sampled += 1
+            if (n_first_samples is not None) and (n_first_samples <= n_sampled):
+                break
+        fs.close()
+        ft.close()
+
+        status[i_corpus][0] = ' x '
+        status[i_corpus][2] = n_sampled
+        status[i_corpus][3] = f'{source_filename} & *.target'
+    print_status(status)
+
+
+class Selector:
+    def __init__(self, sampling_ratio, min_length, max_length):
+        if isinstance(min_length, int) and min_length < 0:
+            min_length = None
+        if isinstance(max_length, int) and max_length < 0:
+            max_length = None
+        self.sampling_ratio = sampling_ratio
+        self.min_length = min_length
+        self.max_length = max_length
+
+    def use(self, text):
+        length = len(text)
+        if (self.min_length is not None) and (length < self.min_length):
+            return False
+        if (self.max_length is not None) and (length > self.max_length):
+            return False
+        if self.sampling_ratio is None:
+            return True
+        return np.random.rand() < self.sampling_ratio
+
+
+def check_corpus(corpus_names):
+    if (corpus_names == 'all') or (corpus_names[0] == 'all'):
+        corpus_names = list(ITERATE_TEXTS)
+    if isinstance(corpus_names, str):
+        corpus_names = [corpus_names]
+    available = []
+    for name in corpus_names:
+        if name not in ITERATE_TEXTS:
+            print(f'{name} corpus not provided. Check the `corpus` argument')
+            continue
+        available.append(name)
+    if 'aihub_translation' in available:
+        available = [name for name in available if (name[:6] != 'aihub_')]
+        available = ['aihub_spoken_translation',
+                     'aihub_conversation_translation',
+                     'aihub_news_translation',
+                     'aihub_korean_culture_translation',
+                     'aihub_decree_translation',
+                     'aihub_government_website_translation'
+                    ] + available
+    if not available:
+        raise ValueError('Not found any proper corpus name. Check the `corpus` argument')
+    return available
+
+
+def print_status(status):
+    max_len = max(max(len(row[3]) for row in status), 9)
+    form = '| {:4} | {:40} | {:10} | {} |'
+    print('\n\n' + form.format('Done', 'Corpus name', 'Num pairs', 'File name' + ' ' * (max_len - 9)))
+    print(form.format('-' * 4, '-' * 40, '-' * 10, '-' * max_len))
+    for finish, name, num_pairs, filename in status:
+        if not filename:
+            filename = ' ' * max_len
+        else:
+            filename += ' ' * (max_len -len(filename))
+        print(form.format(finish, name, num_pairs, filename))
+
+
+ITERATE_TEXTS = {
+    'aihub_translation',
+    'aihub_spoken_translation',
+    'aihub_conversation_translation',
+    'aihub_news_translation',
+    'aihub_korean_culture_translation',
+    'aihub_decree_translation',
+    'aihub_government_website_translation',
+    'korean_parallel_koen_news',
+    'open_subtitles'
+}
diff --git a/README.md b/README.md
@@ -179,6 +179,18 @@ korpora lmdata \
   --output_dir ~/works/lmdata
 ```
 
+터미널에서 번역 모델 학습용 데이터를 만들 수 있습니다.
+파일은 `xx.source`, `xx.target` 이름으로 저장됩니다.
+결과물은 `output_dir` 에 저장됩니다.
+
+```bash
+korpora parallel \
+  --corpus aihub_translation open_subtitles \
+  --output_dir ~/parallel \
+  --min_length 5 \
+  --max_length 500
+```
+
 ## License
 
 - Korpora 라이센스는 Creative Commons License(CCL) 4.0의 [CC-BY](https://creativecommons.org/licenses/by/4.0)입니다. 이 라이센스는 Korpora 패키지 및 그 부속물에 한정됩니다.
@@ -364,6 +376,7 @@ A sample command is as follows.
 It simultaneously processes all corpora provided by `Korpora` and creates a single training dataset for a language model.
 Downloading the corpus and preprocessing its text occur simultaneously as well.
 If the corpus does not exist in the local directory, it is downloaded to `~/Korpora`. 
+It also provides simple length-filtering functions (`min_length`, `max_length`).
 A single output file named `all.train` will be created. 
 It is created within `output_dir`.
 
@@ -373,6 +386,19 @@ korpora lmdata \
   --output_dir ~/works/lmdata
 ```
 
+From your terminal, you can also create a dataset for training translation model.
+A sample command for creating this parallel corpus is as follows.
+It also provides simple length-filtering functions (`min_length`, `max_length`).
+Two output files named `xx.source` and `xx.target` are created within `output_dir`.
+
+```bash
+korpora parallel \
+  --corpus aihub_translation open_subtitles \
+  --output_dir ~/parallel \
+  --min_length 5 \
+  --max_length 500
+```
+
 ## License
 
 - Korpora is licensed under the Creative Commons License(CCL) 4.0 [CC-BY](https://creativecommons.org/licenses/by/4.0). This license covers the Korpora package and all of its components.