diff --git a/Korpora/cli.py b/Korpora/cli.py index 43ff928..cc795be 100644 --- a/Korpora/cli.py +++ b/Korpora/cli.py @@ -5,6 +5,7 @@ from .loader import KORPUS_DESCRIPTION from .task_fetch import fetch from .task_lmdata import create_lmdata +from .task_parallel_corpus import create_parallel_corpus def listup(args): @@ -33,9 +34,9 @@ def main(): # fetch parser_fetch = subparsers.add_parser('fetch', help='Fetch `corpus` to `root`') - parser_fetch.add_argument('--corpus', type=str, default='all', nargs='+', help='corpus name') - parser_fetch.add_argument('--root', type=str, default=None, help='path/to/Korpora/') - parser_fetch.add_argument('--force_download', dest='force_download', action='store_true') + parser_fetch.add_argument('--corpus', '-c', type=str, default='all', nargs='+', help='corpus name') + parser_fetch.add_argument('--root', '-r', type=str, default=None, help='path/to/Korpora/') + parser_fetch.add_argument('--force_download', '-f', dest='force_download', action='store_true') parser_fetch.set_defaults(func=fetch) # list @@ -44,19 +45,33 @@ def main(): # create language model train data parser_lmdata = subparsers.add_parser('lmdata', help='Create language model train data') - parser_lmdata.add_argument('--corpus', type=str, required=True, nargs='+', help='corpus names') - parser_lmdata.add_argument('--root_dir', type=str, default=None, help='path/to/Korpora') - parser_lmdata.add_argument('--output_dir', type=str, required=True, help='output file path') + parser_lmdata.add_argument('--corpus', '-c', type=str, required=True, nargs='+', help='corpus names') + parser_lmdata.add_argument('--root_dir', '-r', type=str, default=None, help='path/to/Korpora') + parser_lmdata.add_argument('--output_dir', '-o', type=str, required=True, help='output file path') parser_lmdata.add_argument('--sampling_ratio', type=float, default=None, help='Sampling ratio') - parser_lmdata.add_argument('--n_first_samples', type=int, default=None, help='Number of first samples') + parser_lmdata.add_argument('--head', type=int, default=None, help='Number of first samples') parser_lmdata.add_argument('--min_length', type=int, default=None, help='Mininum length of text') parser_lmdata.add_argument('--max_length', type=int, default=None, help='Maximum length of text') parser_lmdata.add_argument('--seed', type=int, default=None, help='Random seed') - parser_lmdata.add_argument('--force_download', dest='force_download', action='store_true') + parser_lmdata.add_argument('--force_download', '-f', dest='force_download', action='store_true') parser_lmdata.add_argument('--multilingual', dest='multilingual', action='store_true', help='If True, make include train data foreign language text') parser_lmdata.add_argument('--save_each', dest='save_each', action='store_true', help='store each corpus as a file') parser_lmdata.set_defaults(func=create_lmdata) + # create parallel corpus data + parser_parallel = subparsers.add_parser('parallel', help='Create parallel corpus data') + parser_parallel.add_argument('--corpus', '-c', type=str, required=True, nargs='+', help='corpus names') + parser_parallel.add_argument('--root_dir', '-r', type=str, default=None, help='path/to/Korpora') + parser_parallel.add_argument('--output_dir', '-o', type=str, required=True, help='output file path') + parser_parallel.add_argument('--sampling_ratio', type=float, default=None, help='Sampling ratio') + parser_parallel.add_argument('--head', type=int, default=None, help='Number of first samples') + parser_parallel.add_argument('--min_length', type=int, default=None, help='Mininum length of text') + parser_parallel.add_argument('--max_length', type=int, default=None, help='Maximum length of text') + parser_parallel.add_argument('--seed', type=int, default=None, help='Random seed') + parser_parallel.add_argument('--force_download', '-f', dest='force_download', action='store_true') + parser_parallel.add_argument('--save_each', dest='save_each', action='store_true', help='store each corpus as a file') + parser_parallel.set_defaults(func=create_parallel_corpus) + # Do task args = parser.parse_args() show_arguments(args) diff --git a/Korpora/task_lmdata.py b/Korpora/task_lmdata.py index d94cca1..8dd9051 100644 --- a/Korpora/task_lmdata.py +++ b/Korpora/task_lmdata.py @@ -15,7 +15,7 @@ def create_lmdata(args): sampling_ratio = float(sampling_ratio) if not (0 < sampling_ratio < 1): raise ValueError('`sampling_ratio` must be None or (0, 1) float') - n_first_samples = args.n_first_samples + n_first_samples = args.head np.random.seed(args.seed) selector = Selector(sampling_ratio, args.min_length, args.max_length) diff --git a/Korpora/task_parallel_corpus.py b/Korpora/task_parallel_corpus.py new file mode 100644 index 0000000..4bd47ab --- /dev/null +++ b/Korpora/task_parallel_corpus.py @@ -0,0 +1,132 @@ +import numpy as np +import os +from tqdm import tqdm + +from .loader import Korpora +from .utils import default_korpora_path + + +def create_parallel_corpus(args): + corpus_names = check_corpus(args.corpus) + os.makedirs(os.path.abspath(args.output_dir), exist_ok=True) + + sampling_ratio = args.sampling_ratio + if sampling_ratio is not None: + sampling_ratio = float(sampling_ratio) + if not (0 < sampling_ratio < 1): + raise ValueError('`sampling_ratio` must be None or (0, 1) float') + n_first_samples = args.head + np.random.seed(args.seed) + selector = Selector(sampling_ratio, args.min_length, args.max_length) + + status = [['', name, ' - ', ''] for name in corpus_names] + + for i_corpus, name in enumerate(corpus_names): + if not args.save_each and i_corpus > 0: + mode = 'a' + else: + mode = 'w' + + source_filename = f'{name}.source' if args.save_each else 'all.source' + target_filename = f'{name}.target' if args.save_each else 'all.target' + source_corpus_path = f'{args.output_dir}/{source_filename}' + target_corpus_path = f'{args.output_dir}/{target_filename}' + + pair_iterator = tqdm( + Korpora.load(name, root_dir=args.root_dir, force_download=args.force_download).train, + desc=f'Create train data from {name}' + ) + print_status(status) + + n_sampled = 0 + fs = open(source_corpus_path, mode, encoding='utf-8') + ft = open(target_corpus_path, mode, encoding='utf-8') + for i_sent, pair in enumerate(pair_iterator): + if not selector.use(pair.text) or not selector.use(pair.pair): + continue + source = pair.text.replace('\n', ' ') + target = pair.pair.replace('\n', ' ') + fs.write(f'{source}\n') + ft.write(f'{target}\n') + n_sampled += 1 + if (n_first_samples is not None) and (n_first_samples <= n_sampled): + break + fs.close() + ft.close() + + status[i_corpus][0] = ' x ' + status[i_corpus][2] = n_sampled + status[i_corpus][3] = f'{source_filename} & *.target' + print_status(status) + + +class Selector: + def __init__(self, sampling_ratio, min_length, max_length): + if isinstance(min_length, int) and min_length < 0: + min_length = None + if isinstance(max_length, int) and max_length < 0: + max_length = None + self.sampling_ratio = sampling_ratio + self.min_length = min_length + self.max_length = max_length + + def use(self, text): + length = len(text) + if (self.min_length is not None) and (length < self.min_length): + return False + if (self.max_length is not None) and (length > self.max_length): + return False + if self.sampling_ratio is None: + return True + return np.random.rand() < self.sampling_ratio + + +def check_corpus(corpus_names): + if (corpus_names == 'all') or (corpus_names[0] == 'all'): + corpus_names = list(ITERATE_TEXTS) + if isinstance(corpus_names, str): + corpus_names = [corpus_names] + available = [] + for name in corpus_names: + if name not in ITERATE_TEXTS: + print(f'{name} corpus not provided. Check the `corpus` argument') + continue + available.append(name) + if 'aihub_translation' in available: + available = [name for name in available if (name[:6] != 'aihub_')] + available = ['aihub_spoken_translation', + 'aihub_conversation_translation', + 'aihub_news_translation', + 'aihub_korean_culture_translation', + 'aihub_decree_translation', + 'aihub_government_website_translation' + ] + available + if not available: + raise ValueError('Not found any proper corpus name. Check the `corpus` argument') + return available + + +def print_status(status): + max_len = max(max(len(row[3]) for row in status), 9) + form = '| {:4} | {:40} | {:10} | {} |' + print('\n\n' + form.format('Done', 'Corpus name', 'Num pairs', 'File name' + ' ' * (max_len - 9))) + print(form.format('-' * 4, '-' * 40, '-' * 10, '-' * max_len)) + for finish, name, num_pairs, filename in status: + if not filename: + filename = ' ' * max_len + else: + filename += ' ' * (max_len -len(filename)) + print(form.format(finish, name, num_pairs, filename)) + + +ITERATE_TEXTS = { + 'aihub_translation', + 'aihub_spoken_translation', + 'aihub_conversation_translation', + 'aihub_news_translation', + 'aihub_korean_culture_translation', + 'aihub_decree_translation', + 'aihub_government_website_translation', + 'korean_parallel_koen_news', + 'open_subtitles' +} \ No newline at end of file diff --git a/README.md b/README.md index 5e2cd4c..3239d9e 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,18 @@ korpora lmdata \ --output_dir ~/works/lmdata ``` +터미널에서 번역 모델 학습용 데이터를 만들 수 있습니다. +파일은 `xx.source`, `xx.target` 이름으로 저장됩니다. +결과물은 `output_dir` 에 저장됩니다. + +```bash +korpora parallel \ + --corpus aihub_translation open_subtitles \ + --output_dir ~/parallel \ + --min_length 5 \ + --max_length 500 +``` + ## License - Korpora 라이센스는 Creative Commons License(CCL) 4.0의 [CC-BY](https://creativecommons.org/licenses/by/4.0)입니다. 이 라이센스는 Korpora 패키지 및 그 부속물에 한정됩니다. @@ -364,6 +376,7 @@ A sample command is as follows. It simultaneously processes all corpora provided by `Korpora` and creates a single training dataset for a language model. Downloading the corpus and preprocessing its text occur simultaneously as well. If the corpus does not exist in the local directory, it is downloaded to `~/Korpora`. +It also provides simple length-filtering functions (`min_length`, `max_length`). A single output file named `all.train` will be created. It is created within `output_dir`. @@ -373,6 +386,19 @@ korpora lmdata \ --output_dir ~/works/lmdata ``` +From your terminal, you can also create a dataset for training translation model. +A sample command for creating this parallel corpus is as follows. +It also provides simple length-filtering functions (`min_length`, `max_length`). +Two output files named `xx.source` and `xx.target` are created within `output_dir`. + +```bash +korpora parallel \ + --corpus aihub_translation open_subtitles \ + --output_dir ~/parallel \ + --min_length 5 \ + --max_length 500 +``` + ## License - Korpora is licensed under the Creative Commons License(CCL) 4.0 [CC-BY](https://creativecommons.org/licenses/by/4.0). This license covers the Korpora package and all of its components.