Skip to content

Commit

Permalink
Merge pull request #183 from ko-nlp/dev#135
Browse files Browse the repository at this point in the history
Dev#135
  • Loading branch information
lovit authored Jan 18, 2021
2 parents 4456c94 + daad8ca commit 5282935
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 9 deletions.
31 changes: 23 additions & 8 deletions Korpora/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .loader import KORPUS_DESCRIPTION
from .task_fetch import fetch
from .task_lmdata import create_lmdata
from .task_parallel_corpus import create_parallel_corpus


def listup(args):
Expand Down Expand Up @@ -33,9 +34,9 @@ def main():

# fetch
parser_fetch = subparsers.add_parser('fetch', help='Fetch `corpus` to `root`')
parser_fetch.add_argument('--corpus', type=str, default='all', nargs='+', help='corpus name')
parser_fetch.add_argument('--root', type=str, default=None, help='path/to/Korpora/')
parser_fetch.add_argument('--force_download', dest='force_download', action='store_true')
parser_fetch.add_argument('--corpus', '-c', type=str, default='all', nargs='+', help='corpus name')
parser_fetch.add_argument('--root', '-r', type=str, default=None, help='path/to/Korpora/')
parser_fetch.add_argument('--force_download', '-f', dest='force_download', action='store_true')
parser_fetch.set_defaults(func=fetch)

# list
Expand All @@ -44,19 +45,33 @@ def main():

# create language model train data
parser_lmdata = subparsers.add_parser('lmdata', help='Create language model train data')
parser_lmdata.add_argument('--corpus', type=str, required=True, nargs='+', help='corpus names')
parser_lmdata.add_argument('--root_dir', type=str, default=None, help='path/to/Korpora')
parser_lmdata.add_argument('--output_dir', type=str, required=True, help='output file path')
parser_lmdata.add_argument('--corpus', '-c', type=str, required=True, nargs='+', help='corpus names')
parser_lmdata.add_argument('--root_dir', '-r', type=str, default=None, help='path/to/Korpora')
parser_lmdata.add_argument('--output_dir', '-o', type=str, required=True, help='output file path')
parser_lmdata.add_argument('--sampling_ratio', type=float, default=None, help='Sampling ratio')
parser_lmdata.add_argument('--n_first_samples', type=int, default=None, help='Number of first samples')
parser_lmdata.add_argument('--head', type=int, default=None, help='Number of first samples')
parser_lmdata.add_argument('--min_length', type=int, default=None, help='Mininum length of text')
parser_lmdata.add_argument('--max_length', type=int, default=None, help='Maximum length of text')
parser_lmdata.add_argument('--seed', type=int, default=None, help='Random seed')
parser_lmdata.add_argument('--force_download', dest='force_download', action='store_true')
parser_lmdata.add_argument('--force_download', '-f', dest='force_download', action='store_true')
parser_lmdata.add_argument('--multilingual', dest='multilingual', action='store_true', help='If True, make include train data foreign language text')
parser_lmdata.add_argument('--save_each', dest='save_each', action='store_true', help='store each corpus as a file')
parser_lmdata.set_defaults(func=create_lmdata)

# create parallel corpus data
parser_parallel = subparsers.add_parser('parallel', help='Create parallel corpus data')
parser_parallel.add_argument('--corpus', '-c', type=str, required=True, nargs='+', help='corpus names')
parser_parallel.add_argument('--root_dir', '-r', type=str, default=None, help='path/to/Korpora')
parser_parallel.add_argument('--output_dir', '-o', type=str, required=True, help='output file path')
parser_parallel.add_argument('--sampling_ratio', type=float, default=None, help='Sampling ratio')
parser_parallel.add_argument('--head', type=int, default=None, help='Number of first samples')
parser_parallel.add_argument('--min_length', type=int, default=None, help='Mininum length of text')
parser_parallel.add_argument('--max_length', type=int, default=None, help='Maximum length of text')
parser_parallel.add_argument('--seed', type=int, default=None, help='Random seed')
parser_parallel.add_argument('--force_download', '-f', dest='force_download', action='store_true')
parser_parallel.add_argument('--save_each', dest='save_each', action='store_true', help='store each corpus as a file')
parser_parallel.set_defaults(func=create_parallel_corpus)

# Do task
args = parser.parse_args()
show_arguments(args)
Expand Down
2 changes: 1 addition & 1 deletion Korpora/task_lmdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def create_lmdata(args):
sampling_ratio = float(sampling_ratio)
if not (0 < sampling_ratio < 1):
raise ValueError('`sampling_ratio` must be None or (0, 1) float')
n_first_samples = args.n_first_samples
n_first_samples = args.head
np.random.seed(args.seed)
selector = Selector(sampling_ratio, args.min_length, args.max_length)

Expand Down
132 changes: 132 additions & 0 deletions Korpora/task_parallel_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import numpy as np
import os
from tqdm import tqdm

from .loader import Korpora
from .utils import default_korpora_path


def create_parallel_corpus(args):
corpus_names = check_corpus(args.corpus)
os.makedirs(os.path.abspath(args.output_dir), exist_ok=True)

sampling_ratio = args.sampling_ratio
if sampling_ratio is not None:
sampling_ratio = float(sampling_ratio)
if not (0 < sampling_ratio < 1):
raise ValueError('`sampling_ratio` must be None or (0, 1) float')
n_first_samples = args.head
np.random.seed(args.seed)
selector = Selector(sampling_ratio, args.min_length, args.max_length)

status = [['', name, ' - ', ''] for name in corpus_names]

for i_corpus, name in enumerate(corpus_names):
if not args.save_each and i_corpus > 0:
mode = 'a'
else:
mode = 'w'

source_filename = f'{name}.source' if args.save_each else 'all.source'
target_filename = f'{name}.target' if args.save_each else 'all.target'
source_corpus_path = f'{args.output_dir}/{source_filename}'
target_corpus_path = f'{args.output_dir}/{target_filename}'

pair_iterator = tqdm(
Korpora.load(name, root_dir=args.root_dir, force_download=args.force_download).train,
desc=f'Create train data from {name}'
)
print_status(status)

n_sampled = 0
fs = open(source_corpus_path, mode, encoding='utf-8')
ft = open(target_corpus_path, mode, encoding='utf-8')
for i_sent, pair in enumerate(pair_iterator):
if not selector.use(pair.text) or not selector.use(pair.pair):
continue
source = pair.text.replace('\n', ' ')
target = pair.pair.replace('\n', ' ')
fs.write(f'{source}\n')
ft.write(f'{target}\n')
n_sampled += 1
if (n_first_samples is not None) and (n_first_samples <= n_sampled):
break
fs.close()
ft.close()

status[i_corpus][0] = ' x '
status[i_corpus][2] = n_sampled
status[i_corpus][3] = f'{source_filename} & *.target'
print_status(status)


class Selector:
def __init__(self, sampling_ratio, min_length, max_length):
if isinstance(min_length, int) and min_length < 0:
min_length = None
if isinstance(max_length, int) and max_length < 0:
max_length = None
self.sampling_ratio = sampling_ratio
self.min_length = min_length
self.max_length = max_length

def use(self, text):
length = len(text)
if (self.min_length is not None) and (length < self.min_length):
return False
if (self.max_length is not None) and (length > self.max_length):
return False
if self.sampling_ratio is None:
return True
return np.random.rand() < self.sampling_ratio


def check_corpus(corpus_names):
if (corpus_names == 'all') or (corpus_names[0] == 'all'):
corpus_names = list(ITERATE_TEXTS)
if isinstance(corpus_names, str):
corpus_names = [corpus_names]
available = []
for name in corpus_names:
if name not in ITERATE_TEXTS:
print(f'{name} corpus not provided. Check the `corpus` argument')
continue
available.append(name)
if 'aihub_translation' in available:
available = [name for name in available if (name[:6] != 'aihub_')]
available = ['aihub_spoken_translation',
'aihub_conversation_translation',
'aihub_news_translation',
'aihub_korean_culture_translation',
'aihub_decree_translation',
'aihub_government_website_translation'
] + available
if not available:
raise ValueError('Not found any proper corpus name. Check the `corpus` argument')
return available


def print_status(status):
max_len = max(max(len(row[3]) for row in status), 9)
form = '| {:4} | {:40} | {:10} | {} |'
print('\n\n' + form.format('Done', 'Corpus name', 'Num pairs', 'File name' + ' ' * (max_len - 9)))
print(form.format('-' * 4, '-' * 40, '-' * 10, '-' * max_len))
for finish, name, num_pairs, filename in status:
if not filename:
filename = ' ' * max_len
else:
filename += ' ' * (max_len -len(filename))
print(form.format(finish, name, num_pairs, filename))


ITERATE_TEXTS = {
'aihub_translation',
'aihub_spoken_translation',
'aihub_conversation_translation',
'aihub_news_translation',
'aihub_korean_culture_translation',
'aihub_decree_translation',
'aihub_government_website_translation',
'korean_parallel_koen_news',
'open_subtitles'
}
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,18 @@ korpora lmdata \
--output_dir ~/works/lmdata
```

터미널에서 번역 모델 학습용 데이터를 만들 수 있습니다.
파일은 `xx.source`, `xx.target` 이름으로 저장됩니다.
결과물은 `output_dir` 에 저장됩니다.

```bash
korpora parallel \
--corpus aihub_translation open_subtitles \
--output_dir ~/parallel \
--min_length 5 \
--max_length 500
```

## License

- Korpora 라이센스는 Creative Commons License(CCL) 4.0의 [CC-BY](https://creativecommons.org/licenses/by/4.0)입니다. 이 라이센스는 Korpora 패키지 및 그 부속물에 한정됩니다.
Expand Down Expand Up @@ -364,6 +376,7 @@ A sample command is as follows.
It simultaneously processes all corpora provided by `Korpora` and creates a single training dataset for a language model.
Downloading the corpus and preprocessing its text occur simultaneously as well.
If the corpus does not exist in the local directory, it is downloaded to `~/Korpora`.
It also provides simple length-filtering functions (`min_length`, `max_length`).
A single output file named `all.train` will be created.
It is created within `output_dir`.

Expand All @@ -373,6 +386,19 @@ korpora lmdata \
--output_dir ~/works/lmdata
```

From your terminal, you can also create a dataset for training translation model.
A sample command for creating this parallel corpus is as follows.
It also provides simple length-filtering functions (`min_length`, `max_length`).
Two output files named `xx.source` and `xx.target` are created within `output_dir`.

```bash
korpora parallel \
--corpus aihub_translation open_subtitles \
--output_dir ~/parallel \
--min_length 5 \
--max_length 500
```

## License

- Korpora is licensed under the Creative Commons License(CCL) 4.0 [CC-BY](https://creativecommons.org/licenses/by/4.0). This license covers the Korpora package and all of its components.
Expand Down

0 comments on commit 5282935

Please sign in to comment.