-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Dev#135 #183
Dev#135 #183
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
from .loader import KORPUS_DESCRIPTION | ||
from .task_fetch import fetch | ||
from .task_lmdata import create_lmdata | ||
from .task_parallel_corpus import create_parallel_corpus | ||
|
||
|
||
def listup(args): | ||
|
@@ -33,9 +34,9 @@ def main(): | |
|
||
# fetch | ||
parser_fetch = subparsers.add_parser('fetch', help='Fetch `corpus` to `root`') | ||
parser_fetch.add_argument('--corpus', type=str, default='all', nargs='+', help='corpus name') | ||
parser_fetch.add_argument('--root', type=str, default=None, help='path/to/Korpora/') | ||
parser_fetch.add_argument('--force_download', dest='force_download', action='store_true') | ||
parser_fetch.add_argument('--corpus', '-c', type=str, default='all', nargs='+', help='corpus name') | ||
parser_fetch.add_argument('--root', '-r', type=str, default=None, help='path/to/Korpora/') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 저희 패키지 전체적으로 |
||
parser_fetch.add_argument('--force_download', '-f', dest='force_download', action='store_true') | ||
parser_fetch.set_defaults(func=fetch) | ||
|
||
# list | ||
|
@@ -44,19 +45,33 @@ def main(): | |
|
||
# create language model train data | ||
parser_lmdata = subparsers.add_parser('lmdata', help='Create language model train data') | ||
parser_lmdata.add_argument('--corpus', type=str, required=True, nargs='+', help='corpus names') | ||
parser_lmdata.add_argument('--root_dir', type=str, default=None, help='path/to/Korpora') | ||
parser_lmdata.add_argument('--output_dir', type=str, required=True, help='output file path') | ||
parser_lmdata.add_argument('--corpus', '-c', type=str, required=True, nargs='+', help='corpus names') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 저희 패키지 전체적으로 |
||
parser_lmdata.add_argument('--root_dir', '-r', type=str, default=None, help='path/to/Korpora') | ||
parser_lmdata.add_argument('--output_dir', '-o', type=str, required=True, help='output file path') | ||
parser_lmdata.add_argument('--sampling_ratio', type=float, default=None, help='Sampling ratio') | ||
parser_lmdata.add_argument('--n_first_samples', type=int, default=None, help='Number of first samples') | ||
parser_lmdata.add_argument('--head', type=int, default=None, help='Number of first samples') | ||
parser_lmdata.add_argument('--min_length', type=int, default=None, help='Mininum length of text') | ||
parser_lmdata.add_argument('--max_length', type=int, default=None, help='Maximum length of text') | ||
parser_lmdata.add_argument('--seed', type=int, default=None, help='Random seed') | ||
parser_lmdata.add_argument('--force_download', dest='force_download', action='store_true') | ||
parser_lmdata.add_argument('--force_download', '-f', dest='force_download', action='store_true') | ||
parser_lmdata.add_argument('--multilingual', dest='multilingual', action='store_true', help='If True, make include train data foreign language text') | ||
parser_lmdata.add_argument('--save_each', dest='save_each', action='store_true', help='store each corpus as a file') | ||
parser_lmdata.set_defaults(func=create_lmdata) | ||
|
||
# create parallel corpus data | ||
parser_parallel = subparsers.add_parser('parallel', help='Create parallel corpus data') | ||
parser_parallel.add_argument('--corpus', '-c', type=str, required=True, nargs='+', help='corpus names') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 저희 패키지 전체적으로 |
||
parser_parallel.add_argument('--root_dir', '-r', type=str, default=None, help='path/to/Korpora') | ||
parser_parallel.add_argument('--output_dir', '-o', type=str, required=True, help='output file path') | ||
parser_parallel.add_argument('--sampling_ratio', type=float, default=None, help='Sampling ratio') | ||
parser_parallel.add_argument('--head', type=int, default=None, help='Number of first samples') | ||
parser_parallel.add_argument('--min_length', type=int, default=None, help='Mininum length of text') | ||
parser_parallel.add_argument('--max_length', type=int, default=None, help='Maximum length of text') | ||
parser_parallel.add_argument('--seed', type=int, default=None, help='Random seed') | ||
parser_parallel.add_argument('--force_download', '-f', dest='force_download', action='store_true') | ||
parser_parallel.add_argument('--save_each', dest='save_each', action='store_true', help='store each corpus as a file') | ||
parser_parallel.set_defaults(func=create_parallel_corpus) | ||
|
||
# Do task | ||
args = parser.parse_args() | ||
show_arguments(args) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
import numpy as np | ||
import os | ||
from tqdm import tqdm | ||
|
||
from .loader import Korpora | ||
from .utils import default_korpora_path | ||
|
||
|
||
def create_parallel_corpus(args): | ||
corpus_names = check_corpus(args.corpus) | ||
os.makedirs(os.path.abspath(args.output_dir), exist_ok=True) | ||
|
||
sampling_ratio = args.sampling_ratio | ||
if sampling_ratio is not None: | ||
sampling_ratio = float(sampling_ratio) | ||
if not (0 < sampling_ratio < 1): | ||
raise ValueError('`sampling_ratio` must be None or (0, 1) float') | ||
n_first_samples = args.head | ||
np.random.seed(args.seed) | ||
selector = Selector(sampling_ratio, args.min_length, args.max_length) | ||
|
||
status = [['', name, ' - ', ''] for name in corpus_names] | ||
|
||
for i_corpus, name in enumerate(corpus_names): | ||
if not args.save_each and i_corpus > 0: | ||
mode = 'a' | ||
else: | ||
mode = 'w' | ||
|
||
source_filename = f'{name}.source' if args.save_each else 'all.source' | ||
target_filename = f'{name}.target' if args.save_each else 'all.target' | ||
source_corpus_path = f'{args.output_dir}/{source_filename}' | ||
target_corpus_path = f'{args.output_dir}/{target_filename}' | ||
|
||
pair_iterator = tqdm( | ||
Korpora.load(name, root_dir=args.root_dir, force_download=args.force_download).train, | ||
desc=f'Create train data from {name}' | ||
) | ||
print_status(status) | ||
|
||
n_sampled = 0 | ||
fs = open(source_corpus_path, mode, encoding='utf-8') | ||
ft = open(target_corpus_path, mode, encoding='utf-8') | ||
for i_sent, pair in enumerate(pair_iterator): | ||
if not selector.use(pair.text) or not selector.use(pair.pair): | ||
continue | ||
source = pair.text.replace('\n', ' ') | ||
target = pair.pair.replace('\n', ' ') | ||
fs.write(f'{source}\n') | ||
ft.write(f'{target}\n') | ||
n_sampled += 1 | ||
if (n_first_samples is not None) and (n_first_samples <= n_sampled): | ||
break | ||
fs.close() | ||
ft.close() | ||
|
||
status[i_corpus][0] = ' x ' | ||
status[i_corpus][2] = n_sampled | ||
status[i_corpus][3] = f'{source_filename} & *.target' | ||
print_status(status) | ||
|
||
|
||
class Selector: | ||
def __init__(self, sampling_ratio, min_length, max_length): | ||
if isinstance(min_length, int) and min_length < 0: | ||
min_length = None | ||
if isinstance(max_length, int) and max_length < 0: | ||
max_length = None | ||
self.sampling_ratio = sampling_ratio | ||
self.min_length = min_length | ||
self.max_length = max_length | ||
|
||
def use(self, text): | ||
length = len(text) | ||
if (self.min_length is not None) and (length < self.min_length): | ||
return False | ||
if (self.max_length is not None) and (length > self.max_length): | ||
return False | ||
if self.sampling_ratio is None: | ||
return True | ||
return np.random.rand() < self.sampling_ratio | ||
|
||
|
||
def check_corpus(corpus_names): | ||
if (corpus_names == 'all') or (corpus_names[0] == 'all'): | ||
corpus_names = list(ITERATE_TEXTS) | ||
if isinstance(corpus_names, str): | ||
corpus_names = [corpus_names] | ||
available = [] | ||
for name in corpus_names: | ||
if name not in ITERATE_TEXTS: | ||
print(f'{name} corpus not provided. Check the `corpus` argument') | ||
continue | ||
available.append(name) | ||
if 'aihub_translation' in available: | ||
available = [name for name in available if (name[:6] != 'aihub_')] | ||
available = ['aihub_spoken_translation', | ||
'aihub_conversation_translation', | ||
'aihub_news_translation', | ||
'aihub_korean_culture_translation', | ||
'aihub_decree_translation', | ||
'aihub_government_website_translation' | ||
] + available | ||
if not available: | ||
raise ValueError('Not found any proper corpus name. Check the `corpus` argument') | ||
return available | ||
|
||
|
||
def print_status(status): | ||
max_len = max(max(len(row[3]) for row in status), 9) | ||
form = '| {:4} | {:40} | {:10} | {} |' | ||
print('\n\n' + form.format('Done', 'Corpus name', 'Num pairs', 'File name' + ' ' * (max_len - 9))) | ||
print(form.format('-' * 4, '-' * 40, '-' * 10, '-' * max_len)) | ||
for finish, name, num_pairs, filename in status: | ||
if not filename: | ||
filename = ' ' * max_len | ||
else: | ||
filename += ' ' * (max_len -len(filename)) | ||
print(form.format(finish, name, num_pairs, filename)) | ||
|
||
|
||
ITERATE_TEXTS = { | ||
'aihub_translation', | ||
'aihub_spoken_translation', | ||
'aihub_conversation_translation', | ||
'aihub_news_translation', | ||
'aihub_korean_culture_translation', | ||
'aihub_decree_translation', | ||
'aihub_government_website_translation', | ||
'korean_parallel_koen_news', | ||
'open_subtitles' | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
저희 패키지 전체적으로
corpus_name
을 쓰고 있는 것으로 아는데 맞나요?맞다면 이 역시
corpus_name
으로 통일하면 어떨지 싶습니다.