Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unify variable name: root_dir_or_paths -> root_dir (#136) #149

Merged
merged 1 commit into from
Nov 2, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions Korpora/korpus_aihub_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,62 +55,62 @@


class AIHubTranslationKorpus(Korpus):
def __init__(self, root_dir_or_paths=None, force_download=False, prefix='', name='AIHub_translation'):
def __init__(self, root_dir=None, force_download=False, prefix='', name='AIHub_translation'):
super().__init__(description, license)
if root_dir_or_paths is None:
root_dir_or_paths = os.path.join(default_korpora_path, 'AIHub_Translation', prefix)
elif isinstance(root_dir_or_paths, str) and os.path.isdir(root_dir_or_paths):
root_dir_or_paths = os.path.join(root_dir_or_paths, prefix)
paths = find_corpus_paths(root_dir_or_paths)
if root_dir is None:
root_dir = os.path.join(default_korpora_path, 'AIHub_Translation', prefix)
elif isinstance(root_dir, str) and os.path.isdir(root_dir):
root_dir = os.path.join(root_dir, prefix)
paths = find_corpus_paths(root_dir)
self.train = SentencePairKorpusData(
f'{name}.train',
*load_aihub_translation(paths, name)
)


class AIHubSpokenTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir_or_paths=None, force_download=False):
super().__init__(root_dir_or_paths, force_download, '1_spoken*', 'AIHub_spoken_translation')
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '1_spoken*', 'AIHub_spoken_translation')


class AIHubConversationTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir_or_paths=None, force_download=False):
super().__init__(root_dir_or_paths, force_download, '2_conversation*', 'AIHub_conversation_translation')
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '2_conversation*', 'AIHub_conversation_translation')


class AIHubNewsTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir_or_paths=None, force_download=False):
super().__init__(root_dir_or_paths, force_download, '3_news*', 'AIHub_news_translation')
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '3_news*', 'AIHub_news_translation')


class AIHubKoreanCultureTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir_or_paths=None, force_download=False):
super().__init__(root_dir_or_paths, force_download, '4_korean_culture*', 'AIHub_korean_culture_translation')
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '4_korean_culture*', 'AIHub_korean_culture_translation')


class AIHubDecreeTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir_or_paths=None, force_download=False):
super().__init__(root_dir_or_paths, force_download, '5_decree*', 'AIHub_decree_translation')
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '5_decree*', 'AIHub_decree_translation')


class AIHubGovernmentWebsiteTranslationKorpus(AIHubTranslationKorpus):
def __init__(self, root_dir_or_paths=None, force_download=False):
super().__init__(root_dir_or_paths, force_download, '6_government_website*', 'AIHub_government_website_translation')
def __init__(self, root_dir=None, force_download=False):
super().__init__(root_dir, force_download, '6_government_website*', 'AIHub_government_website_translation')


def find_corpus_paths(root_dir_or_paths, suffix='200226.xlsx'):
def find_corpus_paths(root_dir, suffix='200226.xlsx'):
def match(path):
return path[-11:] == suffix

# directory + wildcard
if isinstance(root_dir_or_paths, str):
paths = sorted(glob(f'{root_dir_or_paths}/*{suffix}') + glob(root_dir_or_paths))
if isinstance(root_dir, str):
paths = sorted(glob(f'{root_dir}/*{suffix}') + glob(root_dir))
else:
paths = root_dir_or_paths
paths = root_dir

paths = [path for path in paths if match(path)]
if not paths:
raise ValueError('Not found corpus files. Check `root_dir_or_paths`')
raise ValueError('Not found corpus files. Check `root_dir`')
return paths


Expand Down