From 4c040535c7d3ac9caad31bb31f1cb1900983a8f4 Mon Sep 17 00:00:00 2001 From: Jamie Date: Mon, 21 Jan 2019 16:59:24 +0900 Subject: [PATCH] =?UTF-8?q?=EC=84=B8=EC=A2=85=20=EC=BD=94=ED=8D=BC?= =?UTF-8?q?=EC=8A=A4=EB=A1=9C=EB=B6=80=ED=84=B0=20=EC=9D=8C=EC=A0=88?= =?UTF-8?q?=EB=8B=A8=EC=9C=84=20=EC=A0=95=EB=A0=AC=EC=9D=84=20=EC=88=98?= =?UTF-8?q?=ED=96=89=ED=95=98=EB=8A=94=20=EC=8A=A4=ED=81=AC=EB=A6=BD?= =?UTF-8?q?=ED=8A=B8=20=EC=B6=94=EA=B0=80=20#30?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- train/bin/map_char_to_tag.py | 184 +++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100755 train/bin/map_char_to_tag.py diff --git a/train/bin/map_char_to_tag.py b/train/bin/map_char_to_tag.py new file mode 100755 index 0000000..b762ab3 --- /dev/null +++ b/train/bin/map_char_to_tag.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +""" +어절 내에서 원문과 형태소 사이에 정렬을 수행하고 음절기반 학습 코퍼스를 생성 +__author__ = 'Jamie (jamie.lim@kakaocorp.com)' +__copyright__ = 'Copyright (C) 2019-, Kakao Corp. All rights reserved.' +""" + + +########### +# imports # +########### +from argparse import ArgumentParser, Namespace +from collections import defaultdict +import logging +import os +import random +import sys +from typing import Iterator, List + +from khaiii.munjong.sejong_corpus import Sentence, sents, Word +from khaiii.resource.char_align import Aligner, AlignError, MrpChr + + +############# +# variables # +############# +_MAP_DIC = defaultdict(list) # char/tag => morpheme result mapping dictionary +_MAP_CASE = defaultdict(list) # char/tag => case(word) dictionary + + +############# +# functions # +############# +def _print_restore_dic(args: Namespace): + """ + 원형복원 사전을 출력한다. + Args: + args: program arguments + """ + if not args.restore_dic: + return + with open(args.restore_dic, 'w', encoding='UTF-8') as fout: + with open('{}.case'.format(args.restore_dic), 'w', encoding='UTF-8') as fcase: + for (char, tag), vals in _MAP_DIC.items(): + for idx, val in enumerate(vals): + print('{}/{}:{}\t{}'.format(char, tag, idx, MrpChr.to_str(val)), file=fout) + word = _MAP_CASE[char, tag][idx] + print('{}/{}:{}\t{}'.format(char, tag, idx, word), file=fcase) + + +def sent_iter(args: Namespace) -> Iterator[Sentence]: + """ + sentence generator + Args: + args: program arguments + Yields: + sentence + """ + for name in sorted(os.listdir(args.corpus_dir)): + if not name.endswith('.txt'): + continue + logging.info(name) + path = '{}/{}'.format(args.corpus_dir, name) + for sent in sents(open(path, 'r', encoding='UTF-8')): + yield sent + + +def _maps_to_tag(word: Word, maps: List[List[MrpChr]]) -> List[str]: + """ + 매핑 정보를 이용해 어절 내 각 음절 별로 출력 태그를 생성한다. + Args: + word: Word object + maps: 음절 별 매핑 정보 + Returns: + 출력 태그 리스트 + """ + tags = [] + for char, align in zip(word.raw, maps): + if len(align) == 1 and align[0].char == char: + tags.append(align[0].tag) + continue + tag = ':'.join([_.tag for _ in align]) + if (char, tag) in _MAP_DIC: + vals = _MAP_DIC[char, tag] + try: + idx = vals.index(align) + except ValueError: + idx = len(vals) + vals.append(align) + _MAP_CASE[char, tag].append(word) + tag = '{}:{}'.format(tag, idx) + else: + _MAP_DIC[char, tag].append(align) + _MAP_CASE[char, tag].append(word) + tag = '{}:0'.format(tag) + tags.append(tag) + return tags + + +def _print_sent(sent: Sentence, word_per_maps: List[List[List[MrpChr]]]): + """ + 각 어절 별 매핑 정보 리스트를 이용해 한 문장을 출력한다. + Args: + sent: Sentence object + word_per_maps: 각 어절별로 음절 매핑 정보를 포함하는 리스트 + """ + lines = [] + has_error = False + for word, maps in zip(sent.words, word_per_maps): + if maps: + if len(word.raw) == len(maps): + lines.append('{}\t{}'.format(word.raw, ' '.join(_maps_to_tag(word, maps)))) + else: + raise RuntimeError('length of maps is different from length of word') + else: + has_error = True + logging.debug(word) + if not has_error: + print('\n'.join(lines)) + print() + + +def run(args: Namespace): + """ + run function which is the start point of program + Args: + args: program arguments + """ + aligner = Aligner(args.rsc_src) + funmap = open(args.unmapped, 'w', encoding='UTF-8') if args.unmapped else None + + for sent in sent_iter(args): + if 0.0 < args.sample < 1.0 and random.random() >= args.sample: + continue + word_per_maps = [] + for word in sent.words: + try: + maps = aligner.align(word) + except AlignError as algn_err: + if funmap: + algn_err.add_msg(str(word)) + print(algn_err, file=funmap) + maps = [] + word_per_maps.append(maps) + _print_sent(sent, word_per_maps) + + _print_restore_dic(args) + aligner.print_middle_cnt() + + +######## +# main # +######## +def main(): + """ + main function processes only argument parsing + """ + parser = ArgumentParser(description='어절 내에서 원문과 형태소 사이에 정렬을 수행하고' + ' 음절기반 학습 코퍼스를 생성') + parser.add_argument('-c', '--corpus-dir', help='corpus dir', metavar='DIR', required=True) + parser.add_argument('--rsc-src', help='train resource dir ', metavar='DIR', + default='../rsc/src') + parser.add_argument('--output', help='output file ', metavar='FILE') + parser.add_argument('--restore-dic', help='restore dic output file', metavar='FILE') + parser.add_argument('--unmapped', help='unmapped log file', metavar='FILE') + parser.add_argument('--sample', help='sampling ratio', metavar='REAL', type=float, default=1.0) + parser.add_argument('--debug', help='enable debug', action='store_true') + args = parser.parse_args() + + if args.output: + sys.stdout = open(args.output, 'w', encoding='UTF-8') + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + run(args) + + +if __name__ == '__main__': + main()