From 3c8ec2094303f214576d80ea49dc703d2a6c2991 Mon Sep 17 00:00:00 2001 From: Michal Harakal Date: Tue, 5 Oct 2021 11:29:54 +0200 Subject: [PATCH] Add initial version of block splitter. Related-To: #17 --- tests/test_block_split.py | 0 transformer_tools/text_utils.py | 57 +++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 tests/test_block_split.py create mode 100644 transformer_tools/text_utils.py diff --git a/tests/test_block_split.py b/tests/test_block_split.py new file mode 100644 index 0000000..e69de29 diff --git a/transformer_tools/text_utils.py b/transformer_tools/text_utils.py new file mode 100644 index 0000000..2bd0d5f --- /dev/null +++ b/transformer_tools/text_utils.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 Michal Harakal, Deutsche Telekom AG +# This software is distributed under the terms of the MIT license +# which is available at https://opensource.org/licenses/MIT + + +"""Text utils.""" + +from somajo import SoMaJo + + +class BlockSplitter: + """Splittinglongers texts into a blocks with complete sentences limited by gived character length.""" + + def __init__(self, is_german: bool = True) -> None: + """Constructor. + + Args: + is_german: Indicate if a german vocabulary has to be loaded for senteces + splitting with SoMaJo. Otherwise load english. + """ + self.tokenizer = ( + SoMaJo("de_CMC", split_camel_case=True) + if is_german + else SoMaJo(language="en_PTB") + ) + + def split_text_to_blocks(self, text: str, block_size: int = 800) -> list[str]: + """Split text to blocks with length limited be "block_size". + + Args: + text: Text to be splitted. + block_size: desired maximal block length. + """ + # tokenize text into senteces + sentences = self.tokenizer.tokenize_text([text]) + counter = 0 + result = list() + block = "" + # process sentences + for sentence_tokens in sentences: + sentence_text_list = list() + # join tokens of one sentes into single string + for token in sentence_tokens: + if token.text != ".": + sentence_text_list.append(token.text) + sentence_text = " ".join(sentence_text_list) + + if len(sentence_text) + len(block) < block_size: + block = block + ". " + sentence_text + counter += len(sentence_text) + else: + result.append(block) + block = sentence_text + counter = 0 + if len(block) > 0: + result.append(block) + return result