From 3c8ec2094303f214576d80ea49dc703d2a6c2991 Mon Sep 17 00:00:00 2001
From: Michal Harakal <michal.harakal@googlemail.com>
Date: Tue, 5 Oct 2021 11:29:54 +0200
Subject: [PATCH] Add initial version of block splitter.

Related-To: #17
---
 tests/test_block_split.py       |  0
 transformer_tools/text_utils.py | 57 +++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 tests/test_block_split.py
 create mode 100644 transformer_tools/text_utils.py

diff --git a/tests/test_block_split.py b/tests/test_block_split.py
new file mode 100644
index 0000000..e69de29
diff --git a/transformer_tools/text_utils.py b/transformer_tools/text_utils.py
new file mode 100644
index 0000000..2bd0d5f
--- /dev/null
+++ b/transformer_tools/text_utils.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 Michal Harakal, Deutsche Telekom AG
+# This software is distributed under the terms of the MIT license
+# which is available at https://opensource.org/licenses/MIT
+
+
+"""Text utils."""
+
+from somajo import SoMaJo
+
+
+class BlockSplitter:
+    """Splittinglongers texts into a blocks with complete sentences limited by gived character length."""
+
+    def __init__(self, is_german: bool = True) -> None:
+        """Constructor.
+
+        Args:
+            is_german: Indicate if a german vocabulary has to be loaded for senteces
+            splitting with SoMaJo. Otherwise load english.
+        """
+        self.tokenizer = (
+            SoMaJo("de_CMC", split_camel_case=True)
+            if is_german
+            else SoMaJo(language="en_PTB")
+        )
+
+    def split_text_to_blocks(self, text: str, block_size: int = 800) -> list[str]:
+        """Split text to blocks with length limited be "block_size".
+
+        Args:
+            text: Text to be splitted.
+            block_size: desired maximal block length.
+        """
+        # tokenize text into senteces
+        sentences = self.tokenizer.tokenize_text([text])
+        counter = 0
+        result = list()
+        block = ""
+        # process sentences
+        for sentence_tokens in sentences:
+            sentence_text_list = list()
+            # join tokens of one sentes into single string
+            for token in sentence_tokens:
+                if token.text != ".":
+                    sentence_text_list.append(token.text)
+            sentence_text = " ".join(sentence_text_list)
+
+            if len(sentence_text) + len(block) < block_size:
+                block = block + ". " + sentence_text
+                counter += len(sentence_text)
+            else:
+                result.append(block)
+                block = sentence_text
+                counter = 0
+        if len(block) > 0:
+            result.append(block)
+        return result