Skip to content

Commit

Permalink
Add initial version of block splitter.
Browse files Browse the repository at this point in the history
Related-To: telekom#17
  • Loading branch information
michalharakal committed Oct 5, 2021
1 parent a85e0e8 commit 3c8ec20
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 0 deletions.
Empty file added tests/test_block_split.py
Empty file.
57 changes: 57 additions & 0 deletions transformer_tools/text_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) 2021 Michal Harakal, Deutsche Telekom AG
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT


"""Text utils."""

from somajo import SoMaJo


class BlockSplitter:
"""Splittinglongers texts into a blocks with complete sentences limited by gived character length."""

def __init__(self, is_german: bool = True) -> None:
"""Constructor.
Args:
is_german: Indicate if a german vocabulary has to be loaded for senteces
splitting with SoMaJo. Otherwise load english.
"""
self.tokenizer = (
SoMaJo("de_CMC", split_camel_case=True)
if is_german
else SoMaJo(language="en_PTB")
)

def split_text_to_blocks(self, text: str, block_size: int = 800) -> list[str]:
"""Split text to blocks with length limited be "block_size".
Args:
text: Text to be splitted.
block_size: desired maximal block length.
"""
# tokenize text into senteces
sentences = self.tokenizer.tokenize_text([text])
counter = 0
result = list()
block = ""
# process sentences
for sentence_tokens in sentences:
sentence_text_list = list()
# join tokens of one sentes into single string
for token in sentence_tokens:
if token.text != ".":
sentence_text_list.append(token.text)
sentence_text = " ".join(sentence_text_list)

if len(sentence_text) + len(block) < block_size:
block = block + ". " + sentence_text
counter += len(sentence_text)
else:
result.append(block)
block = sentence_text
counter = 0
if len(block) > 0:
result.append(block)
return result

0 comments on commit 3c8ec20

Please sign in to comment.