From 87d8f2e821ea16dee2d27150dd4c7af2e2a7da19 Mon Sep 17 00:00:00 2001 From: Elijas <4084885+Elijas@users.noreply.github.com> Date: Thu, 21 Dec 2023 02:45:17 +0200 Subject: [PATCH] Revert "Merge branch 'main' of github.com:alphanome-ai/sec-parser" This reverts commit 62c2dcf2acb7e2c4cae0d8643fea8f7673277183, reversing changes made to cf34079720e09c265407425e011af8d6e92a77d4. --- .../processing_steps/title_classifier.py | 12 ++----- sec_parser/semantic_elements/page_element.py | 32 ------------------- 2 files changed, 2 insertions(+), 42 deletions(-) delete mode 100644 sec_parser/semantic_elements/page_element.py diff --git a/sec_parser/processing_steps/title_classifier.py b/sec_parser/processing_steps/title_classifier.py index 927a2ec..7e3b0cc 100644 --- a/sec_parser/processing_steps/title_classifier.py +++ b/sec_parser/processing_steps/title_classifier.py @@ -11,7 +11,6 @@ TextStyle, ) from sec_parser.semantic_elements.title_element import TitleElement -from sec_parser.semantic_elements.page_element import PageElement if TYPE_CHECKING: # pragma: no cover from sec_parser.semantic_elements.abstract_semantic_element import ( @@ -60,18 +59,11 @@ def _process_element( """Process each element and convert to TitleElement if necessary.""" if not isinstance(element, HighlightedTextElement): return element - + # Ensure the style is tracked self._add_unique_style(element.style) - level = self._unique_styles_by_order.index(element.style) - - if PageElement.is_page(source = element): - return PageElement.create_from_element( - element, - log_origin=self.__class__.__name__, - ) - + level = self._unique_styles_by_order.index(element.style) return TitleElement.create_from_element( element, level=level, diff --git a/sec_parser/semantic_elements/page_element.py b/sec_parser/semantic_elements/page_element.py deleted file mode 100644 index 2b8084b..0000000 --- a/sec_parser/semantic_elements/page_element.py +++ /dev/null @@ -1,32 +0,0 @@ -from __future__ import annotations - -from sec_parser.semantic_elements import IrrelevantElement -from sec_parser.semantic_elements.abstract_semantic_element import AbstractSemanticElement -from sec_parser.processing_engine.processing_log import LogItemOrigin, ProcessingLog - - -class PageElement(IrrelevantElement): - """ - The PageElement class represents the page content of a paragraph or other content object. - It relates to an irrelevant element, storing page numbers and context for the document. - """ - - PageNum = 0 - - def is_page(source: AbstractSemanticElement): - try: - if source.text.__contains__('|'): - pageNum = source.text.replace(" ","").split('|')[-1] - else: - pageNum = int(source.text.strip()) - except ValueError: - return False - - if PageElement.PageNum == 0: - PageElement.PageNum = int(pageNum) - - if int(pageNum) == PageElement.PageNum: - PageElement.PageNum = PageElement.PageNum + 1 - return True - - return False