Skip to content

Commit

Permalink
Escape HTML output
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval authored Aug 16, 2024
2 parents 3fbe596 + 00023b1 commit a833acf
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 5 deletions.
7 changes: 4 additions & 3 deletions textractor/entities/line.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from textractor.exceptions import InputError
from textractor.entities.document_entity import DocumentEntity
from textractor.visualizers.entitylist import EntityList

from textractor.utils.html_utils import escape_text
from textractor.data.text_linearization_config import TextLinearizationConfig

class Line(DocumentEntity):
"""
Expand Down Expand Up @@ -64,13 +65,13 @@ def words(self):
"""
return self._children

def get_text_and_words(self, config):
def get_text_and_words(self, config: TextLinearizationConfig = TextLinearizationConfig()):
if not self.bbox:
self.bbox = BoundingBox.enclosing_bbox(self.words)
for w in self.words:
w.line_id = self.id
w.line_bbox = self.bbox
return self.text, self.words
return escape_text(self.text, config), self.words

@property
def page(self):
Expand Down
4 changes: 2 additions & 2 deletions textractor/entities/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from textractor.data.text_linearization_config import TextLinearizationConfig
from textractor.entities.bbox import BoundingBox
from textractor.entities.document_entity import DocumentEntity

from textractor.utils.html_utils import escape_text

class Word(DocumentEntity):
"""
Expand Down Expand Up @@ -148,7 +148,7 @@ def words(self):
def get_text_and_words(
self, config: TextLinearizationConfig = TextLinearizationConfig()
):
return self.text, [self]
return escape_text(self.text, config), [self]

def __repr__(self) -> str:
"""
Expand Down
7 changes: 7 additions & 0 deletions textractor/utils/html_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import html
from textractor.data.html_linearization_config import HTMLLinearizationConfig

def add_id_to_html_tag(prefix, id, config):
Expand All @@ -9,3 +10,9 @@ def add_id_to_html_tag(prefix, id, config):
return prefix[:-1] + f' id="{id}"' + prefix[-1]
else:
return prefix

def escape_text(text, config):
if not isinstance(config, HTMLLinearizationConfig):
return text
else:
return html.escape(text)

0 comments on commit a833acf

Please sign in to comment.