Skip to content

Commit

Permalink
When remove HTML tags, also replace <div> with \n
Browse files Browse the repository at this point in the history
Treat <div> the same way <p> is treated.
  • Loading branch information
tomtung authored and ilius committed Oct 21, 2022
1 parent ffbc91a commit 3eaea77
Showing 1 changed file with 9 additions and 0 deletions.
9 changes: 9 additions & 0 deletions pyglossary/entry_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ def __init__(self, glos: "GlossaryType"):
'<p( [^<>]*?)?>(.*?)</p>',
re.DOTALL,
)
self._div_pattern = re.compile(
'<div( [^<>]*?)?>(.*?)</div>',
re.DOTALL,
)
self._br_pattern = re.compile(
"<br[ /]*>",
re.IGNORECASE,
Expand All @@ -140,6 +144,11 @@ def fixStr(st: str) -> str:
st = self._p_pattern.sub("\\2\n", st)
# if there is </p> left without opening, replace with <br>
st = st.replace("</p>", "\n")

st = self._div_pattern.sub("\\2\n", st)
# if there is </div> left without opening, replace with <br>
st = st.replace("</div>", "\n")

st = self._br_pattern.sub("\n", st)
return BeautifulSoup(st, "lxml").text

Expand Down

0 comments on commit 3eaea77

Please sign in to comment.