From 5519c46712ad5a29449fd12d44fbb5f6159eb376 Mon Sep 17 00:00:00 2001 From: Tom Dong Date: Fri, 21 Oct 2022 01:28:00 -0700 Subject: [PATCH 1/2] Decode mdict title & description if they're bytes Like words, they are decoded as utf-8. --- pyglossary/plugins/octopus_mdict_new/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyglossary/plugins/octopus_mdict_new/__init__.py b/pyglossary/plugins/octopus_mdict_new/__init__.py index d71ba5869..a8f3b94a3 100644 --- a/pyglossary/plugins/octopus_mdict_new/__init__.py +++ b/pyglossary/plugins/octopus_mdict_new/__init__.py @@ -118,6 +118,8 @@ def open(self, filename): # self._glos.setInfo(key, value) try: title = self._mdx.header[b"Title"] + if isinstance(title, bytes): + title = title.decode("utf-8") except KeyError: pass else: @@ -125,6 +127,8 @@ def open(self, filename): if title: self._glos.setInfo("name", title) desc = self._mdx.header.get(b"Description", "") + if isinstance(desc, bytes): + desc = desc.decode("utf-8") if desc: self._glos.setInfo("description", desc) From fbc8f8ddd80649d9878d4caa26a899c808734b97 Mon Sep 17 00:00:00 2001 From: Tom Dong Date: Fri, 21 Oct 2022 01:31:15 -0700 Subject: [PATCH 2/2] When remove HTML tags, also replace
with \n Treat
the same way

is treated. --- pyglossary/entry_filters.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pyglossary/entry_filters.py b/pyglossary/entry_filters.py index e08c24905..4a0a1bb6f 100644 --- a/pyglossary/entry_filters.py +++ b/pyglossary/entry_filters.py @@ -128,6 +128,10 @@ def __init__(self, glos: "GlossaryType"): ']*?)?>(.*?)

', re.DOTALL, ) + self._div_pattern = re.compile( + ']*?)?>(.*?)
', + re.DOTALL, + ) self._br_pattern = re.compile( "", re.IGNORECASE, @@ -140,6 +144,11 @@ def fixStr(st: str) -> str: st = self._p_pattern.sub("\\2\n", st) # if there is

left without opening, replace with
st = st.replace("

", "\n") + + st = self._div_pattern.sub("\\2\n", st) + # if there is
left without opening, replace with
+ st = st.replace("", "\n") + st = self._br_pattern.sub("\n", st) return BeautifulSoup(st, "lxml").text