From 5519c46712ad5a29449fd12d44fbb5f6159eb376 Mon Sep 17 00:00:00 2001
From: Tom Dong <tomdong@google.com>
Date: Fri, 21 Oct 2022 01:28:00 -0700
Subject: [PATCH 1/2] Decode mdict title & description if they're bytes

Like words, they are decoded as utf-8.
---
 pyglossary/plugins/octopus_mdict_new/__init__.py | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/pyglossary/plugins/octopus_mdict_new/__init__.py b/pyglossary/plugins/octopus_mdict_new/__init__.py
index d71ba5869..a8f3b94a3 100644
--- a/pyglossary/plugins/octopus_mdict_new/__init__.py
+++ b/pyglossary/plugins/octopus_mdict_new/__init__.py
@@ -118,6 +118,8 @@ def open(self, filename):
 		# 	self._glos.setInfo(key, value)
 		try:
 			title = self._mdx.header[b"Title"]
+			if isinstance(title, bytes):
+				title = title.decode("utf-8")
 		except KeyError:
 			pass
 		else:
@@ -125,6 +127,8 @@ def open(self, filename):
 			if title:
 				self._glos.setInfo("name", title)
 		desc = self._mdx.header.get(b"Description", "")
+		if isinstance(desc, bytes):
+			desc = desc.decode("utf-8")
 		if desc:
 			self._glos.setInfo("description", desc)
 

From fbc8f8ddd80649d9878d4caa26a899c808734b97 Mon Sep 17 00:00:00 2001
From: Tom Dong <tomdong@google.com>
Date: Fri, 21 Oct 2022 01:31:15 -0700
Subject: [PATCH 2/2] When remove HTML tags, also replace <div> with \n

Treat <div> the same way <p> is treated.
---
 pyglossary/entry_filters.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pyglossary/entry_filters.py b/pyglossary/entry_filters.py
index e08c24905..4a0a1bb6f 100644
--- a/pyglossary/entry_filters.py
+++ b/pyglossary/entry_filters.py
@@ -128,6 +128,10 @@ def __init__(self, glos: "GlossaryType"):
 			'<p( [^<>]*?)?>(.*?)</p>',
 			re.DOTALL,
 		)
+		self._div_pattern = re.compile(
+			'<div( [^<>]*?)?>(.*?)</div>',
+			re.DOTALL,
+		)
 		self._br_pattern = re.compile(
 			"<br[ /]*>",
 			re.IGNORECASE,
@@ -140,6 +144,11 @@ def fixStr(st: str) -> str:
 			st = self._p_pattern.sub("\\2\n", st)
 			# if there is </p> left without opening, replace with <br>
 			st = st.replace("</p>", "\n")
+
+			st = self._div_pattern.sub("\\2\n", st)
+			# if there is </div> left without opening, replace with <br>
+			st = st.replace("</div>", "\n")
+
 			st = self._br_pattern.sub("\n", st)
 			return BeautifulSoup(st, "lxml").text