StarDict: add write option large_file, support idxoffsetbits=64 on re…

…ad, #392 #422
ilius · Feb 21, 2023 · 65a39c1 · 65a39c1
1 parent e5c602b
commit 65a39c1
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 16 deletions.
diff --git a/doc/p/stardict.md b/doc/p/stardict.md
@@ -27,15 +27,16 @@
 
 ### Write options
 
-| Name             | Default | Type | Comment                                        |
-| ---------------- | ------- | ---- | ---------------------------------------------- |
-| dictzip          | `True`  | bool | Compress .dict file to .dict.dz                |
-| sametypesequence |         | str  | Definition format: h=html, m=plaintext, x=xdxf |
-| stardict_client  | `False` | bool | Modify html entries for StarDict 3.0           |
-| merge_syns       | `False` | bool | Write alternates to .idx instead of .syn       |
-| audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop)   |
-| audio_icon       | `True`  | bool | Add glossary's audio icon                      |
-| sqlite           | `False` | bool | Use SQLite to limit memory usage               |
+| Name             | Default | Type | Comment                                         |
+| ---------------- | ------- | ---- | ----------------------------------------------- |
+| large_file       | `False` | bool | Use idxoffsetbits=64 bits, for large files only |
+| dictzip          | `True`  | bool | Compress .dict file to .dict.dz                 |
+| sametypesequence |         | str  | Definition format: h=html, m=plaintext, x=xdxf  |
+| stardict_client  | `False` | bool | Modify html entries for StarDict 3.0            |
+| merge_syns       | `False` | bool | Write alternates to .idx instead of .syn        |
+| audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop)    |
+| audio_icon       | `True`  | bool | Add glossary's audio icon                       |
+| sqlite           | `False` | bool | Use SQLite to limit memory usage                |
 
 
 

diff --git a/plugins-meta/index.json b/plugins-meta/index.json
@@ -1410,6 +1410,11 @@
 		],
 		"singleFile": false,
 		"optionsProp": {
+			"large_file": {
+				"class": "BoolOption",
+				"type": "bool",
+				"comment": "Use idxoffsetbits=64 bits, for large files only"
+			},
 			"stardict_client": {
 				"class": "BoolOption",
 				"type": "bool",
@@ -1486,6 +1491,7 @@
 			"unicode_errors": "strict"
 		},
 		"writeOptions": {
+			"large_file": false,
 			"dictzip": true,
 			"sametypesequence": "",
 			"stardict_client": false,

diff --git a/pyglossary/plugins/stardict.py b/pyglossary/plugins/stardict.py
@@ -16,7 +16,7 @@
 )
 from pprint import pformat
 from time import time as now
-from typing import Any, Dict, Generator, Iterator, List, Tuple
+from typing import Any, Callable, Dict, Generator, Iterator, List, Tuple
 
 from pyglossary.core import log
 from pyglossary.flags import ALWAYS, DEFAULT_YES
@@ -28,6 +28,8 @@
 from pyglossary.text_utils import (
 	uint32FromBytes,
 	uint32ToBytes,
+	uint64FromBytes,
+	uint64ToBytes,
 )
 
 enable = True
@@ -47,7 +49,11 @@
 	"http://huzheng.org/stardict/",
 	"huzheng.org/stardict",
 )
+# https://github.com/huzheng001/stardict-3/blob/master/dict/doc/StarDictFileFormat
 optionsProp = {
+	"large_file": BoolOption(
+		comment="Use idxoffsetbits=64 bits, for large files only",
+	),
 	"stardict_client": BoolOption(
 		comment="Modify html entries for StarDict 3.0",
 	),
@@ -247,6 +253,7 @@ def __init__(self, glos: GlossaryType) -> None:
 		self.clear()
 
 		self._xdxfTr = None
+		self._large_file = False
 
 		"""
 		indexData format
@@ -348,6 +355,15 @@ def readIfoFile(self) -> None:
 					continue
 				self._glos.setInfo(key, value)
 
+		idxoffsetbits = self._glos.getInfo("idxoffsetbits")
+		if idxoffsetbits:
+			if idxoffsetbits == "32":
+				self._large_file = False
+			elif idxoffsetbits == "64":
+				self._large_file = True
+			else:
+				raise ValueError(f"invalid {idxoffsetbits = }")
+
 	def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":
 		if isfile(self._filename + ".idx.gz"):
 			with gzip.open(self._filename + ".idx.gz") as idxFile:
@@ -358,6 +374,14 @@ def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":
 
 		indexData = []
 		pos = 0
+
+		if self._large_file:
+			def getOffset():
+				return uint64FromBytes(idxBytes[pos:pos + 8]), pos + 8
+		else:
+			def getOffset():
+				return uint32FromBytes(idxBytes[pos:pos + 4]), pos + 4
+
 		while pos < len(idxBytes):
 			beg = pos
 			pos = idxBytes.find(b"\x00", beg)
@@ -369,8 +393,7 @@ def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":
 			if pos + 8 > len(idxBytes):
 				log.error("Index file is corrupted")
 				break
-			offset = uint32FromBytes(idxBytes[pos:pos + 4])
-			pos += 4
+			offset, pos = getOffset()
 			size = uint32FromBytes(idxBytes[pos:pos + 4])
 			pos += 4
 			indexData.append((b_word, offset, size))
@@ -682,6 +705,7 @@ def parseDefiBlockGeneral(self, b_block: bytes) -> "List[Tuple[bytes, int]]":
 
 
 class Writer(object):
+	_large_file: bool = False
 	_dictzip: bool = True
 	_sametypesequence: str = ""  # type: Literal["", "h", "m", "x", None]
 	_stardict_client: bool = False
@@ -800,6 +824,12 @@ def newSynList(self):
 			return MemList()
 		return SynSqList(join(self._glos.tmpDataDir, "stardict-syn.db"))
 
+	def dictMarkToBytesFunc(self) -> "Tuple[Callable, int]":
+		if self._large_file:
+			return uint64ToBytes, 0xffffffffffffffff
+
+		return uint32ToBytes, 0xffffffff
+
 	def writeCompact(self, defiFormat) -> None:
 		"""
 		Build StarDict dictionary with sametypesequence option specified.
@@ -816,6 +846,8 @@ def writeCompact(self, defiFormat) -> None:
 		dictFile = open(self._filename + ".dict", "wb")
 		idxFile = open(self._filename + ".idx", "wb")
 
+		dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()
+
 		t0 = now()
 		wordCount = 0
 		if not isdir(self._resDir):
@@ -844,13 +876,20 @@ def writeCompact(self, defiFormat) -> None:
 			blockLen = len(b_dictBlock)
 
 			b_idxBlock = word.encode("utf-8") + b"\x00" + \
-				uint32ToBytes(dictMark) + \
+				dictMarkToBytes(dictMark) + \
 				uint32ToBytes(blockLen)
 			idxFile.write(b_idxBlock)
 
 			dictMark += blockLen
 			wordCount += 1
 
+			if dictMark > dictMarkMax:
+				log.error(
+					f"StarDict: {dictMark = } is too big, "
+					f"set option large_file=true",
+				)
+				break
+
 		dictFile.close()
 		idxFile.close()
 		if not os.listdir(self._resDir):
@@ -883,6 +922,8 @@ def writeGeneral(self) -> None:
 		if not isdir(self._resDir):
 			os.mkdir(self._resDir)
 
+		dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()
+
 		entryIndex = -1
 		while True:
 			entry = yield
@@ -913,13 +954,20 @@ def writeGeneral(self) -> None:
 			blockLen = len(b_dictBlock)
 
 			b_idxBlock = word.encode("utf-8") + b"\x00" + \
-				uint32ToBytes(dictMark) + \
+				dictMarkToBytes(dictMark) + \
 				uint32ToBytes(blockLen)
 			idxFile.write(b_idxBlock)
 
 			dictMark += blockLen
 			wordCount += 1
 
+			if dictMark > dictMarkMax:
+				log.error(
+					f"StarDict: {dictMark = } is too big, "
+					f"set option large_file=true",
+				)
+				break
+
 		dictFile.close()
 		idxFile.close()
 		if not os.listdir(self._resDir):
@@ -984,6 +1032,8 @@ def writeCompactMergeSyns(self, defiFormat) -> None:
 		if not isdir(self._resDir):
 			os.mkdir(self._resDir)
 
+		dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()
+
 		entryIndex = -1
 		while True:
 			entry = yield
@@ -1003,12 +1053,19 @@ def writeCompactMergeSyns(self, defiFormat) -> None:
 			dictFile.write(b_dictBlock)
 			blockLen = len(b_dictBlock)
 
-			blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
+			blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
 			for word in words:
 				idxBlockList.append((word.encode("utf-8"), blockData))
 
 			dictMark += blockLen
 
+			if dictMark > dictMarkMax:
+				log.error(
+					f"StarDict: {dictMark = } is too big, "
+					f"set option large_file=true",
+				)
+				break
+
 		wordCount = self.writeIdxFile(idxBlockList)
 
 		dictFile.close()
@@ -1041,6 +1098,8 @@ def writeGeneralMergeSyns(self) -> None:
 		if not isdir(self._resDir):
 			os.mkdir(self._resDir)
 
+		dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()
+
 		entryIndex = -1
 		while True:
 			entry = yield
@@ -1067,12 +1126,19 @@ def writeGeneralMergeSyns(self) -> None:
 			dictFile.write(b_dictBlock)
 			blockLen = len(b_dictBlock)
 
-			blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
+			blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
 			for word in words:
 				idxBlockList.append((word.encode("utf-8"), blockData))
 
 			dictMark += blockLen
 
+			if dictMark > dictMarkMax:
+				log.error(
+					f"StarDict: {dictMark = } is too big, "
+					f"set option large_file=true",
+				)
+				break
+
 		wordCount = self.writeIdxFile(idxBlockList)
 
 		dictFile.close()
@@ -1138,6 +1204,8 @@ def writeIfoFile(
 			("wordcount", wordCount),
 			("idxfilesize", indexFileSize),
 		]
+		if self._large_file:
+			ifo.append(("idxoffsetbits", "64"))
 		if defiFormat:
 			ifo.append(("sametypesequence", defiFormat))
 		if synWordCount > 0:

diff --git a/pyglossary/text_utils.py b/pyglossary/text_utils.py
@@ -156,10 +156,18 @@ def uint32ToBytes(n: int) -> bytes:
 	return struct.pack('>I', n)
 
 
+def uint64ToBytes(n: int) -> bytes:
+	return struct.pack('>Q', n)
+
+
 def uint32FromBytes(bs: bytes) -> int:
 	return struct.unpack('>I', bs)[0]
 
 
+def uint64FromBytes(bs: bytes) -> int:
+	return struct.unpack('>Q', bs)[0]
+
+
 def uintFromBytes(bs: bytes) -> int:
 	n = 0
 	for c in bs: