Skip to content

Commit

Permalink
StarDict: add write option large_file, support idxoffsetbits=64 on re…
Browse files Browse the repository at this point in the history
…ad, #392 #422
  • Loading branch information
ilius committed Jan 29, 2023
1 parent d37c25f commit 9829130
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 15 deletions.
19 changes: 10 additions & 9 deletions doc/p/stardict.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@

### Write options

| Name | Default | Type | Comment |
| ---------------- | ------- | ---- | ---------------------------------------------- |
| dictzip | `True` | bool | Compress .dict file to .dict.dz |
| sametypesequence | | str | Definition format: h=html, m=plaintext, x=xdxf |
| stardict_client | `False` | bool | Modify html entries for StarDict 3.0 |
| merge_syns | `False` | bool | Write alternates to .idx instead of .syn |
| audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop) |
| audio_icon | `True` | bool | Add glossary's audio icon |
| sqlite | `False` | bool | Use SQLite to limit memory usage |
| Name | Default | Type | Comment |
| ---------------- | ------- | ---- | ----------------------------------------------- |
| large_file | `False` | bool | Use idxoffsetbits=64 bits, for large files only |
| dictzip | `True` | bool | Compress .dict file to .dict.dz |
| sametypesequence | | str | Definition format: h=html, m=plaintext, x=xdxf |
| stardict_client | `False` | bool | Modify html entries for StarDict 3.0 |
| merge_syns | `False` | bool | Write alternates to .idx instead of .syn |
| audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop) |
| audio_icon | `True` | bool | Add glossary's audio icon |
| sqlite | `False` | bool | Use SQLite to limit memory usage |



Expand Down
6 changes: 6 additions & 0 deletions plugins-meta/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,11 @@
],
"singleFile": false,
"optionsProp": {
"large_file": {
"class": "BoolOption",
"type": "bool",
"comment": "Use idxoffsetbits=64 bits, for large files only"
},
"stardict_client": {
"class": "BoolOption",
"type": "bool",
Expand Down Expand Up @@ -1519,6 +1524,7 @@
"unicode_errors": "strict"
},
"writeOptions": {
"large_file": false,
"dictzip": true,
"sametypesequence": "",
"stardict_client": false,
Expand Down
80 changes: 74 additions & 6 deletions pyglossary/plugins/stardict.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

from pyglossary.text_utils import (
uint32ToBytes,
uint64ToBytes,
uint32FromBytes,
uint64FromBytes,
)

from pyglossary.plugins.formats_common import *
Expand All @@ -37,7 +39,11 @@
"http://huzheng.org/stardict/",
"huzheng.org/stardict",
)
# https://github.com/huzheng001/stardict-3/blob/master/dict/doc/StarDictFileFormat
optionsProp = {
"large_file": BoolOption(
comment="Use idxoffsetbits=64 bits, for large files only",
),
"stardict_client": BoolOption(
comment="Modify html entries for StarDict 3.0",
),
Expand Down Expand Up @@ -238,6 +244,7 @@ def __init__(self, glos: GlossaryType):
self.clear()

self._xdxfTr = None
self._large_file = False

"""
indexData format
Expand Down Expand Up @@ -334,6 +341,15 @@ def readIfoFile(self) -> None:
continue
self._glos.setInfo(key, value)

idxoffsetbits = self._glos.getInfo("idxoffsetbits")
if idxoffsetbits:
if idxoffsetbits == "32":
self._large_file = False
elif idxoffsetbits == "64":
self._large_file = True
else:
raise ValueError(f"invalid {idxoffsetbits = }")

def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":
if isfile(self._filename + ".idx.gz"):
with gzip.open(self._filename + ".idx.gz") as idxFile:
Expand All @@ -344,6 +360,14 @@ def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":

indexData = []
pos = 0

if self._large_file:
def getOffset():
return uint64FromBytes(idxBytes[pos:pos + 8]), pos + 8
else:
def getOffset():
return uint32FromBytes(idxBytes[pos:pos + 4]), pos + 4

while pos < len(idxBytes):
beg = pos
pos = idxBytes.find(b"\x00", beg)
Expand All @@ -355,8 +379,7 @@ def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":
if pos + 8 > len(idxBytes):
log.error("Index file is corrupted")
break
offset = uint32FromBytes(idxBytes[pos:pos + 4])
pos += 4
offset, pos = getOffset()
size = uint32FromBytes(idxBytes[pos:pos + 4])
pos += 4
indexData.append((b_word, offset, size))
Expand Down Expand Up @@ -668,6 +691,7 @@ def parseDefiBlockGeneral(self, b_block: bytes) -> "List[Tuple[bytes, int]]":


class Writer(object):
_large_file: bool = False
_dictzip: bool = True
_sametypesequence: str = "" # type: Literal["", "h", "m", "x", None]
_stardict_client: bool = False
Expand Down Expand Up @@ -786,6 +810,12 @@ def newSynList(self):
return MemList()
return SynSqList(join(self._glos.tmpDataDir, "stardict-syn.db"))

def dictMarkToBytesFunc(self) -> "Tuple[Callable, int]":
if self._large_file:
return uint64ToBytes, 0xffffffffffffffff

return uint32ToBytes, 0xffffffff

def writeCompact(self, defiFormat):
"""
Build StarDict dictionary with sametypesequence option specified.
Expand All @@ -802,6 +832,8 @@ def writeCompact(self, defiFormat):
dictFile = open(self._filename + ".dict", "wb")
idxFile = open(self._filename + ".idx", "wb")

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

t0 = now()
wordCount = 0
if not isdir(self._resDir):
Expand Down Expand Up @@ -830,13 +862,20 @@ def writeCompact(self, defiFormat):
blockLen = len(b_dictBlock)

b_idxBlock = word.encode("utf-8") + b"\x00" + \
uint32ToBytes(dictMark) + \
dictMarkToBytes(dictMark) + \
uint32ToBytes(blockLen)
idxFile.write(b_idxBlock)

dictMark += blockLen
wordCount += 1

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

dictFile.close()
idxFile.close()
if not os.listdir(self._resDir):
Expand Down Expand Up @@ -869,6 +908,8 @@ def writeGeneral(self) -> None:
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

entryIndex = -1
while True:
entry = yield
Expand Down Expand Up @@ -899,13 +940,20 @@ def writeGeneral(self) -> None:
blockLen = len(b_dictBlock)

b_idxBlock = word.encode("utf-8") + b"\x00" + \
uint32ToBytes(dictMark) + \
dictMarkToBytes(dictMark) + \
uint32ToBytes(blockLen)
idxFile.write(b_idxBlock)

dictMark += blockLen
wordCount += 1

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

dictFile.close()
idxFile.close()
if not os.listdir(self._resDir):
Expand Down Expand Up @@ -970,6 +1018,8 @@ def writeCompactMergeSyns(self, defiFormat):
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

entryIndex = -1
while True:
entry = yield
Expand All @@ -989,12 +1039,19 @@ def writeCompactMergeSyns(self, defiFormat):
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
for word in words:
idxBlockList.append((word.encode("utf-8"), blockData))

dictMark += blockLen

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

wordCount = self.writeIdxFile(idxBlockList)

dictFile.close()
Expand Down Expand Up @@ -1027,6 +1084,8 @@ def writeGeneralMergeSyns(self) -> None:
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

entryIndex = -1
while True:
entry = yield
Expand All @@ -1053,12 +1112,19 @@ def writeGeneralMergeSyns(self) -> None:
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
for word in words:
idxBlockList.append((word.encode("utf-8"), blockData))

dictMark += blockLen

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

wordCount = self.writeIdxFile(idxBlockList)

dictFile.close()
Expand Down Expand Up @@ -1124,6 +1190,8 @@ def writeIfoFile(
("wordcount", wordCount),
("idxfilesize", indexFileSize),
]
if self._large_file:
ifo.append(("idxoffsetbits", "64"))
if defiFormat:
ifo.append(("sametypesequence", defiFormat))
if synWordCount > 0:
Expand Down
8 changes: 8 additions & 0 deletions pyglossary/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,18 @@ def uint32ToBytes(n: int) -> bytes:
return struct.pack('>I', n)


def uint64ToBytes(n: int) -> bytes:
return struct.pack('>Q', n)


def uint32FromBytes(bs: bytes) -> int:
return struct.unpack('>I', bs)[0]


def uint64FromBytes(bs: bytes) -> int:
return struct.unpack('>Q', bs)[0]


def uintFromBytes(bs: bytes) -> int:
n = 0
for c in bs:
Expand Down

0 comments on commit 9829130

Please sign in to comment.