Skip to content

Commit

Permalink
StarDict: add write option large_file, support idxoffsetbits=64 on re…
Browse files Browse the repository at this point in the history
…ad, #392 #422
  • Loading branch information
ilius committed Feb 21, 2023
1 parent e5c602b commit 65a39c1
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 16 deletions.
19 changes: 10 additions & 9 deletions doc/p/stardict.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@

### Write options

| Name | Default | Type | Comment |
| ---------------- | ------- | ---- | ---------------------------------------------- |
| dictzip | `True` | bool | Compress .dict file to .dict.dz |
| sametypesequence | | str | Definition format: h=html, m=plaintext, x=xdxf |
| stardict_client | `False` | bool | Modify html entries for StarDict 3.0 |
| merge_syns | `False` | bool | Write alternates to .idx instead of .syn |
| audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop) |
| audio_icon | `True` | bool | Add glossary's audio icon |
| sqlite | `False` | bool | Use SQLite to limit memory usage |
| Name | Default | Type | Comment |
| ---------------- | ------- | ---- | ----------------------------------------------- |
| large_file | `False` | bool | Use idxoffsetbits=64 bits, for large files only |
| dictzip | `True` | bool | Compress .dict file to .dict.dz |
| sametypesequence | | str | Definition format: h=html, m=plaintext, x=xdxf |
| stardict_client | `False` | bool | Modify html entries for StarDict 3.0 |
| merge_syns | `False` | bool | Write alternates to .idx instead of .syn |
| audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop) |
| audio_icon | `True` | bool | Add glossary's audio icon |
| sqlite | `False` | bool | Use SQLite to limit memory usage |



Expand Down
6 changes: 6 additions & 0 deletions plugins-meta/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,11 @@
],
"singleFile": false,
"optionsProp": {
"large_file": {
"class": "BoolOption",
"type": "bool",
"comment": "Use idxoffsetbits=64 bits, for large files only"
},
"stardict_client": {
"class": "BoolOption",
"type": "bool",
Expand Down Expand Up @@ -1486,6 +1491,7 @@
"unicode_errors": "strict"
},
"writeOptions": {
"large_file": false,
"dictzip": true,
"sametypesequence": "",
"stardict_client": false,
Expand Down
82 changes: 75 additions & 7 deletions pyglossary/plugins/stardict.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)
from pprint import pformat
from time import time as now
from typing import Any, Dict, Generator, Iterator, List, Tuple
from typing import Any, Callable, Dict, Generator, Iterator, List, Tuple

from pyglossary.core import log
from pyglossary.flags import ALWAYS, DEFAULT_YES
Expand All @@ -28,6 +28,8 @@
from pyglossary.text_utils import (
uint32FromBytes,
uint32ToBytes,
uint64FromBytes,
uint64ToBytes,
)

enable = True
Expand All @@ -47,7 +49,11 @@
"http://huzheng.org/stardict/",
"huzheng.org/stardict",
)
# https://github.com/huzheng001/stardict-3/blob/master/dict/doc/StarDictFileFormat
optionsProp = {
"large_file": BoolOption(
comment="Use idxoffsetbits=64 bits, for large files only",
),
"stardict_client": BoolOption(
comment="Modify html entries for StarDict 3.0",
),
Expand Down Expand Up @@ -247,6 +253,7 @@ def __init__(self, glos: GlossaryType) -> None:
self.clear()

self._xdxfTr = None
self._large_file = False

"""
indexData format
Expand Down Expand Up @@ -348,6 +355,15 @@ def readIfoFile(self) -> None:
continue
self._glos.setInfo(key, value)

idxoffsetbits = self._glos.getInfo("idxoffsetbits")
if idxoffsetbits:
if idxoffsetbits == "32":
self._large_file = False
elif idxoffsetbits == "64":
self._large_file = True
else:
raise ValueError(f"invalid {idxoffsetbits = }")

def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":
if isfile(self._filename + ".idx.gz"):
with gzip.open(self._filename + ".idx.gz") as idxFile:
Expand All @@ -358,6 +374,14 @@ def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":

indexData = []
pos = 0

if self._large_file:
def getOffset():
return uint64FromBytes(idxBytes[pos:pos + 8]), pos + 8
else:
def getOffset():
return uint32FromBytes(idxBytes[pos:pos + 4]), pos + 4

while pos < len(idxBytes):
beg = pos
pos = idxBytes.find(b"\x00", beg)
Expand All @@ -369,8 +393,7 @@ def readIdxFile(self) -> "List[Tuple[bytes, int, int]]":
if pos + 8 > len(idxBytes):
log.error("Index file is corrupted")
break
offset = uint32FromBytes(idxBytes[pos:pos + 4])
pos += 4
offset, pos = getOffset()
size = uint32FromBytes(idxBytes[pos:pos + 4])
pos += 4
indexData.append((b_word, offset, size))
Expand Down Expand Up @@ -682,6 +705,7 @@ def parseDefiBlockGeneral(self, b_block: bytes) -> "List[Tuple[bytes, int]]":


class Writer(object):
_large_file: bool = False
_dictzip: bool = True
_sametypesequence: str = "" # type: Literal["", "h", "m", "x", None]
_stardict_client: bool = False
Expand Down Expand Up @@ -800,6 +824,12 @@ def newSynList(self):
return MemList()
return SynSqList(join(self._glos.tmpDataDir, "stardict-syn.db"))

def dictMarkToBytesFunc(self) -> "Tuple[Callable, int]":
if self._large_file:
return uint64ToBytes, 0xffffffffffffffff

return uint32ToBytes, 0xffffffff

def writeCompact(self, defiFormat) -> None:
"""
Build StarDict dictionary with sametypesequence option specified.
Expand All @@ -816,6 +846,8 @@ def writeCompact(self, defiFormat) -> None:
dictFile = open(self._filename + ".dict", "wb")
idxFile = open(self._filename + ".idx", "wb")

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

t0 = now()
wordCount = 0
if not isdir(self._resDir):
Expand Down Expand Up @@ -844,13 +876,20 @@ def writeCompact(self, defiFormat) -> None:
blockLen = len(b_dictBlock)

b_idxBlock = word.encode("utf-8") + b"\x00" + \
uint32ToBytes(dictMark) + \
dictMarkToBytes(dictMark) + \
uint32ToBytes(blockLen)
idxFile.write(b_idxBlock)

dictMark += blockLen
wordCount += 1

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

dictFile.close()
idxFile.close()
if not os.listdir(self._resDir):
Expand Down Expand Up @@ -883,6 +922,8 @@ def writeGeneral(self) -> None:
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

entryIndex = -1
while True:
entry = yield
Expand Down Expand Up @@ -913,13 +954,20 @@ def writeGeneral(self) -> None:
blockLen = len(b_dictBlock)

b_idxBlock = word.encode("utf-8") + b"\x00" + \
uint32ToBytes(dictMark) + \
dictMarkToBytes(dictMark) + \
uint32ToBytes(blockLen)
idxFile.write(b_idxBlock)

dictMark += blockLen
wordCount += 1

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

dictFile.close()
idxFile.close()
if not os.listdir(self._resDir):
Expand Down Expand Up @@ -984,6 +1032,8 @@ def writeCompactMergeSyns(self, defiFormat) -> None:
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

entryIndex = -1
while True:
entry = yield
Expand All @@ -1003,12 +1053,19 @@ def writeCompactMergeSyns(self, defiFormat) -> None:
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
for word in words:
idxBlockList.append((word.encode("utf-8"), blockData))

dictMark += blockLen

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

wordCount = self.writeIdxFile(idxBlockList)

dictFile.close()
Expand Down Expand Up @@ -1041,6 +1098,8 @@ def writeGeneralMergeSyns(self) -> None:
if not isdir(self._resDir):
os.mkdir(self._resDir)

dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc()

entryIndex = -1
while True:
entry = yield
Expand All @@ -1067,12 +1126,19 @@ def writeGeneralMergeSyns(self) -> None:
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
for word in words:
idxBlockList.append((word.encode("utf-8"), blockData))

dictMark += blockLen

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

wordCount = self.writeIdxFile(idxBlockList)

dictFile.close()
Expand Down Expand Up @@ -1138,6 +1204,8 @@ def writeIfoFile(
("wordcount", wordCount),
("idxfilesize", indexFileSize),
]
if self._large_file:
ifo.append(("idxoffsetbits", "64"))
if defiFormat:
ifo.append(("sametypesequence", defiFormat))
if synWordCount > 0:
Expand Down
8 changes: 8 additions & 0 deletions pyglossary/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,18 @@ def uint32ToBytes(n: int) -> bytes:
return struct.pack('>I', n)


def uint64ToBytes(n: int) -> bytes:
return struct.pack('>Q', n)


def uint32FromBytes(bs: bytes) -> int:
return struct.unpack('>I', bs)[0]


def uint64FromBytes(bs: bytes) -> int:
return struct.unpack('>Q', bs)[0]


def uintFromBytes(bs: bytes) -> int:
n = 0
for c in bs:
Expand Down

0 comments on commit 65a39c1

Please sign in to comment.