Skip to content

Commit

Permalink
StarDict writer: add option large_file, #392 #422
Browse files Browse the repository at this point in the history
  • Loading branch information
ilius committed Jan 29, 2023
1 parent d37c25f commit 00b5bd9
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 13 deletions.
19 changes: 10 additions & 9 deletions doc/p/stardict.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@

### Write options

| Name | Default | Type | Comment |
| ---------------- | ------- | ---- | ---------------------------------------------- |
| dictzip | `True` | bool | Compress .dict file to .dict.dz |
| sametypesequence | | str | Definition format: h=html, m=plaintext, x=xdxf |
| stardict_client | `False` | bool | Modify html entries for StarDict 3.0 |
| merge_syns | `False` | bool | Write alternates to .idx instead of .syn |
| audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop) |
| audio_icon | `True` | bool | Add glossary's audio icon |
| sqlite | `False` | bool | Use SQLite to limit memory usage |
| Name | Default | Type | Comment |
| ---------------- | ------- | ---- | ----------------------------------------------- |
| large_file | `True` | bool | Use idxoffsetbits=64 bits, for large files only |
| dictzip | `True` | bool | Compress .dict file to .dict.dz |
| sametypesequence | | str | Definition format: h=html, m=plaintext, x=xdxf |
| stardict_client | `False` | bool | Modify html entries for StarDict 3.0 |
| merge_syns | `False` | bool | Write alternates to .idx instead of .syn |
| audio_goldendict | `False` | bool | Convert audio links for GoldenDict (desktop) |
| audio_icon | `True` | bool | Add glossary's audio icon |
| sqlite | `False` | bool | Use SQLite to limit memory usage |



Expand Down
6 changes: 6 additions & 0 deletions plugins-meta/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,11 @@
],
"singleFile": false,
"optionsProp": {
"large_file": {
"class": "BoolOption",
"type": "bool",
"comment": "Use idxoffsetbits=64 bits, for large files only"
},
"stardict_client": {
"class": "BoolOption",
"type": "bool",
Expand Down Expand Up @@ -1519,6 +1524,7 @@
"unicode_errors": "strict"
},
"writeOptions": {
"large_file": true,
"dictzip": true,
"sametypesequence": "",
"stardict_client": false,
Expand Down
73 changes: 69 additions & 4 deletions pyglossary/plugins/stardict.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

from pyglossary.text_utils import (
uint32ToBytes,
uint64ToBytes,
uint32FromBytes,
uint64FromBytes,
)

from pyglossary.plugins.formats_common import *
Expand All @@ -37,7 +39,11 @@
"http://huzheng.org/stardict/",
"huzheng.org/stardict",
)
# https://github.com/huzheng001/stardict-3/blob/master/dict/doc/StarDictFileFormat
optionsProp = {
"large_file": BoolOption(
comment="Use idxoffsetbits=64 bits, for large files only",
),
"stardict_client": BoolOption(
comment="Modify html entries for StarDict 3.0",
),
Expand Down Expand Up @@ -668,6 +674,7 @@ def parseDefiBlockGeneral(self, b_block: bytes) -> "List[Tuple[bytes, int]]":


class Writer(object):
_large_file: bool = False
_dictzip: bool = True
_sametypesequence: str = "" # type: Literal["", "h", "m", "x", None]
_stardict_client: bool = False
Expand Down Expand Up @@ -802,6 +809,13 @@ def writeCompact(self, defiFormat):
dictFile = open(self._filename + ".dict", "wb")
idxFile = open(self._filename + ".idx", "wb")

if self._large_file:
dictMarkToBytes = uint64ToBytes
dictMarkMax = 0xffffffffffffffff
else:
dictMarkToBytes = uint32ToBytes
dictMarkMax = 0xffffffff

t0 = now()
wordCount = 0
if not isdir(self._resDir):
Expand Down Expand Up @@ -830,13 +844,20 @@ def writeCompact(self, defiFormat):
blockLen = len(b_dictBlock)

b_idxBlock = word.encode("utf-8") + b"\x00" + \
uint32ToBytes(dictMark) + \
dictMarkToBytes(dictMark) + \
uint32ToBytes(blockLen)
idxFile.write(b_idxBlock)

dictMark += blockLen
wordCount += 1

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

dictFile.close()
idxFile.close()
if not os.listdir(self._resDir):
Expand Down Expand Up @@ -869,6 +890,13 @@ def writeGeneral(self) -> None:
if not isdir(self._resDir):
os.mkdir(self._resDir)

if self._large_file:
dictMarkToBytes = uint64ToBytes
dictMarkMax = 0xffffffffffffffff
else:
dictMarkToBytes = uint32ToBytes
dictMarkMax = 0xffffffff

entryIndex = -1
while True:
entry = yield
Expand Down Expand Up @@ -899,13 +927,20 @@ def writeGeneral(self) -> None:
blockLen = len(b_dictBlock)

b_idxBlock = word.encode("utf-8") + b"\x00" + \
uint32ToBytes(dictMark) + \
dictMarkToBytes(dictMark) + \
uint32ToBytes(blockLen)
idxFile.write(b_idxBlock)

dictMark += blockLen
wordCount += 1

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

dictFile.close()
idxFile.close()
if not os.listdir(self._resDir):
Expand Down Expand Up @@ -970,6 +1005,13 @@ def writeCompactMergeSyns(self, defiFormat):
if not isdir(self._resDir):
os.mkdir(self._resDir)

if self._large_file:
dictMarkToBytes = uint64ToBytes
dictMarkMax = 0xffffffffffffffff
else:
dictMarkToBytes = uint32ToBytes
dictMarkMax = 0xffffffff

entryIndex = -1
while True:
entry = yield
Expand All @@ -989,12 +1031,19 @@ def writeCompactMergeSyns(self, defiFormat):
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
for word in words:
idxBlockList.append((word.encode("utf-8"), blockData))

dictMark += blockLen

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

wordCount = self.writeIdxFile(idxBlockList)

dictFile.close()
Expand Down Expand Up @@ -1027,6 +1076,13 @@ def writeGeneralMergeSyns(self) -> None:
if not isdir(self._resDir):
os.mkdir(self._resDir)

if self._large_file:
dictMarkToBytes = uint64ToBytes
dictMarkMax = 0xffffffffffffffff
else:
dictMarkToBytes = uint32ToBytes
dictMarkMax = 0xffffffff

entryIndex = -1
while True:
entry = yield
Expand All @@ -1053,12 +1109,19 @@ def writeGeneralMergeSyns(self) -> None:
dictFile.write(b_dictBlock)
blockLen = len(b_dictBlock)

blockData = uint32ToBytes(dictMark) + uint32ToBytes(blockLen)
blockData = dictMarkToBytes(dictMark) + uint32ToBytes(blockLen)
for word in words:
idxBlockList.append((word.encode("utf-8"), blockData))

dictMark += blockLen

if dictMark > dictMarkMax:
log.error(
f"StarDict: {dictMark = } is too big, "
f"set option large_file=true",
)
break

wordCount = self.writeIdxFile(idxBlockList)

dictFile.close()
Expand Down Expand Up @@ -1124,6 +1187,8 @@ def writeIfoFile(
("wordcount", wordCount),
("idxfilesize", indexFileSize),
]
if self._large_file:
ifo.append(("idxoffsetbits", "64"))
if defiFormat:
ifo.append(("sametypesequence", defiFormat))
if synWordCount > 0:
Expand Down
8 changes: 8 additions & 0 deletions pyglossary/text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,18 @@ def uint32ToBytes(n: int) -> bytes:
return struct.pack('>I', n)


def uint64ToBytes(n: int) -> bytes:
return struct.pack('>Q', n)


def uint32FromBytes(bs: bytes) -> int:
return struct.unpack('>I', bs)[0]


def uint64FromBytes(bs: bytes) -> int:
return struct.unpack('>Q', bs)[0]


def uintFromBytes(bs: bytes) -> int:
n = 0
for c in bs:
Expand Down

0 comments on commit 00b5bd9

Please sign in to comment.