From aa6765b9284c62ddc85174908d66c2ff0b95c3ed Mon Sep 17 00:00:00 2001 From: Saeed Rasooli Date: Mon, 2 Sep 2024 11:34:39 +0330 Subject: [PATCH] add new plugin xdxf_css (XdxfCss) based on PR #570 by @soshial - use CSS for all XDXF element (decrease size of output dict file) - created a list (ol / ul) for nested ``s - support loading of `` data - show js tooltips for abbreviations - fixed showing hidden `` - fixed losing spaces before XDXF tags - add braces to `` - show CSS instead of inlined HTML tags for: `, , , , , ` --- doc/p/__index__.md | 1 + doc/p/xdxf_css.md | 28 ++ plugins-meta/index.json | 20 + pyglossary/plugins/xdxf/__init__.py | 13 +- pyglossary/plugins/xdxf_css/__init__.py | 339 +++++++++++++++++ pyglossary/xdxf/css_js_transform.py | 473 ++++++++++++++++++++++++ pyglossary/xdxf/xdxf.css | 70 ++++ pyglossary/xdxf/xdxf.js | 56 +++ 8 files changed, 990 insertions(+), 10 deletions(-) create mode 100644 doc/p/xdxf_css.md create mode 100644 pyglossary/plugins/xdxf_css/__init__.py create mode 100644 pyglossary/xdxf/css_js_transform.py create mode 100644 pyglossary/xdxf/xdxf.css create mode 100644 pyglossary/xdxf/xdxf.js diff --git a/doc/p/__index__.md b/doc/p/__index__.md index cb2985087..438d4f596 100644 --- a/doc/p/__index__.md +++ b/doc/p/__index__.md @@ -44,6 +44,7 @@ | WordNet | Wordnet | [wordnet.md](./wordnet.md) | | Wordset.org JSON directory | Wordset | [wordset.md](./wordset.md) | | XDXF (.xdxf) | Xdxf | [xdxf.md](./xdxf.md) | +| XDXF with CSS and JS | XdxfCss | [xdxf_css.md](./xdxf_css.md) | | XDXF Lax (.xdxf) | XdxfLax | [xdxf_lax.md](./xdxf_lax.md) | | Yomichan (.zip) | Yomichan | [yomichan.md](./yomichan.md) | | Zim (.zim, for Kiwix) | Zim | [zim.md](./zim.md) | diff --git a/doc/p/xdxf_css.md b/doc/p/xdxf_css.md new file mode 100644 index 000000000..384afded8 --- /dev/null +++ b/doc/p/xdxf_css.md @@ -0,0 +1,28 @@ +## XDXF with CSS and JS + +### General Information + +| Attribute | Value | +| --------------- | -------------------------------------------------------------------------------------------------------------- | +| Name | XdxfCss | +| snake_case_name | xdxf_css | +| Description | XDXF with CSS and JS | +| Extensions | | +| Read support | Yes | +| Write support | No | +| Single-file | Yes | +| Kind | 📝 text | +| Sort-on-write | default_no | +| Sort key | (`headword_lower`) | +| Wiki | [XDXF](https://en.wikipedia.org/wiki/XDXF) | +| Website | [XDXF standard - @soshial/xdxf_makedict](https://github.com/soshial/xdxf_makedict/tree/master/format_standard) | + +### Dependencies for reading + +PyPI Links: [lxml](https://pypi.org/project/lxml) + +To install, run: + +```sh +pip3 install lxml +``` diff --git a/plugins-meta/index.json b/plugins-meta/index.json index 68c13cffd..5e1d7d0cd 100644 --- a/plugins-meta/index.json +++ b/plugins-meta/index.json @@ -1823,6 +1823,26 @@ "lzma" ] }, + { + "module": "xdxf_css", + "lname": "xdxf_css", + "name": "XdxfCss", + "description": "XDXF with CSS and JS", + "extensions": [], + "singleFile": true, + "optionsProp": {}, + "canRead": true, + "canWrite": false, + "readOptions": {}, + "readDepends": { + "lxml": "lxml" + }, + "readCompressions": [ + "gz", + "bz2", + "lzma" + ] + }, { "module": "xdxf_lax", "lname": "xdxf_lax", diff --git a/pyglossary/plugins/xdxf/__init__.py b/pyglossary/plugins/xdxf/__init__.py index d1712a17e..3a5193a9d 100644 --- a/pyglossary/plugins/xdxf/__init__.py +++ b/pyglossary/plugins/xdxf/__init__.py @@ -31,6 +31,8 @@ from pyglossary.lxml_types import Element +from lxml import etree as ET + from pyglossary.compression import ( compressionOpen, stdCompressions, @@ -79,7 +81,6 @@ ), } - """ new format @@ -110,9 +111,7 @@ """ - if TYPE_CHECKING: - class TransformerType(typing.Protocol): def transform(self, article: "Element") -> str: ... @@ -154,7 +153,6 @@ def makeTransformer(self) -> None: def open(self, filename: str) -> None: # noqa: PLR0912 # - from lxml import etree as ET self._filename = filename if self._html: @@ -220,9 +218,6 @@ def __len__(self) -> int: return 0 def __iter__(self) -> "Iterator[EntryType]": - from lxml import etree as ET - from lxml.etree import tostring - context = ET.iterparse( # type: ignore self._file, events=("end",), @@ -238,7 +233,7 @@ def __iter__(self) -> "Iterator[EntryType]": if len(words) == 1: defi = self._re_span_k.sub("", defi) else: - b_defi = cast(bytes, tostring(article, encoding=self._encoding)) + b_defi = cast(bytes, ET.tostring(article, encoding=self._encoding)) defi = b_defi[4:-5].decode(self._encoding).strip() defiFormat = "x" @@ -265,8 +260,6 @@ def close(self) -> None: def tostring( elem: "Element", ) -> str: - from lxml import etree as ET - return ( ET.tostring( elem, diff --git a/pyglossary/plugins/xdxf_css/__init__.py b/pyglossary/plugins/xdxf_css/__init__.py new file mode 100644 index 000000000..bc6418818 --- /dev/null +++ b/pyglossary/plugins/xdxf_css/__init__.py @@ -0,0 +1,339 @@ +# -*- coding: utf-8 -*- +# xdxf/__init__.py +"""xdxf file format reader and utils to convert xdxf to html.""" +# +# Copyright © 2023 Saeed Rasooli +# Copyright © 2016 ivan tkachenko me@ratijas.tk +# +# some parts of this file include code from: +# Aard Dictionary Tools . +# Copyright © 2008-2009 Igor Tkach +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# You can get a copy of GNU General Public License along this program +# But you can always get it from http://www.gnu.org/licenses/gpl.txt +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +import re +import typing +from collections.abc import Iterator, Sequence +from os.path import join +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + + from pyglossary.lxml_types import Element + from pyglossary.option import Option + + +from lxml import etree as ET + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import log, rootDir +from pyglossary.glossary_types import EntryType, GlossaryType +from pyglossary.io_utils import nullBinaryIO +from pyglossary.text_utils import toStr + +__all__ = [ + "Reader", + "description", + "enable", + "extensionCreate", + "extensions", + "format", + "kind", + "lname", + "optionsProp", + "singleFile", + "website", + "wiki", +] + +enable = True +lname = "xdxf_css" +format = "XdxfCss" +description = "XDXF with CSS and JS" +extensions = () +extensionCreate = ".xdxf" +singleFile = True +kind = "text" +wiki = "https://en.wikipedia.org/wiki/XDXF" +website = ( + "https://github.com/soshial/xdxf_makedict/tree/master/format_standard", + "XDXF standard - @soshial/xdxf_makedict", +) +optionsProp: "dict[str, Option]" = {} + +""" +new format + + + + ... + ... + ... + + + article 1 + article 2 + article 3 + article 4 + ... + + + +old format + + ... + ... + article 1 + article 2 + article 3 + article 4 + ... + +""" + +if TYPE_CHECKING: + class TransformerType(typing.Protocol): + def transform(self, article: "Element") -> str: ... + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + _html: bool = True + + infoKeyMap = { + "full_name": "name", + "full_title": "name", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: "io.IOBase" = nullBinaryIO + self._encoding = "utf-8" + self._htmlTr: "TransformerType | None" = None + self._re_span_k = re.compile( + '[^<>]*(
)?', + ) + self._has_added_css: bool = False + self._has_added_js: bool = False + self._abbr_defs_js: bytes + + def makeTransformer(self) -> None: + from pyglossary.xdxf.css_js_transform import XdxfTransformer + + self._htmlTr = XdxfTransformer(encoding=self._encoding) + + def open(self, filename: str) -> None: # noqa: PLR0912 + # + + self._filename = filename + self.makeTransformer() + self._glos.setDefaultDefiFormat("h") + + cfile = self._file = cast( + "io.IOBase", + compressionOpen( + self._filename, + mode="rb", + ), + ) + + context = ET.iterparse( # type: ignore + cfile, + events=("end",), + ) + abbr_defs = [] + for _, _elem in context: + elem = cast("Element", _elem) + if elem.tag in {"meta_info", "ar", "k", "abr", "dtrn"}: + break + # every other tag before or is considered info + if elem.tag == "abbr_def": + abbr_defs.append(elem) + continue + # in case of multiple or multiple tags, the last one + # will be stored. + # Very few formats support more than one language pair in their + # metadata, so it's not very useful to have multiple + if elem.tag == "from": + for key, value in elem.attrib.items(): + if key.endswith("}lang"): + self._glos.sourceLangName = value.split("-")[0] + break + continue + if elem.tag == "to": + for key, value in elem.attrib.items(): + if key.endswith("}lang"): + self._glos.targetLangName = value.split("-")[0] + break + continue + if not elem.text: + log.warning(f"empty tag <{elem.tag}>") + continue + key = self.infoKeyMap.get(elem.tag, elem.tag) + self._glos.setInfo(key, elem.text) + self._abbr_defs_js = self.generate_abbr_js(abbr_defs) + del context + + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + self._glos.setInfo("input_file_size", f"{self._fileSize}") + else: + log.warning("XDXF Reader: file is not seekable") + self._file.close() + self._file = compressionOpen(self._filename, mode="rb") + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> "Iterator[EntryType]": + context = ET.iterparse( # type: ignore + self._file, + events=("end",), + tag="ar", + ) + + if self._has_added_css is False: + self._has_added_css = True + with open(join(rootDir, "pyglossary", "xdxf", "xdxf.css"), "rb") as css_file: + yield self._glos.newDataEntry("css/xdxf.css", css_file.read()) + + if self._abbr_defs_js is not None and not self._has_added_js: + self._has_added_js = True + yield self._glos.newDataEntry("js/xdxf.js", self._abbr_defs_js) + + for _, _article in context: + article = cast("Element", _article) + article.tail = None + words = [toStr(w) for w in self.titles(article)] + + defi = self._htmlTr.transform(article) + defiFormat = "h" + if len(words) == 1: + defi = self._re_span_k.sub("", defi) + + defi = f""" + + + + + + {defi} + + +""" + # log.info(f"{defi=}, {words=}") + yield self._glos.newEntry( + words, + defi, + defiFormat=defiFormat, + byteProgress=(self._file.tell(), self._fileSize), + ) + # clean up preceding siblings to save memory + # this can reduce memory usage from 1 GB to ~25 MB + parent = article.getparent() + if parent is None: + continue + while article.getprevious() is not None: + del parent[0] + + def close(self) -> None: + self._file.close() + self._file = nullBinaryIO + + def generate_abbr_js(self, abbr_defs: list["Element"]) -> bytes: + abbr_map_js = """const abbr_map = new Map();\n""" + for abbr_def in abbr_defs: + abbr_k_list: list[str] = [] + abbr_v_text = "" + for child in abbr_def.xpath("child::node()"): + if child.tag == "abbr_k": + abbr_k_list.append(self._htmlTr.stringify_children(child)) + if child.tag == "abbr_v": + abbr_v_text = self._htmlTr.stringify_children(child) + # TODO escape apostrophes + for abbr_k in abbr_k_list: + if len(abbr_k) > 0 and len(abbr_v_text) > 0: + abbr_map_js += f"abbr_map.set('{abbr_k}', '{abbr_v_text}');\n" + with open(join(rootDir, "pyglossary", "xdxf", "xdxf.js"), "rb") as js_file: + return abbr_map_js.encode(encoding="utf-8") + js_file.read() + + @staticmethod + def tostring( + elem: "Element", + ) -> str: + return ( + ET.tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + def titles(self, article: "Element") -> "list[str]": + """ + :param article: tag + :return: (title (str) | None, alternative titles (set)) + """ + from itertools import combinations + + titles: "list[str]" = [] + for title_element in article.findall("k"): + if title_element.text is None: + # TODO: look for tag? + log.warning(f"empty title element: {self.tostring(title_element)}") + continue + n_opts = len([c for c in title_element if c.tag == "opt"]) + if n_opts: + for j in range(n_opts + 1): + for comb in combinations(list(range(n_opts)), j): + titles.append(self._mktitle(title_element, comb)) + else: + titles.append(self._mktitle(title_element)) + + return titles + + def _mktitle( # noqa: PLR6301 + self, + title_element: "Element", + include_opts: "Sequence | None" = None, + ) -> str: + if include_opts is None: + include_opts = () + title = title_element.text or "" + opt_i = -1 + for c in title_element: + if c.tag == "nu" and c.tail: + if title: + title += c.tail + else: + title = c.tail + if c.tag == "opt" and c.text is not None: + opt_i += 1 + if opt_i in include_opts: + title += c.text + if c.tail: + title += c.tail + return title.strip() diff --git a/pyglossary/xdxf/css_js_transform.py b/pyglossary/xdxf/css_js_transform.py new file mode 100644 index 000000000..e1d5f0fae --- /dev/null +++ b/pyglossary/xdxf/css_js_transform.py @@ -0,0 +1,473 @@ +import logging +import sys +from io import BytesIO +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + from pyglossary.lxml_types import Element, T_htmlfile + +log = logging.getLogger("pyglossary") + +__all__ = [ + "XdxfTransformer", +] + + +class XdxfTransformer: + def __init__(self, encoding: str = "utf-8") -> None: + self._encoding = encoding + self.logging_enabled = False + self._childTagWriteMapping = { + "br": self._write_br, + "u": self._write_basic_format, + "i": self._write_basic_format, + "b": self._write_basic_format, + "sub": self._write_basic_format, + "sup": self._write_basic_format, + "tt": self._write_basic_format, + "big": self._write_basic_format, + "small": self._write_basic_format, + "blockquote": self._write_blockquote, + "tr": self._write_tr, + "k": self._write_k, + "sr": self._write_sr, + "ex": self._write_example, + "mrkd": self._write_mrkd, + "kref": self._write_kref, + "iref": self._write_iref, + "pos": self._write_pos, + "abr": self._write_abr, + "abbr": self._write_abbr, + "dtrn": self._write_dtrn, + "co": self._write_co, + "c": self._write_c, + "rref": self._write_rref, + "def": self._write_def, + "deftext": self._write_deftext, + "span": self._write_span, + "gr": self._write_gr, + "ex_orig": self._write_ex_orig, + "categ": self._write_categ, + "opt": self._write_opt, + "img": self._write_img, + "etm": self._write_etm, + } + + @staticmethod + def tostring(elem: "Element") -> str: + from lxml import etree as ET + + return ( + ET.tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + @staticmethod + def hasPrevText(prev: "None | str | Element") -> bool: + if isinstance(prev, str): + return True + if prev is None: + return False + if prev.tag == "k": + return False + if prev.tag in { + "dtrn", + "def", + "span", + "co", + "i", + "b", + "sub", + "sup", + "tt", + "big", + "small", + }: + return True + if prev.text: # noqa: SIM103 + return True + # print(prev) + return False + + def writeString( # noqa: PLR0913 + self, + hf: "T_htmlfile", + child: str, + parent: "Element", + prev: "None | str | Element", + stringSep: "str | None" = None, + ) -> None: + from lxml import etree as ET + + def addSep() -> None: + if stringSep is None: + hf.write(ET.Element("br")) + else: + hf.write(stringSep) + + hasPrev = self.hasPrevText(prev) + trail = False + if parent.tag in {"ar", "font"}: + if child.startswith("\n"): + child = child.lstrip("\n") + if hasPrev: + hf.write(ET.Element("br")) + elif child.endswith("\n"): + child = child.rstrip("\n") + trail = True + if not hasPrev: + child = child.lstrip() + elif child.startswith("\n"): + # child = child.lstrip() + if hasPrev: + addSep() + + lines = [line for line in child.split("\n") if line] + for index, line in enumerate(lines): + if index > 0: + # and line[0] not in ".,;)" + addSep() + hf.write(line) + if trail: + addSep() + + def _write_example(self, hf: "T_htmlfile", elem: "Element") -> None: + prev = None + stringSep = " " + with hf.element( # noqa: PLR1702 + "div", + attrib={"class": elem.tag}, + ): + for child in elem.xpath("child::node()"): + if isinstance(child, str): + # if not child.strip(): + # continue + self.writeString(hf, child, elem, prev, stringSep=stringSep) + continue + if child.tag == "iref": + with hf.element("div"): + self._write_iref(hf, child) # NESTED 5 + continue + + if child.tag == "ex_orig": + with hf.element("span", attrib={"class": child.tag}): + self.writeChildrenOf(hf, child, stringSep=stringSep) + continue + if child.tag == "ex_tran": + ex_trans = elem.xpath("./ex_tran") + if ex_trans.index(child) == 0: + # when several translations, make HTML unordered list of them + if len(ex_trans) > 1: + with hf.element("ul", attrib={}): + for ex_tran in ex_trans: + with hf.element("li", attrib={}): + self._write_ex_transl(hf, ex_tran) + else: + self._write_ex_transl(hf, child) + continue + # log.warning(f"unknown tag {child.tag} inside ") + self.writeChild(hf, child, elem, prev, stringSep=stringSep) + prev = child + + def _write_ex_orig(self, hf: "T_htmlfile", child: "Element") -> None: + # TODO NOT REACHABLE + sys.exit("NOT REACHABLE") + with hf.element("i"): + self.writeChildrenOf(hf, child) + + def _write_ex_transl(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("span", attrib={"class": child.tag}): + self.writeChildrenOf(hf, child) + + def _write_iref(self, hf: "T_htmlfile", child: "Element") -> None: + iref_url = child.attrib.get("href", "") + if iref_url.endswith((".mp3", ".wav", ".aac", ".ogg")): + # with hf.element("audio", src=iref_url): + with hf.element( + "a", + attrib={ + "class": "iref", + "href": iref_url, + }, + ): + hf.write("🔊") + return + + with hf.element( + "a", + attrib={ + "class": "iref", + "href": child.attrib.get("href", child.text or ""), + }, + ): + self.writeChildrenOf(hf, child, stringSep=" ") + + def _write_blockquote(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("div", attrib={"class": "m"}): + self.writeChildrenOf(hf, child) + + def _write_tr(self, hf: "T_htmlfile", child: "Element") -> None: + from lxml import etree as ET + + hf.write("[") + self.writeChildrenOf(hf, child) + hf.write("]") + hf.write(ET.Element("br")) + + def _write_k(self, hf: "T_htmlfile", child: "Element") -> None: + self.logging_enabled = child.text == "iść" + + index = child.getparent().index(child) + if index == 0: + with (hf.element("div", attrib={"class": child.tag})): + # with hf.element(glos.titleTag(child.text)): + # ^ no glos object here! + self.writeChildrenOf(hf, child) + # TODO Lenny: show other forms in a collapsible list + # else: + # with (hf.element("span", attrib={"class": child.tag})): + # hf.write(str(index)) + # self.writeChildrenOf(hf, child) + + def _write_mrkd(self, hf: "T_htmlfile", child: "Element") -> None: # noqa: PLR6301 + if not child.text: + return + with hf.element("span", attrib={"class": child.tag}): + hf.write(child.text) + + def _write_kref(self, hf: "T_htmlfile", child: "Element") -> None: + if not child.text: + log.warning(f"kref with no text: {self.tostring(child)}") + return + with hf.element( + "a", + attrib={ + "class": "kref", + "href": f"bword://{child.attrib.get('k', child.text)}", + }, + ): + hf.write(child.text) + + def _write_sr(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("div", attrib={"class": child.tag}): + self.writeChildrenOf(hf, child) + + def _write_pos(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("span", attrib={"class": child.tag}): + self.writeChildrenOf(hf, child) + + def _write_abr(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("span", attrib={"class": "abbr"}): + self.writeChildrenOf(hf, child) + + def _write_abbr(self, hf: "T_htmlfile", child: "Element") -> None: # noqa: PLR6301 + with hf.element("span", attrib={"class": child.tag}): + self.writeChildrenOf(hf, child) + + def _write_dtrn(self, hf: "T_htmlfile", child: "Element") -> None: + self.writeChildrenOf(hf, child, sep=" ") + + def _write_co(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("span", attrib={"class": child.tag}): + hf.write("(") + self.writeChildrenOf(hf, child, sep=" ") + hf.write(")") + + def _write_basic_format(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element(child.tag): + self.writeChildrenOf(hf, child) + # if child.text is not None: + # hf.write(child.text.strip("\n")) + + def _write_br(self, hf: "T_htmlfile", child: "Element") -> None: + from lxml import etree as ET + + hf.write(ET.Element("br")) + self.writeChildrenOf(hf, child) + + def _write_c(self, hf: "T_htmlfile", child: "Element") -> None: + color = child.attrib.get("c", "green") + with hf.element("font", color=color): + self.writeChildrenOf(hf, child) + + def _write_rref(self, _hf: "T_htmlfile", child: "Element") -> None: + if not child.text: + log.warning(f"rref with no text: {self.tostring(child)}") + return + + def _write_def(self, hf: "T_htmlfile", elem: "Element") -> None: + has_nested_def = False + has_deftext = False + for child in elem.iterchildren(): + if child.tag == "def": + has_nested_def = True + if child.tag == "deftext": + has_deftext = True + + if elem.getparent().tag == "ar": # this is a root + if has_nested_def: + with hf.element("ol"): + self.writeChildrenOf(hf, elem) + else: + with hf.element("div"): + self.writeChildrenOf(hf, elem) + elif has_deftext: + with hf.element("li"): + self.writeChildrenOf(hf, elem) + elif has_nested_def: + with hf.element("li"): + with hf.element("ol"): + self.writeChildrenOf(hf, elem) + else: + with hf.element("li"): + self.writeChildrenOf(hf, elem) + + def _write_deftext(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("span", attrib={"class": child.tag}): + self.writeChildrenOf(hf, child, stringSep=" ", sep=" ") + + def _write_span(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("span"): + self.writeChildrenOf(hf, child) + + def _write_gr(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("div", attrib={"class": child.tag}): + self.writeChildrenOf(hf, child) + + def _write_categ(self, hf: "T_htmlfile", child: "Element") -> None: + with hf.element("span", style="background-color: green;"): + self.writeChildrenOf(hf, child, stringSep=" ") + + def _write_opt(self, hf: "T_htmlfile", child: "Element") -> None: # noqa: PLR6301 + if child.text: + hf.write(" (") + hf.write(child.text) + hf.write(")") + + def _write_img(self, hf: "T_htmlfile", child: "Element") -> None: # noqa: PLR6301 + with hf.element("img", attrib=dict(child.attrib)): + pass + + def _write_etm(self, hf: "T_htmlfile", child: "Element") -> None: # noqa: PLR6301 + # Etymology (history and origin) + # TODO: formatting? + hf.write(f"{child.text}") + + def writeChildElem( # noqa: PLR0913 + self, + hf: "T_htmlfile", + child: "Element", + parent: "Element", # noqa: ARG002 + prev: "None | str | Element", + stringSep: "str | None" = None, # noqa: ARG002 + ) -> None: + func = self._childTagWriteMapping.get(child.tag, None) + if func is not None: + func(hf, child) + return + + if child.tag == "ex_transl" and prev is not None: + if isinstance(prev, str): + pass + elif prev.tag == "ex_orig": + if child.text != prev.text: + with hf.element("i"): + self.writeChildrenOf(hf, child) + return + + log.warning(f"unknown tag {child.tag}") + self.writeChildrenOf(hf, child) + + def writeChild( # noqa: PLR0913 + self, + hf: "T_htmlfile", + child: "str | Element", + parent: "Element", + prev: "None | str | Element", + stringSep: "str | None" = None, + ) -> None: + if isinstance(child, str): + self.writeString(hf, child, parent, prev, stringSep=stringSep) + else: + self.writeChildElem( + hf=hf, + child=child, + parent=parent, + prev=prev, + stringSep=stringSep, + ) + + def shouldAddSep( # noqa: PLR6301 + self, + child: "str | Element", + prev: "str | Element", + ) -> bool: + if isinstance(child, str): + return not (len(child) > 0 and child[0] in ".,;)") + + if child.tag in {"sub", "sup"}: + return False + + if isinstance(prev, str): + pass + elif prev.tag in {"sub", "sup"}: + return False + + return True + + def writeChildrenOf( + self, + hf: "T_htmlfile", + elem: "Element", + sep: "str | None" = None, + stringSep: "str | None" = None, + ) -> None: + prev = None + for child in elem.xpath("child::node()"): + if sep and prev is not None and self.shouldAddSep(child, prev): + hf.write(sep) + self.writeChild(hf, child, elem, prev, stringSep=stringSep) + prev = child + + @staticmethod + def stringify_children(elem: "Element") -> str: + from itertools import chain + + from lxml.etree import tostring + children = [chunk for chunk in chain( + (elem.text,), + chain(*((tostring(child, with_tail=False), child.tail) + for child in elem.getchildren())), + (elem.tail,)) if chunk] + normalized_children = "" + for chunk in children: + if isinstance(chunk, str): + normalized_children += chunk + if isinstance(chunk, bytes): + normalized_children += chunk.decode(encoding="utf-8") + return normalized_children + + def transform(self, article: "Element") -> str: + from lxml import etree as ET + + # encoding = self._encoding + f = BytesIO() + with ET.htmlfile(f, encoding="utf-8") as hf: + with hf.element("div", attrib={"class": "article"}): + self.writeChildrenOf(cast("T_htmlfile", hf), article) + + text = f.getvalue().decode("utf-8") + text = text.replace("
", "
") # for compatibility + return text # noqa: RET504 + + def transformByInnerString(self, articleInnerStr: str) -> str: + from lxml import etree as ET + + return self.transform( + ET.fromstring(f"{articleInnerStr}"), + ) diff --git a/pyglossary/xdxf/xdxf.css b/pyglossary/xdxf/xdxf.css new file mode 100644 index 000000000..0ee23a41d --- /dev/null +++ b/pyglossary/xdxf/xdxf.css @@ -0,0 +1,70 @@ +div.k { + font-weight: 700; + font-size: 150%; +} + +span.k { + font-size: 100%; +} + +.gr { + color: green; +} + +ol { + list-style-type: decimal; + padding-left: 20px; +} + +ol > li > ol > li > ol { + list-style-type: lower-latin; +} + +.ex { + margin: 0px 0px 0px 20px; + color: #888888; +} + +.ex i { + color: red; +} + +.ex_orig { + font-weight: 700; +} + +.ex .mrkd { + text-decoration: underline; +} + +.co { + color: #888888; + font-style: italic; +} + +.abbr { + color: green; + font-style: italic; + text-decoration: underline; + text-decoration-style: dotted; +} + +.pos { + color: red; + font-style: italic; +} + +.abbr_popup { + background: #feffca; + border: 1px solid rgba(0,0,0,.15); + border-radius: 2px; + box-shadow: 2px 2px 3px rgba(0,0,0,.1),0 2px 0 rgba(255,255,255,.4) inset,0 -2px 0 rgba(242,85,0,1) inset; + cursor: pointer; + display: none; + font-size: 100%; + font-style: normal; + padding: .05em .6em .2em; + position: absolute; + z-index: 999; + margin-bottom: 100px; +} \ No newline at end of file diff --git a/pyglossary/xdxf/xdxf.js b/pyglossary/xdxf/xdxf.js new file mode 100644 index 000000000..83257309e --- /dev/null +++ b/pyglossary/xdxf/xdxf.js @@ -0,0 +1,56 @@ + +prepare_tooltips() + +// iterate over all tags that can show tooltip +function prepare_tooltips() { + var pos_elems = document.querySelectorAll(".pos"); + var abbr_elems = document.querySelectorAll(".abbr"); + iterate_over_abbr_elems(pos_elems) + iterate_over_abbr_elems(abbr_elems) +} + +function iterate_over_abbr_elems(elems) { + for (var i = 0; i < elems.length; i++) { + var elem = elems[i]; + if (abbr_map.has(elem.textContent)) { + elem.classList.add("abbr"); + elem.classList.remove("pos"); + elem.addEventListener("mouseover", show_popup); + elem.addEventListener("mouseout", hide_popup); + } else { + elem.classList.add("pos"); + elem.classList.remove("abbr"); + } + } +} + +function show_popup(event) { + var pos_elem = event.target + var pos_text = pos_elem.textContent + var s = document.createElement("small"); + s.classList.add("abbr_popup"); + s.innerHTML = abbr_map.get(pos_text) + pos_elem.parentNode.insertBefore(s, pos_elem.nextSibling); + + if (s.offsetWidth > 200) { + if ((pos_elem.offsetLeft + 200) > document.body.offsetWidth) { + s.style.left = pos_elem.offsetLeft - ((pos_elem.offsetLeft + 200) - document.body.offsetWidth) + 'px'; + } else { + s.style.left = pos_elem.offsetLeft + 'px'; + } + } else { + if ((pos_elem.offsetLeft + s.offsetWidth) > document.body.offsetWidth) { + s.style.left = pos_elem.offsetLeft - ((pos_elem.offsetLeft + s.offsetWidth) - document.body.offsetWidth) + 'px'; + } else { + s.style.left = pos_elem.offsetLeft + 'px'; + } + } + s.style.display = 'block'; +} + +function hide_popup(event) { + var popups = document.getElementsByClassName('abbr_popup'); + for (var i = 0; i < popups.length; ++i) { + popups[i].remove(); + } +}