From 8fb3b2d4a6eb09342bccdea003f2e813573411a5 Mon Sep 17 00:00:00 2001 From: Saeed Rasooli Date: Tue, 31 Dec 2024 08:16:23 +0330 Subject: [PATCH] break up plugins --- pyglossary/plugins/aard2_slob/__init__.py | 393 +---------- pyglossary/plugins/aard2_slob/reader.py | 145 ++++ pyglossary/plugins/aard2_slob/tags.py | 29 + pyglossary/plugins/aard2_slob/writer.py | 260 +++++++ pyglossary/plugins/almaany/__init__.py | 84 +-- pyglossary/plugins/almaany/reader.py | 88 +++ .../plugins/ayandict_sqlite/__init__.py | 206 +----- pyglossary/plugins/ayandict_sqlite/reader.py | 66 ++ pyglossary/plugins/ayandict_sqlite/writer.py | 152 ++++ pyglossary/plugins/cc_kedict/__init__.py | 304 +------- pyglossary/plugins/cc_kedict/reader.py | 309 +++++++++ pyglossary/plugins/crawler_dir/__init__.py | 163 +---- pyglossary/plugins/crawler_dir/reader.py | 88 +++ pyglossary/plugins/crawler_dir/writer.py | 93 +++ pyglossary/plugins/csv_plugin/__init__.py | 244 +------ pyglossary/plugins/csv_plugin/reader.py | 182 +++++ pyglossary/plugins/csv_plugin/writer.py | 121 ++++ pyglossary/plugins/dicformids/__init__.py | 256 +------ pyglossary/plugins/dicformids/reader.py | 76 ++ pyglossary/plugins/dicformids/writer.py | 195 ++++++ pyglossary/plugins/dict_cc/__init__.py | 200 +----- pyglossary/plugins/dict_cc/reader.py | 205 ++++++ pyglossary/plugins/dict_cc_split/__init__.py | 77 +- pyglossary/plugins/dict_cc_split/reader.py | 83 +++ pyglossary/plugins/dict_org/__init__.py | 156 +---- pyglossary/plugins/dict_org/reader.py | 74 ++ pyglossary/plugins/dict_org/writer.py | 98 +++ .../plugins/dict_org_source/__init__.py | 40 +- pyglossary/plugins/dict_org_source/writer.py | 42 ++ pyglossary/plugins/dictunformat/__init__.py | 90 +-- pyglossary/plugins/dictunformat/reader.py | 90 +++ pyglossary/plugins/digitalnk/__init__.py | 55 +- pyglossary/plugins/digitalnk/reader.py | 59 ++ pyglossary/plugins/dikt_json/__init__.py | 74 +- pyglossary/plugins/dikt_json/writer.py | 80 +++ pyglossary/plugins/ebook_epub2/__init__.py | 231 +----- pyglossary/plugins/ebook_epub2/writer.py | 233 +++++++ pyglossary/plugins/ebook_kobo/__init__.py | 229 +----- pyglossary/plugins/ebook_kobo/writer.py | 233 +++++++ .../plugins/ebook_kobo_dictfile/__init__.py | 185 +---- .../plugins/ebook_kobo_dictfile/reader.py | 123 ++++ .../plugins/ebook_kobo_dictfile/writer.py | 89 +++ pyglossary/plugins/ebook_mobi/__init__.py | 306 +------- pyglossary/plugins/ebook_mobi/writer.py | 308 ++++++++ pyglossary/plugins/edict2/__init__.py | 88 +-- pyglossary/plugins/edict2/reader.py | 89 +++ pyglossary/plugins/edlin/__init__.py | 272 +------- pyglossary/plugins/edlin/reader.py | 131 ++++ pyglossary/plugins/edlin/writer.py | 141 ++++ pyglossary/plugins/gettext_po/__init__.py | 177 +---- pyglossary/plugins/gettext_po/reader.py | 128 ++++ pyglossary/plugins/gettext_po/writer.py | 66 ++ pyglossary/plugins/html_dir/__init__.py | 490 +------------ pyglossary/plugins/html_dir/writer.py | 491 +++++++++++++ pyglossary/plugins/info_plugin/__init__.py | 30 +- pyglossary/plugins/info_plugin/reader.py | 36 + pyglossary/plugins/jmdict/__init__.py | 416 +---------- pyglossary/plugins/jmdict/reader.py | 417 +++++++++++ pyglossary/plugins/jmnedict/__init__.py | 295 +------- pyglossary/plugins/jmnedict/reader.py | 298 ++++++++ pyglossary/plugins/json_plugin/__init__.py | 64 +- pyglossary/plugins/json_plugin/writer.py | 68 ++ pyglossary/plugins/lingoes_ldf/__init__.py | 134 +--- pyglossary/plugins/lingoes_ldf/reader.py | 77 ++ pyglossary/plugins/lingoes_ldf/writer.py | 66 ++ .../plugins/makindo_medical/__init__.py | 54 +- pyglossary/plugins/makindo_medical/reader.py | 58 ++ .../plugins/octopus_mdict_new/__init__.py | 220 +----- .../plugins/octopus_mdict_new/reader.py | 221 ++++++ pyglossary/plugins/sql/__init__.py | 138 +--- pyglossary/plugins/sql/writer.py | 140 ++++ .../plugins/stardict_merge_syns/__init__.py | 133 +--- .../plugins/stardict_merge_syns/writer.py | 137 ++++ .../plugins/stardict_textual/__init__.py | 359 +--------- pyglossary/plugins/stardict_textual/reader.py | 212 ++++++ pyglossary/plugins/stardict_textual/writer.py | 162 +++++ pyglossary/plugins/tabfile/__init__.py | 119 +--- pyglossary/plugins/tabfile/reader.py | 49 ++ pyglossary/plugins/tabfile/writer.py | 59 ++ pyglossary/plugins/testformat/__init__.py | 94 +-- pyglossary/plugins/testformat/reader.py | 57 ++ pyglossary/plugins/testformat/writer.py | 43 ++ pyglossary/plugins/wiktextract/__init__.py | 655 +---------------- pyglossary/plugins/wiktextract/reader.py | 656 ++++++++++++++++++ pyglossary/plugins/wordnet/__init__.py | 324 +-------- pyglossary/plugins/wordnet/reader.py | 330 +++++++++ pyglossary/plugins/wordset/__init__.py | 94 +-- pyglossary/plugins/wordset/reader.py | 97 +++ pyglossary/plugins/xdxf/__init__.py | 253 +------ pyglossary/plugins/xdxf/reader.py | 252 +++++++ pyglossary/plugins/xdxf_css/__init__.py | 282 +------- pyglossary/plugins/xdxf_css/reader.py | 284 ++++++++ pyglossary/plugins/xdxf_lax/__init__.py | 246 +------ pyglossary/plugins/xdxf_lax/reader.py | 246 +++++++ pyglossary/plugins/yomichan/__init__.py | 247 +------ pyglossary/plugins/yomichan/writer.py | 249 +++++++ pyglossary/plugins/zimfile/__init__.py | 184 +---- pyglossary/plugins/zimfile/reader.py | 184 +++++ tests/deprecated/glossary_security_test.py | 1 + 99 files changed, 9068 insertions(+), 8559 deletions(-) create mode 100644 pyglossary/plugins/aard2_slob/reader.py create mode 100644 pyglossary/plugins/aard2_slob/tags.py create mode 100644 pyglossary/plugins/aard2_slob/writer.py create mode 100644 pyglossary/plugins/almaany/reader.py create mode 100644 pyglossary/plugins/ayandict_sqlite/reader.py create mode 100644 pyglossary/plugins/ayandict_sqlite/writer.py create mode 100644 pyglossary/plugins/cc_kedict/reader.py create mode 100644 pyglossary/plugins/crawler_dir/reader.py create mode 100644 pyglossary/plugins/crawler_dir/writer.py create mode 100644 pyglossary/plugins/csv_plugin/reader.py create mode 100644 pyglossary/plugins/csv_plugin/writer.py create mode 100644 pyglossary/plugins/dicformids/reader.py create mode 100644 pyglossary/plugins/dicformids/writer.py create mode 100644 pyglossary/plugins/dict_cc/reader.py create mode 100644 pyglossary/plugins/dict_cc_split/reader.py create mode 100644 pyglossary/plugins/dict_org/reader.py create mode 100644 pyglossary/plugins/dict_org/writer.py create mode 100644 pyglossary/plugins/dict_org_source/writer.py create mode 100644 pyglossary/plugins/dictunformat/reader.py create mode 100644 pyglossary/plugins/digitalnk/reader.py create mode 100644 pyglossary/plugins/dikt_json/writer.py create mode 100644 pyglossary/plugins/ebook_epub2/writer.py create mode 100644 pyglossary/plugins/ebook_kobo/writer.py create mode 100644 pyglossary/plugins/ebook_kobo_dictfile/reader.py create mode 100644 pyglossary/plugins/ebook_kobo_dictfile/writer.py create mode 100644 pyglossary/plugins/ebook_mobi/writer.py create mode 100644 pyglossary/plugins/edict2/reader.py create mode 100644 pyglossary/plugins/edlin/reader.py create mode 100644 pyglossary/plugins/edlin/writer.py create mode 100644 pyglossary/plugins/gettext_po/reader.py create mode 100644 pyglossary/plugins/gettext_po/writer.py create mode 100644 pyglossary/plugins/html_dir/writer.py create mode 100644 pyglossary/plugins/info_plugin/reader.py create mode 100644 pyglossary/plugins/jmdict/reader.py create mode 100644 pyglossary/plugins/jmnedict/reader.py create mode 100644 pyglossary/plugins/json_plugin/writer.py create mode 100644 pyglossary/plugins/lingoes_ldf/reader.py create mode 100644 pyglossary/plugins/lingoes_ldf/writer.py create mode 100644 pyglossary/plugins/makindo_medical/reader.py create mode 100644 pyglossary/plugins/octopus_mdict_new/reader.py create mode 100644 pyglossary/plugins/sql/writer.py create mode 100644 pyglossary/plugins/stardict_merge_syns/writer.py create mode 100644 pyglossary/plugins/stardict_textual/reader.py create mode 100644 pyglossary/plugins/stardict_textual/writer.py create mode 100644 pyglossary/plugins/tabfile/reader.py create mode 100644 pyglossary/plugins/tabfile/writer.py create mode 100644 pyglossary/plugins/testformat/reader.py create mode 100644 pyglossary/plugins/testformat/writer.py create mode 100644 pyglossary/plugins/wiktextract/reader.py create mode 100644 pyglossary/plugins/wordnet/reader.py create mode 100644 pyglossary/plugins/wordset/reader.py create mode 100644 pyglossary/plugins/xdxf/reader.py create mode 100644 pyglossary/plugins/xdxf_css/reader.py create mode 100644 pyglossary/plugins/xdxf_lax/reader.py create mode 100644 pyglossary/plugins/yomichan/writer.py create mode 100644 pyglossary/plugins/zimfile/reader.py diff --git a/pyglossary/plugins/aard2_slob/__init__.py b/pyglossary/plugins/aard2_slob/__init__.py index 8d75434ff..6e63ead7a 100644 --- a/pyglossary/plugins/aard2_slob/__init__.py +++ b/pyglossary/plugins/aard2_slob/__init__.py @@ -1,19 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import os -import re -import shutil -from os.path import isfile, splitext -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Generator, Iterator - - from pyglossary import slob - from pyglossary.glossary_types import EntryType, GlossaryType - -from pyglossary.core import cacheDir, exc_note, log, pip from pyglossary.option import ( BoolOption, FileSizeOption, @@ -22,6 +9,9 @@ StrOption, ) +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -92,380 +82,3 @@ " instructions on how to install PyICU.", ), ] - -t_created_at = "created.at" -t_label = "label" -t_created_by = "created.by" -t_copyright = "copyright" -t_license_name = "license.name" -t_license_url = "license.url" -t_uri = "uri" -t_edition = "edition" - -supported_tags = { - t_label, - t_created_at, - t_created_by, - t_copyright, - t_uri, - t_edition, -} - - -class Reader: - depends = { - "icu": "PyICU", # >=1.5 - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - self._re_bword = re.compile( - "(]+?>)", - re.IGNORECASE, - ) - - def close(self) -> None: - if self._slobObj is not None: - self._slobObj.close() - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._slobObj: slob.Slob | None = None - - # TODO: PLR0912 Too many branches (13 > 12) - def open(self, filename: str) -> None: # noqa: PLR0912 - try: - import icu # type: ignore # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install PyICU` to install") - raise - from pyglossary import slob - - self._filename = filename - self._slobObj = slob.open(filename) - tags = dict(self._slobObj.tags.items()) - - if t_label in tags: - self._glos.setInfo("name", tags[t_label]) - - if t_created_at in tags: - self._glos.setInfo("creationTime", tags[t_created_at]) - - if t_created_by in tags: - self._glos.setInfo("author", tags[t_created_by]) - - copyrightLines: list[str] = [] - for key in (t_copyright, t_license_name, t_license_url): - try: - value = tags.pop(key) - except KeyError: - continue - copyrightLines.append(value) - if copyrightLines: - self._glos.setInfo("copyright", "\n".join(copyrightLines)) - - if t_uri in tags: - self._glos.setInfo("website", tags[t_uri]) - - if t_edition in tags: - self._glos.setInfo("edition", tags[t_edition]) - - for key, value in tags.items(): - if key in supported_tags: - continue - self._glos.setInfo(f"slob.{key}", value) - - def __len__(self) -> int: - if self._slobObj is None: - log.error("called len() on a reader which is not open") - return 0 - return len(self._slobObj) - - @staticmethod - def _href_sub(m: re.Match) -> str: - st = m.group(0) - if "//" in st: - return st - return st.replace('href="', 'href="bword://').replace( - "href='", - "href='bword://", - ) - - def __iter__(self) -> Iterator[EntryType | None]: - from pyglossary.slob import MIME_HTML, MIME_TEXT - - if self._slobObj is None: - raise RuntimeError("iterating over a reader while it's not open") - - slobObj = self._slobObj - blobSet = set() - - # slob library gives duplicate blobs when iterating over slobObj - # even keeping the last id is not enough, since duplicate blobs - # are not all consecutive. so we have to keep a set of blob IDs - - for blob in slobObj: - id_ = blob.identity - if id_ in blobSet: - yield None # update progressbar - continue - blobSet.add(id_) - - # blob.key is str, blob.content is bytes - word = blob.key - - ctype = blob.content_type.split(";")[0] - if ctype not in {MIME_HTML, MIME_TEXT}: - log.debug(f"unknown {blob.content_type=} in {word=}") - word = word.removeprefix("~/") - yield self._glos.newDataEntry(word, blob.content) - continue - defiFormat = "" - if ctype == MIME_HTML: - defiFormat = "h" - elif ctype == MIME_TEXT: - defiFormat = "m" - - defi = blob.content.decode("utf-8") - defi = self._re_bword.sub(self._href_sub, defi) - yield self._glos.newEntry(word, defi, defiFormat=defiFormat) - - -class Writer: - depends = { - "icu": "PyICU", - } - - _compression: str = "zlib" - _content_type: str = "" - _file_size_approx: int = 0 - _file_size_approx_check_num_entries = 100 - _separate_alternates: bool = False - _word_title: bool = False - _version_info: bool = False - - _audio_goldendict: bool = False - - resourceMimeTypes = { - "png": "image/png", - "jpeg": "image/jpeg", - "jpg": "image/jpeg", - "gif": "image/gif", - "svg": "image/svg+xml", - "webp": "image/webp", - "tiff": "image/tiff", - "tif": "image/tiff", - "bmp": "image/bmp", - "css": "text/css", - "js": "application/javascript", - "json": "application/json", - "woff": "application/font-woff", - "woff2": "application/font-woff2", - "ttf": "application/x-font-ttf", - "otf": "application/x-font-opentype", - "mp3": "audio/mpeg", - "ogg": "audio/ogg", - "spx": "audio/x-speex", - "wav": "audio/wav", - "ini": "text/plain", - # "application/octet-stream+xapian", - "eot": "application/vnd.ms-fontobject", - "pdf": "application/pdf", - "mp4": "video/mp4", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._resPrefix = "" - self._slobWriter: slob.Writer | None = None - - @staticmethod - def _slobObserver( - event: slob.WriterEvent, # noqa: F401, F821 - ) -> None: - log.debug(f"slob: {event.name}{': ' + event.data if event.data else ''}") - - def _open(self, filepath: str, namePostfix: str) -> slob.Writer: - from pyglossary import slob - - if isfile(filepath): - shutil.move(filepath, f"{filepath}.bak") - log.warning(f"renamed existing {filepath!r} to {filepath + '.bak'!r}") - self._slobWriter = slobWriter = slob.Writer( - filepath, - observer=self._slobObserver, - workdir=cacheDir, - compression=self._compression, - version_info=self._version_info, - ) - - # "label" tag is a dictionary name shown in UI - slobWriter.tag(t_label, self._glos.getInfo("name") + namePostfix) - - createdAt = self._glos.getInfo("creationTime") - if createdAt is not None: - slobWriter.tag(t_created_at, createdAt) - createdBy = self._glos.getInfo("author") - if createdBy is not None: - slobWriter.tag(t_created_by, createdBy) - - filename = os.path.basename(filepath) - dic_uri = re.sub(r"[^A-Za-z0-9_-]+", "_", filename) - # "uri" tag is not web url, it's a part of gloss addressing ID: uri + article ID - # setting the tag allows bookmark & history migration, if dict file is updated - # we use source filename as "uri", since it is stable (most likely) - slobWriter.tag(t_uri, dic_uri) - - return slobWriter - - def open(self, filename: str) -> None: - try: - import icu # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install PyICU` to install") - raise - if isfile(filename): - raise OSError(f"File '{filename}' already exists") - namePostfix = "" - if self._file_size_approx > 0: - namePostfix = " (part 1)" - self._open(filename, namePostfix) - self._filename = filename - - def finish(self) -> None: - from time import perf_counter - - self._filename = "" - if self._slobWriter is None: - return - log.info("Finalizing slob file...") - t0 = perf_counter() - self._slobWriter.finalize() - log.info(f"Finalizing slob file took {perf_counter() - t0:.1f} seconds") - self._slobWriter = None - - def addDataEntry(self, entry: EntryType) -> None: - slobWriter = self._slobWriter - if slobWriter is None: - raise ValueError("slobWriter is None") - rel_path = entry.s_word - _, ext = splitext(rel_path) - ext = ext.lstrip(os.path.extsep).lower() - content_type = self.resourceMimeTypes.get(ext) - if not content_type: - log.error(f"Aard2 slob: unknown content type for {rel_path!r}") - return - content = entry.data - key = self._resPrefix + rel_path - try: - key.encode(slobWriter.encoding) - except UnicodeEncodeError: - log.error(f"Failed to add, broken unicode in key: {key!a}") - return - slobWriter.add(content, key, content_type=content_type) - - def addEntry(self, entry: EntryType) -> None: - words = entry.l_word - b_defi = entry.defi.encode("utf-8") - ctype = self._content_type - writer = self._slobWriter - if writer is None: - raise ValueError("slobWriter is None") - - entry.detectDefiFormat() - defiFormat = entry.defiFormat - - if self._word_title and defiFormat in {"h", "m"}: - if defiFormat == "m": - defiFormat = "h" - title = self._glos.wordTitleStr( - words[0], - ) - b_defi = title.encode("utf-8") + b_defi - - if defiFormat == "h": - b_defi = b_defi.replace(b'"bword://', b'"') - b_defi = b_defi.replace(b"'bword://", b"'") - - if not self._audio_goldendict: - b_defi = b_defi.replace( - b"""href="sound://""", - b'''onclick="new Audio(this.href).play(); return false;" href="''', - ) - b_defi = b_defi.replace( - b"""href='sound://""", - b"""onclick="new Audio(this.href).play(); return false;" href='""", - ) - b_defi = b_defi.replace(b""" Generator[None, EntryType, None]: - slobWriter = self._slobWriter - if slobWriter is None: - raise ValueError("slobWriter is None") - file_size_approx = int(self._file_size_approx * 0.95) - entryCount = 0 - sumBlobSize = 0 - fileIndex = 0 - filenameNoExt, _ = splitext(self._filename) - while True: - entry = yield - if entry is None: - break - - if entry.isData(): - self.addDataEntry(entry) - else: - self.addEntry(entry) - - if file_size_approx <= 0: - continue - - # handle file_size_approx - check_every = self._file_size_approx_check_num_entries - entryCount += 1 - if entryCount % check_every == 0: - sumBlobSize = slobWriter.size_data() - if sumBlobSize >= file_size_approx: - slobWriter.finalize() - fileIndex += 1 - slobWriter = self._open( - f"{filenameNoExt}.{fileIndex}.slob", - f" (part {fileIndex + 1})", - ) - sumBlobSize = 0 - entryCount = 0 diff --git a/pyglossary/plugins/aard2_slob/reader.py b/pyglossary/plugins/aard2_slob/reader.py new file mode 100644 index 000000000..c80fdffb8 --- /dev/null +++ b/pyglossary/plugins/aard2_slob/reader.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary import slob + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import exc_note, log, pip +from pyglossary.plugins.aard2_slob.tags import ( + supported_tags, + t_copyright, + t_created_at, + t_created_by, + t_edition, + t_label, + t_license_name, + t_license_url, + t_uri, +) + + +class Reader: + depends = { + "icu": "PyICU", # >=1.5 + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + self._re_bword = re.compile( + "(]+?>)", + re.IGNORECASE, + ) + + def close(self) -> None: + if self._slobObj is not None: + self._slobObj.close() + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._slobObj: slob.Slob | None = None + + # TODO: PLR0912 Too many branches (13 > 12) + def open(self, filename: str) -> None: # noqa: PLR0912 + try: + import icu # type: ignore # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install PyICU` to install") + raise + from pyglossary import slob + + self._filename = filename + self._slobObj = slob.open(filename) + tags = dict(self._slobObj.tags.items()) + + if t_label in tags: + self._glos.setInfo("name", tags[t_label]) + + if t_created_at in tags: + self._glos.setInfo("creationTime", tags[t_created_at]) + + if t_created_by in tags: + self._glos.setInfo("author", tags[t_created_by]) + + copyrightLines: list[str] = [] + for key in (t_copyright, t_license_name, t_license_url): + try: + value = tags.pop(key) + except KeyError: + continue + copyrightLines.append(value) + if copyrightLines: + self._glos.setInfo("copyright", "\n".join(copyrightLines)) + + if t_uri in tags: + self._glos.setInfo("website", tags[t_uri]) + + if t_edition in tags: + self._glos.setInfo("edition", tags[t_edition]) + + for key, value in tags.items(): + if key in supported_tags: + continue + self._glos.setInfo(f"slob.{key}", value) + + def __len__(self) -> int: + if self._slobObj is None: + log.error("called len() on a reader which is not open") + return 0 + return len(self._slobObj) + + @staticmethod + def _href_sub(m: re.Match) -> str: + st = m.group(0) + if "//" in st: + return st + return st.replace('href="', 'href="bword://').replace( + "href='", + "href='bword://", + ) + + def __iter__(self) -> Iterator[EntryType | None]: + from pyglossary.slob import MIME_HTML, MIME_TEXT + + if self._slobObj is None: + raise RuntimeError("iterating over a reader while it's not open") + + slobObj = self._slobObj + blobSet = set() + + # slob library gives duplicate blobs when iterating over slobObj + # even keeping the last id is not enough, since duplicate blobs + # are not all consecutive. so we have to keep a set of blob IDs + + for blob in slobObj: + id_ = blob.identity + if id_ in blobSet: + yield None # update progressbar + continue + blobSet.add(id_) + + # blob.key is str, blob.content is bytes + word = blob.key + + ctype = blob.content_type.split(";")[0] + if ctype not in {MIME_HTML, MIME_TEXT}: + log.debug(f"unknown {blob.content_type=} in {word=}") + word = word.removeprefix("~/") + yield self._glos.newDataEntry(word, blob.content) + continue + defiFormat = "" + if ctype == MIME_HTML: + defiFormat = "h" + elif ctype == MIME_TEXT: + defiFormat = "m" + + defi = blob.content.decode("utf-8") + defi = self._re_bword.sub(self._href_sub, defi) + yield self._glos.newEntry(word, defi, defiFormat=defiFormat) diff --git a/pyglossary/plugins/aard2_slob/tags.py b/pyglossary/plugins/aard2_slob/tags.py new file mode 100644 index 000000000..e4336a02e --- /dev/null +++ b/pyglossary/plugins/aard2_slob/tags.py @@ -0,0 +1,29 @@ +t_created_at = "created.at" +t_label = "label" +t_created_by = "created.by" +t_copyright = "copyright" +t_license_name = "license.name" +t_license_url = "license.url" +t_uri = "uri" +t_edition = "edition" + +supported_tags = { + t_label, + t_created_at, + t_created_by, + t_copyright, + t_uri, + t_edition, +} + +__all__ = [ + "supported_tags", + "t_copyright", + "t_created_at", + "t_created_by", + "t_edition", + "t_label", + "t_license_name", + "t_license_url", + "t_uri", +] diff --git a/pyglossary/plugins/aard2_slob/writer.py b/pyglossary/plugins/aard2_slob/writer.py new file mode 100644 index 000000000..c8519f987 --- /dev/null +++ b/pyglossary/plugins/aard2_slob/writer.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import os +import re +import shutil +from os.path import isfile, splitext +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary import slob + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import cacheDir, exc_note, log, pip +from pyglossary.plugins.aard2_slob.tags import ( + t_created_at, + t_created_by, + t_label, + t_uri, +) + + +class Writer: + depends = { + "icu": "PyICU", + } + + _compression: str = "zlib" + _content_type: str = "" + _file_size_approx: int = 0 + _file_size_approx_check_num_entries = 100 + _separate_alternates: bool = False + _word_title: bool = False + _version_info: bool = False + + _audio_goldendict: bool = False + + resourceMimeTypes = { + "png": "image/png", + "jpeg": "image/jpeg", + "jpg": "image/jpeg", + "gif": "image/gif", + "svg": "image/svg+xml", + "webp": "image/webp", + "tiff": "image/tiff", + "tif": "image/tiff", + "bmp": "image/bmp", + "css": "text/css", + "js": "application/javascript", + "json": "application/json", + "woff": "application/font-woff", + "woff2": "application/font-woff2", + "ttf": "application/x-font-ttf", + "otf": "application/x-font-opentype", + "mp3": "audio/mpeg", + "ogg": "audio/ogg", + "spx": "audio/x-speex", + "wav": "audio/wav", + "ini": "text/plain", + # "application/octet-stream+xapian", + "eot": "application/vnd.ms-fontobject", + "pdf": "application/pdf", + "mp4": "video/mp4", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._resPrefix = "" + self._slobWriter: slob.Writer | None = None + + @staticmethod + def _slobObserver( + event: slob.WriterEvent, # noqa: F401, F821 + ) -> None: + log.debug(f"slob: {event.name}{': ' + event.data if event.data else ''}") + + def _open(self, filepath: str, namePostfix: str) -> slob.Writer: + from pyglossary import slob + + if isfile(filepath): + shutil.move(filepath, f"{filepath}.bak") + log.warning(f"renamed existing {filepath!r} to {filepath + '.bak'!r}") + self._slobWriter = slobWriter = slob.Writer( + filepath, + observer=self._slobObserver, + workdir=cacheDir, + compression=self._compression, + version_info=self._version_info, + ) + + # "label" tag is a dictionary name shown in UI + slobWriter.tag(t_label, self._glos.getInfo("name") + namePostfix) + + createdAt = self._glos.getInfo("creationTime") + if createdAt is not None: + slobWriter.tag(t_created_at, createdAt) + createdBy = self._glos.getInfo("author") + if createdBy is not None: + slobWriter.tag(t_created_by, createdBy) + + filename = os.path.basename(filepath) + dic_uri = re.sub(r"[^A-Za-z0-9_-]+", "_", filename) + # "uri" tag is not web url, it's a part of gloss addressing ID: uri + article ID + # setting the tag allows bookmark & history migration, if dict file is updated + # we use source filename as "uri", since it is stable (most likely) + slobWriter.tag(t_uri, dic_uri) + + return slobWriter + + def open(self, filename: str) -> None: + try: + import icu # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install PyICU` to install") + raise + if isfile(filename): + raise OSError(f"File '{filename}' already exists") + namePostfix = "" + if self._file_size_approx > 0: + namePostfix = " (part 1)" + self._open(filename, namePostfix) + self._filename = filename + + def finish(self) -> None: + from time import perf_counter + + self._filename = "" + if self._slobWriter is None: + return + log.info("Finalizing slob file...") + t0 = perf_counter() + self._slobWriter.finalize() + log.info(f"Finalizing slob file took {perf_counter() - t0:.1f} seconds") + self._slobWriter = None + + def addDataEntry(self, entry: EntryType) -> None: + slobWriter = self._slobWriter + if slobWriter is None: + raise ValueError("slobWriter is None") + rel_path = entry.s_word + _, ext = splitext(rel_path) + ext = ext.lstrip(os.path.extsep).lower() + content_type = self.resourceMimeTypes.get(ext) + if not content_type: + log.error(f"Aard2 slob: unknown content type for {rel_path!r}") + return + content = entry.data + key = self._resPrefix + rel_path + try: + key.encode(slobWriter.encoding) + except UnicodeEncodeError: + log.error(f"Failed to add, broken unicode in key: {key!a}") + return + slobWriter.add(content, key, content_type=content_type) + + def addEntry(self, entry: EntryType) -> None: + words = entry.l_word + b_defi = entry.defi.encode("utf-8") + ctype = self._content_type + writer = self._slobWriter + if writer is None: + raise ValueError("slobWriter is None") + + entry.detectDefiFormat() + defiFormat = entry.defiFormat + + if self._word_title and defiFormat in {"h", "m"}: + if defiFormat == "m": + defiFormat = "h" + title = self._glos.wordTitleStr( + words[0], + ) + b_defi = title.encode("utf-8") + b_defi + + if defiFormat == "h": + b_defi = b_defi.replace(b'"bword://', b'"') + b_defi = b_defi.replace(b"'bword://", b"'") + + if not self._audio_goldendict: + b_defi = b_defi.replace( + b"""href="sound://""", + b'''onclick="new Audio(this.href).play(); return false;" href="''', + ) + b_defi = b_defi.replace( + b"""href='sound://""", + b"""onclick="new Audio(this.href).play(); return false;" href='""", + ) + b_defi = b_defi.replace(b""" Generator[None, EntryType, None]: + slobWriter = self._slobWriter + if slobWriter is None: + raise ValueError("slobWriter is None") + file_size_approx = int(self._file_size_approx * 0.95) + entryCount = 0 + sumBlobSize = 0 + fileIndex = 0 + filenameNoExt, _ = splitext(self._filename) + while True: + entry = yield + if entry is None: + break + + if entry.isData(): + self.addDataEntry(entry) + else: + self.addEntry(entry) + + if file_size_approx <= 0: + continue + + # handle file_size_approx + check_every = self._file_size_approx_check_num_entries + entryCount += 1 + if entryCount % check_every == 0: + sumBlobSize = slobWriter.size_data() + if sumBlobSize >= file_size_approx: + slobWriter.finalize() + fileIndex += 1 + slobWriter = self._open( + f"{filenameNoExt}.{fileIndex}.slob", + f" (part {fileIndex + 1})", + ) + sumBlobSize = 0 + entryCount = 0 diff --git a/pyglossary/plugins/almaany/__init__.py b/pyglossary/plugins/almaany/__init__.py index 9a49bb167..8838cfd62 100644 --- a/pyglossary/plugins/almaany/__init__.py +++ b/pyglossary/plugins/almaany/__init__.py @@ -1,16 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html from typing import TYPE_CHECKING if TYPE_CHECKING: - import sqlite3 - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option +from .reader import Reader + __all__ = [ "Reader", "description", @@ -40,80 +37,3 @@ "Almaany.com Arabic Dictionary - Google Play", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("h") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(*) from WordsTable") - return self._cur.fetchone()[0] - - def __iter__(self) -> Iterator[EntryType]: - if self._cur is None: - raise ValueError("cur is None") - from pyglossary.langs.writing_system import getWritingSystemFromText - - alternateDict: dict[str, list[str]] = {} - self._cur.execute("select wordkey, searchwordkey from Keys") - for row in self._cur.fetchall(): - if row[0] in alternateDict: - alternateDict[row[0]].append(row[1]) - else: - alternateDict[row[0]] = [row[1]] - - self._cur.execute( - "select word, searchword, root, meaning from WordsTable order by id", - ) - # FIXME: iteration over self._cur stops after one entry - # and self._cur.fetchone() returns None - # for row in self._cur: - for row in self._cur.fetchall(): - word = row[0] - searchword = row[1] - root = row[2] - meaning = row[3] - definition = meaning - definition = definition.replace("|", "
") - - if root: - definition += ( - f'
Root:
{root}' - ) - - ws = getWritingSystemFromText(meaning) - if ws and ws.direction == "rtl": - definition = f'
{definition}
' - - words = [word, searchword] - if word in alternateDict: - words += alternateDict[word] - yield self._glos.newEntry( - words, - definition, - defiFormat="h", - ) - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/almaany/reader.py b/pyglossary/plugins/almaany/reader.py new file mode 100644 index 000000000..3447c1010 --- /dev/null +++ b/pyglossary/plugins/almaany/reader.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("h") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(*) from WordsTable") + return self._cur.fetchone()[0] + + def __iter__(self) -> Iterator[EntryType]: + if self._cur is None: + raise ValueError("cur is None") + from pyglossary.langs.writing_system import getWritingSystemFromText + + alternateDict: dict[str, list[str]] = {} + self._cur.execute("select wordkey, searchwordkey from Keys") + for row in self._cur.fetchall(): + if row[0] in alternateDict: + alternateDict[row[0]].append(row[1]) + else: + alternateDict[row[0]] = [row[1]] + + self._cur.execute( + "select word, searchword, root, meaning from WordsTable order by id", + ) + # FIXME: iteration over self._cur stops after one entry + # and self._cur.fetchone() returns None + # for row in self._cur: + for row in self._cur.fetchall(): + word = row[0] + searchword = row[1] + root = row[2] + meaning = row[3] + definition = meaning + definition = definition.replace("|", "
") + + if root: + definition += ( + f'
Root: {root}' + ) + + ws = getWritingSystemFromText(meaning) + if ws and ws.direction == "rtl": + definition = f'
{definition}
' + + words = [word, searchword] + if word in alternateDict: + words += alternateDict[word] + yield self._glos.newEntry( + words, + definition, + defiFormat="h", + ) + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/ayandict_sqlite/__init__.py b/pyglossary/plugins/ayandict_sqlite/__init__.py index 5ac40b37b..a86e83029 100644 --- a/pyglossary/plugins/ayandict_sqlite/__init__.py +++ b/pyglossary/plugins/ayandict_sqlite/__init__.py @@ -1,20 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import ( - TYPE_CHECKING, -) - -if TYPE_CHECKING: - import sqlite3 - from collections.abc import Generator, Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.xdxf.transform import XdxfTransformer - -from pyglossary.core import log from pyglossary.option import BoolOption, Option +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -49,194 +40,3 @@ comment="Create fuzzy search data", ), } - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("h") - - self._cur.execute("SELECT key, value FROM meta;") - for row in self._cur.fetchall(): - if row[0] == "hash": - continue - self._glos.setInfo(row[0], row[1]) - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(id) from entry") - return self._cur.fetchone()[0] - - def __iter__(self) -> Iterator[EntryType]: - from json import loads - - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - "SELECT entry.term, entry.article, " - "json_group_array(alt.term)" - "FROM entry LEFT JOIN alt ON entry.id=alt.id " - "GROUP BY entry.id;", - ) - for row in self._cur.fetchall(): - terms = [row[0]] + [alt for alt in loads(row[2]) if alt] - article = row[1] - yield self._glos.newEntry(terms, article, defiFormat="h") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() - - -class Writer: - _fuzzy: int = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - self._xdxfTr: XdxfTransformer | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - con = self._con = connect(filename) - self._cur = self._con.cursor() - - for query in ( - "CREATE TABLE meta ('key' TEXT PRIMARY KEY NOT NULL, 'value' TEXT);", - ( - "CREATE TABLE entry ('id' INTEGER PRIMARY KEY NOT NULL, " - "'term' TEXT, 'article' TEXT);" - ), - "CREATE TABLE alt ('id' INTEGER NOT NULL, 'term' TEXT);", - "CREATE INDEX idx_meta ON meta(key);", - "CREATE INDEX idx_entry_term ON entry(term COLLATE NOCASE);", - "CREATE INDEX idx_alt_id ON alt(id);", - "CREATE INDEX idx_alt_term ON alt(term COLLATE NOCASE);", - ): - try: - con.execute(query) - except Exception as e: # noqa: PERF203 - log.error(f"query: {query}") - raise e - - for key, value in self._glos.iterInfo(): - con.execute( - "INSERT INTO meta (key, value) VALUES (?, ?);", - (key, value), - ) - - if self._fuzzy: - con.execute( - "CREATE TABLE fuzzy3 ('sub' TEXT NOT NULL, " - "'term' TEXT NOT NULL, " - "id INTEGER NOT NULL);", - ) - con.execute( - "CREATE INDEX idx_fuzzy3_sub ON fuzzy3(sub COLLATE NOCASE);", - ) - - con.commit() - - def finish(self) -> None: - if self._con is None or self._cur is None: - return - - self._con.commit() - self._con.close() - self._con = None - self._cur = None - - def xdxf_setup(self) -> None: - from pyglossary.xdxf.transform import XdxfTransformer - - # if self._xsl: - # self._xdxfTr = XslXdxfTransformer(encoding="utf-8") - # return - self._xdxfTr = XdxfTransformer(encoding="utf-8") - - def xdxf_transform(self, text: str) -> str: - if self._xdxfTr is None: - self.xdxf_setup() - return self._xdxfTr.transformByInnerString(text) # type: ignore - - def write(self) -> Generator[None, EntryType, None]: - import hashlib - - cur = self._cur - if cur is None: - raise ValueError("cur is None") - hash_ = hashlib.md5() - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # can save it with entry.save(directory) - continue - defi = entry.defi - entry.detectDefiFormat() - if entry.defiFormat == "m": - if "\n" in defi: - defi = f"
{defi}
" - elif entry.defiFormat == "x": - defi = self.xdxf_transform(defi) - - cur.execute( - "INSERT INTO entry(term, article) VALUES (?, ?);", - (entry.l_word[0], defi), - ) - id_ = cur.lastrowid - if id_ is None: - raise ValueError("lastrowid is None") - for alt in entry.l_word[1:]: - cur.execute( - "INSERT INTO alt(id, term) VALUES (?, ?);", - (id_, alt), - ) - hash_.update(entry.s_word.encode("utf-8")) - if self._fuzzy: - self.addFuzzy(id_, entry.l_word) - - cur.execute( - "INSERT INTO meta (key, value) VALUES (?, ?);", - ("hash", hash_.hexdigest()), - ) - - def addFuzzy(self, id_: int, terms: list[str]) -> None: - cur = self._cur - if cur is None: - raise ValueError("cur is None") - for term in terms: - subs: set[str] = set() - for word in term.split(" "): - eword = "\n" + word - subs.update(eword[i : i + 3] for i in range(len(eword) - 2)) - for sub in subs: - cur.execute( - "INSERT INTO fuzzy3(sub, term, id) VALUES (?, ?, ?);", - (sub, term, id_), - ) diff --git a/pyglossary/plugins/ayandict_sqlite/reader.py b/pyglossary/plugins/ayandict_sqlite/reader.py new file mode 100644 index 000000000..b1ed0b6eb --- /dev/null +++ b/pyglossary/plugins/ayandict_sqlite/reader.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("h") + + self._cur.execute("SELECT key, value FROM meta;") + for row in self._cur.fetchall(): + if row[0] == "hash": + continue + self._glos.setInfo(row[0], row[1]) + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(id) from entry") + return self._cur.fetchone()[0] + + def __iter__(self) -> Iterator[EntryType]: + from json import loads + + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + "SELECT entry.term, entry.article, " + "json_group_array(alt.term)" + "FROM entry LEFT JOIN alt ON entry.id=alt.id " + "GROUP BY entry.id;", + ) + for row in self._cur.fetchall(): + terms = [row[0]] + [alt for alt in loads(row[2]) if alt] + article = row[1] + yield self._glos.newEntry(terms, article, defiFormat="h") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/ayandict_sqlite/writer.py b/pyglossary/plugins/ayandict_sqlite/writer.py new file mode 100644 index 000000000..810631c71 --- /dev/null +++ b/pyglossary/plugins/ayandict_sqlite/writer.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.xdxf.transform import XdxfTransformer + +from pyglossary.core import log + + +class Writer: + _fuzzy: int = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + self._xdxfTr: XdxfTransformer | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + con = self._con = connect(filename) + self._cur = self._con.cursor() + + for query in ( + "CREATE TABLE meta ('key' TEXT PRIMARY KEY NOT NULL, 'value' TEXT);", + ( + "CREATE TABLE entry ('id' INTEGER PRIMARY KEY NOT NULL, " + "'term' TEXT, 'article' TEXT);" + ), + "CREATE TABLE alt ('id' INTEGER NOT NULL, 'term' TEXT);", + "CREATE INDEX idx_meta ON meta(key);", + "CREATE INDEX idx_entry_term ON entry(term COLLATE NOCASE);", + "CREATE INDEX idx_alt_id ON alt(id);", + "CREATE INDEX idx_alt_term ON alt(term COLLATE NOCASE);", + ): + try: + con.execute(query) + except Exception as e: # noqa: PERF203 + log.error(f"query: {query}") + raise e + + for key, value in self._glos.iterInfo(): + con.execute( + "INSERT INTO meta (key, value) VALUES (?, ?);", + (key, value), + ) + + if self._fuzzy: + con.execute( + "CREATE TABLE fuzzy3 ('sub' TEXT NOT NULL, " + "'term' TEXT NOT NULL, " + "id INTEGER NOT NULL);", + ) + con.execute( + "CREATE INDEX idx_fuzzy3_sub ON fuzzy3(sub COLLATE NOCASE);", + ) + + con.commit() + + def finish(self) -> None: + if self._con is None or self._cur is None: + return + + self._con.commit() + self._con.close() + self._con = None + self._cur = None + + def xdxf_setup(self) -> None: + from pyglossary.xdxf.transform import XdxfTransformer + + # if self._xsl: + # self._xdxfTr = XslXdxfTransformer(encoding="utf-8") + # return + self._xdxfTr = XdxfTransformer(encoding="utf-8") + + def xdxf_transform(self, text: str) -> str: + if self._xdxfTr is None: + self.xdxf_setup() + return self._xdxfTr.transformByInnerString(text) # type: ignore + + def write(self) -> Generator[None, EntryType, None]: + import hashlib + + cur = self._cur + if cur is None: + raise ValueError("cur is None") + hash_ = hashlib.md5() + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # can save it with entry.save(directory) + continue + defi = entry.defi + entry.detectDefiFormat() + if entry.defiFormat == "m": + if "\n" in defi: + defi = f"
{defi}
" + elif entry.defiFormat == "x": + defi = self.xdxf_transform(defi) + + cur.execute( + "INSERT INTO entry(term, article) VALUES (?, ?);", + (entry.l_word[0], defi), + ) + id_ = cur.lastrowid + if id_ is None: + raise ValueError("lastrowid is None") + for alt in entry.l_word[1:]: + cur.execute( + "INSERT INTO alt(id, term) VALUES (?, ?);", + (id_, alt), + ) + hash_.update(entry.s_word.encode("utf-8")) + if self._fuzzy: + self.addFuzzy(id_, entry.l_word) + + cur.execute( + "INSERT INTO meta (key, value) VALUES (?, ?);", + ("hash", hash_.hexdigest()), + ) + + def addFuzzy(self, id_: int, terms: list[str]) -> None: + cur = self._cur + if cur is None: + raise ValueError("cur is None") + for term in terms: + subs: set[str] = set() + for word in term.split(" "): + eword = "\n" + word + subs.update(eword[i : i + 3] for i in range(len(eword) - 2)) + for sub in subs: + cur.execute( + "INSERT INTO fuzzy3(sub, term, id) VALUES (?, ?, ?);", + (sub, term, id_), + ) diff --git a/pyglossary/plugins/cc_kedict/__init__.py b/pyglossary/plugins/cc_kedict/__init__.py index 772c2ff6b..5289633ef 100644 --- a/pyglossary/plugins/cc_kedict/__init__.py +++ b/pyglossary/plugins/cc_kedict/__init__.py @@ -2,20 +2,12 @@ # mypy: ignore-errors from __future__ import annotations -from io import BytesIO -from os.path import isdir, join -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING if TYPE_CHECKING: - from collections.abc import Callable, Iterator - - import lxml - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option -from pyglossary.core import exc_note, log, pip -from pyglossary.text_reader import TextGlossaryReader +from .reader import Reader __all__ = [ "Reader", @@ -46,295 +38,3 @@ "@mhagiwara/cc-kedict", ) optionsProp: dict[str, Option] = {} - - -class YamlReader(TextGlossaryReader): - tagStyle = ( - "color:white;" - "background:green;" - "padding-left:3px;" - "padding-right:3px;" - "border-radius:0.5ex;" - # 0.5ex ~= 0.3em, but "ex" is recommended - ) - - def __init__( # noqa: PLR0913 - self, - glos: GlossaryType, - spellKey: str = "", - posKey: str = "", - synsKey: str = "", - tagsKey: str = "", - ) -> None: - TextGlossaryReader.__init__(self, glos) - self._spellKey = spellKey - self._posKey = posKey - self._synsKey = synsKey - self._tagsKey = tagsKey - - self._posMapping = { - "n": "noun", - "v": "verb", - "a": "adjective", - "pron": "pronoun", - "propn": "proper noun", - "intj": "interjection", - "det": "determiner", - "part": "particle", - "adv": "adverb", - "num": "number", - "abbrev": "abbreviation", - "suf": "suffix", - "pref": "prefix", - } - - @classmethod - def isInfoWord(cls, _word: str) -> bool: - return False - - @classmethod - def fixInfoWord(cls, _word: str) -> str: - return "" - - @staticmethod - def _makeList( - hf: lxml.etree.htmlfile, - input_objects: list[Any], - processor: Callable, - single_prefix: str | None = None, - skip_single: bool = True, - ) -> None: - """Wrap elements into
    if more than one element.""" - if not input_objects: - return - - if skip_single and len(input_objects) == 1: - # if single_prefix is None: - # single_prefix = ET.Element("br") - if single_prefix: - hf.write(single_prefix) - processor(hf, input_objects[0], 1) - return - - with hf.element("ol"): - for el in input_objects: - with hf.element("li"): - processor(hf, el, len(input_objects)) - - def _processExample( # noqa: PLR6301 - self, - hf: lxml.etree.htmlfile, - exampleDict: dict, - _count: int, - ) -> None: - from lxml import etree as ET - - if not exampleDict.get("example"): - log.error(f"invalid example: {exampleDict}") - return - - hf.write(exampleDict["example"]) - - transliteration = exampleDict.get("transliteration") - if transliteration: - hf.write(ET.Element("br")) - with hf.element("font", color="green"): - hf.write(f"{transliteration}") - - translation = exampleDict.get("translation") - if translation: - hf.write(ET.Element("br")) - with hf.element("i"): - hf.write(f"{translation}") - - def _processDef( - self, - hf: lxml.etree.htmlfile, - defDict: dict, - count: int, - ) -> None: - from lxml import etree as ET - - text = defDict.get("def", "") - if text: - hf.write(text) - - examples = defDict.get("examples") - if examples: - if text: - if count == 1: - hf.write(ET.Element("br")) - hf.write(ET.Element("br")) - with hf.element("i"): - hf.write("Examples:") - self._makeList( - hf, - examples, - self._processExample, - skip_single=False, - ) - - def _processNote( # noqa: PLR6301 - self, - hf: lxml.etree.htmlfile, - note: str, - _count: int, - ) -> None: - hf.write(note) - - def _processEntry( - self, - hf: lxml.etree.htmlfile, - edict: dict, - ) -> None: - from lxml import etree as ET - - if self._spellKey and self._spellKey in edict: - spelling = edict[self._spellKey] - if not isinstance(spelling, str): - log.error(f"{spelling=}, {type(spelling)=}, {edict=}") - # https://github.com/mhagiwara/cc-kedict/pull/1 - spelling = "on" if spelling is True else "" - if spelling: - with hf.element("font", color="green"): - hf.write(spelling) - hf.write(ET.Element("br")) - - if self._posKey and self._posKey in edict: - pos = edict[self._posKey] - pos = self._posMapping.get(pos, pos) - with hf.element("i"): - hf.write(pos.capitalize()) - hf.write(ET.Element("br")) - - if self._tagsKey and self._tagsKey in edict: - tags = edict[self._tagsKey] - for i, tag in enumerate(tags): - if i > 0: - hf.write(" ") - with hf.element("span", style=self.tagStyle): - hf.write(tag) - hf.write(ET.Element("br")) - - defs = edict.get("defs") - if defs: - self._makeList( - hf, - defs, - self._processDef, - ) - - if self._synsKey and self._synsKey in edict: - hf.write("Synonyms: ") - for i, word in enumerate(edict[self._synsKey]): - if i > 0: - with hf.element("big"): - hf.write(" | ") # NESTED: 5 - with hf.element("a", href=f"bword://{word}"): - hf.write(word) - hf.write(ET.Element("br")) - - notes = edict.get("notes") - if notes: - hf.write(ET.Element("br")) - hf.write("Notes:") - self._makeList( - hf, - notes, - self._processNote, - skip_single=False, - ) - - def _createEntry( - self, - yamlBlock: str, - ) -> tuple[str, str, None] | None: - from lxml import etree as ET - from yaml import load - - try: - from yaml import CLoader as Loader - except ImportError: - from yaml import Loader - - edict = load(yamlBlock, Loader=Loader) - word = edict.get("word") - if not word: - log.error(f"no word in {edict}") - return None - - f = BytesIO() - - with ET.htmlfile(f, encoding="utf-8") as hf: - with hf.element("div"): - self._processEntry(hf, edict) - - defi = f.getvalue().decode("utf-8") - return word, defi, None - - def nextBlock(self) -> EntryType: - if not self._file: - raise StopIteration - lines: list[str] = [] - while True: - line = self.readline() - if not line: - break - line = line.rstrip("\n\r") - if not line: - continue - if line.startswith("- "): - line = " " + line[1:] - if lines: - self._bufferLine = line - return self._createEntry("\n".join(lines)) - - lines.append(line) - - if lines: - return self._createEntry("\n".join(lines)) - - raise StopIteration - - -class Reader: - depends = { - "yaml": "PyYAML", - "lxml": "lxml", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._yaml = YamlReader( - glos, - spellKey="romaja", - posKey="pos", - synsKey="syns", - tagsKey="tags", - ) - - def __len__(self) -> int: - return 0 - - def open(self, filename: str) -> None: - try: - from lxml import etree as ET # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - if isdir(filename): - filename = join(filename, "kedict.yml") - self._filename = filename - - self._glos.sourceLangName = "Korean" - self._glos.targetLangName = "English" - - self._glos.setDefaultDefiFormat("h") - self._yaml.open(filename) - - def close(self) -> None: - self._yaml.close() - - def __iter__(self) -> Iterator[EntryType]: - yield from self._yaml diff --git a/pyglossary/plugins/cc_kedict/reader.py b/pyglossary/plugins/cc_kedict/reader.py new file mode 100644 index 000000000..1a9efcb4f --- /dev/null +++ b/pyglossary/plugins/cc_kedict/reader.py @@ -0,0 +1,309 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +from __future__ import annotations + +from io import BytesIO +from os.path import isdir, join +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Callable, Iterator + + import lxml + + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import exc_note, log, pip +from pyglossary.text_reader import TextGlossaryReader + + +class YamlReader(TextGlossaryReader): + tagStyle = ( + "color:white;" + "background:green;" + "padding-left:3px;" + "padding-right:3px;" + "border-radius:0.5ex;" + # 0.5ex ~= 0.3em, but "ex" is recommended + ) + + def __init__( # noqa: PLR0913 + self, + glos: GlossaryType, + spellKey: str = "", + posKey: str = "", + synsKey: str = "", + tagsKey: str = "", + ) -> None: + TextGlossaryReader.__init__(self, glos) + self._spellKey = spellKey + self._posKey = posKey + self._synsKey = synsKey + self._tagsKey = tagsKey + + self._posMapping = { + "n": "noun", + "v": "verb", + "a": "adjective", + "pron": "pronoun", + "propn": "proper noun", + "intj": "interjection", + "det": "determiner", + "part": "particle", + "adv": "adverb", + "num": "number", + "abbrev": "abbreviation", + "suf": "suffix", + "pref": "prefix", + } + + @classmethod + def isInfoWord(cls, _word: str) -> bool: + return False + + @classmethod + def fixInfoWord(cls, _word: str) -> str: + return "" + + @staticmethod + def _makeList( + hf: lxml.etree.htmlfile, + input_objects: list[Any], + processor: Callable, + single_prefix: str | None = None, + skip_single: bool = True, + ) -> None: + """Wrap elements into
      if more than one element.""" + if not input_objects: + return + + if skip_single and len(input_objects) == 1: + # if single_prefix is None: + # single_prefix = ET.Element("br") + if single_prefix: + hf.write(single_prefix) + processor(hf, input_objects[0], 1) + return + + with hf.element("ol"): + for el in input_objects: + with hf.element("li"): + processor(hf, el, len(input_objects)) + + def _processExample( # noqa: PLR6301 + self, + hf: lxml.etree.htmlfile, + exampleDict: dict, + _count: int, + ) -> None: + from lxml import etree as ET + + if not exampleDict.get("example"): + log.error(f"invalid example: {exampleDict}") + return + + hf.write(exampleDict["example"]) + + transliteration = exampleDict.get("transliteration") + if transliteration: + hf.write(ET.Element("br")) + with hf.element("font", color="green"): + hf.write(f"{transliteration}") + + translation = exampleDict.get("translation") + if translation: + hf.write(ET.Element("br")) + with hf.element("i"): + hf.write(f"{translation}") + + def _processDef( + self, + hf: lxml.etree.htmlfile, + defDict: dict, + count: int, + ) -> None: + from lxml import etree as ET + + text = defDict.get("def", "") + if text: + hf.write(text) + + examples = defDict.get("examples") + if examples: + if text: + if count == 1: + hf.write(ET.Element("br")) + hf.write(ET.Element("br")) + with hf.element("i"): + hf.write("Examples:") + self._makeList( + hf, + examples, + self._processExample, + skip_single=False, + ) + + def _processNote( # noqa: PLR6301 + self, + hf: lxml.etree.htmlfile, + note: str, + _count: int, + ) -> None: + hf.write(note) + + def _processEntry( + self, + hf: lxml.etree.htmlfile, + edict: dict, + ) -> None: + from lxml import etree as ET + + if self._spellKey and self._spellKey in edict: + spelling = edict[self._spellKey] + if not isinstance(spelling, str): + log.error(f"{spelling=}, {type(spelling)=}, {edict=}") + # https://github.com/mhagiwara/cc-kedict/pull/1 + spelling = "on" if spelling is True else "" + if spelling: + with hf.element("font", color="green"): + hf.write(spelling) + hf.write(ET.Element("br")) + + if self._posKey and self._posKey in edict: + pos = edict[self._posKey] + pos = self._posMapping.get(pos, pos) + with hf.element("i"): + hf.write(pos.capitalize()) + hf.write(ET.Element("br")) + + if self._tagsKey and self._tagsKey in edict: + tags = edict[self._tagsKey] + for i, tag in enumerate(tags): + if i > 0: + hf.write(" ") + with hf.element("span", style=self.tagStyle): + hf.write(tag) + hf.write(ET.Element("br")) + + defs = edict.get("defs") + if defs: + self._makeList( + hf, + defs, + self._processDef, + ) + + if self._synsKey and self._synsKey in edict: + hf.write("Synonyms: ") + for i, word in enumerate(edict[self._synsKey]): + if i > 0: + with hf.element("big"): + hf.write(" | ") # NESTED: 5 + with hf.element("a", href=f"bword://{word}"): + hf.write(word) + hf.write(ET.Element("br")) + + notes = edict.get("notes") + if notes: + hf.write(ET.Element("br")) + hf.write("Notes:") + self._makeList( + hf, + notes, + self._processNote, + skip_single=False, + ) + + def _createEntry( + self, + yamlBlock: str, + ) -> tuple[str, str, None] | None: + from lxml import etree as ET + from yaml import load + + try: + from yaml import CLoader as Loader + except ImportError: + from yaml import Loader + + edict = load(yamlBlock, Loader=Loader) + word = edict.get("word") + if not word: + log.error(f"no word in {edict}") + return None + + f = BytesIO() + + with ET.htmlfile(f, encoding="utf-8") as hf: + with hf.element("div"): + self._processEntry(hf, edict) + + defi = f.getvalue().decode("utf-8") + return word, defi, None + + def nextBlock(self) -> EntryType: + if not self._file: + raise StopIteration + lines: list[str] = [] + while True: + line = self.readline() + if not line: + break + line = line.rstrip("\n\r") + if not line: + continue + if line.startswith("- "): + line = " " + line[1:] + if lines: + self._bufferLine = line + return self._createEntry("\n".join(lines)) + + lines.append(line) + + if lines: + return self._createEntry("\n".join(lines)) + + raise StopIteration + + +class Reader: + depends = { + "yaml": "PyYAML", + "lxml": "lxml", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._yaml = YamlReader( + glos, + spellKey="romaja", + posKey="pos", + synsKey="syns", + tagsKey="tags", + ) + + def __len__(self) -> int: + return 0 + + def open(self, filename: str) -> None: + try: + from lxml import etree as ET # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + if isdir(filename): + filename = join(filename, "kedict.yml") + self._filename = filename + + self._glos.sourceLangName = "Korean" + self._glos.targetLangName = "English" + + self._glos.setDefaultDefiFormat("h") + self._yaml.open(filename) + + def close(self) -> None: + self._yaml.close() + + def __iter__(self) -> Iterator[EntryType]: + yield from self._yaml diff --git a/pyglossary/plugins/crawler_dir/__init__.py b/pyglossary/plugins/crawler_dir/__init__.py index 9c0ec0557..ae64f6e5c 100644 --- a/pyglossary/plugins/crawler_dir/__init__.py +++ b/pyglossary/plugins/crawler_dir/__init__.py @@ -1,28 +1,13 @@ # mypy: ignore-errors from __future__ import annotations -from hashlib import sha1 -from os import listdir, makedirs -from os.path import dirname, isdir, isfile, join, splitext -from typing import TYPE_CHECKING - -from pyglossary.compression import ( - compressionOpenFunc, -) -from pyglossary.core import log from pyglossary.option import ( Option, StrOption, ) -from pyglossary.text_utils import ( - escapeNTB, - splitByBarUnescapeNTB, -) - -if TYPE_CHECKING: - from collections.abc import Generator, Iterator - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -56,147 +41,3 @@ comment="Compression Algorithm", ), } - - -class Writer: - _compression: str = "" - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = None - - def finish(self) -> None: - pass - - def open(self, filename: str) -> None: - self._filename = filename - if not isdir(filename): - makedirs(filename) - - @staticmethod - def filePathFromWord(b_word: bytes) -> str: - bw = b_word.lower() - if len(bw) <= 2: - return bw.hex() - if len(bw) <= 4: - return join( - bw[:2].hex() + ".d", - bw[2:].hex(), - ) - return join( - bw[:2].hex() + ".d", - bw[2:4].hex() + ".d", - bw[4:8].hex() + "-" + sha1(b_word).hexdigest()[:8], # noqa: S324 - ) - - def write(self) -> None: - from pyglossary.json_utils import dataToPrettyJson - - filename = self._filename - - wordCount = 0 - compression = self._compression - c_open = compressionOpenFunc(compression) - if not c_open: - raise ValueError(f"invalid compression {compression!r}") - while True: - entry = yield - if entry is None: - break - if entry.isData(): - continue - fpath = join(filename, self.filePathFromWord(entry.b_word)) - if compression: - fpath = f"{fpath}.{compression}" - parentDir = dirname(fpath) - if not isdir(parentDir): - makedirs(parentDir) - if isfile(fpath): - log.warning(f"file exists: {fpath}") - fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}" # noqa: S324 - with c_open(fpath, "wt", encoding="utf-8") as _file: - _file.write( - f"{escapeNTB(entry.s_word)}\n{entry.defi}", - ) - wordCount += 1 - - with open( - join(filename, "info.json"), - mode="w", - encoding="utf-8", - ) as infoFile: - info = {} - info["name"] = self._glos.getInfo("name") - info["wordCount"] = wordCount - info |= self._glos.getExtraInfos(["name", "wordCount"]) - - infoFile.write(dataToPrettyJson(info)) - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = None - self._wordCount = 0 - - def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToData - - self._filename = filename - - with open(join(filename, "info.json"), encoding="utf-8") as infoFp: - info = jsonToData(infoFp.read()) - self._wordCount = info.pop("wordCount") - for key, value in info.items(): - self._glos.setInfo(key, value) - - def close(self) -> None: - pass - - def __len__(self) -> int: - return self._wordCount - - def _fromFile(self, fpath: str) -> EntryType: - _, ext = splitext(fpath) - c_open = compressionOpenFunc(ext.lstrip(".")) - if not c_open: - log.error(f"invalid extension {ext}") - c_open = open - with c_open(fpath, "rt", encoding="utf-8") as _file: - words = splitByBarUnescapeNTB(_file.readline().rstrip("\n")) - defi = _file.read() - return self._glos.newEntry(words, defi) - - @staticmethod - def _listdirSortKey(name: str) -> str: - name_nox, ext = splitext(name) - if ext == ".d": - return name - return name_nox - - def _readDir( - self, - dpath: str, - exclude: set[str] | None, - ) -> Generator[EntryType, None, None]: - children = listdir(dpath) - if exclude: - children = [name for name in children if name not in exclude] - children.sort(key=self._listdirSortKey) - for name in children: - cpath = join(dpath, name) - if isfile(cpath): - yield self._fromFile(cpath) - continue - if isdir(cpath): - yield from self._readDir(cpath, None) - continue - log.error(f"Not a file nor a directory: {cpath}") - - def __iter__(self) -> Iterator[EntryType]: - yield from self._readDir( - self._filename, - { - "info.json", - }, - ) diff --git a/pyglossary/plugins/crawler_dir/reader.py b/pyglossary/plugins/crawler_dir/reader.py new file mode 100644 index 000000000..9bb6b0369 --- /dev/null +++ b/pyglossary/plugins/crawler_dir/reader.py @@ -0,0 +1,88 @@ +# mypy: ignore-errors +from __future__ import annotations + +from os import listdir +from os.path import isdir, isfile, join, splitext +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + compressionOpenFunc, +) +from pyglossary.core import log +from pyglossary.text_utils import ( + splitByBarUnescapeNTB, +) + +if TYPE_CHECKING: + from collections.abc import Generator, Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = None + self._wordCount = 0 + + def open(self, filename: str) -> None: + from pyglossary.json_utils import jsonToData + + self._filename = filename + + with open(join(filename, "info.json"), encoding="utf-8") as infoFp: + info = jsonToData(infoFp.read()) + self._wordCount = info.pop("wordCount") + for key, value in info.items(): + self._glos.setInfo(key, value) + + def close(self) -> None: + pass + + def __len__(self) -> int: + return self._wordCount + + def _fromFile(self, fpath: str) -> EntryType: + _, ext = splitext(fpath) + c_open = compressionOpenFunc(ext.lstrip(".")) + if not c_open: + log.error(f"invalid extension {ext}") + c_open = open + with c_open(fpath, "rt", encoding="utf-8") as _file: + words = splitByBarUnescapeNTB(_file.readline().rstrip("\n")) + defi = _file.read() + return self._glos.newEntry(words, defi) + + @staticmethod + def _listdirSortKey(name: str) -> str: + name_nox, ext = splitext(name) + if ext == ".d": + return name + return name_nox + + def _readDir( + self, + dpath: str, + exclude: set[str] | None, + ) -> Generator[EntryType, None, None]: + children = listdir(dpath) + if exclude: + children = [name for name in children if name not in exclude] + children.sort(key=self._listdirSortKey) + for name in children: + cpath = join(dpath, name) + if isfile(cpath): + yield self._fromFile(cpath) + continue + if isdir(cpath): + yield from self._readDir(cpath, None) + continue + log.error(f"Not a file nor a directory: {cpath}") + + def __iter__(self) -> Iterator[EntryType]: + yield from self._readDir( + self._filename, + { + "info.json", + }, + ) diff --git a/pyglossary/plugins/crawler_dir/writer.py b/pyglossary/plugins/crawler_dir/writer.py new file mode 100644 index 000000000..6171a341e --- /dev/null +++ b/pyglossary/plugins/crawler_dir/writer.py @@ -0,0 +1,93 @@ +# mypy: ignore-errors +from __future__ import annotations + +from hashlib import sha1 +from os import makedirs +from os.path import dirname, isdir, isfile, join +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + compressionOpenFunc, +) +from pyglossary.core import log +from pyglossary.text_utils import ( + escapeNTB, +) + +if TYPE_CHECKING: + from pyglossary.glossary_types import GlossaryType + + +class Writer: + _compression: str = "" + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = None + + def finish(self) -> None: + pass + + def open(self, filename: str) -> None: + self._filename = filename + if not isdir(filename): + makedirs(filename) + + @staticmethod + def filePathFromWord(b_word: bytes) -> str: + bw = b_word.lower() + if len(bw) <= 2: + return bw.hex() + if len(bw) <= 4: + return join( + bw[:2].hex() + ".d", + bw[2:].hex(), + ) + return join( + bw[:2].hex() + ".d", + bw[2:4].hex() + ".d", + bw[4:8].hex() + "-" + sha1(b_word).hexdigest()[:8], # noqa: S324 + ) + + def write(self) -> None: + from pyglossary.json_utils import dataToPrettyJson + + filename = self._filename + + wordCount = 0 + compression = self._compression + c_open = compressionOpenFunc(compression) + if not c_open: + raise ValueError(f"invalid compression {compression!r}") + while True: + entry = yield + if entry is None: + break + if entry.isData(): + continue + fpath = join(filename, self.filePathFromWord(entry.b_word)) + if compression: + fpath = f"{fpath}.{compression}" + parentDir = dirname(fpath) + if not isdir(parentDir): + makedirs(parentDir) + if isfile(fpath): + log.warning(f"file exists: {fpath}") + fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}" # noqa: S324 + with c_open(fpath, "wt", encoding="utf-8") as _file: + _file.write( + f"{escapeNTB(entry.s_word)}\n{entry.defi}", + ) + wordCount += 1 + + with open( + join(filename, "info.json"), + mode="w", + encoding="utf-8", + ) as infoFile: + info = {} + info["name"] = self._glos.getInfo("name") + info["wordCount"] = wordCount + info |= self._glos.getExtraInfos(["name", "wordCount"]) + + infoFile.write(dataToPrettyJson(info)) diff --git a/pyglossary/plugins/csv_plugin/__init__.py b/pyglossary/plugins/csv_plugin/__init__.py index 1f9aebb29..36916b243 100644 --- a/pyglossary/plugins/csv_plugin/__init__.py +++ b/pyglossary/plugins/csv_plugin/__init__.py @@ -20,16 +20,7 @@ from __future__ import annotations import csv -import os -from os.path import isdir, join -from typing import TYPE_CHECKING, cast -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import log -from pyglossary.io_utils import nullTextIO from pyglossary.option import ( BoolOption, EncodingOption, @@ -37,11 +28,8 @@ Option, ) -if TYPE_CHECKING: - import io - from collections.abc import Generator, Iterable, Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -94,231 +82,3 @@ } csv.field_size_limit(0x7FFFFFFF) - - -class Reader: - compressions = stdCompressions - - _encoding: str = "utf-8" - _newline: str = "\n" - _delimiter: str = "," - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self.clear() - - def clear(self) -> None: - self._filename = "" - self._file: io.TextIOBase = nullTextIO - self._fileSize = 0 - self._leadingLinesCount = 0 - self._wordCount: int | None = None - self._pos = -1 - self._csvReader: Iterable[list[str]] | None = None - self._resDir = "" - self._resFileNames: list[str] = [] - self._bufferRow: list[str] | None = None - - def open( - self, - filename: str, - ) -> None: - from pyglossary.text_reader import TextFilePosWrapper - - self._filename = filename - cfile = cast( - "io.TextIOBase", - compressionOpen( - filename, - mode="rt", - encoding=self._encoding, - newline=self._newline, - ), - ) - - if self._glos.progressbar: - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - # self._glos.setInfo("input_file_size", f"{self._fileSize}") - else: - log.warning("CSV Reader: file is not seekable") - - self._file = TextFilePosWrapper(cfile, self._encoding) - self._csvReader = csv.reader( - self._file, - dialect="excel", - delimiter=self._delimiter, - ) - self._resDir = filename + "_res" - if isdir(self._resDir): - self._resFileNames = os.listdir(self._resDir) - else: - self._resDir = "" - self._resFileNames = [] - for row in self._csvReader: - if not row: - continue - if not row[0].startswith("#"): - self._bufferRow = row - break - if len(row) < 2: - log.error(f"invalid row: {row}") - continue - self._glos.setInfo(row[0].lstrip("#"), row[1]) - - def close(self) -> None: - if self._file: - try: - self._file.close() - except Exception: - log.exception("error while closing csv file") - self.clear() - - def __len__(self) -> int: - from pyglossary.file_utils import fileCountLines - - if self._wordCount is None: - if hasattr(self._file, "compression"): - return 0 - log.debug("Try not to use len(reader) as it takes extra time") - self._wordCount = fileCountLines(self._filename) - self._leadingLinesCount - return self._wordCount + len(self._resFileNames) - - def _iterRows(self) -> Iterator[list[str]]: - if self._csvReader is None: - raise RuntimeError("self._csvReader is None") - if self._bufferRow: - yield self._bufferRow - yield from self._csvReader - - def _processRow(self, row: list[str]) -> EntryType | None: - if not row: - return None - - word: str | list[str] - try: - word = row[0] - defi = row[1] - except IndexError: - log.error(f"invalid row: {row!r}") - return None - - try: - alts = row[2].split(",") - except IndexError: - pass - else: - word = [word] + alts - - return self._glos.newEntry( - word, - defi, - byteProgress=( - (self._file.tell(), self._fileSize) if self._fileSize else None - ), - ) - - def __iter__(self) -> Iterator[EntryType | None]: - if not self._csvReader: - raise RuntimeError("iterating over a reader while it's not open") - - wordCount = 0 - for row in self._iterRows(): - wordCount += 1 - yield self._processRow(row) - - self._wordCount = wordCount - - resDir = self._resDir - for fname in self._resFileNames: - with open(join(resDir, fname), "rb") as _file: - yield self._glos.newDataEntry( - fname, - _file.read(), - ) - - -class Writer: - compressions = stdCompressions - - _encoding: str = "utf-8" - _newline: str = "\n" - _resources: bool = True - _delimiter: str = "," - _add_defi_format: bool = False - _enable_info: bool = True - _word_title: bool = False - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._file: io.TextIOBase = nullTextIO - - def open(self, filename: str) -> None: - self._filename = filename - self._file = cast( - "io.TextIOBase", - compressionOpen( - filename, - mode="wt", - encoding=self._encoding, - newline=self._newline, - ), - ) - self._resDir = resDir = filename + "_res" - self._csvWriter = csv.writer( - self._file, - dialect="excel", - quoting=csv.QUOTE_ALL, # FIXME - delimiter=self._delimiter, - ) - if not isdir(resDir): - os.mkdir(resDir) - if self._enable_info: - for key, value in self._glos.iterInfo(): - self._csvWriter.writerow([f"#{key}", value]) - - def finish(self) -> None: - self._filename = "" - self._file.close() - self._file = nullTextIO - if not os.listdir(self._resDir): - os.rmdir(self._resDir) - - def write(self) -> Generator[None, EntryType, None]: - resources = self._resources - add_defi_format = self._add_defi_format - glos = self._glos - resDir = self._resDir - writer = self._csvWriter - word_title = self._word_title - while True: - entry = yield - if entry is None: - break - if entry.isData(): - if resources: - entry.save(resDir) - continue - - words = entry.l_word - if not words: - continue - word, alts = words[0], words[1:] - defi = entry.defi - - if word_title: - defi = glos.wordTitleStr(words[0]) + defi - - row = [ - word, - defi, - ] - if add_defi_format: - entry.detectDefiFormat() - row.append(entry.defiFormat) - if alts: - row.append(",".join(alts)) - - writer.writerow(row) diff --git a/pyglossary/plugins/csv_plugin/reader.py b/pyglossary/plugins/csv_plugin/reader.py new file mode 100644 index 000000000..8087e9e92 --- /dev/null +++ b/pyglossary/plugins/csv_plugin/reader.py @@ -0,0 +1,182 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2013-2019 Saeed Rasooli (ilius) +# This file is part of PyGlossary project, https://github.com/ilius/pyglossary +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL +# If not, see . + +from __future__ import annotations + +import csv +import os +from os.path import isdir, join +from typing import TYPE_CHECKING, cast + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import log +from pyglossary.io_utils import nullTextIO + +if TYPE_CHECKING: + import io + from collections.abc import Iterable, Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + compressions = stdCompressions + + _encoding: str = "utf-8" + _newline: str = "\n" + _delimiter: str = "," + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self.clear() + + def clear(self) -> None: + self._filename = "" + self._file: io.TextIOBase = nullTextIO + self._fileSize = 0 + self._leadingLinesCount = 0 + self._wordCount: int | None = None + self._pos = -1 + self._csvReader: Iterable[list[str]] | None = None + self._resDir = "" + self._resFileNames: list[str] = [] + self._bufferRow: list[str] | None = None + + def open( + self, + filename: str, + ) -> None: + from pyglossary.text_reader import TextFilePosWrapper + + self._filename = filename + cfile = cast( + "io.TextIOBase", + compressionOpen( + filename, + mode="rt", + encoding=self._encoding, + newline=self._newline, + ), + ) + + if self._glos.progressbar: + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + # self._glos.setInfo("input_file_size", f"{self._fileSize}") + else: + log.warning("CSV Reader: file is not seekable") + + self._file = TextFilePosWrapper(cfile, self._encoding) + self._csvReader = csv.reader( + self._file, + dialect="excel", + delimiter=self._delimiter, + ) + self._resDir = filename + "_res" + if isdir(self._resDir): + self._resFileNames = os.listdir(self._resDir) + else: + self._resDir = "" + self._resFileNames = [] + for row in self._csvReader: + if not row: + continue + if not row[0].startswith("#"): + self._bufferRow = row + break + if len(row) < 2: + log.error(f"invalid row: {row}") + continue + self._glos.setInfo(row[0].lstrip("#"), row[1]) + + def close(self) -> None: + if self._file: + try: + self._file.close() + except Exception: + log.exception("error while closing csv file") + self.clear() + + def __len__(self) -> int: + from pyglossary.file_utils import fileCountLines + + if self._wordCount is None: + if hasattr(self._file, "compression"): + return 0 + log.debug("Try not to use len(reader) as it takes extra time") + self._wordCount = fileCountLines(self._filename) - self._leadingLinesCount + return self._wordCount + len(self._resFileNames) + + def _iterRows(self) -> Iterator[list[str]]: + if self._csvReader is None: + raise RuntimeError("self._csvReader is None") + if self._bufferRow: + yield self._bufferRow + yield from self._csvReader + + def _processRow(self, row: list[str]) -> EntryType | None: + if not row: + return None + + word: str | list[str] + try: + word = row[0] + defi = row[1] + except IndexError: + log.error(f"invalid row: {row!r}") + return None + + try: + alts = row[2].split(",") + except IndexError: + pass + else: + word = [word] + alts + + return self._glos.newEntry( + word, + defi, + byteProgress=( + (self._file.tell(), self._fileSize) if self._fileSize else None + ), + ) + + def __iter__(self) -> Iterator[EntryType | None]: + if not self._csvReader: + raise RuntimeError("iterating over a reader while it's not open") + + wordCount = 0 + for row in self._iterRows(): + wordCount += 1 + yield self._processRow(row) + + self._wordCount = wordCount + + resDir = self._resDir + for fname in self._resFileNames: + with open(join(resDir, fname), "rb") as _file: + yield self._glos.newDataEntry( + fname, + _file.read(), + ) diff --git a/pyglossary/plugins/csv_plugin/writer.py b/pyglossary/plugins/csv_plugin/writer.py new file mode 100644 index 000000000..ff1c42920 --- /dev/null +++ b/pyglossary/plugins/csv_plugin/writer.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2013-2019 Saeed Rasooli (ilius) +# This file is part of PyGlossary project, https://github.com/ilius/pyglossary +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL +# If not, see . + +from __future__ import annotations + +import csv +import os +from os.path import isdir +from typing import TYPE_CHECKING, cast + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.io_utils import nullTextIO + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + compressions = stdCompressions + + _encoding: str = "utf-8" + _newline: str = "\n" + _resources: bool = True + _delimiter: str = "," + _add_defi_format: bool = False + _enable_info: bool = True + _word_title: bool = False + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._file: io.TextIOBase = nullTextIO + + def open(self, filename: str) -> None: + self._filename = filename + self._file = cast( + "io.TextIOBase", + compressionOpen( + filename, + mode="wt", + encoding=self._encoding, + newline=self._newline, + ), + ) + self._resDir = resDir = filename + "_res" + self._csvWriter = csv.writer( + self._file, + dialect="excel", + quoting=csv.QUOTE_ALL, # FIXME + delimiter=self._delimiter, + ) + if not isdir(resDir): + os.mkdir(resDir) + if self._enable_info: + for key, value in self._glos.iterInfo(): + self._csvWriter.writerow([f"#{key}", value]) + + def finish(self) -> None: + self._filename = "" + self._file.close() + self._file = nullTextIO + if not os.listdir(self._resDir): + os.rmdir(self._resDir) + + def write(self) -> Generator[None, EntryType, None]: + resources = self._resources + add_defi_format = self._add_defi_format + glos = self._glos + resDir = self._resDir + writer = self._csvWriter + word_title = self._word_title + while True: + entry = yield + if entry is None: + break + if entry.isData(): + if resources: + entry.save(resDir) + continue + + words = entry.l_word + if not words: + continue + word, alts = words[0], words[1:] + defi = entry.defi + + if word_title: + defi = glos.wordTitleStr(words[0]) + defi + + row = [ + word, + defi, + ] + if add_defi_format: + entry.detectDefiFormat() + row.append(entry.defiFormat) + if alts: + row.append(",".join(alts)) + + writer.writerow(row) diff --git a/pyglossary/plugins/dicformids/__init__.py b/pyglossary/plugins/dicformids/__init__.py index 625b9b7f3..8e1f4ca76 100644 --- a/pyglossary/plugins/dicformids/__init__.py +++ b/pyglossary/plugins/dicformids/__init__.py @@ -2,22 +2,16 @@ # mypy: ignore-errors from __future__ import annotations -import operator -import os -import re -from os.path import join from typing import TYPE_CHECKING -from pyglossary.core import log -from pyglossary.flags import ALWAYS -from pyglossary.plugins.tabfile import Reader as TabfileReader - if TYPE_CHECKING: - from collections.abc import Generator, Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option +from pyglossary.flags import ALWAYS + +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -52,243 +46,3 @@ ) optionsProp: dict[str, Option] = {} - - -PROP_TEMPLATE = """#DictionaryForMIDs property file -infoText={name}, author: {author} -indexFileMaxSize={indexFileMaxSize}\n -language1IndexNumberOfSourceEntries={wordCount} -language1DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate -indexCharEncoding=ISO-8859-1 -dictionaryFileSeparationCharacter='\\t' -language2NormationClassName=de.kugihan.dictionaryformids.translation.Normation -language2DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate -logLevel=0 -language1FilePostfix={directoryPostfix} -dictionaryCharEncoding=UTF-8 -numberOfAvailableLanguages=2 -language1IsSearchable=true -language2GenerateIndex=false -dictionaryFileMaxSize={dicMaxSize} -language2FilePostfix={language2FilePostfix} -searchListFileMaxSize=20000 -language2IsSearchable=false -fileEncodingFormat=plain_format1 -language1HasSeparateDictionaryFile=true -searchListCharEncoding=ISO-8859-1 -searchListFileSeparationCharacter='\t' -indexFileSeparationCharacter='\t' -language1DisplayText={sourceLang} -language2HasSeparateDictionaryFile=false -dictionaryGenerationInputCharEncoding=UTF-8 -language1GenerateIndex=true -language2DisplayText={targetLang} -language1NormationClassName=de.kugihan.dictionaryformids.translation.NormationEng -""" - - -class Reader: - re_number = re.compile(r"\d+") - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._tabFileNames: list[str] = [] - self._tabFileReader = None - - def open(self, dirname: str) -> None: - self._dirname = dirname - orderFileNames: list[tuple[int, str]] = [] - for fname in os.listdir(dirname): - if not fname.startswith("directory"): - continue - try: - num = self.re_number.findall(fname)[-1] - except IndexError: - pass - else: - orderFileNames.append((num, fname)) - orderFileNames.sort( - key=operator.itemgetter(0), - reverse=True, - ) - self._tabFileNames = [x[1] for x in orderFileNames] - self.nextTabFile() - - def __len__(self) -> int: - raise NotImplementedError # FIXME - - def __iter__(self) -> Iterator[EntryType]: - return self - - def __next__(self) -> EntryType: - for _ in range(10): - try: - return next(self._tabFileReader) - except StopIteration: # noqa: PERF203 - self._tabFileReader.close() - self.nextTabFile() - return None - - def nextTabFile(self) -> None: - try: - tabFileName = self._tabFileNames.pop() - except IndexError: - raise StopIteration from None - self._tabFileReader = TabfileReader(self._glos, hasInfo=False) - self._tabFileReader.open(join(self._dirname, tabFileName), newline="\n") - - def close(self) -> None: - if self._tabFileReader: - try: - self._tabFileReader.close() - except Exception: - pass # noqa: S110 - self._tabFileReader = None - self._tabFileNames = [] - - -class Writer: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self.linesPerDirectoryFile = 500 # 200 - self.indexFileMaxSize = 32722 # 30000 - self.directoryPostfix = "" - self.indexPostfix = "" - self._dirname = "" - # looks like we need to remove tabs, because app gives error - # but based on the java code, all punctuations should be removed - # as well, including '|' - self.re_punc = re.compile( - r"""[!"$§%&/()=?´`\\{}\[\]^°+*~#'\-_.:,;<>@|]*""", # noqa: RUF001 - ) - self.re_spaces = re.compile(" +") - self.re_tabs = re.compile("\t+") - - def normateWord(self, word: str) -> str: - word = word.strip() - word = self.re_punc.sub("", word) - word = self.re_spaces.sub(" ", word) - word = self.re_tabs.sub(" ", word) - word = word.lower() - return word # noqa: RET504 - - def writeProbs(self) -> None: - glos = self._glos - probsPath = join( - self._dirname, - "DictionaryForMIDs.properties", - ) - with open(probsPath, mode="w", newline="\n", encoding="utf-8") as fileObj: - fileObj.write( - PROP_TEMPLATE.format( - name=glos.getInfo("name"), - author=glos.author, - indexFileMaxSize=self.indexFileMaxSize, - wordCount=self.wordCount, - directoryPostfix=self.directoryPostfix, - dicMaxSize=self.dicMaxSize + 1, - language2FilePostfix="fa", # FIXME - sourceLang=glos.sourceLangName, - targetLang=glos.targetLangName, - ), - ) - - def nextIndex(self) -> None: - try: - self.indexFp.close() - except AttributeError: - self.indexIndex = 0 - - self.indexIndex += 1 - fname = f"index{self.indexPostfix}{self.indexIndex}.csv" - fpath = join(self._dirname, fname) - self.indexFp = open(fpath, mode="w", encoding="utf-8", newline="\n") - - def finish(self) -> None: - pass - - def open(self, dirname: str) -> None: - self._dirname = dirname - if not os.path.isdir(dirname): - os.mkdir(dirname) - - def write(self) -> Generator[None, EntryType, None]: - self.nextIndex() - - dicMaxSize = 0 - indexData: list[tuple[str, int, int]] = [] - - def writeBucket(dicIndex: int, entryList: list[EntryType]) -> None: - nonlocal dicMaxSize - log.debug( - f"{dicIndex=}, {len(entryList)=}, {dicMaxSize=}", - ) - dicFp = open( - join( - self._dirname, - f"directory{self.directoryPostfix}{dicIndex + 1}.csv", - ), - mode="w", - encoding="utf-8", - newline="\n", - ) - for entry in entryList: - word = entry.s_word - n_word = self.normateWord(word) - defi = entry.defi - dicLine = word + "\t" + defi + "\n" - dicPos = dicFp.tell() - dicFp.write(dicLine) - indexData.append((n_word, dicIndex + 1, dicPos)) - - dicMaxSize = max(dicMaxSize, dicFp.tell()) - dicFp.close() - - bucketSize = self.linesPerDirectoryFile - wordCount = 0 - dicIndex = 0 - entryList: list[EntryType] = [] # aka bucket - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # FIXME - continue - wordCount += 1 - entryList.append(entry) - if len(entryList) >= bucketSize: - writeBucket(dicIndex, entryList) - dicIndex += 1 - entryList = [] - - if entryList: - writeBucket(dicIndex, entryList) - entryList = [] - - self.dicMaxSize = dicMaxSize - self.wordCount = wordCount - - langSearchListFp = open( - join( - self._dirname, - f"searchlist{self.directoryPostfix}.csv", - ), - mode="w", - newline="\n", - encoding="utf-8", - ) - - langSearchListFp.write(f"{indexData[0][0]}\t{self.indexIndex}\n") - - for word, dicIndex, dicPos in indexData: - indexLine = f"{word}\t{dicIndex}-{dicPos}-B\n" - if (self.indexFp.tell() + len(indexLine)) > self.indexFileMaxSize - 10: - self.nextIndex() - langSearchListFp.write(f"{word}\t{self.indexIndex}\n") - self.indexFp.write(indexLine) - - self.indexFp.close() - langSearchListFp.close() - - self.writeProbs() diff --git a/pyglossary/plugins/dicformids/reader.py b/pyglossary/plugins/dicformids/reader.py new file mode 100644 index 000000000..9ae2bd1a8 --- /dev/null +++ b/pyglossary/plugins/dicformids/reader.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +from __future__ import annotations + +import operator +import os +import re +from os.path import join +from typing import TYPE_CHECKING + +from pyglossary.plugins.tabfile import Reader as TabfileReader + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + re_number = re.compile(r"\d+") + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._tabFileNames: list[str] = [] + self._tabFileReader = None + + def open(self, dirname: str) -> None: + self._dirname = dirname + orderFileNames: list[tuple[int, str]] = [] + for fname in os.listdir(dirname): + if not fname.startswith("directory"): + continue + try: + num = self.re_number.findall(fname)[-1] + except IndexError: + pass + else: + orderFileNames.append((num, fname)) + orderFileNames.sort( + key=operator.itemgetter(0), + reverse=True, + ) + self._tabFileNames = [x[1] for x in orderFileNames] + self.nextTabFile() + + def __len__(self) -> int: + raise NotImplementedError # FIXME + + def __iter__(self) -> Iterator[EntryType]: + return self + + def __next__(self) -> EntryType: + for _ in range(10): + try: + return next(self._tabFileReader) + except StopIteration: # noqa: PERF203 + self._tabFileReader.close() + self.nextTabFile() + return None + + def nextTabFile(self) -> None: + try: + tabFileName = self._tabFileNames.pop() + except IndexError: + raise StopIteration from None + self._tabFileReader = TabfileReader(self._glos, hasInfo=False) + self._tabFileReader.open(join(self._dirname, tabFileName), newline="\n") + + def close(self) -> None: + if self._tabFileReader: + try: + self._tabFileReader.close() + except Exception: + pass # noqa: S110 + self._tabFileReader = None + self._tabFileNames = [] diff --git a/pyglossary/plugins/dicformids/writer.py b/pyglossary/plugins/dicformids/writer.py new file mode 100644 index 000000000..44dc07ebd --- /dev/null +++ b/pyglossary/plugins/dicformids/writer.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +from __future__ import annotations + +import os +import re +from os.path import join +from typing import TYPE_CHECKING + +from pyglossary.core import log + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +PROP_TEMPLATE = """#DictionaryForMIDs property file +infoText={name}, author: {author} +indexFileMaxSize={indexFileMaxSize}\n +language1IndexNumberOfSourceEntries={wordCount} +language1DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate +indexCharEncoding=ISO-8859-1 +dictionaryFileSeparationCharacter='\\t' +language2NormationClassName=de.kugihan.dictionaryformids.translation.Normation +language2DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate +logLevel=0 +language1FilePostfix={directoryPostfix} +dictionaryCharEncoding=UTF-8 +numberOfAvailableLanguages=2 +language1IsSearchable=true +language2GenerateIndex=false +dictionaryFileMaxSize={dicMaxSize} +language2FilePostfix={language2FilePostfix} +searchListFileMaxSize=20000 +language2IsSearchable=false +fileEncodingFormat=plain_format1 +language1HasSeparateDictionaryFile=true +searchListCharEncoding=ISO-8859-1 +searchListFileSeparationCharacter='\t' +indexFileSeparationCharacter='\t' +language1DisplayText={sourceLang} +language2HasSeparateDictionaryFile=false +dictionaryGenerationInputCharEncoding=UTF-8 +language1GenerateIndex=true +language2DisplayText={targetLang} +language1NormationClassName=de.kugihan.dictionaryformids.translation.NormationEng +""" + + +class Writer: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self.linesPerDirectoryFile = 500 # 200 + self.indexFileMaxSize = 32722 # 30000 + self.directoryPostfix = "" + self.indexPostfix = "" + self._dirname = "" + # looks like we need to remove tabs, because app gives error + # but based on the java code, all punctuations should be removed + # as well, including '|' + self.re_punc = re.compile( + r"""[!"$§%&/()=?´`\\{}\[\]^°+*~#'\-_.:,;<>@|]*""", # noqa: RUF001 + ) + self.re_spaces = re.compile(" +") + self.re_tabs = re.compile("\t+") + + def normateWord(self, word: str) -> str: + word = word.strip() + word = self.re_punc.sub("", word) + word = self.re_spaces.sub(" ", word) + word = self.re_tabs.sub(" ", word) + word = word.lower() + return word # noqa: RET504 + + def writeProbs(self) -> None: + glos = self._glos + probsPath = join( + self._dirname, + "DictionaryForMIDs.properties", + ) + with open(probsPath, mode="w", newline="\n", encoding="utf-8") as fileObj: + fileObj.write( + PROP_TEMPLATE.format( + name=glos.getInfo("name"), + author=glos.author, + indexFileMaxSize=self.indexFileMaxSize, + wordCount=self.wordCount, + directoryPostfix=self.directoryPostfix, + dicMaxSize=self.dicMaxSize + 1, + language2FilePostfix="fa", # FIXME + sourceLang=glos.sourceLangName, + targetLang=glos.targetLangName, + ), + ) + + def nextIndex(self) -> None: + try: + self.indexFp.close() + except AttributeError: + self.indexIndex = 0 + + self.indexIndex += 1 + fname = f"index{self.indexPostfix}{self.indexIndex}.csv" + fpath = join(self._dirname, fname) + self.indexFp = open(fpath, mode="w", encoding="utf-8", newline="\n") + + def finish(self) -> None: + pass + + def open(self, dirname: str) -> None: + self._dirname = dirname + if not os.path.isdir(dirname): + os.mkdir(dirname) + + def write(self) -> Generator[None, EntryType, None]: + self.nextIndex() + + dicMaxSize = 0 + indexData: list[tuple[str, int, int]] = [] + + def writeBucket(dicIndex: int, entryList: list[EntryType]) -> None: + nonlocal dicMaxSize + log.debug( + f"{dicIndex=}, {len(entryList)=}, {dicMaxSize=}", + ) + dicFp = open( + join( + self._dirname, + f"directory{self.directoryPostfix}{dicIndex + 1}.csv", + ), + mode="w", + encoding="utf-8", + newline="\n", + ) + for entry in entryList: + word = entry.s_word + n_word = self.normateWord(word) + defi = entry.defi + dicLine = word + "\t" + defi + "\n" + dicPos = dicFp.tell() + dicFp.write(dicLine) + indexData.append((n_word, dicIndex + 1, dicPos)) + + dicMaxSize = max(dicMaxSize, dicFp.tell()) + dicFp.close() + + bucketSize = self.linesPerDirectoryFile + wordCount = 0 + dicIndex = 0 + entryList: list[EntryType] = [] # aka bucket + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # FIXME + continue + wordCount += 1 + entryList.append(entry) + if len(entryList) >= bucketSize: + writeBucket(dicIndex, entryList) + dicIndex += 1 + entryList = [] + + if entryList: + writeBucket(dicIndex, entryList) + entryList = [] + + self.dicMaxSize = dicMaxSize + self.wordCount = wordCount + + langSearchListFp = open( + join( + self._dirname, + f"searchlist{self.directoryPostfix}.csv", + ), + mode="w", + newline="\n", + encoding="utf-8", + ) + + langSearchListFp.write(f"{indexData[0][0]}\t{self.indexIndex}\n") + + for word, dicIndex, dicPos in indexData: + indexLine = f"{word}\t{dicIndex}-{dicPos}-B\n" + if (self.indexFp.tell() + len(indexLine)) > self.indexFileMaxSize - 10: + self.nextIndex() + langSearchListFp.write(f"{word}\t{self.indexIndex}\n") + self.indexFp.write(indexLine) + + self.indexFp.close() + langSearchListFp.close() + + self.writeProbs() diff --git a/pyglossary/plugins/dict_cc/__init__.py b/pyglossary/plugins/dict_cc/__init__.py index 9105a963e..c75ec3d64 100644 --- a/pyglossary/plugins/dict_cc/__init__.py +++ b/pyglossary/plugins/dict_cc/__init__.py @@ -1,20 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html -from operator import itemgetter -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING if TYPE_CHECKING: - import sqlite3 - from collections.abc import Callable, Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.lxml_types import Element, T_htmlfile from pyglossary.option import Option -from pyglossary.core import log +from .reader import Reader __all__ = [ "Reader", @@ -45,192 +38,3 @@ "dict.cc dictionary - Google Play", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("h") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - "select count(distinct term1)+count(distinct term2) from main_ft", - ) - return self._cur.fetchone()[0] - - @staticmethod - def makeList( - hf: T_htmlfile, - input_elements: list[Element], - processor: Callable, - single_prefix: str = "", - skip_single: bool = True, - ) -> None: - """Wrap elements into
        if more than one element.""" - if not input_elements: - return - - if skip_single and len(input_elements) == 1: - hf.write(single_prefix) - processor(hf, input_elements[0]) - return - - with hf.element("ol"): - for el in input_elements: - with hf.element("li"): - processor(hf, el) - - @staticmethod - def makeGroupsList( - hf: T_htmlfile, - groups: list[tuple[str, str]], - processor: Callable[[T_htmlfile, tuple[str, str]], None], - single_prefix: str = "", - skip_single: bool = True, - ) -> None: - """Wrap elements into
          if more than one element.""" - if not groups: - return - - if skip_single and len(groups) == 1: - hf.write(single_prefix) - processor(hf, groups[0]) - return - - with hf.element("ol"): - for el in groups: - with hf.element("li"): - processor(hf, el) - - def writeSense( # noqa: PLR6301 - self, - hf: T_htmlfile, - row: tuple[str, str], - ) -> None: - from lxml import etree as ET - - trans, entry_type = row - if entry_type: - with hf.element("i"): - hf.write(f"{entry_type}") # noqa: FURB183 - hf.write(ET.Element("br")) - try: - hf.write(trans + " ") - except Exception as e: - log.error(f"error in writing {trans!r}, {e}") - hf.write(repr(trans) + " ") - else: - with hf.element("big"): - with hf.element("a", href=f"bword://{trans}"): - hf.write("⏎") - - def iterRows( - self, - column1: str, - column2: str, - ) -> Iterator[tuple[str, str, str]]: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - f"select {column1}, {column2}, entry_type from main_ft" - f" order by {column1}", - ) - for row in self._cur.fetchall(): - term1 = row[0] - term2 = row[1] - try: - term1 = html.unescape(term1) - except Exception as e: - log.error(f"html.unescape({term1!r}) -> {e}") - try: - term2 = html.unescape(term2) - except Exception as e: - log.error(f"html.unescape({term2!r}) -> {e}") - yield term1, term2, row[2] - - def parseGender(self, headword: str) -> tuple[str | None, str]: # noqa: PLR6301 - # {m} masc masculine German: maskulin - # {f} fem feminine German: feminin - # {n} neut neutral German: neutral - # { } ???? - i = headword.find(" {") - if i <= 0: - return None, headword - if len(headword) < i + 4: - return None, headword - if headword[i + 3] != "}": - return None, headword - g = headword[i + 2] - gender = None - if g == "m": - gender = "masculine" - elif g == "f": - gender = "feminine" - elif g == "n": - gender = "neutral" - else: - log.warning(f"invalid gender {g!r}") - return None, headword - headword = headword[:i] + headword[i + 4 :] - return gender, headword - - def _iterOneDirection( - self, - column1: str, - column2: str, - ) -> Iterator[EntryType]: - from io import BytesIO - from itertools import groupby - - from lxml import etree as ET - - glos = self._glos - for headwordEscaped, groupsOrig in groupby( - self.iterRows(column1, column2), - key=itemgetter(0), - ): - headword = html.unescape(headwordEscaped) - groups: list[tuple[str, str]] = [ - (term2, entry_type) for _, term2, entry_type in groupsOrig - ] - f = BytesIO() - gender, headword = self.parseGender(headword) - with ET.htmlfile(f, encoding="utf-8") as hf: - with hf.element("div"): - if gender: - with hf.element("i"): - hf.write(gender) - hf.write(ET.Element("br")) - self.makeGroupsList( - cast("T_htmlfile", hf), - groups, - self.writeSense, - ) - defi = f.getvalue().decode("utf-8") - yield glos.newEntry(headword, defi, defiFormat="h") - - def __iter__(self) -> Iterator[EntryType]: - yield from self._iterOneDirection("term1", "term2") - yield from self._iterOneDirection("term2", "term1") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/dict_cc/reader.py b/pyglossary/plugins/dict_cc/reader.py new file mode 100644 index 000000000..e6615604a --- /dev/null +++ b/pyglossary/plugins/dict_cc/reader.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from operator import itemgetter +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Callable, Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.lxml_types import Element, T_htmlfile + + +from pyglossary.core import log + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("h") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + "select count(distinct term1)+count(distinct term2) from main_ft", + ) + return self._cur.fetchone()[0] + + @staticmethod + def makeList( + hf: T_htmlfile, + input_elements: list[Element], + processor: Callable, + single_prefix: str = "", + skip_single: bool = True, + ) -> None: + """Wrap elements into
            if more than one element.""" + if not input_elements: + return + + if skip_single and len(input_elements) == 1: + hf.write(single_prefix) + processor(hf, input_elements[0]) + return + + with hf.element("ol"): + for el in input_elements: + with hf.element("li"): + processor(hf, el) + + @staticmethod + def makeGroupsList( + hf: T_htmlfile, + groups: list[tuple[str, str]], + processor: Callable[[T_htmlfile, tuple[str, str]], None], + single_prefix: str = "", + skip_single: bool = True, + ) -> None: + """Wrap elements into
              if more than one element.""" + if not groups: + return + + if skip_single and len(groups) == 1: + hf.write(single_prefix) + processor(hf, groups[0]) + return + + with hf.element("ol"): + for el in groups: + with hf.element("li"): + processor(hf, el) + + def writeSense( # noqa: PLR6301 + self, + hf: T_htmlfile, + row: tuple[str, str], + ) -> None: + from lxml import etree as ET + + trans, entry_type = row + if entry_type: + with hf.element("i"): + hf.write(f"{entry_type}") # noqa: FURB183 + hf.write(ET.Element("br")) + try: + hf.write(trans + " ") + except Exception as e: + log.error(f"error in writing {trans!r}, {e}") + hf.write(repr(trans) + " ") + else: + with hf.element("big"): + with hf.element("a", href=f"bword://{trans}"): + hf.write("⏎") + + def iterRows( + self, + column1: str, + column2: str, + ) -> Iterator[tuple[str, str, str]]: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + f"select {column1}, {column2}, entry_type from main_ft" + f" order by {column1}", + ) + for row in self._cur.fetchall(): + term1 = row[0] + term2 = row[1] + try: + term1 = html.unescape(term1) + except Exception as e: + log.error(f"html.unescape({term1!r}) -> {e}") + try: + term2 = html.unescape(term2) + except Exception as e: + log.error(f"html.unescape({term2!r}) -> {e}") + yield term1, term2, row[2] + + def parseGender(self, headword: str) -> tuple[str | None, str]: # noqa: PLR6301 + # {m} masc masculine German: maskulin + # {f} fem feminine German: feminin + # {n} neut neutral German: neutral + # { } ???? + i = headword.find(" {") + if i <= 0: + return None, headword + if len(headword) < i + 4: + return None, headword + if headword[i + 3] != "}": + return None, headword + g = headword[i + 2] + gender = None + if g == "m": + gender = "masculine" + elif g == "f": + gender = "feminine" + elif g == "n": + gender = "neutral" + else: + log.warning(f"invalid gender {g!r}") + return None, headword + headword = headword[:i] + headword[i + 4 :] + return gender, headword + + def _iterOneDirection( + self, + column1: str, + column2: str, + ) -> Iterator[EntryType]: + from io import BytesIO + from itertools import groupby + + from lxml import etree as ET + + glos = self._glos + for headwordEscaped, groupsOrig in groupby( + self.iterRows(column1, column2), + key=itemgetter(0), + ): + headword = html.unescape(headwordEscaped) + groups: list[tuple[str, str]] = [ + (term2, entry_type) for _, term2, entry_type in groupsOrig + ] + f = BytesIO() + gender, headword = self.parseGender(headword) + with ET.htmlfile(f, encoding="utf-8") as hf: + with hf.element("div"): + if gender: + with hf.element("i"): + hf.write(gender) + hf.write(ET.Element("br")) + self.makeGroupsList( + cast("T_htmlfile", hf), + groups, + self.writeSense, + ) + defi = f.getvalue().decode("utf-8") + yield glos.newEntry(headword, defi, defiFormat="h") + + def __iter__(self) -> Iterator[EntryType]: + yield from self._iterOneDirection("term1", "term2") + yield from self._iterOneDirection("term2", "term1") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/dict_cc_split/__init__.py b/pyglossary/plugins/dict_cc_split/__init__.py index daa096949..69fbb799c 100644 --- a/pyglossary/plugins/dict_cc_split/__init__.py +++ b/pyglossary/plugins/dict_cc_split/__init__.py @@ -1,17 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html from typing import TYPE_CHECKING if TYPE_CHECKING: - import sqlite3 - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option -from pyglossary.core import log +from .reader import Reader __all__ = [ "Reader", @@ -42,73 +37,3 @@ "dict.cc dictionary - Google Play", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("m") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(*) * 2 from main_ft") - return self._cur.fetchone()[0] - - def iterRows( - self, - column1: str, - column2: str, - ) -> Iterator[tuple[str, str, str]]: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - f"select {column1}, {column2}, entry_type from main_ft" - f" order by {column1}", - ) - for row in self._cur.fetchall(): - term1 = row[0] - term2 = row[1] - try: - term1 = html.unescape(term1) - except Exception as e: - log.error(f"html.unescape({term1!r}) -> {e}") - try: - term2 = html.unescape(term2) - except Exception as e: - log.error(f"html.unescape({term2!r}) -> {e}") - yield term1, term2, row[2] - - def _iterOneDirection( - self, - column1: str, - column2: str, - ) -> Iterator[EntryType]: - for word, defi, entry_type in self.iterRows(column1, column2): - if entry_type: - word = f"{word} {{{entry_type}}}" # noqa: PLW2901 - yield self._glos.newEntry(word, defi, defiFormat="m") - - def __iter__(self) -> Iterator[EntryType]: - yield from self._iterOneDirection("term1", "term2") - yield from self._iterOneDirection("term2", "term1") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/dict_cc_split/reader.py b/pyglossary/plugins/dict_cc_split/reader.py new file mode 100644 index 000000000..1e5205f28 --- /dev/null +++ b/pyglossary/plugins/dict_cc_split/reader.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import log + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("m") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(*) * 2 from main_ft") + return self._cur.fetchone()[0] + + def iterRows( + self, + column1: str, + column2: str, + ) -> Iterator[tuple[str, str, str]]: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + f"select {column1}, {column2}, entry_type from main_ft" + f" order by {column1}", + ) + for row in self._cur.fetchall(): + term1 = row[0] + term2 = row[1] + try: + term1 = html.unescape(term1) + except Exception as e: + log.error(f"html.unescape({term1!r}) -> {e}") + try: + term2 = html.unescape(term2) + except Exception as e: + log.error(f"html.unescape({term2!r}) -> {e}") + yield term1, term2, row[2] + + def _iterOneDirection( + self, + column1: str, + column2: str, + ) -> Iterator[EntryType]: + for word, defi, entry_type in self.iterRows(column1, column2): + if entry_type: + word = f"{word} {{{entry_type}}}" # noqa: PLW2901 + yield self._glos.newEntry(word, defi, defiFormat="m") + + def __iter__(self) -> Iterator[EntryType]: + yield from self._iterOneDirection("term1", "term2") + yield from self._iterOneDirection("term2", "term1") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/dict_org/__init__.py b/pyglossary/plugins/dict_org/__init__.py index 8331d3adb..9af2bf0b3 100644 --- a/pyglossary/plugins/dict_org/__init__.py +++ b/pyglossary/plugins/dict_org/__init__.py @@ -2,20 +2,11 @@ from __future__ import annotations -import os -import re -from os.path import isdir, splitext -from typing import TYPE_CHECKING - -from pyglossary.core import log from pyglossary.flags import DEFAULT_NO from pyglossary.option import BoolOption, Option -from pyglossary.plugin_lib.dictdlib import DictDB - -if TYPE_CHECKING: - from collections.abc import Generator, Iterator - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -51,146 +42,3 @@ "http://dict.org/bin/Dict", "The DICT Development Group", ) - - -def installToDictd(filename: str, dictzip: bool) -> None: - """Filename is without extension (neither .index or .dict or .dict.dz).""" - import shutil - import subprocess - - targetDir = "/usr/share/dictd/" - if filename.startswith(targetDir): - return - - if not isdir(targetDir): - log.warning(f"Directory {targetDir!r} does not exist, skipping install") - return - - log.info(f"Installing {filename!r} to DICTD server directory: {targetDir}") - - if dictzip and os.path.isfile(filename + ".dict.dz"): - dictExt = ".dict.dz" - elif os.path.isfile(filename + ".dict"): - dictExt = ".dict" - else: - log.error(f"No .dict file, could not install dictd file {filename!r}") - return - - if not filename.startswith(targetDir): - shutil.copy(filename + ".index", targetDir) - shutil.copy(filename + dictExt, targetDir) - - # update /var/lib/dictd/db.list - if subprocess.call(["/usr/sbin/dictdconfig", "-w"]) != 0: - log.error( - "failed to update /var/lib/dictd/db.list file" - ", try manually running: sudo /usr/sbin/dictdconfig -w", - ) - - log.info("don't forget to restart dictd server") - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._dictdb: DictDB | None = None - - # regular expression patterns used to prettify definition text - self._re_newline_in_braces = re.compile( - r"\{(?P.*?)\n(?P.*?)?\}", - ) - self._re_words_in_braces = re.compile( - r"\{(?P.+?)\}", - ) - - def open(self, filename: str) -> None: - filename = filename.removesuffix(".index") - self._filename = filename - self._dictdb = DictDB(filename, "read", 1) - - def close(self) -> None: - if self._dictdb is not None: - self._dictdb.close() - # self._dictdb.finish() - self._dictdb = None - - def prettifyDefinitionText(self, defi: str) -> str: - # Handle words in {} - # First, we remove any \n in {} pairs - defi = self._re_newline_in_braces.sub(r"{\g\g}", defi) - - # Then, replace any {words} into words, - # so it can be rendered as link correctly - defi = self._re_words_in_braces.sub( - r'\g', - defi, - ) - - # Use
              so it can be rendered as newline correctly - return defi.replace("\n", "
              ") - - def __len__(self) -> int: - if self._dictdb is None: - return 0 - return len(self._dictdb) - - def __iter__(self) -> Iterator[EntryType]: - if self._dictdb is None: - raise RuntimeError("iterating over a reader while it's not open") - dictdb = self._dictdb - for word in dictdb.getDefList(): - b_defi = b"\n\n
              \n\n".join(dictdb.getDef(word)) - try: - defi = b_defi.decode("utf_8", "ignore") - defi = self.prettifyDefinitionText(defi) - except Exception as e: - log.error(f"{b_defi = }") - raise e - yield self._glos.newEntry(word, defi) - - -class Writer: - _dictzip: bool = False - _install: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._dictdb: DictDB | None = None - - def finish(self) -> None: - from pyglossary.os_utils import runDictzip - - if self._dictdb is None: - raise RuntimeError("self._dictdb is None") - - self._dictdb.finish(dosort=True) - if self._dictzip: - runDictzip(f"{self._filename}.dict") - if self._install: - installToDictd( - self._filename, - self._dictzip, - ) - self._filename = "" - - def open(self, filename: str) -> None: - filename_nox, ext = splitext(filename) - if ext.lower() == ".index": - filename = filename_nox - self._dictdb = DictDB(filename, "write", 1) - self._filename = filename - - def write(self) -> Generator[None, EntryType, None]: - dictdb = self._dictdb - if dictdb is None: - raise RuntimeError("self._dictdb is None") - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # does dictd support resources? and how? FIXME - continue - dictdb.addEntry(entry.defi, entry.l_word) diff --git a/pyglossary/plugins/dict_org/reader.py b/pyglossary/plugins/dict_org/reader.py new file mode 100644 index 000000000..71a47fc13 --- /dev/null +++ b/pyglossary/plugins/dict_org/reader.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.plugin_lib.dictdlib import DictDB + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._dictdb: DictDB | None = None + + # regular expression patterns used to prettify definition text + self._re_newline_in_braces = re.compile( + r"\{(?P.*?)\n(?P.*?)?\}", + ) + self._re_words_in_braces = re.compile( + r"\{(?P.+?)\}", + ) + + def open(self, filename: str) -> None: + filename = filename.removesuffix(".index") + self._filename = filename + self._dictdb = DictDB(filename, "read", 1) + + def close(self) -> None: + if self._dictdb is not None: + self._dictdb.close() + # self._dictdb.finish() + self._dictdb = None + + def prettifyDefinitionText(self, defi: str) -> str: + # Handle words in {} + # First, we remove any \n in {} pairs + defi = self._re_newline_in_braces.sub(r"{\g\g}", defi) + + # Then, replace any {words} into words, + # so it can be rendered as link correctly + defi = self._re_words_in_braces.sub( + r'\g', + defi, + ) + + # Use
              so it can be rendered as newline correctly + return defi.replace("\n", "
              ") + + def __len__(self) -> int: + if self._dictdb is None: + return 0 + return len(self._dictdb) + + def __iter__(self) -> Iterator[EntryType]: + if self._dictdb is None: + raise RuntimeError("iterating over a reader while it's not open") + dictdb = self._dictdb + for word in dictdb.getDefList(): + b_defi = b"\n\n
              \n\n".join(dictdb.getDef(word)) + try: + defi = b_defi.decode("utf_8", "ignore") + defi = self.prettifyDefinitionText(defi) + except Exception as e: + log.error(f"{b_defi = }") + raise e + yield self._glos.newEntry(word, defi) diff --git a/pyglossary/plugins/dict_org/writer.py b/pyglossary/plugins/dict_org/writer.py new file mode 100644 index 000000000..5cc2762e7 --- /dev/null +++ b/pyglossary/plugins/dict_org/writer.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from os.path import splitext +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.plugin_lib.dictdlib import DictDB + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +def installToDictd(filename: str, dictzip: bool) -> None: + """Filename is without extension (neither .index or .dict or .dict.dz).""" + import shutil + import subprocess + from os.path import isdir, isfile + + targetDir = "/usr/share/dictd/" + if filename.startswith(targetDir): + return + + if not isdir(targetDir): + log.warning(f"Directory {targetDir!r} does not exist, skipping install") + return + + log.info(f"Installing {filename!r} to DICTD server directory: {targetDir}") + + if dictzip and isfile(filename + ".dict.dz"): + dictExt = ".dict.dz" + elif isfile(filename + ".dict"): + dictExt = ".dict" + else: + log.error(f"No .dict file, could not install dictd file {filename!r}") + return + + if not filename.startswith(targetDir): + shutil.copy(filename + ".index", targetDir) + shutil.copy(filename + dictExt, targetDir) + + # update /var/lib/dictd/db.list + if subprocess.call(["/usr/sbin/dictdconfig", "-w"]) != 0: + log.error( + "failed to update /var/lib/dictd/db.list file" + ", try manually running: sudo /usr/sbin/dictdconfig -w", + ) + + log.info("don't forget to restart dictd server") + + +class Writer: + _dictzip: bool = False + _install: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._dictdb: DictDB | None = None + + def finish(self) -> None: + from pyglossary.os_utils import runDictzip + + if self._dictdb is None: + raise RuntimeError("self._dictdb is None") + + self._dictdb.finish(dosort=True) + if self._dictzip: + runDictzip(f"{self._filename}.dict") + if self._install: + installToDictd( + self._filename, + self._dictzip, + ) + self._filename = "" + + def open(self, filename: str) -> None: + filename_nox, ext = splitext(filename) + if ext.lower() == ".index": + filename = filename_nox + self._dictdb = DictDB(filename, "write", 1) + self._filename = filename + + def write(self) -> Generator[None, EntryType, None]: + dictdb = self._dictdb + if dictdb is None: + raise RuntimeError("self._dictdb is None") + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # does dictd support resources? and how? FIXME + continue + dictdb.addEntry(entry.defi, entry.l_word) diff --git a/pyglossary/plugins/dict_org_source/__init__.py b/pyglossary/plugins/dict_org_source/__init__.py index 5c899f1fe..9a9d63233 100644 --- a/pyglossary/plugins/dict_org_source/__init__.py +++ b/pyglossary/plugins/dict_org_source/__init__.py @@ -1,14 +1,9 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import TYPE_CHECKING - from pyglossary.option import BoolOption, Option -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -41,36 +36,3 @@ optionsProp: dict[str, Option] = { "remove_html_all": BoolOption(comment="Remove all HTML tags"), } - - -class Writer: - _remove_html_all: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - - def finish(self) -> None: - self._filename = "" - - def open(self, filename: str) -> None: - self._filename = filename - if self._remove_html_all: - self._glos.removeHtmlTagsAll() - # TODO: add another bool flag to only remove html tags that are not - # supported by GtkTextView - - @staticmethod - def _defiEscapeFunc(defi: str) -> str: - return defi.replace("\r", "") - - def write(self) -> Generator[None, EntryType, None]: - from pyglossary.text_writer import writeTxt - - yield from writeTxt( - self._glos, - entryFmt=":{word}:{defi}\n", - filename=self._filename, - defiEscapeFunc=self._defiEscapeFunc, - ext=".dtxt", - ) diff --git a/pyglossary/plugins/dict_org_source/writer.py b/pyglossary/plugins/dict_org_source/writer.py new file mode 100644 index 000000000..1548f5975 --- /dev/null +++ b/pyglossary/plugins/dict_org_source/writer.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + _remove_html_all: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + + def finish(self) -> None: + self._filename = "" + + def open(self, filename: str) -> None: + self._filename = filename + if self._remove_html_all: + self._glos.removeHtmlTagsAll() + # TODO: add another bool flag to only remove html tags that are not + # supported by GtkTextView + + @staticmethod + def _defiEscapeFunc(defi: str) -> str: + return defi.replace("\r", "") + + def write(self) -> Generator[None, EntryType, None]: + from pyglossary.text_writer import writeTxt + + yield from writeTxt( + self._glos, + entryFmt=":{word}:{defi}\n", + filename=self._filename, + defiEscapeFunc=self._defiEscapeFunc, + ext=".dtxt", + ) diff --git a/pyglossary/plugins/dictunformat/__init__.py b/pyglossary/plugins/dictunformat/__init__.py index a05c55459..77e5f8233 100644 --- a/pyglossary/plugins/dictunformat/__init__.py +++ b/pyglossary/plugins/dictunformat/__init__.py @@ -1,8 +1,8 @@ from __future__ import annotations -from pyglossary.core import log from pyglossary.option import EncodingOption, Option, StrOption -from pyglossary.text_reader import TextGlossaryReader + +from .reader import Reader __all__ = [ "Reader", @@ -38,89 +38,3 @@ comment="separator for headword and alternates", ), } - - -def unescapeDefi(defi: str) -> str: - return defi - - -class Reader(TextGlossaryReader): - _headword_separator = "; " - # https://github.com/cheusov/dictd/blob/master/dictfmt/dictunformat.in#L14 - - @classmethod - def isInfoWord(cls, word: str) -> bool: - return word.startswith("00-database-") - - @classmethod - def fixInfoWord(cls, word: str) -> str: - return word - - def setInfo(self, word: str, defi: str) -> None: - if word == "00-database-short": - self._glos.setInfo("name", defi) - return - - if word != "00-database-info": - return - - glos = self._glos - - lastKey = "" - for line in defi.split("\n"): - if not line.startswith("##:"): - if lastKey: - glos.setInfo(word, f"{glos.getInfo(lastKey)}\n{line}") - continue - - parts = line[3:].split(":") - if len(parts) < 2: - log.error(f"unexpected line: {line}") - key = lastKey = parts[0] - value = ":".join(parts[1:]) - glos.setInfo(key, value) - - def nextBlock(self) -> tuple[str | list[str], str, None] | None: - if not self._file: - raise StopIteration - word = "" - defiLines: list[str] = [] - - while True: - line = self.readline() - if not line: - break - line = line.rstrip("\n\r") - if not line: - continue - - if not line.strip("_"): - if not word: - continue - if not defiLines: - log.warning(f"no definition/value for {word!r}") - defi = unescapeDefi("\n".join(defiLines)) - words = word.split(self._headword_separator) - return words, defi, None - - if not word: - word = line - continue - - if line == word: - continue - if line.lower() == word: - word = line - continue - - defiLines.append(line) - - if word: - defi = unescapeDefi("\n".join(defiLines)) - if word.startswith("00-database-") and defi == "unknown": - log.info(f"ignoring {word} -> {defi}") - return None - words = word.split(self._headword_separator) - return words, defi, None - - raise StopIteration diff --git a/pyglossary/plugins/dictunformat/reader.py b/pyglossary/plugins/dictunformat/reader.py new file mode 100644 index 000000000..c66a0f937 --- /dev/null +++ b/pyglossary/plugins/dictunformat/reader.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from pyglossary.core import log +from pyglossary.text_reader import TextGlossaryReader + + +def unescapeDefi(defi: str) -> str: + return defi + + +class Reader(TextGlossaryReader): + _headword_separator = "; " + # https://github.com/cheusov/dictd/blob/master/dictfmt/dictunformat.in#L14 + + @classmethod + def isInfoWord(cls, word: str) -> bool: + return word.startswith("00-database-") + + @classmethod + def fixInfoWord(cls, word: str) -> str: + return word + + def setInfo(self, word: str, defi: str) -> None: + if word == "00-database-short": + self._glos.setInfo("name", defi) + return + + if word != "00-database-info": + return + + glos = self._glos + + lastKey = "" + for line in defi.split("\n"): + if not line.startswith("##:"): + if lastKey: + glos.setInfo(word, f"{glos.getInfo(lastKey)}\n{line}") + continue + + parts = line[3:].split(":") + if len(parts) < 2: + log.error(f"unexpected line: {line}") + key = lastKey = parts[0] + value = ":".join(parts[1:]) + glos.setInfo(key, value) + + def nextBlock(self) -> tuple[str | list[str], str, None] | None: + if not self._file: + raise StopIteration + word = "" + defiLines: list[str] = [] + + while True: + line = self.readline() + if not line: + break + line = line.rstrip("\n\r") + if not line: + continue + + if not line.strip("_"): + if not word: + continue + if not defiLines: + log.warning(f"no definition/value for {word!r}") + defi = unescapeDefi("\n".join(defiLines)) + words = word.split(self._headword_separator) + return words, defi, None + + if not word: + word = line + continue + + if line == word: + continue + if line.lower() == word: + word = line + continue + + defiLines.append(line) + + if word: + defi = unescapeDefi("\n".join(defiLines)) + if word.startswith("00-database-") and defi == "unknown": + log.info(f"ignoring {word} -> {defi}") + return None + words = word.split(self._headword_separator) + return words, defi, None + + raise StopIteration diff --git a/pyglossary/plugins/digitalnk/__init__.py b/pyglossary/plugins/digitalnk/__init__.py index cf35cef73..08c23d4eb 100644 --- a/pyglossary/plugins/digitalnk/__init__.py +++ b/pyglossary/plugins/digitalnk/__init__.py @@ -1,16 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html from typing import TYPE_CHECKING if TYPE_CHECKING: - import sqlite3 - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option +from .reader import Reader + __all__ = [ "Reader", "description", @@ -40,51 +37,3 @@ "@digitalprk/dicrs", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("m") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(*) from dictionary") - return self._cur.fetchone()[0] - - def __iter__(self) -> Iterator[EntryType]: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - "select word, definition from dictionary order by word", - ) - # iteration over self._cur stops after one entry - # and self._cur.fetchone() returns None - # no idea why! - # https://github.com/ilius/pyglossary/issues/282 - # for row in self._cur: - for row in self._cur.fetchall(): - word = html.unescape(row[0]) - definition = row[1] - yield self._glos.newEntry(word, definition, defiFormat="m") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/digitalnk/reader.py b/pyglossary/plugins/digitalnk/reader.py new file mode 100644 index 000000000..5eb2ba373 --- /dev/null +++ b/pyglossary/plugins/digitalnk/reader.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("m") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(*) from dictionary") + return self._cur.fetchone()[0] + + def __iter__(self) -> Iterator[EntryType]: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + "select word, definition from dictionary order by word", + ) + # iteration over self._cur stops after one entry + # and self._cur.fetchone() returns None + # no idea why! + # https://github.com/ilius/pyglossary/issues/282 + # for row in self._cur: + for row in self._cur.fetchall(): + word = html.unescape(row[0]) + definition = row[1] + yield self._glos.newEntry(word, definition, defiFormat="m") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/dikt_json/__init__.py b/pyglossary/plugins/dikt_json/__init__.py index e47315cd5..39eeecf74 100644 --- a/pyglossary/plugins/dikt_json/__init__.py +++ b/pyglossary/plugins/dikt_json/__init__.py @@ -4,23 +4,13 @@ from __future__ import annotations -import re -from typing import TYPE_CHECKING - -from pyglossary.compression import ( - # compressionOpen, - stdCompressions, -) from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -55,65 +45,3 @@ comment="add headwords title to beginning of definition", ), } - - -class Writer: - _encoding: str = "utf-8" - _enable_info: bool = True - _resources: bool = True - _word_title: bool = False - - compressions = stdCompressions - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = None - glos.preventDuplicateWords() - - def open(self, filename: str) -> None: - self._filename = filename - - def finish(self) -> None: - self._filename = None - - def write(self) -> Generator[None, EntryType, None]: - from json import dumps - - from pyglossary.text_writer import writeTxt - - glos = self._glos - encoding = self._encoding - enable_info = self._enable_info - resources = self._resources - - ensure_ascii = encoding == "ascii" - - def escape(st: str) -> str: - # remove styling from HTML tags - st2 = re.sub(r' style="[^"]*"', "", st) - st2 = re.sub(r' class="[^"]*"', "", st2) - st2 = re.sub(r"]*>", "", st2) - st2 = st2.replace("", "") - st2 = re.sub(r"\n", "", st2) - st2 = st2.replace("
              ", "") - st2 = st2.replace("", "") - # fix russian dictionary issues, - # such as hyphenation in word (e.g. абб{[']}а{[/']}т) - st2 = re.sub(r"\{\['\]\}", "", st2) - st2 = re.sub(r"\{\[/'\]\}", "", st2) - return dumps(st2, ensure_ascii=ensure_ascii) - - yield from writeTxt( - glos, - entryFmt="\t{word}: {defi},\n", - filename=self._filename, - encoding=encoding, - writeInfo=enable_info, - wordEscapeFunc=escape, - defiEscapeFunc=escape, - ext=".json", - head="{\n", - tail='\t"": ""\n}', - resources=resources, - word_title=self._word_title, - ) diff --git a/pyglossary/plugins/dikt_json/writer.py b/pyglossary/plugins/dikt_json/writer.py new file mode 100644 index 000000000..e7827ae4b --- /dev/null +++ b/pyglossary/plugins/dikt_json/writer.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +# from https://github.com/maxim-saplin/pyglossary + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + # compressionOpen, + stdCompressions, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + _encoding: str = "utf-8" + _enable_info: bool = True + _resources: bool = True + _word_title: bool = False + + compressions = stdCompressions + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = None + glos.preventDuplicateWords() + + def open(self, filename: str) -> None: + self._filename = filename + + def finish(self) -> None: + self._filename = None + + def write(self) -> Generator[None, EntryType, None]: + from json import dumps + + from pyglossary.text_writer import writeTxt + + glos = self._glos + encoding = self._encoding + enable_info = self._enable_info + resources = self._resources + + ensure_ascii = encoding == "ascii" + + def escape(st: str) -> str: + # remove styling from HTML tags + st2 = re.sub(r' style="[^"]*"', "", st) + st2 = re.sub(r' class="[^"]*"', "", st2) + st2 = re.sub(r"]*>", "", st2) + st2 = st2.replace("", "") + st2 = re.sub(r"\n", "", st2) + st2 = st2.replace("
              ", "") + st2 = st2.replace("", "") + # fix russian dictionary issues, + # such as hyphenation in word (e.g. абб{[']}а{[/']}т) + st2 = re.sub(r"\{\['\]\}", "", st2) + st2 = re.sub(r"\{\[/'\]\}", "", st2) + return dumps(st2, ensure_ascii=ensure_ascii) + + yield from writeTxt( + glos, + entryFmt="\t{word}: {defi},\n", + filename=self._filename, + encoding=encoding, + writeInfo=enable_info, + wordEscapeFunc=escape, + defiEscapeFunc=escape, + ext=".json", + head="{\n", + tail='\t"": ""\n}', + resources=resources, + word_title=self._word_title, + ) diff --git a/pyglossary/plugins/ebook_epub2/__init__.py b/pyglossary/plugins/ebook_epub2/__init__.py index 8bf34801b..baabf0036 100644 --- a/pyglossary/plugins/ebook_epub2/__init__.py +++ b/pyglossary/plugins/ebook_epub2/__init__.py @@ -1,27 +1,7 @@ # -*- coding: utf-8 -*- -# The MIT License (MIT) -# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) -# Copyright © 2016-2019 Saeed Rasooli -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -from __future__ import annotations -from typing import TYPE_CHECKING, Any +from __future__ import annotations -from pyglossary.ebook_base import EbookWriter from pyglossary.flags import ALWAYS from pyglossary.option import ( BoolOption, @@ -30,8 +10,7 @@ StrOption, ) -if TYPE_CHECKING: - from pyglossary.glossary_types import GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -85,209 +64,3 @@ comment="Path to cover file", ), } - - -class Writer(EbookWriter): - # these class attrs are only in Epub - # MIMETYPE_CONTENTS, CONTAINER_XML_CONTENTS - # NCX_TEMPLATE, NCX_NAVPOINT_TEMPLATE - - MIMETYPE_CONTENTS = "application/epub+zip" - CONTAINER_XML_CONTENTS = """ - - - - -""" - - NCX_TEMPLATE = """ - - - - - - - - - - {title} - - -{ncx_items} - -""" - - NCX_NAVPOINT_TEMPLATE = """\t - - {text} - - - """ - - CSS_CONTENTS = b"""@charset "UTF-8"; -body { - margin: 10px 25px 10px 25px; -} -h1 { - font-size: 200%; -} -h2 { - font-size: 150%; -} -p { - margin-left: 0em; - margin-right: 0em; - margin-top: 0em; - margin-bottom: 0em; - line-height: 2em; - text-align: justify; -} -a, a:focus, a:active, a:visited { - color: black; - text-decoration: none; -} -body.indexPage {} -h1.indexTitle {} -p.indexGroups { - font-size: 150%; -} -span.indexGroup {} -body.groupPage {} -h1.groupTitle {} -div.groupNavigation {} -span.groupHeadword {} -div.groupEntry { - margin-top: 0; - margin-bottom: 1em; -} -h2.groupHeadword { - margin-left: 5%; -} -p.groupDefinition { - margin-left: 10%; - margin-right: 10%; -} -""" - - GROUP_XHTML_TEMPLATE = """ - - - - {title} - - - -

              {group_title}

              -
              - [ Previous ] -{index_link} - [ Next ] -
              -{group_contents} - -""" - GROUP_XHTML_INDEX_LINK = '\t\t[ Index ]' - - GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """\t
              -

              {headword}

              -

              {definition}

              -
              """ - - OPF_TEMPLATE = """ - - - {identifier} - {sourceLang} - {title} - {creator} - {copyright} - {creationDate} - {cover} - - -{manifest} - - -{spine} - -""" - - COVER_TEMPLATE = '' - - def __init__(self, glos: GlossaryType) -> None: - import uuid - - EbookWriter.__init__( - self, - glos, - ) - glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) - - @classmethod - def cls_get_prefix( - cls: type[EbookWriter], - options: dict[str, Any], - word: str, - ) -> str: - if not word: - return "" - length = options.get("group_by_prefix_length", cls._group_by_prefix_length) - prefix = word[:length].lower() - if prefix[0] < "a": - return "SPECIAL" - return prefix - - def get_prefix(self, word: str) -> str: - if not word: - return "" - length = self._group_by_prefix_length - prefix = word[:length].lower() - if prefix[0] < "a": - return "SPECIAL" - return prefix - - def write_ncx(self, group_labels: list[str]) -> None: - """ - write_ncx - only for epub. - """ - ncx_items: list[str] = [] - index = 1 - if self._include_index_page: - ncx_items.append( - self.NCX_NAVPOINT_TEMPLATE.format( - index=index, - text="Index", - src="index.xhtml", - ), - ) - index += 1 - for group_label in group_labels: - ncx_items.append( - self.NCX_NAVPOINT_TEMPLATE.format( - index=index, - text=group_label, - src=self.get_group_xhtml_file_name_from_index(index), - ), - ) - index += 1 - ncx_items_unicode = "\n".join(ncx_items) - ncx_contents = self.NCX_TEMPLATE.format( - identifier=self._glos.getInfo("uuid"), - title=self._glos.getInfo("name"), - ncx_items=ncx_items_unicode, - ).encode("utf-8") - self.add_file_manifest( - "OEBPS/toc.ncx", - "toc.ncx", - ncx_contents, - "application/x-dtbncx+xml", - ) - - # inherits write from EbookWriter diff --git a/pyglossary/plugins/ebook_epub2/writer.py b/pyglossary/plugins/ebook_epub2/writer.py new file mode 100644 index 000000000..eba888c33 --- /dev/null +++ b/pyglossary/plugins/ebook_epub2/writer.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) +# Copyright © 2016-2019 Saeed Rasooli +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from pyglossary.ebook_base import EbookWriter + +if TYPE_CHECKING: + from pyglossary.glossary_types import GlossaryType + + +class Writer(EbookWriter): + # these class attrs are only in Epub + # MIMETYPE_CONTENTS, CONTAINER_XML_CONTENTS + # NCX_TEMPLATE, NCX_NAVPOINT_TEMPLATE + + MIMETYPE_CONTENTS = "application/epub+zip" + CONTAINER_XML_CONTENTS = """ + + + + +""" + + NCX_TEMPLATE = """ + + + + + + + + + + {title} + + +{ncx_items} + +""" + + NCX_NAVPOINT_TEMPLATE = """\t + + {text} + + + """ + + CSS_CONTENTS = b"""@charset "UTF-8"; +body { + margin: 10px 25px 10px 25px; +} +h1 { + font-size: 200%; +} +h2 { + font-size: 150%; +} +p { + margin-left: 0em; + margin-right: 0em; + margin-top: 0em; + margin-bottom: 0em; + line-height: 2em; + text-align: justify; +} +a, a:focus, a:active, a:visited { + color: black; + text-decoration: none; +} +body.indexPage {} +h1.indexTitle {} +p.indexGroups { + font-size: 150%; +} +span.indexGroup {} +body.groupPage {} +h1.groupTitle {} +div.groupNavigation {} +span.groupHeadword {} +div.groupEntry { + margin-top: 0; + margin-bottom: 1em; +} +h2.groupHeadword { + margin-left: 5%; +} +p.groupDefinition { + margin-left: 10%; + margin-right: 10%; +} +""" + + GROUP_XHTML_TEMPLATE = """ + + + + {title} + + + +

              {group_title}

              +
              + [ Previous ] +{index_link} + [ Next ] +
              +{group_contents} + +""" + GROUP_XHTML_INDEX_LINK = '\t\t[ Index ]' + + GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """\t
              +

              {headword}

              +

              {definition}

              +
              """ + + OPF_TEMPLATE = """ + + + {identifier} + {sourceLang} + {title} + {creator} + {copyright} + {creationDate} + {cover} + + +{manifest} + + +{spine} + +""" + + COVER_TEMPLATE = '' + + def __init__(self, glos: GlossaryType) -> None: + import uuid + + EbookWriter.__init__( + self, + glos, + ) + glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) + + @classmethod + def cls_get_prefix( + cls: type[EbookWriter], + options: dict[str, Any], + word: str, + ) -> str: + if not word: + return "" + length = options.get("group_by_prefix_length", cls._group_by_prefix_length) + prefix = word[:length].lower() + if prefix[0] < "a": + return "SPECIAL" + return prefix + + def get_prefix(self, word: str) -> str: + if not word: + return "" + length = self._group_by_prefix_length + prefix = word[:length].lower() + if prefix[0] < "a": + return "SPECIAL" + return prefix + + def write_ncx(self, group_labels: list[str]) -> None: + """ + write_ncx + only for epub. + """ + ncx_items: list[str] = [] + index = 1 + if self._include_index_page: + ncx_items.append( + self.NCX_NAVPOINT_TEMPLATE.format( + index=index, + text="Index", + src="index.xhtml", + ), + ) + index += 1 + for group_label in group_labels: + ncx_items.append( + self.NCX_NAVPOINT_TEMPLATE.format( + index=index, + text=group_label, + src=self.get_group_xhtml_file_name_from_index(index), + ), + ) + index += 1 + ncx_items_unicode = "\n".join(ncx_items) + ncx_contents = self.NCX_TEMPLATE.format( + identifier=self._glos.getInfo("uuid"), + title=self._glos.getInfo("name"), + ncx_items=ncx_items_unicode, + ).encode("utf-8") + self.add_file_manifest( + "OEBPS/toc.ncx", + "toc.ncx", + ncx_contents, + "application/x-dtbncx+xml", + ) + + # inherits write from EbookWriter diff --git a/pyglossary/plugins/ebook_kobo/__init__.py b/pyglossary/plugins/ebook_kobo/__init__.py index 02a108f88..cbd9b6f90 100644 --- a/pyglossary/plugins/ebook_kobo/__init__.py +++ b/pyglossary/plugins/ebook_kobo/__init__.py @@ -1,41 +1,14 @@ # -*- coding: utf-8 -*- -# The MIT License (MIT) -# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) -# Copyright © 2022 Saeed Rasooli -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. + from __future__ import annotations -import re -import unicodedata -from gzip import compress, decompress -from operator import itemgetter -from pathlib import Path -from pickle import dumps, loads from typing import TYPE_CHECKING -from pyglossary import core -from pyglossary.core import exc_note, log, pip from pyglossary.flags import NEVER -from pyglossary.os_utils import indir -if TYPE_CHECKING: - from collections.abc import Generator +from .writer import Writer - from pyglossary.glossary_types import EntryType, GlossaryType +if TYPE_CHECKING: from pyglossary.option import Option __all__ = [ @@ -75,199 +48,3 @@ # Penelope option: marisa_index_size=1000000 - - -def is_cyrillic_char(c: str) -> bool: - # U+0400 - U+04FF: Cyrillic - # U+0500 - U+052F: Cyrillic Supplement - if "\u0400" <= c <= "\u052f": - return True - - # U+2DE0 - U+2DFF: Cyrillic Extended-A - if "\u2de0" <= c <= "\u2dff": - return True - - # U+A640 - U+A69F: Cyrillic Extended-B - if "\ua640" <= c <= "\ua69f": - return True - - # U+1C80 - U+1C8F: Cyrillic Extended-C - if "\u1c80" <= c <= "\u1c8f": - return True - - # U+FE2E, U+FE2F: Combining Half Marks - # U+1D2B, U+1D78: Phonetic Extensions - return c in {"\ufe2e", "\ufe2f", "\u1d2b", "\u1d78"} - - -def fixFilename(fname: str) -> str: - return Path(fname.replace("/", "2F").replace("\\", "5C")).name - - -class Writer: - WORDS_FILE_NAME = "words" - - depends = { - "marisa_trie": "marisa-trie", - } - - @staticmethod - def stripFullHtmlError(entry: EntryType, error: str) -> None: - log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._words: list[str] = [] - self._img_pattern = re.compile( - ']*?)?>', - re.DOTALL, - ) - # img tag has no closing - glos.stripFullHtml(errorHandler=self.stripFullHtmlError) - - def get_prefix(self, word: str) -> str: # noqa: PLR6301 - if not word: - return "11" - wo = word[:2].strip().lower() - if not wo: - return "11" - if wo[0] == "\x00": - return "11" - if len(wo) > 1 and wo[1] == "\x00": - wo = wo[:1] - if is_cyrillic_char(wo[0]): - return wo - # if either of the first 2 chars are not unicode letters, return "11" - for c in wo: - if not unicodedata.category(c).startswith("L"): - return "11" - return wo.ljust(2, "a") - - def fix_defi(self, defi: str) -> str: - # @pgaskin on #219: Kobo supports images in dictionaries, - # but these have a lot of gotchas - # (see https://pgaskin.net/dictutil/dicthtml/format.html). - # Basically, The best way to do it is to encode the images as a - # base64 data URL after shrinking it and making it grayscale - # (if it's JPG, this is as simple as only keeping the Y channel) - - # for now we just skip data entries and remove ' Generator[None, EntryType, None]: - import gzip - - dataEntryCount = 0 - - htmlHeader = '\n' - - groupCounter = 0 - htmlContents = htmlHeader - - def writeGroup(lastPrefix: str) -> None: - nonlocal htmlContents - group_fname = fixFilename(lastPrefix) - htmlContents += "" - core.trace( - log, - f"writeGroup: {lastPrefix!r}, " - f"{group_fname!r}, count={groupCounter}", - ) - with gzip.open(group_fname + ".html", mode="wb") as gzipFile: - gzipFile.write(htmlContents.encode("utf-8")) - htmlContents = htmlHeader - - allWords: list[str] = [] - # TODO: switch to SQLite, like StarDict writer - data: list[tuple[str, bytes]] = [] - - while True: - entry = yield - if entry is None: - break - if entry.isData(): - dataEntryCount += 1 - continue - l_word = entry.l_word - allWords += l_word - wordsByPrefix: dict[str, list[str]] = {} - for word in l_word: - prefix = self.get_prefix(word) - if prefix in wordsByPrefix: - wordsByPrefix[prefix].append(word) - else: - wordsByPrefix[prefix] = [word] - defi = self.fix_defi(entry.defi) - mainHeadword = l_word[0] - for prefix, p_words in wordsByPrefix.items(): - headword, *variants = p_words - if headword != mainHeadword: - headword = f"{mainHeadword}, {headword}" - data.append( - ( - prefix, - compress( - dumps( - ( - headword, - variants, - defi, - ), - ), - ), - ), - ) - del entry - - log.info("Kobo: sorting entries...") - data.sort(key=itemgetter(0)) - - log.info("Kobo: writing entries...") - - lastPrefix = "" - for prefix, row in data: - headword, variants, defi = loads(decompress(row)) - if lastPrefix and prefix != lastPrefix: - writeGroup(lastPrefix) - groupCounter = 0 - lastPrefix = prefix - - htmlVariants = "".join( - f'' for v in variants - ) - body = f"
              {headword}{htmlVariants}
              {defi}
              " - htmlContents += f'{body}\n' - groupCounter += 1 - del data - - if groupCounter > 0: - writeGroup(lastPrefix) - - if dataEntryCount > 0: - log.warning( - f"ignored {dataEntryCount} files (data entries)" - " and replaced ' None: - try: - import marisa_trie # type: ignore # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install marisa-trie` to install") - raise - self._filename = filename - - def write(self) -> Generator[None, EntryType, None]: - with indir(self._filename, create=True): - yield from self.write_groups() - - def finish(self) -> None: - import marisa_trie - - with indir(self._filename, create=False): - trie = marisa_trie.Trie(self._words) - trie.save(self.WORDS_FILE_NAME) - self._filename = "" diff --git a/pyglossary/plugins/ebook_kobo/writer.py b/pyglossary/plugins/ebook_kobo/writer.py new file mode 100644 index 000000000..5b26aff01 --- /dev/null +++ b/pyglossary/plugins/ebook_kobo/writer.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) +# Copyright © 2022 Saeed Rasooli +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +import re +import unicodedata +from gzip import compress, decompress +from operator import itemgetter +from pathlib import Path +from pickle import dumps, loads +from typing import TYPE_CHECKING + +from pyglossary import core +from pyglossary.core import exc_note, log, pip +from pyglossary.os_utils import indir + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +def is_cyrillic_char(c: str) -> bool: + # U+0400 - U+04FF: Cyrillic + # U+0500 - U+052F: Cyrillic Supplement + if "\u0400" <= c <= "\u052f": + return True + + # U+2DE0 - U+2DFF: Cyrillic Extended-A + if "\u2de0" <= c <= "\u2dff": + return True + + # U+A640 - U+A69F: Cyrillic Extended-B + if "\ua640" <= c <= "\ua69f": + return True + + # U+1C80 - U+1C8F: Cyrillic Extended-C + if "\u1c80" <= c <= "\u1c8f": + return True + + # U+FE2E, U+FE2F: Combining Half Marks + # U+1D2B, U+1D78: Phonetic Extensions + return c in {"\ufe2e", "\ufe2f", "\u1d2b", "\u1d78"} + + +def fixFilename(fname: str) -> str: + return Path(fname.replace("/", "2F").replace("\\", "5C")).name + + +class Writer: + WORDS_FILE_NAME = "words" + + depends = { + "marisa_trie": "marisa-trie", + } + + @staticmethod + def stripFullHtmlError(entry: EntryType, error: str) -> None: + log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._words: list[str] = [] + self._img_pattern = re.compile( + ']*?)?>', + re.DOTALL, + ) + # img tag has no closing + glos.stripFullHtml(errorHandler=self.stripFullHtmlError) + + def get_prefix(self, word: str) -> str: # noqa: PLR6301 + if not word: + return "11" + wo = word[:2].strip().lower() + if not wo: + return "11" + if wo[0] == "\x00": + return "11" + if len(wo) > 1 and wo[1] == "\x00": + wo = wo[:1] + if is_cyrillic_char(wo[0]): + return wo + # if either of the first 2 chars are not unicode letters, return "11" + for c in wo: + if not unicodedata.category(c).startswith("L"): + return "11" + return wo.ljust(2, "a") + + def fix_defi(self, defi: str) -> str: + # @pgaskin on #219: Kobo supports images in dictionaries, + # but these have a lot of gotchas + # (see https://pgaskin.net/dictutil/dicthtml/format.html). + # Basically, The best way to do it is to encode the images as a + # base64 data URL after shrinking it and making it grayscale + # (if it's JPG, this is as simple as only keeping the Y channel) + + # for now we just skip data entries and remove ' Generator[None, EntryType, None]: + import gzip + + dataEntryCount = 0 + + htmlHeader = '\n' + + groupCounter = 0 + htmlContents = htmlHeader + + def writeGroup(lastPrefix: str) -> None: + nonlocal htmlContents + group_fname = fixFilename(lastPrefix) + htmlContents += "" + core.trace( + log, + f"writeGroup: {lastPrefix!r}, " + f"{group_fname!r}, count={groupCounter}", + ) + with gzip.open(group_fname + ".html", mode="wb") as gzipFile: + gzipFile.write(htmlContents.encode("utf-8")) + htmlContents = htmlHeader + + allWords: list[str] = [] + # TODO: switch to SQLite, like StarDict writer + data: list[tuple[str, bytes]] = [] + + while True: + entry = yield + if entry is None: + break + if entry.isData(): + dataEntryCount += 1 + continue + l_word = entry.l_word + allWords += l_word + wordsByPrefix: dict[str, list[str]] = {} + for word in l_word: + prefix = self.get_prefix(word) + if prefix in wordsByPrefix: + wordsByPrefix[prefix].append(word) + else: + wordsByPrefix[prefix] = [word] + defi = self.fix_defi(entry.defi) + mainHeadword = l_word[0] + for prefix, p_words in wordsByPrefix.items(): + headword, *variants = p_words + if headword != mainHeadword: + headword = f"{mainHeadword}, {headword}" + data.append( + ( + prefix, + compress( + dumps( + ( + headword, + variants, + defi, + ), + ), + ), + ), + ) + del entry + + log.info("Kobo: sorting entries...") + data.sort(key=itemgetter(0)) + + log.info("Kobo: writing entries...") + + lastPrefix = "" + for prefix, row in data: + headword, variants, defi = loads(decompress(row)) + if lastPrefix and prefix != lastPrefix: + writeGroup(lastPrefix) + groupCounter = 0 + lastPrefix = prefix + + htmlVariants = "".join( + f'' for v in variants + ) + body = f"
              {headword}{htmlVariants}
              {defi}
              " + htmlContents += f'
              {body}\n' + groupCounter += 1 + del data + + if groupCounter > 0: + writeGroup(lastPrefix) + + if dataEntryCount > 0: + log.warning( + f"ignored {dataEntryCount} files (data entries)" + " and replaced ' None: + try: + import marisa_trie # type: ignore # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install marisa-trie` to install") + raise + self._filename = filename + + def write(self) -> Generator[None, EntryType, None]: + with indir(self._filename, create=True): + yield from self.write_groups() + + def finish(self) -> None: + import marisa_trie + + with indir(self._filename, create=False): + trie = marisa_trie.Trie(self._words) + trie.save(self.WORDS_FILE_NAME) + self._filename = "" diff --git a/pyglossary/plugins/ebook_kobo_dictfile/__init__.py b/pyglossary/plugins/ebook_kobo_dictfile/__init__.py index 7ec327ee3..946b18dfd 100644 --- a/pyglossary/plugins/ebook_kobo_dictfile/__init__.py +++ b/pyglossary/plugins/ebook_kobo_dictfile/__init__.py @@ -1,42 +1,15 @@ # -*- coding: utf-8 -*- -# The MIT License (MIT) -# Copyright © 2020-2021 Saeed Rasooli -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -from __future__ import annotations -import os -from os.path import isdir -from typing import TYPE_CHECKING +from __future__ import annotations -from pyglossary.core import exc_note, log, pip -from pyglossary.image_utils import extractInlineHtmlImages -from pyglossary.io_utils import nullTextIO from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -from pyglossary.text_reader import TextGlossaryReader -if TYPE_CHECKING: - import io - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -73,155 +46,3 @@ "encoding": EncodingOption(), "extract_inline_images": BoolOption(comment="Extract inline images"), } - - -def fixWord(word: str) -> str: - return word.replace("\n", " ") - - -def escapeDefi(defi: str) -> str: - return defi.replace("\n@", "\n @").replace("\n:", "\n :").replace("\n&", "\n &") - - -class Reader(TextGlossaryReader): - depends = { - "mistune": "mistune==3.0.1", - } - - _extract_inline_images: bool = True - - def __init__(self, glos: GlossaryType) -> None: - TextGlossaryReader.__init__(self, glos, hasInfo=False) - - def open(self, filename: str) -> None: - try: - import mistune # type: ignore # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install mistune` to install") - raise - TextGlossaryReader.open(self, filename) - self._glos.setDefaultDefiFormat("h") - - @classmethod - def isInfoWord(cls, _word: str) -> bool: - return False - - @classmethod - def fixInfoWord(cls, _word: str) -> str: - raise NotImplementedError - - def fixDefi( - self, - defi: str, - html: bool, - ) -> tuple[str, list[tuple[str, str]] | None]: - import mistune - - defi = ( - defi.replace("\n @", "\n@") - .replace("\n :", "\n:") - .replace("\n &", "\n&") - .replace("


              ", "

              ") - .replace("


              ", "

              ") - .replace("


              ", "

              ") - ) - defi = defi.strip() - if html: - pass - else: - defi = mistune.html(defi) - images: list[tuple[str, str]] | None = None - if self._extract_inline_images: - defi, images = extractInlineHtmlImages( - defi, - self._glos.tmpDataDir, - fnamePrefix="", # maybe f"{self._pos:06d}-" - ) - return defi, images - - def nextBlock( - self, - ) -> tuple[list[str], str, list[tuple[str, str]] | None]: - words: list[str] = [] - defiLines: list[str] = [] - html = False - - while True: - line = self.readline() - if not line: - break - line = line.rstrip("\n\r") - if line.startswith("@"): - if words: - self._bufferLine = line - defi, images = self.fixDefi("\n".join(defiLines), html=html) - return words, defi, images - words = [line[1:].strip()] - continue - if line.startswith(": "): - defiLines.append(line[2:]) - continue - if line.startswith("::"): - continue - if line.startswith("&"): - words.append(line[1:].strip()) - continue - if line.startswith(""): - line = line[6:] - html = True - defiLines.append(line) - - if words: - defi, images = self.fixDefi("\n".join(defiLines), html=html) - return words, defi, images - - raise StopIteration - - -class Writer: - _encoding: str = "utf-8" - - @staticmethod - def stripFullHtmlError(entry: EntryType, error: str) -> None: - log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._file: io.TextIOBase = nullTextIO - glos.stripFullHtml(errorHandler=self.stripFullHtmlError) - - def finish(self) -> None: - self._file.close() - if not os.listdir(self._resDir): - os.rmdir(self._resDir) - - def open(self, filename: str) -> None: - self._file = open(filename, "w", encoding=self._encoding) - # dictgen's ParseDictFile does not seem to support glossary info / metedata - self._resDir = filename + "_res" - if not isdir(self._resDir): - os.mkdir(self._resDir) - - def write( - self, - ) -> Generator[None, EntryType, None]: - fileObj = self._file - resDir = self._resDir - while True: - entry = yield - if entry is None: - break - if entry.isData(): - entry.save(resDir) - continue - words = entry.l_word - defi = entry.defi - - entry.detectDefiFormat() - if entry.defiFormat == "h": - defi = f"{entry.defi}" - - fileObj.write(f"@ {fixWord(words[0])}\n") - for alt in words[1:]: - fileObj.write(f"& {fixWord(alt)}\n") - fileObj.write(f"{escapeDefi(defi)}\n\n") diff --git a/pyglossary/plugins/ebook_kobo_dictfile/reader.py b/pyglossary/plugins/ebook_kobo_dictfile/reader.py new file mode 100644 index 000000000..131ab6190 --- /dev/null +++ b/pyglossary/plugins/ebook_kobo_dictfile/reader.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2020-2021 Saeed Rasooli +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.core import exc_note, pip +from pyglossary.image_utils import extractInlineHtmlImages +from pyglossary.text_reader import TextGlossaryReader + +if TYPE_CHECKING: + from pyglossary.glossary_types import GlossaryType + + +class Reader(TextGlossaryReader): + depends = { + "mistune": "mistune==3.0.1", + } + + _extract_inline_images: bool = True + + def __init__(self, glos: GlossaryType) -> None: + TextGlossaryReader.__init__(self, glos, hasInfo=False) + + def open(self, filename: str) -> None: + try: + import mistune # type: ignore # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install mistune` to install") + raise + TextGlossaryReader.open(self, filename) + self._glos.setDefaultDefiFormat("h") + + @classmethod + def isInfoWord(cls, _word: str) -> bool: + return False + + @classmethod + def fixInfoWord(cls, _word: str) -> str: + raise NotImplementedError + + def fixDefi( + self, + defi: str, + html: bool, + ) -> tuple[str, list[tuple[str, str]] | None]: + import mistune + + defi = ( + defi.replace("\n @", "\n@") + .replace("\n :", "\n:") + .replace("\n &", "\n&") + .replace("


              ", "

              ") + .replace("


              ", "

              ") + .replace("


              ", "

              ") + ) + defi = defi.strip() + if html: + pass + else: + defi = mistune.html(defi) + images: list[tuple[str, str]] | None = None + if self._extract_inline_images: + defi, images = extractInlineHtmlImages( + defi, + self._glos.tmpDataDir, + fnamePrefix="", # maybe f"{self._pos:06d}-" + ) + return defi, images + + def nextBlock( + self, + ) -> tuple[list[str], str, list[tuple[str, str]] | None]: + words: list[str] = [] + defiLines: list[str] = [] + html = False + + while True: + line = self.readline() + if not line: + break + line = line.rstrip("\n\r") + if line.startswith("@"): + if words: + self._bufferLine = line + defi, images = self.fixDefi("\n".join(defiLines), html=html) + return words, defi, images + words = [line[1:].strip()] + continue + if line.startswith(": "): + defiLines.append(line[2:]) + continue + if line.startswith("::"): + continue + if line.startswith("&"): + words.append(line[1:].strip()) + continue + if line.startswith(""): + line = line[6:] + html = True + defiLines.append(line) + + if words: + defi, images = self.fixDefi("\n".join(defiLines), html=html) + return words, defi, images + + raise StopIteration diff --git a/pyglossary/plugins/ebook_kobo_dictfile/writer.py b/pyglossary/plugins/ebook_kobo_dictfile/writer.py new file mode 100644 index 000000000..60c9c9651 --- /dev/null +++ b/pyglossary/plugins/ebook_kobo_dictfile/writer.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2020-2021 Saeed Rasooli +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +import os +from os.path import isdir +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.io_utils import nullTextIO + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +def fixWord(word: str) -> str: + return word.replace("\n", " ") + + +def escapeDefi(defi: str) -> str: + return defi.replace("\n@", "\n @").replace("\n:", "\n :").replace("\n&", "\n &") + + +class Writer: + _encoding: str = "utf-8" + + @staticmethod + def stripFullHtmlError(entry: EntryType, error: str) -> None: + log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._file: io.TextIOBase = nullTextIO + glos.stripFullHtml(errorHandler=self.stripFullHtmlError) + + def finish(self) -> None: + self._file.close() + if not os.listdir(self._resDir): + os.rmdir(self._resDir) + + def open(self, filename: str) -> None: + self._file = open(filename, "w", encoding=self._encoding) + # dictgen's ParseDictFile does not seem to support glossary info / metedata + self._resDir = filename + "_res" + if not isdir(self._resDir): + os.mkdir(self._resDir) + + def write( + self, + ) -> Generator[None, EntryType, None]: + fileObj = self._file + resDir = self._resDir + while True: + entry = yield + if entry is None: + break + if entry.isData(): + entry.save(resDir) + continue + words = entry.l_word + defi = entry.defi + + entry.detectDefiFormat() + if entry.defiFormat == "h": + defi = f"{entry.defi}" + + fileObj.write(f"@ {fixWord(words[0])}\n") + for alt in words[1:]: + fileObj.write(f"& {fixWord(alt)}\n") + fileObj.write(f"{escapeDefi(defi)}\n\n") diff --git a/pyglossary/plugins/ebook_mobi/__init__.py b/pyglossary/plugins/ebook_mobi/__init__.py index 9ac4e18ec..00da1e1ad 100644 --- a/pyglossary/plugins/ebook_mobi/__init__.py +++ b/pyglossary/plugins/ebook_mobi/__init__.py @@ -1,33 +1,8 @@ # -*- coding: utf-8 -*- -# The MIT License (MIT) -# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) -# Copyright © 2016-2022 Saeed Rasooli -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -from __future__ import annotations -import os -from datetime import datetime -from os.path import join, split -from typing import TYPE_CHECKING +from __future__ import annotations -from pyglossary.core import log -from pyglossary.ebook_base import EbookWriter from pyglossary.flags import DEFAULT_YES -from pyglossary.langs import Lang from pyglossary.option import ( BoolOption, FileSizeOption, @@ -36,10 +11,7 @@ StrOption, ) -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -121,277 +93,3 @@ " for creating Mobipocket e-books.", ), ] - - -class GroupStateBySize: - def __init__(self, writer: Writer) -> None: - self.writer = writer - self.group_index = -1 - self.reset() - - def reset(self) -> None: - self.group_contents: list[str] = [] - self.group_size = 0 - - def add(self, entry: EntryType) -> None: - defi = entry.defi - content = self.writer.format_group_content( - entry.l_word[0], - defi, - variants=entry.l_word[1:], - ) - self.group_contents.append(content) - self.group_size += len(content.encode("utf-8")) - - -class Writer(EbookWriter): - _compress: bool = False - _keep: bool = False - _kindlegen_path: str = "" - _file_size_approx: int = 271360 - _hide_word_index: bool = False - _spellcheck: bool = True - _exact: bool = False - CSS_CONTENTS = b""""@charset "UTF-8";""" - GROUP_XHTML_TEMPLATE = """ - - - - - - - - -{group_contents} - - -""" - - GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """ -{headword_visible}{infl} - -
              {definition} -
              -
              """ - - GROUP_XHTML_WORD_INFL_TEMPLATE = """ -{iforms_str} -""" - - GROUP_XHTML_WORD_IFORM_TEMPLATE = """""" - - OPF_TEMPLATE = """ - - - -{title} -{sourceLang} -{identifier} -{creator} -{copyright} -{description} -Dictionaries - - - -{sourceLang} -{targetLang} -{cover} - - - -{manifest} - - -{spine} - - - -""" - - def __init__(self, glos: GlossaryType) -> None: - import uuid - - EbookWriter.__init__( - self, - glos, - ) - glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) - # FIXME: check if full html pages/documents as entry do work - # glos.stripFullHtml(errorHandler=None) - - def get_prefix(self, word: str) -> str: - if not word: - return "" - length = self._group_by_prefix_length - prefix = word[:length].lower() - if prefix[0] < "a": - return "SPECIAL" - return prefix - - def format_group_content( - self, - word: str, - defi: str, - variants: list[str] | None = None, - ) -> str: - hide_word_index = self._hide_word_index - infl = "" - if variants: - iforms_list = [ - self.GROUP_XHTML_WORD_IFORM_TEMPLATE.format( - inflword=variant, - exact_str=' exact="yes"' if self._exact else "", - ) - for variant in variants - ] - infl = "\n" + self.GROUP_XHTML_WORD_INFL_TEMPLATE.format( - iforms_str="\n".join(iforms_list), - ) - - headword = self.escape_if_needed(word) - - defi = self.escape_if_needed(defi) - - if hide_word_index: - headword_visible = "" - value_headword = f' value="{headword}"' - else: - headword_visible = "\n" + self._glos.wordTitleStr(headword) - value_headword = "" - - return self.GROUP_XHTML_WORD_DEFINITION_TEMPLATE.format( - spellcheck_str=' spell="yes"' if self._spellcheck else "", - headword_visible=headword_visible, - value_headword=value_headword, - definition=defi, - infl=infl, - ) - - @staticmethod - def getLangCode(lang: Lang | None) -> str: - return lang.code if isinstance(lang, Lang) else "" - - def get_opf_contents( - self, - manifest_contents: str, - spine_contents: str, - ) -> bytes: - cover = "" - if self.cover: - cover = self.COVER_TEMPLATE.format(cover=self.cover) - creationDate = datetime.now().strftime("%Y-%m-%d") - - return self.OPF_TEMPLATE.format( - identifier=self._glos.getInfo("uuid"), - # use Language code instead name for kindlegen - sourceLang=self.getLangCode(self._glos.sourceLang), - targetLang=self.getLangCode(self._glos.targetLang), - title=self._glos.getInfo("name"), - creator=self._glos.author, - copyright=self._glos.getInfo("copyright"), - description=self._glos.getInfo("description"), - creationDate=creationDate, - cover=cover, - manifest=manifest_contents, - spine=spine_contents, - ).encode("utf-8") - - def write_groups(self) -> Generator[None, EntryType, None]: - def add_group(state: GroupStateBySize) -> None: - if state.group_size <= 0: - return - state.group_index += 1 - index = state.group_index + self.GROUP_START_INDEX - group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) - self.add_file_manifest( - "OEBPS/" + group_xhtml_path, - group_xhtml_path, - self.GROUP_XHTML_TEMPLATE.format( - group_contents=self.GROUP_XHTML_WORD_DEFINITION_JOINER.join( - state.group_contents, - ), - ).encode("utf-8"), - "application/xhtml+xml", - ) - - state = GroupStateBySize(self) - while True: - entry = yield - if entry is None: - break - if entry.isData(): - continue - - if state.group_size >= self._file_size_approx: - add_group(state) - state.reset() - - state.add(entry) - - add_group(state) - - def write(self) -> Generator[None, EntryType, None]: - import shutil - import subprocess - - filename = self._filename - kindlegen_path = self._kindlegen_path - - yield from EbookWriter.write(self) - - # download kindlegen from this page: - # https://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000765211 - - # run kindlegen - if not kindlegen_path: - kindlegen_path = shutil.which("kindlegen") or "" - if not kindlegen_path: - log.warning( - f"Not running kindlegen, the raw files are located in {filename}", - ) - log.warning( - "Provide KindleGen path with: --write-options 'kindlegen_path=...'", - ) - return - - # name = self._glos.getInfo("name") - log.info(f"Creating .mobi file with kindlegen, using {kindlegen_path!r}") - direc, filename = split(filename) - cmd = [ - kindlegen_path, - join(filename, "OEBPS", "content.opf"), - "-gen_ff_mobi7", - "-o", - "content.mobi", - ] - proc = subprocess.Popen( - cmd, - cwd=direc, - stdout=subprocess.PIPE, - stdin=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - output = proc.communicate() - log.info(output[0].decode("utf-8")) - mobi_path_abs = os.path.join(filename, "OEBPS", "content.mobi") - log.info(f"Created .mobi file with kindlegen: {mobi_path_abs}") diff --git a/pyglossary/plugins/ebook_mobi/writer.py b/pyglossary/plugins/ebook_mobi/writer.py new file mode 100644 index 000000000..36484ff8e --- /dev/null +++ b/pyglossary/plugins/ebook_mobi/writer.py @@ -0,0 +1,308 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) +# Copyright © 2016-2022 Saeed Rasooli +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +import os +from datetime import datetime +from os.path import join, split +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.ebook_base import EbookWriter +from pyglossary.langs import Lang + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class GroupStateBySize: + def __init__(self, writer: Writer) -> None: + self.writer = writer + self.group_index = -1 + self.reset() + + def reset(self) -> None: + self.group_contents: list[str] = [] + self.group_size = 0 + + def add(self, entry: EntryType) -> None: + defi = entry.defi + content = self.writer.format_group_content( + entry.l_word[0], + defi, + variants=entry.l_word[1:], + ) + self.group_contents.append(content) + self.group_size += len(content.encode("utf-8")) + + +class Writer(EbookWriter): + _compress: bool = False + _keep: bool = False + _kindlegen_path: str = "" + _file_size_approx: int = 271360 + _hide_word_index: bool = False + _spellcheck: bool = True + _exact: bool = False + CSS_CONTENTS = b""""@charset "UTF-8";""" + GROUP_XHTML_TEMPLATE = """ + + + + + + + + +{group_contents} + + +""" + + GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """ +{headword_visible}{infl} + +
              {definition} +
              +
              """ + + GROUP_XHTML_WORD_INFL_TEMPLATE = """ +{iforms_str} +""" + + GROUP_XHTML_WORD_IFORM_TEMPLATE = """""" + + OPF_TEMPLATE = """ + + + +{title} +{sourceLang} +{identifier} +{creator} +{copyright} +{description} +Dictionaries + + + +{sourceLang} +{targetLang} +{cover} + + + +{manifest} + + +{spine} + + + +""" + + def __init__(self, glos: GlossaryType) -> None: + import uuid + + EbookWriter.__init__( + self, + glos, + ) + glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) + # FIXME: check if full html pages/documents as entry do work + # glos.stripFullHtml(errorHandler=None) + + def get_prefix(self, word: str) -> str: + if not word: + return "" + length = self._group_by_prefix_length + prefix = word[:length].lower() + if prefix[0] < "a": + return "SPECIAL" + return prefix + + def format_group_content( + self, + word: str, + defi: str, + variants: list[str] | None = None, + ) -> str: + hide_word_index = self._hide_word_index + infl = "" + if variants: + iforms_list = [ + self.GROUP_XHTML_WORD_IFORM_TEMPLATE.format( + inflword=variant, + exact_str=' exact="yes"' if self._exact else "", + ) + for variant in variants + ] + infl = "\n" + self.GROUP_XHTML_WORD_INFL_TEMPLATE.format( + iforms_str="\n".join(iforms_list), + ) + + headword = self.escape_if_needed(word) + + defi = self.escape_if_needed(defi) + + if hide_word_index: + headword_visible = "" + value_headword = f' value="{headword}"' + else: + headword_visible = "\n" + self._glos.wordTitleStr(headword) + value_headword = "" + + return self.GROUP_XHTML_WORD_DEFINITION_TEMPLATE.format( + spellcheck_str=' spell="yes"' if self._spellcheck else "", + headword_visible=headword_visible, + value_headword=value_headword, + definition=defi, + infl=infl, + ) + + @staticmethod + def getLangCode(lang: Lang | None) -> str: + return lang.code if isinstance(lang, Lang) else "" + + def get_opf_contents( + self, + manifest_contents: str, + spine_contents: str, + ) -> bytes: + cover = "" + if self.cover: + cover = self.COVER_TEMPLATE.format(cover=self.cover) + creationDate = datetime.now().strftime("%Y-%m-%d") + + return self.OPF_TEMPLATE.format( + identifier=self._glos.getInfo("uuid"), + # use Language code instead name for kindlegen + sourceLang=self.getLangCode(self._glos.sourceLang), + targetLang=self.getLangCode(self._glos.targetLang), + title=self._glos.getInfo("name"), + creator=self._glos.author, + copyright=self._glos.getInfo("copyright"), + description=self._glos.getInfo("description"), + creationDate=creationDate, + cover=cover, + manifest=manifest_contents, + spine=spine_contents, + ).encode("utf-8") + + def write_groups(self) -> Generator[None, EntryType, None]: + def add_group(state: GroupStateBySize) -> None: + if state.group_size <= 0: + return + state.group_index += 1 + index = state.group_index + self.GROUP_START_INDEX + group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) + self.add_file_manifest( + "OEBPS/" + group_xhtml_path, + group_xhtml_path, + self.GROUP_XHTML_TEMPLATE.format( + group_contents=self.GROUP_XHTML_WORD_DEFINITION_JOINER.join( + state.group_contents, + ), + ).encode("utf-8"), + "application/xhtml+xml", + ) + + state = GroupStateBySize(self) + while True: + entry = yield + if entry is None: + break + if entry.isData(): + continue + + if state.group_size >= self._file_size_approx: + add_group(state) + state.reset() + + state.add(entry) + + add_group(state) + + def write(self) -> Generator[None, EntryType, None]: + import shutil + import subprocess + + filename = self._filename + kindlegen_path = self._kindlegen_path + + yield from EbookWriter.write(self) + + # download kindlegen from this page: + # https://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000765211 + + # run kindlegen + if not kindlegen_path: + kindlegen_path = shutil.which("kindlegen") or "" + if not kindlegen_path: + log.warning( + f"Not running kindlegen, the raw files are located in {filename}", + ) + log.warning( + "Provide KindleGen path with: --write-options 'kindlegen_path=...'", + ) + return + + # name = self._glos.getInfo("name") + log.info(f"Creating .mobi file with kindlegen, using {kindlegen_path!r}") + direc, filename = split(filename) + cmd = [ + kindlegen_path, + join(filename, "OEBPS", "content.opf"), + "-gen_ff_mobi7", + "-o", + "content.mobi", + ] + proc = subprocess.Popen( + cmd, + cwd=direc, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + output = proc.communicate() + log.info(output[0].decode("utf-8")) + mobi_path_abs = os.path.join(filename, "OEBPS", "content.mobi") + log.info(f"Created .mobi file with kindlegen: {mobi_path_abs}") diff --git a/pyglossary/plugins/edict2/__init__.py b/pyglossary/plugins/edict2/__init__.py index f0cb45408..50b9a2466 100644 --- a/pyglossary/plugins/edict2/__init__.py +++ b/pyglossary/plugins/edict2/__init__.py @@ -1,23 +1,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -from pyglossary.core import log -from pyglossary.io_utils import nullTextIO from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -from . import conv - -if TYPE_CHECKING: - import io - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType - +from .reader import Reader __all__ = [ "Reader", @@ -71,78 +60,3 @@ comment="Set to false to disable tones coloring", ), } - - -class Reader: - depends = { - "lxml": "lxml", - } - - _encoding: str = "utf-8" - _traditional_title: bool = False - _colorize_tones: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self.file: io.TextIOBase = nullTextIO - self._fileSize = 0 - - def open(self, filename: str) -> None: - # self._glos.sourceLangName = "Chinese" - # self._glos.targetLangName = "English" - - cfile = self.file = open(filename, encoding=self._encoding) - - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - # self._glos.setInfo("input_file_size", f"{self._fileSize}") - else: - log.warning("EDICT2 Reader: file is not seekable") - - def close(self) -> None: - self.file.close() - self.file = nullTextIO - - def __len__(self) -> int: - return 0 - - def __iter__(self) -> Iterator[EntryType]: - file = self.file - fileSize = self._fileSize - glos = self._glos - - render_syllables = ( - conv.render_syllables_color - if self._colorize_tones - else conv.render_syllables_no_color - ) - parse_line = ( - conv.parse_line_trad if self._traditional_title else conv.parse_line_simp - ) - - while True: - line = file.readline() - if not line: - break - line = line.rstrip("\n") - if not line: - continue - if line.startswith("#"): - continue - parts = parse_line(line) - if parts is None: - log.warning(f"bad line: {line!r}") - continue - names, article_text = conv.render_article( - render_syllables, - conv.Article(*parts), - ) - entry = glos.newEntry( - names, - article_text, - defiFormat="h", - byteProgress=(file.tell(), fileSize) if fileSize else None, - ) - yield entry diff --git a/pyglossary/plugins/edict2/reader.py b/pyglossary/plugins/edict2/reader.py new file mode 100644 index 000000000..378cc0251 --- /dev/null +++ b/pyglossary/plugins/edict2/reader.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.io_utils import nullTextIO + +from . import conv + +if TYPE_CHECKING: + import io + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + depends = { + "lxml": "lxml", + } + + _encoding: str = "utf-8" + _traditional_title: bool = False + _colorize_tones: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self.file: io.TextIOBase = nullTextIO + self._fileSize = 0 + + def open(self, filename: str) -> None: + # self._glos.sourceLangName = "Chinese" + # self._glos.targetLangName = "English" + + cfile = self.file = open(filename, encoding=self._encoding) + + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + # self._glos.setInfo("input_file_size", f"{self._fileSize}") + else: + log.warning("EDICT2 Reader: file is not seekable") + + def close(self) -> None: + self.file.close() + self.file = nullTextIO + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> Iterator[EntryType]: + file = self.file + fileSize = self._fileSize + glos = self._glos + + render_syllables = ( + conv.render_syllables_color + if self._colorize_tones + else conv.render_syllables_no_color + ) + parse_line = ( + conv.parse_line_trad if self._traditional_title else conv.parse_line_simp + ) + + while True: + line = file.readline() + if not line: + break + line = line.rstrip("\n") + if not line: + continue + if line.startswith("#"): + continue + parts = parse_line(line) + if parts is None: + log.warning(f"bad line: {line!r}") + continue + names, article_text = conv.render_article( + render_syllables, + conv.Article(*parts), + ) + entry = glos.newEntry( + names, + article_text, + defiFormat="h", + byteProgress=(file.tell(), fileSize) if fileSize else None, + ) + yield entry diff --git a/pyglossary/plugins/edlin/__init__.py b/pyglossary/plugins/edlin/__init__.py index fc5e428f8..6f6664762 100644 --- a/pyglossary/plugins/edlin/__init__.py +++ b/pyglossary/plugins/edlin/__init__.py @@ -1,45 +1,15 @@ # -*- coding: utf-8 -*- -# edlin.py -# -# Copyright © 2016-2019 Saeed Rasooli (ilius) -# This file is part of PyGlossary project, https://github.com/ilius/pyglossary -# -# This program is a free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL -# If not, see . from __future__ import annotations -import os -from os.path import dirname, isdir, isfile, join -from typing import TYPE_CHECKING - -from pyglossary.core import log from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -from pyglossary.text_utils import ( - escapeNTB, - splitByBarUnescapeNTB, - unescapeNTB, -) - -if TYPE_CHECKING: - from collections.abc import Generator, Iterator - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -72,241 +42,3 @@ "encoding": EncodingOption(), "prev_link": BoolOption(comment="Enable link to previous entry"), } - - -def makeDir(direc: str) -> None: - if not isdir(direc): - os.makedirs(direc) - - -class Reader: - _encoding: str = "utf-8" - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def close(self) -> None: - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._prev_link = True - self._wordCount = None - self._rootPath = None - self._resDir = "" - self._resFileNames: list[str] = [] - - def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToData - - if isdir(filename): - infoFname = join(filename, "info.json") - elif isfile(filename): - infoFname = filename - filename = dirname(filename) - else: - raise ValueError( - f"error while opening {filename!r}: no such file or directory", - ) - self._filename = filename - - with open(infoFname, encoding=self._encoding) as infoFp: - info = jsonToData(infoFp.read()) - self._wordCount = info.pop("wordCount") - self._prev_link = info.pop("prev_link") - self._rootPath = info.pop("root") - for key, value in info.items(): - self._glos.setInfo(key, value) - - self._resDir = join(filename, "res") - if isdir(self._resDir): - self._resFileNames = os.listdir(self._resDir) - else: - self._resDir = "" - self._resFileNames = [] - - def __len__(self) -> int: - if self._wordCount is None: - log.error("called len() on a reader which is not open") - return 0 - return self._wordCount + len(self._resFileNames) - - def __iter__(self) -> Iterator[EntryType]: - if not self._rootPath: - raise RuntimeError("iterating over a reader while it's not open") - - wordCount = 0 - nextPath = self._rootPath - while nextPath != "END": - wordCount += 1 - # before or after reading word and defi - # (and skipping empty entry)? FIXME - - with open( - join(self._filename, nextPath), - encoding=self._encoding, - ) as _file: - header = _file.readline().rstrip() - if self._prev_link: - _prevPath, nextPath = header.split(" ") - else: - nextPath = header - word = _file.readline() - if not word: - yield None # update progressbar - continue - defi = _file.read() - if not defi: - log.warning( - f"Edlin Reader: no definition for word {word!r}, skipping", - ) - yield None # update progressbar - continue - word = word.rstrip() - defi = defi.rstrip() - - if self._glos.alts: - word = splitByBarUnescapeNTB(word) - if len(word) == 1: - word = word[0] - else: - word = unescapeNTB(word, bar=False) - - # defi = unescapeNTB(defi) - yield self._glos.newEntry(word, defi) - - if wordCount != self._wordCount: - log.warning( - f"{wordCount} words found, " - f"wordCount in info.json was {self._wordCount}", - ) - self._wordCount = wordCount - - resDir = self._resDir - for fname in self._resFileNames: - with open(join(resDir, fname), "rb") as _file: - yield self._glos.newDataEntry( - fname, - _file.read(), - ) - - -class Writer: - _encoding: str = "utf-8" - _prev_link: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def finish(self) -> None: - self._clear() - - def open(self, filename: str) -> None: - self._filename = filename - self._resDir = join(filename, "res") - os.makedirs(filename) - os.mkdir(self._resDir) - - def _clear(self) -> None: - self._filename = "" - self._resDir = "" - self._encoding = "utf-8" - self._hashSet: set[str] = set() - # self._wordCount = None - - @staticmethod - def hashToPath(h: str) -> str: - return h[:2] + "/" + h[2:] - - def getEntryHash(self, entry: EntryType) -> str: - """ - Return hash string for given entry - don't call it twice for one entry, if you do you will get a - different hash string. - """ - from hashlib import sha1 - - hash_ = sha1(entry.s_word.encode("utf-8")).hexdigest()[:8] # noqa: S324 - if hash_ not in self._hashSet: - self._hashSet.add(hash_) - return hash_ - index = 0 - while True: - tmp_hash = hash_ + f"{index:x}" - if tmp_hash not in self._hashSet: - self._hashSet.add(tmp_hash) - return tmp_hash - index += 1 - - def saveEntry( - self, - thisEntry: EntryType, - thisHash: str, - prevHash: str | None, - nextHash: str | None, - ) -> None: - dpath = join(self._filename, thisHash[:2]) - makeDir(dpath) - with open( - join(dpath, thisHash[2:]), - "w", - encoding=self._encoding, - ) as toFile: - nextPath = self.hashToPath(nextHash) if nextHash else "END" - if self._prev_link: - prevPath = self.hashToPath(prevHash) if prevHash else "START" - header = prevPath + " " + nextPath - else: - header = nextPath - toFile.write( - "\n".join( - [ - header, - escapeNTB(thisEntry.s_word, bar=False), - thisEntry.defi, - ], - ), - ) - - def write(self) -> Generator[None, EntryType, None]: - from pyglossary.json_utils import dataToPrettyJson - - thisEntry = yield - if thisEntry is None: - raise ValueError("glossary is empty") - - count = 1 - rootHash = thisHash = self.getEntryHash(thisEntry) - prevHash = None - - while True: - nextEntry = yield - if nextEntry is None: - break - if nextEntry.isData(): - nextEntry.save(self._resDir) - continue - nextHash = self.getEntryHash(nextEntry) - self.saveEntry(thisEntry, thisHash, prevHash, nextHash) - thisEntry = nextEntry - prevHash, thisHash = thisHash, nextHash - count += 1 - self.saveEntry(thisEntry, thisHash, prevHash, None) - - with open( - join(self._filename, "info.json"), - "w", - encoding=self._encoding, - ) as toFile: - info = {} - info["name"] = self._glos.getInfo("name") - info["root"] = self.hashToPath(rootHash) - info["prev_link"] = self._prev_link - info["wordCount"] = count - # info["modified"] = - - info |= self._glos.getExtraInfos(["name", "root", "prev_link", "wordCount"]) - - toFile.write(dataToPrettyJson(info)) diff --git a/pyglossary/plugins/edlin/reader.py b/pyglossary/plugins/edlin/reader.py new file mode 100644 index 000000000..8fcdf4007 --- /dev/null +++ b/pyglossary/plugins/edlin/reader.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import os +from os.path import dirname, isdir, isfile, join +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.text_utils import ( + splitByBarUnescapeNTB, + unescapeNTB, +) + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + _encoding: str = "utf-8" + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def close(self) -> None: + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._prev_link = True + self._wordCount = None + self._rootPath = None + self._resDir = "" + self._resFileNames: list[str] = [] + + def open(self, filename: str) -> None: + from pyglossary.json_utils import jsonToData + + if isdir(filename): + infoFname = join(filename, "info.json") + elif isfile(filename): + infoFname = filename + filename = dirname(filename) + else: + raise ValueError( + f"error while opening {filename!r}: no such file or directory", + ) + self._filename = filename + + with open(infoFname, encoding=self._encoding) as infoFp: + info = jsonToData(infoFp.read()) + self._wordCount = info.pop("wordCount") + self._prev_link = info.pop("prev_link") + self._rootPath = info.pop("root") + for key, value in info.items(): + self._glos.setInfo(key, value) + + self._resDir = join(filename, "res") + if isdir(self._resDir): + self._resFileNames = os.listdir(self._resDir) + else: + self._resDir = "" + self._resFileNames = [] + + def __len__(self) -> int: + if self._wordCount is None: + log.error("called len() on a reader which is not open") + return 0 + return self._wordCount + len(self._resFileNames) + + def __iter__(self) -> Iterator[EntryType]: + if not self._rootPath: + raise RuntimeError("iterating over a reader while it's not open") + + wordCount = 0 + nextPath = self._rootPath + while nextPath != "END": + wordCount += 1 + # before or after reading word and defi + # (and skipping empty entry)? FIXME + + with open( + join(self._filename, nextPath), + encoding=self._encoding, + ) as _file: + header = _file.readline().rstrip() + if self._prev_link: + _prevPath, nextPath = header.split(" ") + else: + nextPath = header + word = _file.readline() + if not word: + yield None # update progressbar + continue + defi = _file.read() + if not defi: + log.warning( + f"Edlin Reader: no definition for word {word!r}, skipping", + ) + yield None # update progressbar + continue + word = word.rstrip() + defi = defi.rstrip() + + if self._glos.alts: + word = splitByBarUnescapeNTB(word) + if len(word) == 1: + word = word[0] + else: + word = unescapeNTB(word, bar=False) + + # defi = unescapeNTB(defi) + yield self._glos.newEntry(word, defi) + + if wordCount != self._wordCount: + log.warning( + f"{wordCount} words found, " + f"wordCount in info.json was {self._wordCount}", + ) + self._wordCount = wordCount + + resDir = self._resDir + for fname in self._resFileNames: + with open(join(resDir, fname), "rb") as _file: + yield self._glos.newDataEntry( + fname, + _file.read(), + ) diff --git a/pyglossary/plugins/edlin/writer.py b/pyglossary/plugins/edlin/writer.py new file mode 100644 index 000000000..10b77b85a --- /dev/null +++ b/pyglossary/plugins/edlin/writer.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import os +from os.path import isdir, join +from typing import TYPE_CHECKING + +from pyglossary.text_utils import ( + escapeNTB, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +def makeDir(direc: str) -> None: + if not isdir(direc): + os.makedirs(direc) + + +class Writer: + _encoding: str = "utf-8" + _prev_link: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def finish(self) -> None: + self._clear() + + def open(self, filename: str) -> None: + self._filename = filename + self._resDir = join(filename, "res") + os.makedirs(filename) + os.mkdir(self._resDir) + + def _clear(self) -> None: + self._filename = "" + self._resDir = "" + self._encoding = "utf-8" + self._hashSet: set[str] = set() + # self._wordCount = None + + @staticmethod + def hashToPath(h: str) -> str: + return h[:2] + "/" + h[2:] + + def getEntryHash(self, entry: EntryType) -> str: + """ + Return hash string for given entry + don't call it twice for one entry, if you do you will get a + different hash string. + """ + from hashlib import sha1 + + hash_ = sha1(entry.s_word.encode("utf-8")).hexdigest()[:8] # noqa: S324 + if hash_ not in self._hashSet: + self._hashSet.add(hash_) + return hash_ + index = 0 + while True: + tmp_hash = hash_ + f"{index:x}" + if tmp_hash not in self._hashSet: + self._hashSet.add(tmp_hash) + return tmp_hash + index += 1 + + def saveEntry( + self, + thisEntry: EntryType, + thisHash: str, + prevHash: str | None, + nextHash: str | None, + ) -> None: + dpath = join(self._filename, thisHash[:2]) + makeDir(dpath) + with open( + join(dpath, thisHash[2:]), + "w", + encoding=self._encoding, + ) as toFile: + nextPath = self.hashToPath(nextHash) if nextHash else "END" + if self._prev_link: + prevPath = self.hashToPath(prevHash) if prevHash else "START" + header = prevPath + " " + nextPath + else: + header = nextPath + toFile.write( + "\n".join( + [ + header, + escapeNTB(thisEntry.s_word, bar=False), + thisEntry.defi, + ], + ), + ) + + def write(self) -> Generator[None, EntryType, None]: + from pyglossary.json_utils import dataToPrettyJson + + thisEntry = yield + if thisEntry is None: + raise ValueError("glossary is empty") + + count = 1 + rootHash = thisHash = self.getEntryHash(thisEntry) + prevHash = None + + while True: + nextEntry = yield + if nextEntry is None: + break + if nextEntry.isData(): + nextEntry.save(self._resDir) + continue + nextHash = self.getEntryHash(nextEntry) + self.saveEntry(thisEntry, thisHash, prevHash, nextHash) + thisEntry = nextEntry + prevHash, thisHash = thisHash, nextHash + count += 1 + self.saveEntry(thisEntry, thisHash, prevHash, None) + + with open( + join(self._filename, "info.json"), + "w", + encoding=self._encoding, + ) as toFile: + info = {} + info["name"] = self._glos.getInfo("name") + info["root"] = self.hashToPath(rootHash) + info["prev_link"] = self._prev_link + info["wordCount"] = count + # info["modified"] = + + info |= self._glos.getExtraInfos(["name", "root", "prev_link", "wordCount"]) + + toFile.write(dataToPrettyJson(info)) diff --git a/pyglossary/plugins/gettext_po/__init__.py b/pyglossary/plugins/gettext_po/__init__.py index 978b7c455..cd6dd9887 100644 --- a/pyglossary/plugins/gettext_po/__init__.py +++ b/pyglossary/plugins/gettext_po/__init__.py @@ -2,23 +2,13 @@ from __future__ import annotations -import os -from os.path import isdir -from typing import TYPE_CHECKING - -from pyglossary.core import exc_note, log, pip -from pyglossary.io_utils import nullTextIO from pyglossary.option import ( BoolOption, Option, ) -from pyglossary.text_utils import splitByBar - -if TYPE_CHECKING: - import io - from collections.abc import Generator, Iterator - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -52,166 +42,3 @@ optionsProp: dict[str, Option] = { "resources": BoolOption(comment="Enable resources / data files"), } - - -class Reader: - depends = { - "polib": "polib", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._alts = glos.alts - self.clear() - - def clear(self) -> None: - self._filename = "" - self._file: io.TextIOBase = nullTextIO - self._wordCount: int | None = None - self._resDir = "" - self._resFileNames: list[str] = [] - - def open(self, filename: str) -> None: - self._filename = filename - self._file = open(filename, encoding="utf-8") - self._resDir = filename + "_res" - if isdir(self._resDir): - self._resFileNames = os.listdir(self._resDir) - else: - self._resDir = "" - self._resFileNames = [] - - def close(self) -> None: - self._file.close() - self._file = nullTextIO - self.clear() - - def __len__(self) -> int: - from pyglossary.file_utils import fileCountLines - - if self._wordCount is None: - log.debug("Try not to use len(reader) as it takes extra time") - self._wordCount = fileCountLines( - self._filename, - newline=b"\nmsgid", - ) - return self._wordCount - - def makeEntry(self, word: str, defi: str) -> EntryType: - if self._alts: - return self._glos.newEntry(splitByBar(word), defi) - return self._glos.newEntry(word, defi) - - def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912 - try: - from polib import unescape as po_unescape - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install polib` to install") - raise - - file = self._file - - word = "" - defi = "" - msgstr = False - wordCount = 0 - for line_ in file: - line = line_.strip() # noqa: PLW2901 - if not line: - continue - if line.startswith("#"): - continue - if line.startswith("msgid "): - if word: - yield self.makeEntry(word, defi) - wordCount += 1 - word = "" - defi = "" - else: - pass - # TODO: parse defi and set glos info? - # but this should be done in self.open - word = po_unescape(line[6:]) - if word.startswith('"'): - if len(word) < 2 or word[-1] != '"': - raise ValueError("invalid po line: line") - word = word[1:-1] - msgstr = False - continue - if line.startswith("msgstr "): - if msgstr: - log.error("msgid omitted!") - defi = po_unescape(line[7:]) - if defi.startswith('"'): - if len(defi) < 2 or defi[-1] != '"': - raise ValueError("invalid po line: line") - defi = defi[1:-1] - msgstr = True - continue - - line = po_unescape(line) - if line.startswith('"'): - if len(line) < 2 or line[-1] != '"': - raise ValueError("invalid po line: line") - line = line[1:-1] - - if msgstr: - defi += line - else: - word += line - if word: - yield self.makeEntry(word, defi) - wordCount += 1 - self._wordCount = wordCount - - -class Writer: - depends = { - "polib": "polib", - } - - _resources: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.TextIOBase = nullTextIO - glos.preventDuplicateWords() - - def open(self, filename: str) -> None: - try: - from polib import escape as po_escape - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install polib` to install") - raise - - self._filename = filename - self._file = file = open(filename, mode="w", encoding="utf-8") - file.write('#\nmsgid ""\nmsgstr ""\n') - for key, value in self._glos.iterInfo(): - file.write(f'"{po_escape(key)}: {po_escape(value)}\\n"\n') - - def finish(self) -> None: - self._filename = "" - self._file.close() - self._file = nullTextIO - - def write(self) -> Generator[None, EntryType, None]: - from polib import escape as po_escape - - file = self._file - - resources = self._resources - filename = self._filename - while True: - entry = yield - if entry is None: - break - if entry.isData(): - if resources: - entry.save(filename + "_res") - continue - file.write( - f'msgid "{po_escape(entry.s_word)}"\n' - f'msgstr "{po_escape(entry.defi)}"\n\n', - ) diff --git a/pyglossary/plugins/gettext_po/reader.py b/pyglossary/plugins/gettext_po/reader.py new file mode 100644 index 000000000..126288488 --- /dev/null +++ b/pyglossary/plugins/gettext_po/reader.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import os +from os.path import isdir +from typing import TYPE_CHECKING + +from pyglossary.core import exc_note, log, pip +from pyglossary.io_utils import nullTextIO +from pyglossary.text_utils import splitByBar + +if TYPE_CHECKING: + import io + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + depends = { + "polib": "polib", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._alts = glos.alts + self.clear() + + def clear(self) -> None: + self._filename = "" + self._file: io.TextIOBase = nullTextIO + self._wordCount: int | None = None + self._resDir = "" + self._resFileNames: list[str] = [] + + def open(self, filename: str) -> None: + self._filename = filename + self._file = open(filename, encoding="utf-8") + self._resDir = filename + "_res" + if isdir(self._resDir): + self._resFileNames = os.listdir(self._resDir) + else: + self._resDir = "" + self._resFileNames = [] + + def close(self) -> None: + self._file.close() + self._file = nullTextIO + self.clear() + + def __len__(self) -> int: + from pyglossary.file_utils import fileCountLines + + if self._wordCount is None: + log.debug("Try not to use len(reader) as it takes extra time") + self._wordCount = fileCountLines( + self._filename, + newline=b"\nmsgid", + ) + return self._wordCount + + def makeEntry(self, word: str, defi: str) -> EntryType: + if self._alts: + return self._glos.newEntry(splitByBar(word), defi) + return self._glos.newEntry(word, defi) + + def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912 + try: + from polib import unescape as po_unescape + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install polib` to install") + raise + + file = self._file + + word = "" + defi = "" + msgstr = False + wordCount = 0 + for line_ in file: + line = line_.strip() # noqa: PLW2901 + if not line: + continue + if line.startswith("#"): + continue + if line.startswith("msgid "): + if word: + yield self.makeEntry(word, defi) + wordCount += 1 + word = "" + defi = "" + else: + pass + # TODO: parse defi and set glos info? + # but this should be done in self.open + word = po_unescape(line[6:]) + if word.startswith('"'): + if len(word) < 2 or word[-1] != '"': + raise ValueError("invalid po line: line") + word = word[1:-1] + msgstr = False + continue + if line.startswith("msgstr "): + if msgstr: + log.error("msgid omitted!") + defi = po_unescape(line[7:]) + if defi.startswith('"'): + if len(defi) < 2 or defi[-1] != '"': + raise ValueError("invalid po line: line") + defi = defi[1:-1] + msgstr = True + continue + + line = po_unescape(line) + if line.startswith('"'): + if len(line) < 2 or line[-1] != '"': + raise ValueError("invalid po line: line") + line = line[1:-1] + + if msgstr: + defi += line + else: + word += line + if word: + yield self.makeEntry(word, defi) + wordCount += 1 + self._wordCount = wordCount diff --git a/pyglossary/plugins/gettext_po/writer.py b/pyglossary/plugins/gettext_po/writer.py new file mode 100644 index 000000000..685a447ee --- /dev/null +++ b/pyglossary/plugins/gettext_po/writer.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.core import exc_note, pip +from pyglossary.io_utils import nullTextIO + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + depends = { + "polib": "polib", + } + + _resources: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.TextIOBase = nullTextIO + glos.preventDuplicateWords() + + def open(self, filename: str) -> None: + try: + from polib import escape as po_escape + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install polib` to install") + raise + + self._filename = filename + self._file = file = open(filename, mode="w", encoding="utf-8") + file.write('#\nmsgid ""\nmsgstr ""\n') + for key, value in self._glos.iterInfo(): + file.write(f'"{po_escape(key)}: {po_escape(value)}\\n"\n') + + def finish(self) -> None: + self._filename = "" + self._file.close() + self._file = nullTextIO + + def write(self) -> Generator[None, EntryType, None]: + from polib import escape as po_escape + + file = self._file + + resources = self._resources + filename = self._filename + while True: + entry = yield + if entry is None: + break + if entry.isData(): + if resources: + entry.save(filename + "_res") + continue + file.write( + f'msgid "{po_escape(entry.s_word)}"\n' + f'msgstr "{po_escape(entry.defi)}"\n\n', + ) diff --git a/pyglossary/plugins/html_dir/__init__.py b/pyglossary/plugins/html_dir/__init__.py index 8931a0697..d47850759 100644 --- a/pyglossary/plugins/html_dir/__init__.py +++ b/pyglossary/plugins/html_dir/__init__.py @@ -1,24 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html -import os -import re -import time -from functools import lru_cache -from os.path import isdir, isfile, join -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - import io - from collections.abc import Generator - - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) - -from pyglossary.core import log from pyglossary.option import ( BoolOption, EncodingOption, @@ -26,10 +8,8 @@ Option, StrOption, ) -from pyglossary.text_utils import ( - escapeNTB, - unescapeNTB, -) + +from .writer import Writer __all__ = [ "Writer", @@ -80,469 +60,3 @@ comment="Add headwords title to beginning of definition", ), } - -nbsp = "\xa0" -# nbsp = " " - -darkStyle = """ -body {{ - background-color: #373737; - color: #eee; -}} -a {{ color: #aaaaff; }} -a.broken {{ color: #e0c0c0; }} -a.no_ul {{ text-decoration: none; }} -b.headword {{ font-size: 1.5em; color: #c7ffb9; }} -h1 {{ font-size: 1.5em; color: #c7ffb9;}} -h2 {{ font-size: 1.3em;}} -h3 {{ font-size: 1.0em;}} -h4 {{ font-size: 1.0em;}} -h5 {{ font-size: 1.0em;}} -h6 {{ font-size: 1.0em;}} -""" - - -class Writer: - _encoding: str = "utf-8" - _resources: bool = True - _max_file_size: int = 102400 - _filename_format: str = "{n:05d}.html" - _escape_defi: bool = False - _dark: bool = True - _css: str = "" - _word_title: bool = True - - @staticmethod - def stripFullHtmlError(entry: EntryType, error: str) -> None: - log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._fileObj: io.IOBase | None = None - self._encoding = "utf-8" - self._filename_format = "{n:05d}.html" - self._tail = "" - self._filenameList: list[str] = [] - glos.stripFullHtml(errorHandler=self.stripFullHtmlError) - - self._resSrcPattern = re.compile(' src="([^"]*)"') - - def open(self, filename: str) -> None: - self._filename = filename - self._resDir = resDir = join(filename, "res") - if not isdir(filename): - os.mkdir(filename) - if not isdir(resDir): - os.mkdir(resDir) - if self._css: - self.copyCSS(self._css) - - def copyCSS(self, cssPath: str) -> None: - import shutil - - shutil.copy(cssPath, join(self._filename, "style.css")) - - def finish(self) -> None: - pass - - def getNextFilename(self) -> str: - return self._filename_format.format( - n=len(self._filenameList), - ) - - def nextFile(self) -> io.TextIOBase: - if self._fileObj: - self._fileObj.write(self._tail) - self._fileObj.close() - filename = self.getNextFilename() - self._filenameList.append(filename) - self._fileObj = open( - join( - self._filename, - filename, - ), - mode="w", - encoding=self._encoding, - ) - return self._fileObj - - def fixLinks(self, linkTargetSet: set[str]) -> None: # noqa: PLR0912 - import gc - - gc.collect() - dirn = self._filename - - filenameList = self._filenameList - - fileByWord: dict[str, list[tuple[str, int]]] = {} - for line in open(join(dirn, "index.txt"), encoding="utf-8"): - line = line.rstrip("\n") # noqa: PLW2901 - if not line: - continue - entryIndexStr, wordEsc, filename, _ = line.split("\t") - entryIndex = int(entryIndexStr) - # entryId = f"entry{entryIndex}" - word = unescapeNTB(wordEsc) - if word not in linkTargetSet: - continue - if word in fileByWord: - fileByWord[word].append((filename, entryIndex)) - else: - fileByWord[word] = [(filename, entryIndex)] - - # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile: - # json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t") - - @lru_cache(maxsize=10) - def getLinksByFile(fileIndex: int) -> io.TextIOBase: - return open( - join(dirn, f"links{fileIndex}"), - mode="a", - encoding="utf-8", - ) - - log.info("") - for line in open(join(dirn, "links.txt"), encoding="utf-8"): - line = line.rstrip("\n") # noqa: PLW2901 - if not line: - continue - target, fileIndexStr, x_start, x_size = line.split("\t") - target = unescapeNTB(target) - if target not in fileByWord: - targetNew = "" - else: - targetFilename, targetEntryIndex = fileByWord[target][0] - if targetFilename == filename: - continue - targetNew = f"{targetFilename}#entry{targetEntryIndex}" - file = getLinksByFile(int(fileIndexStr)) - file.write( - f"{x_start}\t{x_size}\t{targetNew}\n", - ) - file.flush() - - linkTargetSet.clear() - del fileByWord, linkTargetSet - gc.collect() - - if os.sep == "\\": - time.sleep(0.1) - - entry_url_fmt = self._glos.getInfo("entry_url") - - re_href = re.compile( - b' href="[^<>"]*?"', - re.IGNORECASE, - ) - - for fileIndex, filename in enumerate(filenameList): - if not isfile(join(dirn, f"links{fileIndex}")): - continue - with open(join(dirn, filename), mode="rb") as inFile: - with open(join(dirn, f"{filename}.new"), mode="wb") as outFile: - for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"): - outFile.flush() - ( - b_x_start, - b_x_size, - b_target, - ) = linkLine.rstrip(b"\n").split(b"\t") - outFile.write( - inFile.read( - int(b_x_start, 16) - inFile.tell(), - ), - ) - curLink = inFile.read(int(b_x_size, 16)) - - if b_target: - outFile.write( - re_href.sub( - b' href="./' + b_target + b'"', - curLink, - ), - ) - continue - - if not entry_url_fmt: - outFile.write( - curLink.replace( - b' href="#', - b' class="broken" href="#', - ), - ) - continue - - st = curLink.decode("utf-8") - i = st.find('href="#') - j = st.find('"', i + 7) - word = st[i + 7 : j] - url = entry_url_fmt.format(word=word) - outFile.write( - ( - st[:i] + f'class="broken" href="{url}"' + st[j + 1 :] - ).encode("utf-8"), - ) - - outFile.write(inFile.read()) - - os.remove(join(dirn, filename)) - os.rename(join(dirn, f"{filename}.new"), join(dirn, filename)) - os.remove(join(dirn, f"links{fileIndex}")) - - def writeInfo(self, filename: str, header: str) -> None: - glos = self._glos - title = glos.getInfo("name") - customStyle = ( - "table, th, td {border: 1px solid black; " - "border-collapse: collapse; padding: 5px;}" - ) - infoHeader = header.format( - pageTitle=f"Info: {title}", - customStyle=customStyle, - ) - with open( - join(filename, "info.html"), - mode="w", - encoding=self._encoding, - ) as _file: - _file.write( - infoHeader + "" - "" - '' - '' - "\n", - ) - for key, value in glos.iterInfo(): - _file.write( - f"\n", - ) - _file.write("
              KeyValue
              {key}{value}
              ") - - @staticmethod - def _subResSrc(m: re.Match) -> str: - url = m.group(1) - if "://" in url: - return m.group(0) - url = "res/" + url - return f' src="{url}"' - - def write(self) -> Generator[None, EntryType, None]: # noqa: PLR0912 - encoding = self._encoding - resources = self._resources - max_file_size = self._max_file_size - filename_format = self._filename_format - escape_defi = self._escape_defi - - wordSep = ' | ' - - initFileSizeMax = 100 - - glos = self._glos - - filename = self._filename - self._encoding = encoding - self._filename_format = filename_format - - entry_url_fmt = glos.getInfo("entry_url") - - def getEntryWebLink(entry: EntryType) -> str: - if not entry_url_fmt: - return "" - url = entry_url_fmt.format(word=html.escape(entry.l_word[0])) - return f'{nbsp}
              🌏' - - # from math import log2, ceil - # maxPosHexLen = int(ceil(log2(max_file_size) / 4)) - - indexTxtFileObj = open( - join(filename, "index.txt"), - mode="w", - encoding="utf-8", - ) - linksTxtFileObj = open( - join(filename, "links.txt"), - mode="w", - encoding="utf-8", - ) - - title = glos.getInfo("name") - style = "" - if self._dark: - style = darkStyle - - cssLink = '' if self._css else "" - - header = ( - "\n" - "" - "{pageTitle}" - f'' - f'{cssLink}' - "\n" - ) - - def pageHeader(n: int) -> str: - return header.format( - pageTitle=f"Page {n} of {title}", - customStyle="", - ) - - def navBar() -> str: - links: list[str] = [] - if len(self._filenameList) > 1: - links.append(f'') - links.extend( - [ - f'', - 'ℹ️', # noqa: RUF001 - ], - ) - return ( - '" - ) - - tailSize = len(self._tail.encode(encoding)) - - if max_file_size < len(header) + tailSize: - raise ValueError(f"{max_file_size=} is too small") - - max_file_size -= tailSize - - if not isdir(self._filename): - os.mkdir(self._filename) - - fileObj = self.nextFile() - fileObj.write(pageHeader(0)) - fileObj.write(navBar()) - - re_fixed_link = re.compile( - r']*? )?href="#([^<>"]+?)">[^<>]+?', - re.IGNORECASE, - ) - - linkTargetSet = set() - - def replaceBword(text: str) -> str: - return text.replace( - ' href="bword://', - ' href="#', - ) - - def addLinks(text: str, pos: int) -> None: - for m in re_fixed_link.finditer(text): - if ' class="entry_link"' in m.group(0): - continue - if m.group(0).count("href=") != 1: - log.error(f"unexpected match: {m.group(0)}") - target = html.unescape(m.group(1)) - linkTargetSet.add(target) - start = m.start() - b_start = len(text[:start].encode(encoding)) - b_size = len(text[start : m.end()].encode(encoding)) - linksTxtFileObj.write( - f"{escapeNTB(target)}\t" - f"{len(self._filenameList) - 1}\t" - f"{pos + b_start:x}\t" - f"{b_size:x}\n", - ) - linksTxtFileObj.flush() - - self.writeInfo(filename, header) - - word_title = self._word_title - - resDir = self._resDir - entryIndex = -1 - while True: - entryIndex += 1 - entry = yield - if entry is None: - break - if entry.isData(): - if resources: - entry.save(resDir) - continue - - entry.detectDefiFormat() - defi = entry.defi - defiFormat = entry.defiFormat - - if defi.startswith("") and defiFormat != "h": - log.error(f"bad {defiFormat=}") - defiFormat = "h" - - if defiFormat == "m": - defi = html.escape(defi) - if "\n" in defi: - # could be markdown or unformatted plaintext - # FIXME: this changes the font to a monospace - defi = f"
              {defi}
              " - elif defiFormat == "h": - defi = self._resSrcPattern.sub(self._subResSrc, defi) - if escape_defi: - defi = html.escape(defi) - - entryId = f"entry{entryIndex}" - - if word_title: - words = [html.escape(word) for word in entry.l_word] - title = glos.wordTitleStr( - wordSep.join(words), - sample=entry.l_word[0], - class_="headword", - ) - - if not title: - title = f"Entry {entryIndex}" - - # entry_link_sym = "¶" - entry_link_sym = "🔗" - text = ( - f'
              {title}{nbsp}{nbsp}' - f'' - f"{entry_link_sym}" - f"{getEntryWebLink(entry)}" - f"
              \n{defi}" - "
              \n" - "
              \n" - ) - pos = fileObj.tell() - if pos > initFileSizeMax and pos > max_file_size - len( - text.encode(encoding), - ): - fileObj = self.nextFile() - fileObj.write( - pageHeader( - len(self._filenameList) - 1, - ), - ) - fileObj.write(navBar()) - pos = fileObj.tell() - tmpFilename = escapeNTB(self._filenameList[-1]) - for word in entry.l_word: - indexTxtFileObj.write( - f"{entryIndex}\t" - f"{escapeNTB(word)}\t" - f"{tmpFilename}\t" - f"{pos}\n", - ) - del tmpFilename - text = replaceBword(text) - addLinks(text, pos) - fileObj.write(text) - - fileObj.close() - self._fileObj = None - indexTxtFileObj.close() - - linksTxtFileObj.close() - - if linkTargetSet: - log.info(f"{len(linkTargetSet)} link targets found") - log.info("Fixing links, please wait...") - self.fixLinks(linkTargetSet) - - os.remove(join(filename, "links.txt")) diff --git a/pyglossary/plugins/html_dir/writer.py b/pyglossary/plugins/html_dir/writer.py new file mode 100644 index 000000000..6451f09ce --- /dev/null +++ b/pyglossary/plugins/html_dir/writer.py @@ -0,0 +1,491 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +import os +import re +import time +from functools import lru_cache +from os.path import isdir, isfile, join +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + +from pyglossary.core import log +from pyglossary.text_utils import ( + escapeNTB, + unescapeNTB, +) + +nbsp = "\xa0" +# nbsp = " " + +darkStyle = """ +body {{ + background-color: #373737; + color: #eee; +}} +a {{ color: #aaaaff; }} +a.broken {{ color: #e0c0c0; }} +a.no_ul {{ text-decoration: none; }} +b.headword {{ font-size: 1.5em; color: #c7ffb9; }} +h1 {{ font-size: 1.5em; color: #c7ffb9;}} +h2 {{ font-size: 1.3em;}} +h3 {{ font-size: 1.0em;}} +h4 {{ font-size: 1.0em;}} +h5 {{ font-size: 1.0em;}} +h6 {{ font-size: 1.0em;}} +""" + + +class Writer: + _encoding: str = "utf-8" + _resources: bool = True + _max_file_size: int = 102400 + _filename_format: str = "{n:05d}.html" + _escape_defi: bool = False + _dark: bool = True + _css: str = "" + _word_title: bool = True + + @staticmethod + def stripFullHtmlError(entry: EntryType, error: str) -> None: + log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._fileObj: io.IOBase | None = None + self._encoding = "utf-8" + self._filename_format = "{n:05d}.html" + self._tail = "" + self._filenameList: list[str] = [] + glos.stripFullHtml(errorHandler=self.stripFullHtmlError) + + self._resSrcPattern = re.compile(' src="([^"]*)"') + + def open(self, filename: str) -> None: + self._filename = filename + self._resDir = resDir = join(filename, "res") + if not isdir(filename): + os.mkdir(filename) + if not isdir(resDir): + os.mkdir(resDir) + if self._css: + self.copyCSS(self._css) + + def copyCSS(self, cssPath: str) -> None: + import shutil + + shutil.copy(cssPath, join(self._filename, "style.css")) + + def finish(self) -> None: + pass + + def getNextFilename(self) -> str: + return self._filename_format.format( + n=len(self._filenameList), + ) + + def nextFile(self) -> io.TextIOBase: + if self._fileObj: + self._fileObj.write(self._tail) + self._fileObj.close() + filename = self.getNextFilename() + self._filenameList.append(filename) + self._fileObj = open( + join( + self._filename, + filename, + ), + mode="w", + encoding=self._encoding, + ) + return self._fileObj + + def fixLinks(self, linkTargetSet: set[str]) -> None: # noqa: PLR0912 + import gc + + gc.collect() + dirn = self._filename + + filenameList = self._filenameList + + fileByWord: dict[str, list[tuple[str, int]]] = {} + for line in open(join(dirn, "index.txt"), encoding="utf-8"): + line = line.rstrip("\n") # noqa: PLW2901 + if not line: + continue + entryIndexStr, wordEsc, filename, _ = line.split("\t") + entryIndex = int(entryIndexStr) + # entryId = f"entry{entryIndex}" + word = unescapeNTB(wordEsc) + if word not in linkTargetSet: + continue + if word in fileByWord: + fileByWord[word].append((filename, entryIndex)) + else: + fileByWord[word] = [(filename, entryIndex)] + + # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile: + # json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t") + + @lru_cache(maxsize=10) + def getLinksByFile(fileIndex: int) -> io.TextIOBase: + return open( + join(dirn, f"links{fileIndex}"), + mode="a", + encoding="utf-8", + ) + + log.info("") + for line in open(join(dirn, "links.txt"), encoding="utf-8"): + line = line.rstrip("\n") # noqa: PLW2901 + if not line: + continue + target, fileIndexStr, x_start, x_size = line.split("\t") + target = unescapeNTB(target) + if target not in fileByWord: + targetNew = "" + else: + targetFilename, targetEntryIndex = fileByWord[target][0] + if targetFilename == filename: + continue + targetNew = f"{targetFilename}#entry{targetEntryIndex}" + file = getLinksByFile(int(fileIndexStr)) + file.write( + f"{x_start}\t{x_size}\t{targetNew}\n", + ) + file.flush() + + linkTargetSet.clear() + del fileByWord, linkTargetSet + gc.collect() + + if os.sep == "\\": + time.sleep(0.1) + + entry_url_fmt = self._glos.getInfo("entry_url") + + re_href = re.compile( + b' href="[^<>"]*?"', + re.IGNORECASE, + ) + + for fileIndex, filename in enumerate(filenameList): + if not isfile(join(dirn, f"links{fileIndex}")): + continue + with open(join(dirn, filename), mode="rb") as inFile: + with open(join(dirn, f"{filename}.new"), mode="wb") as outFile: + for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"): + outFile.flush() + ( + b_x_start, + b_x_size, + b_target, + ) = linkLine.rstrip(b"\n").split(b"\t") + outFile.write( + inFile.read( + int(b_x_start, 16) - inFile.tell(), + ), + ) + curLink = inFile.read(int(b_x_size, 16)) + + if b_target: + outFile.write( + re_href.sub( + b' href="./' + b_target + b'"', + curLink, + ), + ) + continue + + if not entry_url_fmt: + outFile.write( + curLink.replace( + b' href="#', + b' class="broken" href="#', + ), + ) + continue + + st = curLink.decode("utf-8") + i = st.find('href="#') + j = st.find('"', i + 7) + word = st[i + 7 : j] + url = entry_url_fmt.format(word=word) + outFile.write( + ( + st[:i] + f'class="broken" href="{url}"' + st[j + 1 :] + ).encode("utf-8"), + ) + + outFile.write(inFile.read()) + + os.remove(join(dirn, filename)) + os.rename(join(dirn, f"{filename}.new"), join(dirn, filename)) + os.remove(join(dirn, f"links{fileIndex}")) + + def writeInfo(self, filename: str, header: str) -> None: + glos = self._glos + title = glos.getInfo("name") + customStyle = ( + "table, th, td {border: 1px solid black; " + "border-collapse: collapse; padding: 5px;}" + ) + infoHeader = header.format( + pageTitle=f"Info: {title}", + customStyle=customStyle, + ) + with open( + join(filename, "info.html"), + mode="w", + encoding=self._encoding, + ) as _file: + _file.write( + infoHeader + "" + "" + '' + '' + "\n", + ) + for key, value in glos.iterInfo(): + _file.write( + f"\n", + ) + _file.write("
              KeyValue
              {key}{value}
              ") + + @staticmethod + def _subResSrc(m: re.Match) -> str: + url = m.group(1) + if "://" in url: + return m.group(0) + url = "res/" + url + return f' src="{url}"' + + def write(self) -> Generator[None, EntryType, None]: # noqa: PLR0912 + encoding = self._encoding + resources = self._resources + max_file_size = self._max_file_size + filename_format = self._filename_format + escape_defi = self._escape_defi + + wordSep = ' | ' + + initFileSizeMax = 100 + + glos = self._glos + + filename = self._filename + self._encoding = encoding + self._filename_format = filename_format + + entry_url_fmt = glos.getInfo("entry_url") + + def getEntryWebLink(entry: EntryType) -> str: + if not entry_url_fmt: + return "" + url = entry_url_fmt.format(word=html.escape(entry.l_word[0])) + return f'{nbsp}🌏' + + # from math import log2, ceil + # maxPosHexLen = int(ceil(log2(max_file_size) / 4)) + + indexTxtFileObj = open( + join(filename, "index.txt"), + mode="w", + encoding="utf-8", + ) + linksTxtFileObj = open( + join(filename, "links.txt"), + mode="w", + encoding="utf-8", + ) + + title = glos.getInfo("name") + style = "" + if self._dark: + style = darkStyle + + cssLink = '' if self._css else "" + + header = ( + "\n" + "" + "{pageTitle}" + f'' + f'{cssLink}' + "\n" + ) + + def pageHeader(n: int) -> str: + return header.format( + pageTitle=f"Page {n} of {title}", + customStyle="", + ) + + def navBar() -> str: + links: list[str] = [] + if len(self._filenameList) > 1: + links.append(f'') + links.extend( + [ + f'', + 'ℹ️', # noqa: RUF001 + ], + ) + return ( + '" + ) + + tailSize = len(self._tail.encode(encoding)) + + if max_file_size < len(header) + tailSize: + raise ValueError(f"{max_file_size=} is too small") + + max_file_size -= tailSize + + if not isdir(self._filename): + os.mkdir(self._filename) + + fileObj = self.nextFile() + fileObj.write(pageHeader(0)) + fileObj.write(navBar()) + + re_fixed_link = re.compile( + r']*? )?href="#([^<>"]+?)">[^<>]+?', + re.IGNORECASE, + ) + + linkTargetSet = set() + + def replaceBword(text: str) -> str: + return text.replace( + ' href="bword://', + ' href="#', + ) + + def addLinks(text: str, pos: int) -> None: + for m in re_fixed_link.finditer(text): + if ' class="entry_link"' in m.group(0): + continue + if m.group(0).count("href=") != 1: + log.error(f"unexpected match: {m.group(0)}") + target = html.unescape(m.group(1)) + linkTargetSet.add(target) + start = m.start() + b_start = len(text[:start].encode(encoding)) + b_size = len(text[start : m.end()].encode(encoding)) + linksTxtFileObj.write( + f"{escapeNTB(target)}\t" + f"{len(self._filenameList) - 1}\t" + f"{pos + b_start:x}\t" + f"{b_size:x}\n", + ) + linksTxtFileObj.flush() + + self.writeInfo(filename, header) + + word_title = self._word_title + + resDir = self._resDir + entryIndex = -1 + while True: + entryIndex += 1 + entry = yield + if entry is None: + break + if entry.isData(): + if resources: + entry.save(resDir) + continue + + entry.detectDefiFormat() + defi = entry.defi + defiFormat = entry.defiFormat + + if defi.startswith("") and defiFormat != "h": + log.error(f"bad {defiFormat=}") + defiFormat = "h" + + if defiFormat == "m": + defi = html.escape(defi) + if "\n" in defi: + # could be markdown or unformatted plaintext + # FIXME: this changes the font to a monospace + defi = f"
              {defi}
              " + elif defiFormat == "h": + defi = self._resSrcPattern.sub(self._subResSrc, defi) + if escape_defi: + defi = html.escape(defi) + + entryId = f"entry{entryIndex}" + + if word_title: + words = [html.escape(word) for word in entry.l_word] + title = glos.wordTitleStr( + wordSep.join(words), + sample=entry.l_word[0], + class_="headword", + ) + + if not title: + title = f"Entry {entryIndex}" + + # entry_link_sym = "¶" + entry_link_sym = "🔗" + text = ( + f'
              {title}{nbsp}{nbsp}' + f'' + f"{entry_link_sym}" + f"{getEntryWebLink(entry)}" + f"
              \n{defi}" + "
              \n" + "
              \n" + ) + pos = fileObj.tell() + if pos > initFileSizeMax and pos > max_file_size - len( + text.encode(encoding), + ): + fileObj = self.nextFile() + fileObj.write( + pageHeader( + len(self._filenameList) - 1, + ), + ) + fileObj.write(navBar()) + pos = fileObj.tell() + tmpFilename = escapeNTB(self._filenameList[-1]) + for word in entry.l_word: + indexTxtFileObj.write( + f"{entryIndex}\t" + f"{escapeNTB(word)}\t" + f"{tmpFilename}\t" + f"{pos}\n", + ) + del tmpFilename + text = replaceBword(text) + addLinks(text, pos) + fileObj.write(text) + + fileObj.close() + self._fileObj = None + indexTxtFileObj.close() + + linksTxtFileObj.close() + + if linkTargetSet: + log.info(f"{len(linkTargetSet)} link targets found") + log.info("Fixing links, please wait...") + self.fixLinks(linkTargetSet) + + os.remove(join(filename, "links.txt")) diff --git a/pyglossary/plugins/info_plugin/__init__.py b/pyglossary/plugins/info_plugin/__init__.py index 8c4852ae0..57f4cc719 100644 --- a/pyglossary/plugins/info_plugin/__init__.py +++ b/pyglossary/plugins/info_plugin/__init__.py @@ -6,13 +6,9 @@ from pyglossary.info_writer import InfoWriter as Writer -if TYPE_CHECKING: - from collections.abc import Iterator +from .reader import Reader - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) +if TYPE_CHECKING: from pyglossary.option import Option __all__ = [ @@ -44,25 +40,3 @@ # key is option/argument name, value is instance of Option optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - - def close(self) -> None: - pass - - def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToData - - with open(filename, encoding="utf-8") as infoFp: - info = jsonToData(infoFp.read()) - for key, value in info.items(): - self._glos.setInfo(key, value) - - def __len__(self) -> int: - return 0 - - def __iter__(self) -> Iterator[EntryType | None]: - yield None diff --git a/pyglossary/plugins/info_plugin/reader.py b/pyglossary/plugins/info_plugin/reader.py new file mode 100644 index 000000000..f8c212230 --- /dev/null +++ b/pyglossary/plugins/info_plugin/reader.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + + def close(self) -> None: + pass + + def open(self, filename: str) -> None: + from pyglossary.json_utils import jsonToData + + with open(filename, encoding="utf-8") as infoFp: + info = jsonToData(infoFp.read()) + assert isinstance(info, dict) + for key, value in info.items(): + self._glos.setInfo(key, value) + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> Iterator[EntryType | None]: + yield None diff --git a/pyglossary/plugins/jmdict/__init__.py b/pyglossary/plugins/jmdict/__init__.py index e5f88f31c..de0297912 100644 --- a/pyglossary/plugins/jmdict/__init__.py +++ b/pyglossary/plugins/jmdict/__init__.py @@ -1,28 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import os -import re -import unicodedata -from io import BytesIO -from typing import TYPE_CHECKING, cast - -if TYPE_CHECKING: - import io - from collections.abc import Callable, Iterator - - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) - from pyglossary.lxml_types import Element, T_htmlfile - -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import exc_note, pip -from pyglossary.io_utils import nullBinaryIO from pyglossary.option import ( BoolOption, IntOption, @@ -30,6 +8,8 @@ StrOption, ) +from .reader import Reader + __all__ = [ "Reader", "description", @@ -69,395 +49,3 @@ comment="Add translitation (romaji) of keywords", ), } - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - _example_padding: int = 10 - _example_color: str = "" - # _example_color: str = "#008FE1" - _translitation: bool = False - - tagStyle = ( - "color:white;" - "background:green;" - "padding-left:3px;" - "padding-right:3px;" - "border-radius:0.5ex;" - # 0.5ex ~= 0.3em, but "ex" is recommended - ) - - gikun_key = "gikun (meaning as reading) or jukujikun (special kanji reading)" - re_inf_mapping = { - gikun_key: "gikun/jukujikun", - "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete - "word containing irregular kana usage": "irregular", - } - - @staticmethod - def makeList( - hf: T_htmlfile, - input_objects: list[Element], - processor: Callable, - single_prefix: str = "", - skip_single: bool = True, - ) -> None: - """Wrap elements into
                if more than one element.""" - if not input_objects: - return - - if skip_single and len(input_objects) == 1: - hf.write(single_prefix) - processor(hf, input_objects[0]) - return - - with hf.element("ol"): - for el in input_objects: - with hf.element("li"): - processor(hf, el) - - # TODO: break it down - # PLR0912 Too many branches (23 > 12) - def writeSense( # noqa: PLR0912 - self, - hf: T_htmlfile, - sense: Element, - ) -> None: - from lxml import etree as ET - - def br() -> Element: - return ET.Element("br") - - for elem in sense.findall("pos"): - if not elem.text: - continue - desc = elem.text - if desc == "unclassified": - continue - with hf.element("i"): - hf.write(desc.capitalize()) - hf.write(br()) - - glossList = [elem.text.strip() for elem in sense.findall("gloss") if elem.text] - if glossList: - for i, gloss in enumerate(glossList): - if i > 0: - hf.write(", ") - hf.write(gloss) - hf.write(br()) - - relatedWords: list[str] = [] - for elem in sense.findall("xref"): - if not elem.text: - continue - word = elem.text.strip() - word = self._link_number_postfix.sub("", word) - relatedWords.append(word) - - if relatedWords: - hf.write("Related: ") - for i, word in enumerate(relatedWords): - if i > 0: - with hf.element("big"): - hf.write(" | ") - with hf.element("a", href=f"bword://{word}"): - hf.write(word) - hf.write(br()) - - antonymWords: list[str] = [] - for elem in sense.findall("ant"): - if not elem.text: - continue - word = elem.text.strip() - word = self._link_number_postfix.sub("", word) - antonymWords.append(word) - if antonymWords: - hf.write("Antonym: ") - for i, word in enumerate(antonymWords): - if i > 0: - with hf.element("big"): - hf.write(" | ") - with hf.element( - "a", - href=f"bword://{word}", - attrib={"class": "antonym"}, - ): - hf.write(word) - hf.write(br()) - - for i, elem in enumerate(sense.findall("field")): - if not elem.text: - continue - if i > 0: - hf.write(" ") - desc = elem.text - with hf.element("span", style=self.tagStyle): - hf.write(desc) - hf.write(br()) - - for i, elem in enumerate(sense.findall("misc")): - if not elem.text: - continue - if i > 0: - hf.write(" ") - desc = elem.text - with hf.element("small"): - with hf.element("span", style=self.tagStyle): - hf.write(desc) - hf.write(br()) - - examples = sense.findall("example") - # TODO: move to a method - if examples: # noqa: PLR1702 - with hf.element( - "div", - attrib={ - "class": "example", - "style": f"padding: {self._example_padding}px 0px;", - }, - ): - hf.write("Examples:") - with hf.element("ul"): - for i, elem in enumerate(examples): - if not elem.text: - continue - if i > 0: - hf.write(" ") - # one ex_srce (id?), one ex_text, and two ex_sent tags - textElem = elem.find("ex_text") - if textElem is None: - continue - if not textElem.text: - continue - text = textElem.text - sentList: list[str] = [] - for sentElem in elem.findall("ex_sent"): - if not sentElem.text: - continue - sentList.append(sentElem.text) - with hf.element("li"): - style: dict[str, str] = {} - if self._example_color: - style["color"] = self._example_color - with hf.element("font", attrib=style): - hf.write(text) - for sent in sentList: - hf.write(br()) - hf.write(sent) - - # TODO: break it down - def getEntryByElem( # noqa: PLR0912 - self, - entry: Element, - ) -> EntryType: - from lxml import etree as ET - - glos = self._glos - keywords: list[str] = [] - f = BytesIO() - translit = self._translitation - - def br() -> Element: - return ET.Element("br") - - with ET.htmlfile(f, encoding="utf-8") as hf: # noqa: PLR1702 - kebList: list[str] = [] - rebList: list[str] = [] - kebDisplayList: list[str] = [] - rebDisplayList: list[tuple[str, list[str]]] = [] - with hf.element("div"): - for k_ele in entry.findall("k_ele"): - keb = k_ele.find("keb") - if keb is None: - continue - if not keb.text: - continue - keb_text = keb.text - keb_text_norm = unicodedata.normalize("NFKC", keb_text) - keywords.append(keb_text_norm) - if keb_text != keb_text_norm: - keywords.append(keb_text) - kebList.append(keb_text) - keb_display = keb_text - if translit: - import romkan # type: ignore - - t_keb = romkan.to_roma(keb_text) - if t_keb and t_keb.isascii(): - keywords.append(t_keb) - keb_display += f" ({t_keb})" - kebDisplayList.append(keb_display) - # for elem in k_ele.findall("ke_pri"): - # log.info(elem.text) - - for r_ele in entry.findall("r_ele"): - reb = r_ele.find("reb") - if reb is None: - continue - if not reb.text: - continue - props: list[str] = [] - if r_ele.find("re_nokanji") is not None: - props.append("no kanji") - inf = r_ele.find("re_inf") - if inf is not None and inf.text: - props.append( - self.re_inf_mapping.get(inf.text, inf.text), - ) - keywords.append(reb.text) - reb_text = reb.text - rebList.append(reb_text) - reb_display = reb_text - if translit: - import romkan - - t_reb = romkan.to_roma(reb.text) - if t_reb and t_reb.isascii(): - keywords.append(t_reb) - reb_display += f" ({t_reb})" - rebDisplayList.append((reb_display, props)) - # for elem in r_ele.findall("re_pri"): - # log.info(elem.text) - - # this is for making internal links valid - # this makes too many alternates! - # but we don't seem to have a choice - # except for scanning and indexing all words once - # and then starting over and fixing/optimizing links - for s_keb in kebList: - for s_reb in rebList: - keywords.append(f"{s_keb}・{s_reb}") # noqa: PERF401 - - if kebDisplayList: - with hf.element(glos.titleTag(kebDisplayList[0])): - for i, s_keb in enumerate(kebDisplayList): - if i > 0: - with hf.element("font", color="red"): - hf.write(" | ") - hf.write(s_keb) - hf.write(br()) - - if rebDisplayList: - for i, (s_reb, props) in enumerate(rebDisplayList): - if i > 0: - with hf.element("font", color="red"): - hf.write(" | ") - with hf.element("font", color="green"): - hf.write(s_reb) - for prop in props: - hf.write(" ") - with hf.element("small"): - with hf.element("span", style=self.tagStyle): - hf.write(prop) - hf.write(br()) - - hf_ = cast("T_htmlfile", hf) - self.makeList( - hf_, - entry.findall("sense"), - self.writeSense, - ) - - defi = f.getvalue().decode("utf-8") - file = self._file - byteProgress = (file.tell(), self._fileSize) - return self._glos.newEntry( - keywords, - defi, - defiFormat="h", - byteProgress=byteProgress, - ) - - @staticmethod - def tostring(elem: Element) -> str: - from lxml import etree as ET - - return ( - ET.tostring( - elem, - method="html", - pretty_print=True, - ) - .decode("utf-8") - .strip() - ) - - def setCreationTime(self, header: str) -> None: - m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) - if m is None: - return - self._glos.setInfo("creationTime", m.group(1)) - - def setMetadata(self, header: str) -> None: - # TODO: self.set_info("edition", ...) - self.setCreationTime(header) - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._wordCount = 0 - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._fileSize = 0 - self._link_number_postfix = re.compile("・[0-9]+$") - - def __len__(self) -> int: - return self._wordCount - - def close(self) -> None: - if self._file: - self._file.close() - self._file = nullBinaryIO - - def open( - self, - filename: str, - ) -> None: - try: - from lxml import etree as ET # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - self._filename = filename - self._fileSize = os.path.getsize(filename) - - self._glos.sourceLangName = "Japanese" - - self._glos.setDefaultDefiFormat("h") - self._glos.setInfo("definition_has_headwords", "True") - self._glos.setInfo("entry_url", "https://jisho.org/search/{word}") - # also good: f"https://sakuradict.com/search?q={{word}}" - - header = "" - with compressionOpen(filename, mode="rt", encoding="utf-8") as text_file: - text_file = cast("io.TextIOBase", text_file) - for line in text_file: - if "" in line: - break - header += line - self.setMetadata(header) - - self._file = compressionOpen(filename, mode="rb") - - def __iter__(self) -> Iterator[EntryType]: - from lxml import etree as ET - - context = ET.iterparse( # type: ignore # noqa: PGH003 - self._file, - events=("end",), - tag="entry", - ) - for _, _elem in context: - elem = cast("Element", _elem) - yield self.getEntryByElem(elem) - # clean up preceding siblings to save memory - # this reduces memory usage from ~64 MB to ~30 MB - parent = elem.getparent() - if parent is None: - continue - while elem.getprevious() is not None: - del parent[0] diff --git a/pyglossary/plugins/jmdict/reader.py b/pyglossary/plugins/jmdict/reader.py new file mode 100644 index 000000000..16de72ffc --- /dev/null +++ b/pyglossary/plugins/jmdict/reader.py @@ -0,0 +1,417 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import os +import re +import unicodedata +from io import BytesIO +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Callable, Iterator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + from pyglossary.lxml_types import Element, T_htmlfile + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import exc_note, pip +from pyglossary.io_utils import nullBinaryIO + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + _example_padding: int = 10 + _example_color: str = "" + # _example_color: str = "#008FE1" + _translitation: bool = False + + tagStyle = ( + "color:white;" + "background:green;" + "padding-left:3px;" + "padding-right:3px;" + "border-radius:0.5ex;" + # 0.5ex ~= 0.3em, but "ex" is recommended + ) + + gikun_key = "gikun (meaning as reading) or jukujikun (special kanji reading)" + re_inf_mapping = { + gikun_key: "gikun/jukujikun", + "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete + "word containing irregular kana usage": "irregular", + } + + @staticmethod + def makeList( + hf: T_htmlfile, + input_objects: list[Element], + processor: Callable, + single_prefix: str = "", + skip_single: bool = True, + ) -> None: + """Wrap elements into
                  if more than one element.""" + if not input_objects: + return + + if skip_single and len(input_objects) == 1: + hf.write(single_prefix) + processor(hf, input_objects[0]) + return + + with hf.element("ol"): + for el in input_objects: + with hf.element("li"): + processor(hf, el) + + # TODO: break it down + # PLR0912 Too many branches (23 > 12) + def writeSense( # noqa: PLR0912 + self, + hf: T_htmlfile, + sense: Element, + ) -> None: + from lxml import etree as ET + + def br() -> Element: + return ET.Element("br") + + for elem in sense.findall("pos"): + if not elem.text: + continue + desc = elem.text + if desc == "unclassified": + continue + with hf.element("i"): + hf.write(desc.capitalize()) + hf.write(br()) + + glossList = [elem.text.strip() for elem in sense.findall("gloss") if elem.text] + if glossList: + for i, gloss in enumerate(glossList): + if i > 0: + hf.write(", ") + hf.write(gloss) + hf.write(br()) + + relatedWords: list[str] = [] + for elem in sense.findall("xref"): + if not elem.text: + continue + word = elem.text.strip() + word = self._link_number_postfix.sub("", word) + relatedWords.append(word) + + if relatedWords: + hf.write("Related: ") + for i, word in enumerate(relatedWords): + if i > 0: + with hf.element("big"): + hf.write(" | ") + with hf.element("a", href=f"bword://{word}"): + hf.write(word) + hf.write(br()) + + antonymWords: list[str] = [] + for elem in sense.findall("ant"): + if not elem.text: + continue + word = elem.text.strip() + word = self._link_number_postfix.sub("", word) + antonymWords.append(word) + if antonymWords: + hf.write("Antonym: ") + for i, word in enumerate(antonymWords): + if i > 0: + with hf.element("big"): + hf.write(" | ") + with hf.element( + "a", + href=f"bword://{word}", + attrib={"class": "antonym"}, + ): + hf.write(word) + hf.write(br()) + + for i, elem in enumerate(sense.findall("field")): + if not elem.text: + continue + if i > 0: + hf.write(" ") + desc = elem.text + with hf.element("span", style=self.tagStyle): + hf.write(desc) + hf.write(br()) + + for i, elem in enumerate(sense.findall("misc")): + if not elem.text: + continue + if i > 0: + hf.write(" ") + desc = elem.text + with hf.element("small"): + with hf.element("span", style=self.tagStyle): + hf.write(desc) + hf.write(br()) + + examples = sense.findall("example") + # TODO: move to a method + if examples: # noqa: PLR1702 + with hf.element( + "div", + attrib={ + "class": "example", + "style": f"padding: {self._example_padding}px 0px;", + }, + ): + hf.write("Examples:") + with hf.element("ul"): + for i, elem in enumerate(examples): + if not elem.text: + continue + if i > 0: + hf.write(" ") + # one ex_srce (id?), one ex_text, and two ex_sent tags + textElem = elem.find("ex_text") + if textElem is None: + continue + if not textElem.text: + continue + text = textElem.text + sentList: list[str] = [] + for sentElem in elem.findall("ex_sent"): + if not sentElem.text: + continue + sentList.append(sentElem.text) + with hf.element("li"): + style: dict[str, str] = {} + if self._example_color: + style["color"] = self._example_color + with hf.element("font", attrib=style): + hf.write(text) + for sent in sentList: + hf.write(br()) + hf.write(sent) + + # TODO: break it down + def getEntryByElem( # noqa: PLR0912 + self, + entry: Element, + ) -> EntryType: + from lxml import etree as ET + + glos = self._glos + keywords: list[str] = [] + f = BytesIO() + translit = self._translitation + + def br() -> Element: + return ET.Element("br") + + with ET.htmlfile(f, encoding="utf-8") as hf: # noqa: PLR1702 + kebList: list[str] = [] + rebList: list[str] = [] + kebDisplayList: list[str] = [] + rebDisplayList: list[tuple[str, list[str]]] = [] + with hf.element("div"): + for k_ele in entry.findall("k_ele"): + keb = k_ele.find("keb") + if keb is None: + continue + if not keb.text: + continue + keb_text = keb.text + keb_text_norm = unicodedata.normalize("NFKC", keb_text) + keywords.append(keb_text_norm) + if keb_text != keb_text_norm: + keywords.append(keb_text) + kebList.append(keb_text) + keb_display = keb_text + if translit: + import romkan # type: ignore + + t_keb = romkan.to_roma(keb_text) + if t_keb and t_keb.isascii(): + keywords.append(t_keb) + keb_display += f" ({t_keb})" + kebDisplayList.append(keb_display) + # for elem in k_ele.findall("ke_pri"): + # log.info(elem.text) + + for r_ele in entry.findall("r_ele"): + reb = r_ele.find("reb") + if reb is None: + continue + if not reb.text: + continue + props: list[str] = [] + if r_ele.find("re_nokanji") is not None: + props.append("no kanji") + inf = r_ele.find("re_inf") + if inf is not None and inf.text: + props.append( + self.re_inf_mapping.get(inf.text, inf.text), + ) + keywords.append(reb.text) + reb_text = reb.text + rebList.append(reb_text) + reb_display = reb_text + if translit: + import romkan + + t_reb = romkan.to_roma(reb.text) + if t_reb and t_reb.isascii(): + keywords.append(t_reb) + reb_display += f" ({t_reb})" + rebDisplayList.append((reb_display, props)) + # for elem in r_ele.findall("re_pri"): + # log.info(elem.text) + + # this is for making internal links valid + # this makes too many alternates! + # but we don't seem to have a choice + # except for scanning and indexing all words once + # and then starting over and fixing/optimizing links + for s_keb in kebList: + for s_reb in rebList: + keywords.append(f"{s_keb}・{s_reb}") # noqa: PERF401 + + if kebDisplayList: + with hf.element(glos.titleTag(kebDisplayList[0])): + for i, s_keb in enumerate(kebDisplayList): + if i > 0: + with hf.element("font", color="red"): + hf.write(" | ") + hf.write(s_keb) + hf.write(br()) + + if rebDisplayList: + for i, (s_reb, props) in enumerate(rebDisplayList): + if i > 0: + with hf.element("font", color="red"): + hf.write(" | ") + with hf.element("font", color="green"): + hf.write(s_reb) + for prop in props: + hf.write(" ") + with hf.element("small"): + with hf.element("span", style=self.tagStyle): + hf.write(prop) + hf.write(br()) + + hf_ = cast("T_htmlfile", hf) + self.makeList( + hf_, + entry.findall("sense"), + self.writeSense, + ) + + defi = f.getvalue().decode("utf-8") + file = self._file + byteProgress = (file.tell(), self._fileSize) + return self._glos.newEntry( + keywords, + defi, + defiFormat="h", + byteProgress=byteProgress, + ) + + @staticmethod + def tostring(elem: Element) -> str: + from lxml import etree as ET + + return ( + ET.tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + def setCreationTime(self, header: str) -> None: + m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) + if m is None: + return + self._glos.setInfo("creationTime", m.group(1)) + + def setMetadata(self, header: str) -> None: + # TODO: self.set_info("edition", ...) + self.setCreationTime(header) + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._wordCount = 0 + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._fileSize = 0 + self._link_number_postfix = re.compile("・[0-9]+$") + + def __len__(self) -> int: + return self._wordCount + + def close(self) -> None: + if self._file: + self._file.close() + self._file = nullBinaryIO + + def open( + self, + filename: str, + ) -> None: + try: + from lxml import etree as ET # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + self._filename = filename + self._fileSize = os.path.getsize(filename) + + self._glos.sourceLangName = "Japanese" + + self._glos.setDefaultDefiFormat("h") + self._glos.setInfo("definition_has_headwords", "True") + self._glos.setInfo("entry_url", "https://jisho.org/search/{word}") + # also good: f"https://sakuradict.com/search?q={{word}}" + + header = "" + with compressionOpen(filename, mode="rt", encoding="utf-8") as text_file: + text_file = cast("io.TextIOBase", text_file) + for line in text_file: + if "" in line: + break + header += line + self.setMetadata(header) + + self._file = compressionOpen(filename, mode="rb") + + def __iter__(self) -> Iterator[EntryType]: + from lxml import etree as ET + + context = ET.iterparse( # type: ignore # noqa: PGH003 + self._file, + events=("end",), + tag="entry", + ) + for _, _elem in context: + elem = cast("Element", _elem) + yield self.getEntryByElem(elem) + # clean up preceding siblings to save memory + # this reduces memory usage from ~64 MB to ~30 MB + parent = elem.getparent() + if parent is None: + continue + while elem.getprevious() is not None: + del parent[0] diff --git a/pyglossary/plugins/jmnedict/__init__.py b/pyglossary/plugins/jmnedict/__init__.py index 59582d936..ba4213465 100644 --- a/pyglossary/plugins/jmnedict/__init__.py +++ b/pyglossary/plugins/jmnedict/__init__.py @@ -1,28 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import os -import re -from io import BytesIO -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING -if TYPE_CHECKING: - import io - from collections.abc import Callable, Iterator +from .reader import Reader - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) - from pyglossary.lxml_types import Element, T_htmlfile +if TYPE_CHECKING: from pyglossary.option import Option -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import exc_note, pip -from pyglossary.io_utils import nullBinaryIO __all__ = [ "Reader", @@ -53,277 +38,3 @@ "EDRDG Wiki", ) optionsProp: dict[str, Option] = {} - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - tagStyle = ( - "color:white;" - "background:green;" - "padding-left:3px;" - "padding-right:3px;" - "border-radius:0.5ex;" - # 0.5ex ~= 0.3em, but "ex" is recommended - ) - - gikun_key = "gikun (meaning as reading) or jukujikun (special kanji reading)" - re_inf_mapping = { - gikun_key: "gikun/jukujikun", - "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete - "word containing irregular kana usage": "irregular", - } - - @staticmethod - def makeList( - hf: T_htmlfile, - input_objects: list[Element], - processor: Callable, - single_prefix: str = "", - skip_single: bool = True, - ) -> None: - """Wrap elements into
                    if more than one element.""" - if not input_objects: - return - - if skip_single and len(input_objects) == 1: - hf.write(single_prefix) - processor(hf, input_objects[0]) - return - - with hf.element("ol"): - for el in input_objects: - with hf.element("li"): - processor(hf, el) - - def writeTrans( - self, - hf: T_htmlfile, - trans: Element, - ) -> None: - from lxml import etree as ET - - def br() -> Element: - return ET.Element("br") - - for elem in trans.findall("name_type"): - if not elem.text: - continue - desc = elem.text - with hf.element("i"): - hf.write(desc.capitalize()) - hf.write(br()) - - for elem in trans.findall("trans_det"): - if not elem.text: - continue - desc = elem.text - hf.write(desc) - hf.write(br()) - - relatedWords: list[str] = [] - for elem in trans.findall("xref"): - if not elem.text: - continue - word = elem.text.strip() - word = self._link_number_postfix.sub("", word) - relatedWords.append(word) - - if relatedWords: - hf.write("Related: ") - for i, word in enumerate(relatedWords): - if i > 0: - with hf.element("big"): - hf.write(" | ") - with hf.element("a", href=f"bword://{word}"): - hf.write(word) - hf.write(br()) - - def getEntryByElem( # noqa: PLR0912 - self, - entry: Element, - ) -> EntryType: - from lxml import etree as ET - - glos = self._glos - keywords: list[str] = [] - f = BytesIO() - - def br() -> Element: - return ET.Element("br") - - with ET.htmlfile(f, encoding="utf-8") as hf: # noqa: PLR1702 - kebList: list[str] = [] - rebList: list[tuple[str, list[str]]] = [] - with hf.element("div"): - for k_ele in entry.findall("k_ele"): - keb = k_ele.find("keb") - if keb is None: - continue - if not keb.text: - continue - kebList.append(keb.text) - keywords.append(keb.text) - # for elem in k_ele.findall("ke_pri"): - # log.info(elem.text) - - for r_ele in entry.findall("r_ele"): - reb = r_ele.find("reb") - if reb is None: - continue - if not reb.text: - continue - props: list[str] = [] - if r_ele.find("re_nokanji") is not None: - props.append("no kanji") - inf = r_ele.find("re_inf") - if inf is not None and inf.text: - props.append( - self.re_inf_mapping.get(inf.text, inf.text), - ) - rebList.append((reb.text, props)) - keywords.append(reb.text) - # for elem in r_ele.findall("re_pri"): - # log.info(elem.text) - - # this is for making internal links valid - # this makes too many alternates! - # but we don't seem to have a choice - # except for scanning and indexing all words once - # and then starting over and fixing/optimizing links - for s_keb in kebList: - for s_reb, _ in rebList: - keywords.append(f"{s_keb}・{s_reb}") - - if kebList: - with hf.element(glos.titleTag(kebList[0])): - for i, s_keb in enumerate(kebList): - if i > 0: - with hf.element("font", color="red"): - hf.write(" | ") - hf.write(s_keb) - hf.write(br()) - - if rebList: - for i, (s_reb, props) in enumerate(rebList): - if i > 0: - with hf.element("font", color="red"): - hf.write(" | ") - with hf.element("font", color="green"): - hf.write(s_reb) - for prop in props: - hf.write(" ") - with hf.element("small"): - with hf.element("span", style=self.tagStyle): - hf.write(prop) - hf.write(br()) - - hf_ = cast("T_htmlfile", hf) - self.makeList( - hf_, - entry.findall("trans"), - self.writeTrans, - ) - - defi = f.getvalue().decode("utf-8") - file = self._file - byteProgress = (file.tell(), self._fileSize) - return self._glos.newEntry( - keywords, - defi, - defiFormat="h", - byteProgress=byteProgress, - ) - - @staticmethod - def tostring(elem: Element) -> str: - from lxml import etree as ET - - return ( - ET.tostring( - elem, - method="html", - pretty_print=True, - ) - .decode("utf-8") - .strip() - ) - - def setCreationTime(self, header: str) -> None: - m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) - if m is None: - return - self._glos.setInfo("creationTime", m.group(1)) - - def setMetadata(self, header: str) -> None: - # TODO: self.set_info("edition", ...) - self.setCreationTime(header) - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._wordCount = 0 - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._fileSize = 0 - self._link_number_postfix = re.compile("・[0-9]+$") - - def __len__(self) -> int: - return self._wordCount - - def close(self) -> None: - if self._file: - self._file.close() - self._file = nullBinaryIO - - def open( - self, - filename: str, - ) -> None: - try: - from lxml import etree as ET # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - self._filename = filename - self._fileSize = os.path.getsize(filename) - - self._glos.sourceLangName = "Japanese" - - self._glos.setDefaultDefiFormat("h") - self._glos.setInfo("definition_has_headwords", "True") - self._glos.setInfo("entry_url", "https://jisho.org/search/{word}") - # also good: f"https://sakuradict.com/search?q={{word}}" - - header = "" - with compressionOpen(filename, mode="rt", encoding="utf-8") as text_file: - text_file = cast("io.TextIOBase", text_file) - for line in text_file: - if "" in line: - break - header += line - self.setMetadata(header) - - self._file = compressionOpen(filename, mode="rb") - - def __iter__(self) -> Iterator[EntryType]: - from lxml import etree as ET - - context = ET.iterparse( # type: ignore # noqa: PGH003 - self._file, - events=("end",), - tag="entry", - ) - for _, _elem in context: - elem = cast("Element", _elem) - yield self.getEntryByElem(elem) - # clean up preceding siblings to save memory - # this reduces memory usage from ~64 MB to ~30 MB - parent = elem.getparent() - if parent is None: - continue - while elem.getprevious() is not None: - del parent[0] diff --git a/pyglossary/plugins/jmnedict/reader.py b/pyglossary/plugins/jmnedict/reader.py new file mode 100644 index 000000000..8d25b8ce1 --- /dev/null +++ b/pyglossary/plugins/jmnedict/reader.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import os +import re +from io import BytesIO +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Callable, Iterator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + from pyglossary.lxml_types import Element, T_htmlfile + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import exc_note, pip +from pyglossary.io_utils import nullBinaryIO + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + tagStyle = ( + "color:white;" + "background:green;" + "padding-left:3px;" + "padding-right:3px;" + "border-radius:0.5ex;" + # 0.5ex ~= 0.3em, but "ex" is recommended + ) + + gikun_key = "gikun (meaning as reading) or jukujikun (special kanji reading)" + re_inf_mapping = { + gikun_key: "gikun/jukujikun", + "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete + "word containing irregular kana usage": "irregular", + } + + @staticmethod + def makeList( + hf: T_htmlfile, + input_objects: list[Element], + processor: Callable, + single_prefix: str = "", + skip_single: bool = True, + ) -> None: + """Wrap elements into
                      if more than one element.""" + if not input_objects: + return + + if skip_single and len(input_objects) == 1: + hf.write(single_prefix) + processor(hf, input_objects[0]) + return + + with hf.element("ol"): + for el in input_objects: + with hf.element("li"): + processor(hf, el) + + def writeTrans( + self, + hf: T_htmlfile, + trans: Element, + ) -> None: + from lxml import etree as ET + + def br() -> Element: + return ET.Element("br") + + for elem in trans.findall("name_type"): + if not elem.text: + continue + desc = elem.text + with hf.element("i"): + hf.write(desc.capitalize()) + hf.write(br()) + + for elem in trans.findall("trans_det"): + if not elem.text: + continue + desc = elem.text + hf.write(desc) + hf.write(br()) + + relatedWords: list[str] = [] + for elem in trans.findall("xref"): + if not elem.text: + continue + word = elem.text.strip() + word = self._link_number_postfix.sub("", word) + relatedWords.append(word) + + if relatedWords: + hf.write("Related: ") + for i, word in enumerate(relatedWords): + if i > 0: + with hf.element("big"): + hf.write(" | ") + with hf.element("a", href=f"bword://{word}"): + hf.write(word) + hf.write(br()) + + def getEntryByElem( # noqa: PLR0912 + self, + entry: Element, + ) -> EntryType: + from lxml import etree as ET + + glos = self._glos + keywords: list[str] = [] + f = BytesIO() + + def br() -> Element: + return ET.Element("br") + + with ET.htmlfile(f, encoding="utf-8") as hf: # noqa: PLR1702 + kebList: list[str] = [] + rebList: list[tuple[str, list[str]]] = [] + with hf.element("div"): + for k_ele in entry.findall("k_ele"): + keb = k_ele.find("keb") + if keb is None: + continue + if not keb.text: + continue + kebList.append(keb.text) + keywords.append(keb.text) + # for elem in k_ele.findall("ke_pri"): + # log.info(elem.text) + + for r_ele in entry.findall("r_ele"): + reb = r_ele.find("reb") + if reb is None: + continue + if not reb.text: + continue + props: list[str] = [] + if r_ele.find("re_nokanji") is not None: + props.append("no kanji") + inf = r_ele.find("re_inf") + if inf is not None and inf.text: + props.append( + self.re_inf_mapping.get(inf.text, inf.text), + ) + rebList.append((reb.text, props)) + keywords.append(reb.text) + # for elem in r_ele.findall("re_pri"): + # log.info(elem.text) + + # this is for making internal links valid + # this makes too many alternates! + # but we don't seem to have a choice + # except for scanning and indexing all words once + # and then starting over and fixing/optimizing links + for s_keb in kebList: + for s_reb, _ in rebList: + keywords.append(f"{s_keb}・{s_reb}") + + if kebList: + with hf.element(glos.titleTag(kebList[0])): + for i, s_keb in enumerate(kebList): + if i > 0: + with hf.element("font", color="red"): + hf.write(" | ") + hf.write(s_keb) + hf.write(br()) + + if rebList: + for i, (s_reb, props) in enumerate(rebList): + if i > 0: + with hf.element("font", color="red"): + hf.write(" | ") + with hf.element("font", color="green"): + hf.write(s_reb) + for prop in props: + hf.write(" ") + with hf.element("small"): + with hf.element("span", style=self.tagStyle): + hf.write(prop) + hf.write(br()) + + hf_ = cast("T_htmlfile", hf) + self.makeList( + hf_, + entry.findall("trans"), + self.writeTrans, + ) + + defi = f.getvalue().decode("utf-8") + file = self._file + byteProgress = (file.tell(), self._fileSize) + return self._glos.newEntry( + keywords, + defi, + defiFormat="h", + byteProgress=byteProgress, + ) + + @staticmethod + def tostring(elem: Element) -> str: + from lxml import etree as ET + + return ( + ET.tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + def setCreationTime(self, header: str) -> None: + m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) + if m is None: + return + self._glos.setInfo("creationTime", m.group(1)) + + def setMetadata(self, header: str) -> None: + # TODO: self.set_info("edition", ...) + self.setCreationTime(header) + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._wordCount = 0 + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._fileSize = 0 + self._link_number_postfix = re.compile("・[0-9]+$") + + def __len__(self) -> int: + return self._wordCount + + def close(self) -> None: + if self._file: + self._file.close() + self._file = nullBinaryIO + + def open( + self, + filename: str, + ) -> None: + try: + from lxml import etree as ET # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + self._filename = filename + self._fileSize = os.path.getsize(filename) + + self._glos.sourceLangName = "Japanese" + + self._glos.setDefaultDefiFormat("h") + self._glos.setInfo("definition_has_headwords", "True") + self._glos.setInfo("entry_url", "https://jisho.org/search/{word}") + # also good: f"https://sakuradict.com/search?q={{word}}" + + header = "" + with compressionOpen(filename, mode="rt", encoding="utf-8") as text_file: + text_file = cast("io.TextIOBase", text_file) + for line in text_file: + if "" in line: + break + header += line + self.setMetadata(header) + + self._file = compressionOpen(filename, mode="rb") + + def __iter__(self) -> Iterator[EntryType]: + from lxml import etree as ET + + context = ET.iterparse( # type: ignore # noqa: PGH003 + self._file, + events=("end",), + tag="entry", + ) + for _, _elem in context: + elem = cast("Element", _elem) + yield self.getEntryByElem(elem) + # clean up preceding siblings to save memory + # this reduces memory usage from ~64 MB to ~30 MB + parent = elem.getparent() + if parent is None: + continue + while elem.getprevious() is not None: + del parent[0] diff --git a/pyglossary/plugins/json_plugin/__init__.py b/pyglossary/plugins/json_plugin/__init__.py index 83fdbbb10..a21b50f69 100644 --- a/pyglossary/plugins/json_plugin/__init__.py +++ b/pyglossary/plugins/json_plugin/__init__.py @@ -2,25 +2,13 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -from pyglossary.compression import ( - # compressionOpen, - stdCompressions, -) from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) +from .writer import Writer __all__ = [ "Writer", @@ -58,53 +46,3 @@ comment="add headwords title to beginning of definition", ), } - - -class Writer: - _encoding: str = "utf-8" - _enable_info: bool = True - _resources: bool = True - _word_title: bool = False - - compressions = stdCompressions - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - glos.preventDuplicateWords() - - def open(self, filename: str) -> None: - self._filename = filename - - def finish(self) -> None: - self._filename = "" - - def write(self) -> Generator[None, EntryType, None]: - from json import dumps - - from pyglossary.text_writer import writeTxt - - glos = self._glos - encoding = self._encoding - enable_info = self._enable_info - resources = self._resources - - ensure_ascii = encoding == "ascii" - - def escape(st: str) -> str: - return dumps(st, ensure_ascii=ensure_ascii) - - yield from writeTxt( - glos, - entryFmt="\t{word}: {defi},\n", - filename=self._filename, - encoding=encoding, - writeInfo=enable_info, - wordEscapeFunc=escape, - defiEscapeFunc=escape, - ext=".json", - head="{\n", - tail='\t"": ""\n}', - resources=resources, - word_title=self._word_title, - ) diff --git a/pyglossary/plugins/json_plugin/writer.py b/pyglossary/plugins/json_plugin/writer.py new file mode 100644 index 000000000..f43b88a7e --- /dev/null +++ b/pyglossary/plugins/json_plugin/writer.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + # compressionOpen, + stdCompressions, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + + +class Writer: + _encoding: str = "utf-8" + _enable_info: bool = True + _resources: bool = True + _word_title: bool = False + + compressions = stdCompressions + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + glos.preventDuplicateWords() + + def open(self, filename: str) -> None: + self._filename = filename + + def finish(self) -> None: + self._filename = "" + + def write(self) -> Generator[None, EntryType, None]: + from json import dumps + + from pyglossary.text_writer import writeTxt + + glos = self._glos + encoding = self._encoding + enable_info = self._enable_info + resources = self._resources + + ensure_ascii = encoding == "ascii" + + def escape(st: str) -> str: + return dumps(st, ensure_ascii=ensure_ascii) + + yield from writeTxt( + glos, + entryFmt="\t{word}: {defi},\n", + filename=self._filename, + encoding=encoding, + writeInfo=enable_info, + wordEscapeFunc=escape, + defiEscapeFunc=escape, + ext=".json", + head="{\n", + tail='\t"": ""\n}', + resources=resources, + word_title=self._word_title, + ) diff --git a/pyglossary/plugins/lingoes_ldf/__init__.py b/pyglossary/plugins/lingoes_ldf/__init__.py index 41f9c3269..e63e43e93 100644 --- a/pyglossary/plugins/lingoes_ldf/__init__.py +++ b/pyglossary/plugins/lingoes_ldf/__init__.py @@ -1,27 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import TYPE_CHECKING - -from pyglossary.compression import ( - # compressionOpen, - stdCompressions, -) -from pyglossary.core import log -from pyglossary.file_utils import fileCountLines from pyglossary.option import ( BoolOption, EncodingOption, NewlineOption, Option, ) -from pyglossary.text_reader import TextGlossaryReader, nextBlockResultType -from pyglossary.text_utils import splitByBar -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -57,121 +45,3 @@ "resources": BoolOption(comment="Enable resources / data files"), "encoding": EncodingOption(), } - - -class Reader(TextGlossaryReader): - compressions = stdCompressions - - def __len__(self) -> int: - if self._wordCount is None: - log.debug("Try not to use len(reader) as it takes extra time") - self._wordCount = ( - fileCountLines( - self._filename, - newline=b"\n\n", - ) - - self._leadingLinesCount - ) - return self._wordCount - - @classmethod - def isInfoWord(cls, word: str) -> bool: - if isinstance(word, str): - return word.startswith("#") - - return False - - @classmethod - def fixInfoWord(cls, word: str) -> str: - if isinstance(word, str): - return word.lstrip("#").lower() - - return word - - def nextBlock(self) -> nextBlockResultType: - if not self._file: - raise StopIteration - entryLines: list[str] = [] - while True: - line = self.readline() - if not line: - raise StopIteration - line = line.rstrip("\n\r") # FIXME - if line.startswith("###"): - parts = line.split(":") - key = parts[0].strip() - value = ":".join(parts[1:]).strip() - return key, value, None - - if line: - entryLines.append(line) - continue - - # now `line` is empty, process `entryLines` - if not entryLines: - return None - if len(entryLines) < 2: - log.error( - f"invalid block near pos {self._file.tell()}" - f" in file {self._filename}", - ) - return None - word = entryLines[0] - defi = "\n".join(entryLines[1:]) - defi = defi.replace("
                      ", "\n") # FIXME - - words = splitByBar(word) - - return words, defi, None - - -class Writer: - compressions = stdCompressions - - _newline: str = "\n" - _resources: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - - def getInfo(self, key: str) -> str: - return self._glos.getInfo(key).replace("\n", "
                      ") - - def getAuthor(self) -> str: - return self._glos.author.replace("\n", "
                      ") - - def finish(self) -> None: - self._filename = "" - - def open(self, filename: str) -> None: - self._filename = filename - - @staticmethod - def _defiEscapeFunc(defi: str) -> str: - return defi.replace("\n", "
                      ") - - def write(self) -> Generator[None, EntryType, None]: - from pyglossary.text_writer import writeTxt - - newline = self._newline - resources = self._resources - head = ( - f"###Title: {self.getInfo('title')}\n" - f"###Description: {self.getInfo('description')}\n" - f"###Author: {self.getAuthor()}\n" - f"###Email: {self.getInfo('email')}\n" - f"###Website: {self.getInfo('website')}\n" - f"###Copyright: {self.getInfo('copyright')}\n" - ) - yield from writeTxt( - self._glos, - entryFmt="{word}\n{defi}\n\n", - filename=self._filename, - writeInfo=False, - defiEscapeFunc=self._defiEscapeFunc, - ext=".ldf", - head=head, - newline=newline, - resources=resources, - ) diff --git a/pyglossary/plugins/lingoes_ldf/reader.py b/pyglossary/plugins/lingoes_ldf/reader.py new file mode 100644 index 000000000..211056bfe --- /dev/null +++ b/pyglossary/plugins/lingoes_ldf/reader.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from pyglossary.compression import ( + # compressionOpen, + stdCompressions, +) +from pyglossary.core import log +from pyglossary.file_utils import fileCountLines +from pyglossary.text_reader import TextGlossaryReader, nextBlockResultType +from pyglossary.text_utils import splitByBar + + +class Reader(TextGlossaryReader): + compressions = stdCompressions + + def __len__(self) -> int: + if self._wordCount is None: + log.debug("Try not to use len(reader) as it takes extra time") + self._wordCount = ( + fileCountLines( + self._filename, + newline=b"\n\n", + ) + - self._leadingLinesCount + ) + return self._wordCount + + @classmethod + def isInfoWord(cls, word: str) -> bool: + if isinstance(word, str): + return word.startswith("#") + + return False + + @classmethod + def fixInfoWord(cls, word: str) -> str: + if isinstance(word, str): + return word.lstrip("#").lower() + + return word + + def nextBlock(self) -> nextBlockResultType: + if not self._file: + raise StopIteration + entryLines: list[str] = [] + while True: + line = self.readline() + if not line: + raise StopIteration + line = line.rstrip("\n\r") # FIXME + if line.startswith("###"): + parts = line.split(":") + key = parts[0].strip() + value = ":".join(parts[1:]).strip() + return key, value, None + + if line: + entryLines.append(line) + continue + + # now `line` is empty, process `entryLines` + if not entryLines: + return None + if len(entryLines) < 2: + log.error( + f"invalid block near pos {self._file.tell()}" + f" in file {self._filename}", + ) + return None + word = entryLines[0] + defi = "\n".join(entryLines[1:]) + defi = defi.replace("
                      ", "\n") # FIXME + + words = splitByBar(word) + + return words, defi, None diff --git a/pyglossary/plugins/lingoes_ldf/writer.py b/pyglossary/plugins/lingoes_ldf/writer.py new file mode 100644 index 000000000..93004246f --- /dev/null +++ b/pyglossary/plugins/lingoes_ldf/writer.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + # compressionOpen, + stdCompressions, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + compressions = stdCompressions + + _newline: str = "\n" + _resources: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + + def getInfo(self, key: str) -> str: + return self._glos.getInfo(key).replace("\n", "
                      ") + + def getAuthor(self) -> str: + return self._glos.author.replace("\n", "
                      ") + + def finish(self) -> None: + self._filename = "" + + def open(self, filename: str) -> None: + self._filename = filename + + @staticmethod + def _defiEscapeFunc(defi: str) -> str: + return defi.replace("\n", "
                      ") + + def write(self) -> Generator[None, EntryType, None]: + from pyglossary.text_writer import writeTxt + + newline = self._newline + resources = self._resources + head = ( + f"###Title: {self.getInfo('title')}\n" + f"###Description: {self.getInfo('description')}\n" + f"###Author: {self.getAuthor()}\n" + f"###Email: {self.getInfo('email')}\n" + f"###Website: {self.getInfo('website')}\n" + f"###Copyright: {self.getInfo('copyright')}\n" + ) + yield from writeTxt( + self._glos, + entryFmt="{word}\n{defi}\n\n", + filename=self._filename, + writeInfo=False, + defiEscapeFunc=self._defiEscapeFunc, + ext=".ldf", + head=head, + newline=newline, + resources=resources, + ) diff --git a/pyglossary/plugins/makindo_medical/__init__.py b/pyglossary/plugins/makindo_medical/__init__.py index 2e2f5f579..07f783113 100644 --- a/pyglossary/plugins/makindo_medical/__init__.py +++ b/pyglossary/plugins/makindo_medical/__init__.py @@ -1,14 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html from typing import TYPE_CHECKING -if TYPE_CHECKING: - import sqlite3 - from collections.abc import Iterator +from .reader import Reader - from pyglossary.glossary_types import EntryType, GlossaryType +if TYPE_CHECKING: from pyglossary.option import Option __all__ = [ @@ -40,50 +37,3 @@ "Makindo.co.uk Comprehensive Medical Encyclopedia", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("h") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(*) from NEW_TABLE") - return self._cur.fetchone()[0] - - def __iter__(self) -> Iterator[EntryType]: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - "select _id, contents from NEW_TABLE where _id is not null", - ) - # FIXME: iteration over self._cur stops after one entry - # and self._cur.fetchone() returns None - # for row in self._cur: - for row in self._cur.fetchall(): - word = html.unescape(row[0]) - definition = row[1].decode("utf-8", errors="ignore") - # print(f"{word!r}, {definition!r}") - yield self._glos.newEntry(word, definition, defiFormat="h") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/makindo_medical/reader.py b/pyglossary/plugins/makindo_medical/reader.py new file mode 100644 index 000000000..14bb29cd2 --- /dev/null +++ b/pyglossary/plugins/makindo_medical/reader.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("h") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(*) from NEW_TABLE") + return self._cur.fetchone()[0] + + def __iter__(self) -> Iterator[EntryType]: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + "select _id, contents from NEW_TABLE where _id is not null", + ) + # FIXME: iteration over self._cur stops after one entry + # and self._cur.fetchone() returns None + # for row in self._cur: + for row in self._cur.fetchall(): + word = html.unescape(row[0]) + definition = row[1].decode("utf-8", errors="ignore") + # print(f"{word!r}, {definition!r}") + yield self._glos.newEntry(word, definition, defiFormat="h") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/octopus_mdict_new/__init__.py b/pyglossary/plugins/octopus_mdict_new/__init__.py index 244609819..bdd3aa239 100644 --- a/pyglossary/plugins/octopus_mdict_new/__init__.py +++ b/pyglossary/plugins/octopus_mdict_new/__init__.py @@ -1,43 +1,13 @@ # -*- coding: utf-8 -*- -# Read Octopus MDict dictionary format, mdx(dictionary)/mdd(data) -# -# Copyright © 2013 Xiaoqiang Wang -# Copyright © 2013-2021 Saeed Rasooli -# -# This program is a free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# You can get a copy of GNU General Public License along this program -# But you can always get it from http://www.gnu.org/licenses/gpl.txt -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. from __future__ import annotations -import gc -import os -import re -import sys -from os.path import dirname, extsep, isfile, join, splitext -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.plugin_lib.readmdict import MDD, MDX - - -from pyglossary.core import log from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -from pyglossary.text_utils import toStr + +from .reader import Reader __all__ = [ "Reader", @@ -87,189 +57,3 @@ then try to install [LZO library and Python binding](./doc/lzo.md).""", ), ] - - -class Reader: - _encoding: str = "" - _substyle: bool = True - _same_dir_data_files: bool = False - _audio: bool = False - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self.clear() - self._re_internal_link = re.compile("href=([\"'])(entry://|[dx]:)") - self._re_audio_link = re.compile( - ']*? )?href="sound://([^<>"]+)"( .*?)?>(.*?)', - ) - - def clear(self) -> None: - self._filename = "" - self._mdx: MDX | None = None - self._mdd: list[MDD] = [] - self._wordCount = 0 - self._dataEntryCount = 0 - - # dict of mainWord -> newline-separated alternatives - self._linksDict: dict[str, str] = {} - - def open(self, filename: str) -> None: - from pyglossary.plugin_lib.readmdict import MDD, MDX - - self._filename = filename - self._mdx = MDX(filename, self._encoding, self._substyle) - - """ - multiple MDD files are supported with this naming schema: - FILE.mdx - FILE.mdd - FILE.1.mdd - FILE.2.mdd - FILE.3.mdd - """ - - filenameNoExt, _ext = splitext(self._filename) - mddBase = filenameNoExt + extsep - for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"): - if isfile(fname): - self._mdd.append(MDD(fname)) - mddN = 2 - while isfile(f"{mddBase}{mddN}.mdd"): - self._mdd.append(MDD(f"{mddBase}{mddN}.mdd")) - mddN += 1 - - dataEntryCount = 0 - for mdd in self._mdd: - dataEntryCount += len(mdd) - self._dataEntryCount = dataEntryCount - log.info(f"Found {len(self._mdd)} mdd files with {dataEntryCount} entries") - - # from pprint import pformat - # log.debug("mdx.header = " + pformat(self._mdx.header)) - # for key, value in self._mdx.header.items(): - # key = key.lower() - # self._glos.setInfo(key, value) - try: - title = toStr(self._mdx.header[b"Title"]) - except KeyError: - pass - else: - title = title.strip() - if title == "Title (No HTML code allowed)": - # TODO: how to avoid this? - title = "" - if title: - self._glos.setInfo("name", title) - desc = toStr(self._mdx.header.get(b"Description", "")) - if desc: - self._glos.setInfo("description", desc) - - self.loadLinks() - - def loadLinks(self) -> None: - from pyglossary.plugin_lib.readmdict import MDX - - mdx = self._mdx - if mdx is None: - raise ValueError("mdx is None") - - log.info("extracting links...") - linksDict: dict[str, str] = {} - word = "" - wordCount = 0 - for b_word, b_defi in mdx.items(): - word = b_word.decode("utf-8") - defi = b_defi.decode("utf-8").strip() - if defi.startswith("@@@LINK="): - if not word: - log.warning(f"unexpected defi: {defi}") - continue - mainWord = defi[8:] - if mainWord in linksDict: - linksDict[mainWord] += "\n" + word - else: - linksDict[mainWord] = word - continue - wordCount += 1 - - log.info( - f"extracting links done, sizeof(linksDict)={sys.getsizeof(linksDict)}", - ) - log.info(f"{wordCount = }") - self._linksDict = linksDict - self._wordCount = wordCount - self._mdx = MDX(self._filename, self._encoding, self._substyle) - - def fixDefi(self, defi: str) -> str: - defi = self._re_internal_link.sub(r"href=\1bword://", defi) - defi = defi.replace(' src="file://', ' src=".') - - if self._audio: - # \5 is the possible elements between and - # but anything between and is completely - # ignored by Aaard2 Web and browser - # and there is no point adding it after - # which makes it shown after audio controls - - # GoldenDict acts completely different, so must use - # audio_goldendict=True option in StarDict writer instead. - - defi = self._re_audio_link.sub( - r'', - defi, - ) - - return defi - - def __iter__(self) -> Iterator[EntryType]: - if self._mdx is None: - log.error("trying to iterate on a closed MDX file") - return - - glos = self._glos - linksDict = self._linksDict - for b_word, b_defi in self._mdx.items(): - word = b_word.decode("utf-8") - defi = b_defi.decode("utf-8").strip() - if defi.startswith("@@@LINK="): - continue - defi = self.fixDefi(defi) - words = word - altsStr = linksDict.get(word, "") - if altsStr: - words = [word] + altsStr.split("\n") - yield glos.newEntry(words, defi) - - self._mdx = None - del linksDict - self._linksDict = {} - gc.collect() - - if self._same_dir_data_files: - dirPath = dirname(self._filename) - for fname in os.listdir(dirPath): - ext = splitext(fname)[1].lower() - if ext in {".mdx", ".mdd"}: - continue - fpath = join(dirPath, fname) - if not isfile(fpath): - continue - with open(fpath, mode="rb") as _file: - b_data = _file.read() - yield glos.newDataEntry(fname, b_data) - - for mdd in self._mdd: - try: - for b_fname, b_data in mdd.items(): - fname = toStr(b_fname) - fname = fname.replace("\\", os.sep).lstrip(os.sep) - yield glos.newDataEntry(fname, b_data) - except Exception: # noqa: PERF203 - log.exception(f"Error reading {mdd.filename}") - self._mdd = [] - - def __len__(self) -> int: - return self._wordCount + self._dataEntryCount - - def close(self) -> None: - self.clear() diff --git a/pyglossary/plugins/octopus_mdict_new/reader.py b/pyglossary/plugins/octopus_mdict_new/reader.py new file mode 100644 index 000000000..f154200dc --- /dev/null +++ b/pyglossary/plugins/octopus_mdict_new/reader.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- +# Read Octopus MDict dictionary format, mdx(dictionary)/mdd(data) +# +# Copyright © 2013 Xiaoqiang Wang +# Copyright © 2013-2021 Saeed Rasooli +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# You can get a copy of GNU General Public License along this program +# But you can always get it from http://www.gnu.org/licenses/gpl.txt +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +from __future__ import annotations + +import gc +import os +import re +import sys +from os.path import dirname, extsep, isfile, join, splitext +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.plugin_lib.readmdict import MDD, MDX + + +from pyglossary.core import log +from pyglossary.text_utils import toStr + + +class Reader: + _encoding: str = "" + _substyle: bool = True + _same_dir_data_files: bool = False + _audio: bool = False + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self.clear() + self._re_internal_link = re.compile("href=([\"'])(entry://|[dx]:)") + self._re_audio_link = re.compile( + ']*? )?href="sound://([^<>"]+)"( .*?)?>(.*?)', + ) + + def clear(self) -> None: + self._filename = "" + self._mdx: MDX | None = None + self._mdd: list[MDD] = [] + self._wordCount = 0 + self._dataEntryCount = 0 + + # dict of mainWord -> newline-separated alternatives + self._linksDict: dict[str, str] = {} + + def open(self, filename: str) -> None: + from pyglossary.plugin_lib.readmdict import MDD, MDX + + self._filename = filename + self._mdx = MDX(filename, self._encoding, self._substyle) + + """ + multiple MDD files are supported with this naming schema: + FILE.mdx + FILE.mdd + FILE.1.mdd + FILE.2.mdd + FILE.3.mdd + """ + + filenameNoExt, _ext = splitext(self._filename) + mddBase = filenameNoExt + extsep + for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"): + if isfile(fname): + self._mdd.append(MDD(fname)) + mddN = 2 + while isfile(f"{mddBase}{mddN}.mdd"): + self._mdd.append(MDD(f"{mddBase}{mddN}.mdd")) + mddN += 1 + + dataEntryCount = 0 + for mdd in self._mdd: + dataEntryCount += len(mdd) + self._dataEntryCount = dataEntryCount + log.info(f"Found {len(self._mdd)} mdd files with {dataEntryCount} entries") + + # from pprint import pformat + # log.debug("mdx.header = " + pformat(self._mdx.header)) + # for key, value in self._mdx.header.items(): + # key = key.lower() + # self._glos.setInfo(key, value) + try: + title = toStr(self._mdx.header[b"Title"]) + except KeyError: + pass + else: + title = title.strip() + if title == "Title (No HTML code allowed)": + # TODO: how to avoid this? + title = "" + if title: + self._glos.setInfo("name", title) + desc = toStr(self._mdx.header.get(b"Description", "")) + if desc: + self._glos.setInfo("description", desc) + + self.loadLinks() + + def loadLinks(self) -> None: + from pyglossary.plugin_lib.readmdict import MDX + + mdx = self._mdx + if mdx is None: + raise ValueError("mdx is None") + + log.info("extracting links...") + linksDict: dict[str, str] = {} + word = "" + wordCount = 0 + for b_word, b_defi in mdx.items(): + word = b_word.decode("utf-8") + defi = b_defi.decode("utf-8").strip() + if defi.startswith("@@@LINK="): + if not word: + log.warning(f"unexpected defi: {defi}") + continue + mainWord = defi[8:] + if mainWord in linksDict: + linksDict[mainWord] += "\n" + word + else: + linksDict[mainWord] = word + continue + wordCount += 1 + + log.info( + f"extracting links done, sizeof(linksDict)={sys.getsizeof(linksDict)}", + ) + log.info(f"{wordCount = }") + self._linksDict = linksDict + self._wordCount = wordCount + self._mdx = MDX(self._filename, self._encoding, self._substyle) + + def fixDefi(self, defi: str) -> str: + defi = self._re_internal_link.sub(r"href=\1bword://", defi) + defi = defi.replace(' src="file://', ' src=".') + + if self._audio: + # \5 is the possible elements between and + # but anything between and is completely + # ignored by Aaard2 Web and browser + # and there is no point adding it after + # which makes it shown after audio controls + + # GoldenDict acts completely different, so must use + # audio_goldendict=True option in StarDict writer instead. + + defi = self._re_audio_link.sub( + r'', + defi, + ) + + return defi + + def __iter__(self) -> Iterator[EntryType]: + if self._mdx is None: + log.error("trying to iterate on a closed MDX file") + return + + glos = self._glos + linksDict = self._linksDict + for b_word, b_defi in self._mdx.items(): + word = b_word.decode("utf-8") + defi = b_defi.decode("utf-8").strip() + if defi.startswith("@@@LINK="): + continue + defi = self.fixDefi(defi) + words = word + altsStr = linksDict.get(word, "") + if altsStr: + words = [word] + altsStr.split("\n") + yield glos.newEntry(words, defi) + + self._mdx = None + del linksDict + self._linksDict = {} + gc.collect() + + if self._same_dir_data_files: + dirPath = dirname(self._filename) + for fname in os.listdir(dirPath): + ext = splitext(fname)[1].lower() + if ext in {".mdx", ".mdd"}: + continue + fpath = join(dirPath, fname) + if not isfile(fpath): + continue + with open(fpath, mode="rb") as _file: + b_data = _file.read() + yield glos.newDataEntry(fname, b_data) + + for mdd in self._mdd: + try: + for b_fname, b_data in mdd.items(): + fname = toStr(b_fname) + fname = fname.replace("\\", os.sep).lstrip(os.sep) + yield glos.newDataEntry(fname, b_data) + except Exception: # noqa: PERF203 + log.exception(f"Error reading {mdd.filename}") + self._mdd = [] + + def __len__(self) -> int: + return self._wordCount + self._dataEntryCount + + def close(self) -> None: + self.clear() diff --git a/pyglossary/plugins/sql/__init__.py b/pyglossary/plugins/sql/__init__.py index fce4cfb56..c0629c979 100644 --- a/pyglossary/plugins/sql/__init__.py +++ b/pyglossary/plugins/sql/__init__.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import TYPE_CHECKING - from pyglossary.option import ( BoolOption, EncodingOption, @@ -11,11 +9,7 @@ Option, ) -if TYPE_CHECKING: - import io - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -49,133 +43,3 @@ "newline": NewlineOption(), "transaction": BoolOption(comment="Use TRANSACTION"), } - - -class Writer: - _encoding: str = "utf-8" - _info_keys: list | None = None - _add_extra_info: bool = True - _newline: str = "
                      " - _transaction: bool = False - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.IOBase | None = None - - def finish(self) -> None: - self._filename = "" - if self._file: - self._file.close() - self._file = None - - def open(self, filename: str) -> None: - self._filename = filename - self._file = open(filename, "w", encoding=self._encoding) - self._writeInfo() - - def _writeInfo(self) -> None: - fileObj = self._file - if fileObj is None: - raise ValueError("fileObj is None") - newline = self._newline - info_keys = self._getInfoKeys() - infoDefLine = "CREATE TABLE dbinfo (" - infoValues: list[str] = [] - glos = self._glos - - for key in info_keys: - value = glos.getInfo(key) - value = ( - value.replace("'", "''") - .replace("\x00", "") - .replace("\r", "") - .replace("\n", newline) - ) - infoValues.append(f"'{value}'") - infoDefLine += f"{key} char({len(value)}), " - - infoDefLine = infoDefLine[:-2] + ");" - fileObj.write(infoDefLine + "\n") - - if self._add_extra_info: - fileObj.write( - "CREATE TABLE dbinfo_extra (" - "'id' INTEGER PRIMARY KEY NOT NULL, " - "'name' TEXT UNIQUE, 'value' TEXT);\n", - ) - - fileObj.write( - "CREATE TABLE word ('id' INTEGER PRIMARY KEY NOT NULL, " - "'w' TEXT, 'm' TEXT);\n", - ) - fileObj.write( - "CREATE TABLE alt ('id' INTEGER NOT NULL, 'w' TEXT);\n", - ) - - if self._transaction: - fileObj.write("BEGIN TRANSACTION;\n") - fileObj.write(f"INSERT INTO dbinfo VALUES({','.join(infoValues)});\n") - - if self._add_extra_info: - extraInfo = glos.getExtraInfos(info_keys) - for index, (key, value) in enumerate(extraInfo.items()): - key2 = key.replace("'", "''") - value2 = value.replace("'", "''") - fileObj.write( - f"INSERT INTO dbinfo_extra VALUES({index + 1}, " - f"'{key2}', '{value2}');\n", - ) - - def _getInfoKeys(self) -> list[str]: - info_keys = self._info_keys - if info_keys: - return info_keys - return [ - "dbname", - "author", - "version", - "direction", - "origLang", - "destLang", - "license", - "category", - "description", - ] - - def write(self) -> Generator[None, EntryType, None]: - newline = self._newline - - fileObj = self._file - if fileObj is None: - raise ValueError("fileObj is None") - - def fixStr(word: str) -> str: - return word.replace("'", "''").replace("\r", "").replace("\n", newline) - - id_ = 1 - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # FIXME - continue - words = entry.l_word - word = fixStr(words[0]) - defi = fixStr(entry.defi) - fileObj.write( - f"INSERT INTO word VALUES({id_}, '{word}', '{defi}');\n", - ) - for alt in words[1:]: - fileObj.write( - f"INSERT INTO alt VALUES({id_}, '{fixStr(alt)}');\n", - ) - id_ += 1 - - if self._transaction: - fileObj.write("END TRANSACTION;\n") - - fileObj.write("CREATE INDEX ix_word_w ON word(w COLLATE NOCASE);\n") - fileObj.write("CREATE INDEX ix_alt_id ON alt(id COLLATE NOCASE);\n") - fileObj.write("CREATE INDEX ix_alt_w ON alt(w COLLATE NOCASE);\n") diff --git a/pyglossary/plugins/sql/writer.py b/pyglossary/plugins/sql/writer.py new file mode 100644 index 000000000..64350fc5a --- /dev/null +++ b/pyglossary/plugins/sql/writer.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + _encoding: str = "utf-8" + _info_keys: list | None = None + _add_extra_info: bool = True + _newline: str = "
                      " + _transaction: bool = False + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.IOBase | None = None + + def finish(self) -> None: + self._filename = "" + if self._file: + self._file.close() + self._file = None + + def open(self, filename: str) -> None: + self._filename = filename + self._file = open(filename, "w", encoding=self._encoding) + self._writeInfo() + + def _writeInfo(self) -> None: + fileObj = self._file + if fileObj is None: + raise ValueError("fileObj is None") + newline = self._newline + info_keys = self._getInfoKeys() + infoDefLine = "CREATE TABLE dbinfo (" + infoValues: list[str] = [] + glos = self._glos + + for key in info_keys: + value = glos.getInfo(key) + value = ( + value.replace("'", "''") + .replace("\x00", "") + .replace("\r", "") + .replace("\n", newline) + ) + infoValues.append(f"'{value}'") + infoDefLine += f"{key} char({len(value)}), " + + infoDefLine = infoDefLine[:-2] + ");" + fileObj.write(infoDefLine + "\n") + + if self._add_extra_info: + fileObj.write( + "CREATE TABLE dbinfo_extra (" + "'id' INTEGER PRIMARY KEY NOT NULL, " + "'name' TEXT UNIQUE, 'value' TEXT);\n", + ) + + fileObj.write( + "CREATE TABLE word ('id' INTEGER PRIMARY KEY NOT NULL, " + "'w' TEXT, 'm' TEXT);\n", + ) + fileObj.write( + "CREATE TABLE alt ('id' INTEGER NOT NULL, 'w' TEXT);\n", + ) + + if self._transaction: + fileObj.write("BEGIN TRANSACTION;\n") + fileObj.write(f"INSERT INTO dbinfo VALUES({','.join(infoValues)});\n") + + if self._add_extra_info: + extraInfo = glos.getExtraInfos(info_keys) + for index, (key, value) in enumerate(extraInfo.items()): + key2 = key.replace("'", "''") + value2 = value.replace("'", "''") + fileObj.write( + f"INSERT INTO dbinfo_extra VALUES({index + 1}, " + f"'{key2}', '{value2}');\n", + ) + + def _getInfoKeys(self) -> list[str]: + info_keys = self._info_keys + if info_keys: + return info_keys + return [ + "dbname", + "author", + "version", + "direction", + "origLang", + "destLang", + "license", + "category", + "description", + ] + + def write(self) -> Generator[None, EntryType, None]: + newline = self._newline + + fileObj = self._file + if fileObj is None: + raise ValueError("fileObj is None") + + def fixStr(word: str) -> str: + return word.replace("'", "''").replace("\r", "").replace("\n", newline) + + id_ = 1 + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # FIXME + continue + words = entry.l_word + word = fixStr(words[0]) + defi = fixStr(entry.defi) + fileObj.write( + f"INSERT INTO word VALUES({id_}, '{word}', '{defi}');\n", + ) + for alt in words[1:]: + fileObj.write( + f"INSERT INTO alt VALUES({id_}, '{fixStr(alt)}');\n", + ) + id_ += 1 + + if self._transaction: + fileObj.write("END TRANSACTION;\n") + + fileObj.write("CREATE INDEX ix_word_w ON word(w COLLATE NOCASE);\n") + fileObj.write("CREATE INDEX ix_alt_id ON alt(id COLLATE NOCASE);\n") + fileObj.write("CREATE INDEX ix_alt_w ON alt(w COLLATE NOCASE);\n") diff --git a/pyglossary/plugins/stardict_merge_syns/__init__.py b/pyglossary/plugins/stardict_merge_syns/__init__.py index b13cb423f..d1ef62fc7 100644 --- a/pyglossary/plugins/stardict_merge_syns/__init__.py +++ b/pyglossary/plugins/stardict_merge_syns/__init__.py @@ -2,10 +2,6 @@ from __future__ import annotations import os -from time import perf_counter as now -from typing import ( - TYPE_CHECKING, -) from pyglossary.flags import ALWAYS, DEFAULT_YES from pyglossary.option import ( @@ -13,17 +9,8 @@ Option, StrOption, ) -from pyglossary.plugins.stardict import Writer as StdWriter - -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType - -from pyglossary.core import log -from pyglossary.glossary_utils import Error -from pyglossary.text_utils import uint32ToBytes +from .writer import Writer __all__ = [ "Writer", @@ -97,121 +84,3 @@ if os.getenv("PYGLOSSARY_STARDICT_NO_FORCE_SORT") == "1": sortOnWrite = DEFAULT_YES - - -class Writer(StdWriter): - dictzipSynFile = False - - def fixDefi(self, defi: str, defiFormat: str) -> bytes: # noqa: ARG002, PLR6301 - return defi.encode("utf-8") - - def writeCompact( - self, - defiFormat: str, - ) -> Generator[None, EntryType, None]: - """ - Build StarDict dictionary with sametypesequence option specified. - Every item definition consists of a single article. - All articles have the same format, specified in defiFormat parameter. - - defiFormat - format of article definition: h - html, m - plain text - """ - log.debug(f"writeCompact: {defiFormat=}") - - idxBlockList = self.newIdxList() - altIndexList = self.newSynList() - - dictFile = open(self._filename + ".dict", "wb") - - t0 = now() - - dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc() - - dictMark, entryIndex = 0, -1 - while True: - entry = yield - if entry is None: - break - if entry.isData(): - entry.save(self._resDir) - continue - entryIndex += 1 - - b_dictBlock = self.fixDefi(entry.defi, defiFormat) - dictFile.write(b_dictBlock) - - b_idxBlock = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock)) - for b_word in entry.lb_word: - idxBlockList.append((b_word, b_idxBlock)) - - dictMark += len(b_dictBlock) - - if dictMark > dictMarkMax: - raise Error( - f"StarDict: {dictMark = } is too big, set option large_file=true", - ) - - dictFile.close() - log.info(f"Writing dict file took {now() - t0:.2f} seconds") - - self.writeIdxFile(idxBlockList) - - self.writeIfoFile( - len(idxBlockList), - len(altIndexList), - ) - - def writeGeneral(self) -> Generator[None, EntryType, None]: - """ - Build StarDict dictionary in general case. - Every item definition may consist of an arbitrary number of articles. - sametypesequence option is not used. - """ - log.debug("writeGeneral") - idxBlockList = self.newIdxList() - altIndexList = self.newSynList() - - dictFile = open(self._filename + ".dict", "wb") - - t0 = now() - - dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc() - - dictMark, entryIndex = 0, -1 - while True: - entry = yield - if entry is None: - break - if entry.isData(): - entry.save(self._resDir) - continue - entryIndex += 1 - - defiFormat = entry.detectDefiFormat("m") # call no more than once - - b_defi = self.fixDefi(entry.defi, defiFormat) - b_dictBlock = defiFormat.encode("ascii") + b_defi + b"\x00" - dictFile.write(b_dictBlock) - - b_idxBlock = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock)) - for b_word in entry.lb_word: - idxBlockList.append((b_word, b_idxBlock)) - - dictMark += len(b_dictBlock) - - if dictMark > dictMarkMax: - raise Error( - f"StarDict: {dictMark = } is too big, set option large_file=true", - ) - - dictFile.close() - log.info(f"Writing dict file took {now() - t0:.2f} seconds") - - self.writeIdxFile(idxBlockList) - - self.writeIfoFile( - len(idxBlockList), - len(altIndexList), - ) - - # TODO: override getDescription to indicate merge_syns diff --git a/pyglossary/plugins/stardict_merge_syns/writer.py b/pyglossary/plugins/stardict_merge_syns/writer.py new file mode 100644 index 000000000..ba0349d04 --- /dev/null +++ b/pyglossary/plugins/stardict_merge_syns/writer.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from time import perf_counter as now +from typing import ( + TYPE_CHECKING, +) + +from pyglossary.plugins.stardict import Writer as StdWriter + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType + + +from pyglossary.core import log +from pyglossary.glossary_utils import Error +from pyglossary.text_utils import uint32ToBytes + + +class Writer(StdWriter): + dictzipSynFile = False + + def fixDefi(self, defi: str, defiFormat: str) -> bytes: # noqa: ARG002, PLR6301 + return defi.encode("utf-8") + + def writeCompact( + self, + defiFormat: str, + ) -> Generator[None, EntryType, None]: + """ + Build StarDict dictionary with sametypesequence option specified. + Every item definition consists of a single article. + All articles have the same format, specified in defiFormat parameter. + + defiFormat - format of article definition: h - html, m - plain text + """ + log.debug(f"writeCompact: {defiFormat=}") + + idxBlockList = self.newIdxList() + altIndexList = self.newSynList() + + dictFile = open(self._filename + ".dict", "wb") + + t0 = now() + + dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc() + + dictMark, entryIndex = 0, -1 + while True: + entry = yield + if entry is None: + break + if entry.isData(): + entry.save(self._resDir) + continue + entryIndex += 1 + + b_dictBlock = self.fixDefi(entry.defi, defiFormat) + dictFile.write(b_dictBlock) + + b_idxBlock = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock)) + for b_word in entry.lb_word: + idxBlockList.append((b_word, b_idxBlock)) + + dictMark += len(b_dictBlock) + + if dictMark > dictMarkMax: + raise Error( + f"StarDict: {dictMark = } is too big, set option large_file=true", + ) + + dictFile.close() + log.info(f"Writing dict file took {now() - t0:.2f} seconds") + + self.writeIdxFile(idxBlockList) + + self.writeIfoFile( + len(idxBlockList), + len(altIndexList), + ) + + def writeGeneral(self) -> Generator[None, EntryType, None]: + """ + Build StarDict dictionary in general case. + Every item definition may consist of an arbitrary number of articles. + sametypesequence option is not used. + """ + log.debug("writeGeneral") + idxBlockList = self.newIdxList() + altIndexList = self.newSynList() + + dictFile = open(self._filename + ".dict", "wb") + + t0 = now() + + dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc() + + dictMark, entryIndex = 0, -1 + while True: + entry = yield + if entry is None: + break + if entry.isData(): + entry.save(self._resDir) + continue + entryIndex += 1 + + defiFormat = entry.detectDefiFormat("m") # call no more than once + + b_defi = self.fixDefi(entry.defi, defiFormat) + b_dictBlock = defiFormat.encode("ascii") + b_defi + b"\x00" + dictFile.write(b_dictBlock) + + b_idxBlock = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock)) + for b_word in entry.lb_word: + idxBlockList.append((b_word, b_idxBlock)) + + dictMark += len(b_dictBlock) + + if dictMark > dictMarkMax: + raise Error( + f"StarDict: {dictMark = } is too big, set option large_file=true", + ) + + dictFile.close() + log.info(f"Writing dict file took {now() - t0:.2f} seconds") + + self.writeIdxFile(idxBlockList) + + self.writeIfoFile( + len(idxBlockList), + len(altIndexList), + ) + + # TODO: override getDescription to indicate merge_syns diff --git a/pyglossary/plugins/stardict_textual/__init__.py b/pyglossary/plugins/stardict_textual/__init__.py index a54d04266..80dc78d69 100644 --- a/pyglossary/plugins/stardict_textual/__init__.py +++ b/pyglossary/plugins/stardict_textual/__init__.py @@ -1,34 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import os -from os.path import dirname, isdir, join -from typing import TYPE_CHECKING, cast - -if TYPE_CHECKING: - import io - from collections.abc import Generator, Iterator - - from lxml import builder - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.lxml_types import Element - from pyglossary.xdxf.transform import XdxfTransformer - - -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import exc_note, log, pip -from pyglossary.html_utils import unescape_unicode -from pyglossary.io_utils import nullBinaryIO from pyglossary.option import ( BoolOption, EncodingOption, Option, ) +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -66,337 +47,3 @@ comment="Convert XDXF entries to HTML", ), } - - -class Reader: - _encoding: str = "utf-8" - _xdxf_to_html: bool = True - - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._fileSize = 0 - self._xdxfTr: XdxfTransformer | None = None - - def xdxf_setup(self) -> XdxfTransformer: - from pyglossary.xdxf.transform import XdxfTransformer - - self._xdxfTr = tr = XdxfTransformer(encoding="utf-8") - return tr - - def xdxf_transform(self, text: str) -> str: - tr = self._xdxfTr - if tr is None: - tr = self.xdxf_setup() - return tr.transformByInnerString(text) - - def __len__(self) -> int: - return 0 - - def close(self) -> None: - self._file.close() - self._file = nullBinaryIO - self._filename = "" - self._fileSize = 0 - - def open(self, filename: str) -> None: - try: - from lxml import etree as ET - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - self._filename = filename - cfile = compressionOpen(filename, mode="rb") - - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - # self._glos.setInfo("input_file_size", f"{self._fileSize}") - else: - log.warning("StarDict Textual File Reader: file is not seekable") - - context = ET.iterparse( # type: ignore # noqa: PGH003 - cfile, - events=("end",), - tag="info", - ) - for _, elem in context: - self.setMetadata(elem) # type: ignore - break - - cfile.close() - - def setGlosInfo(self, key: str, value: str) -> None: - if value is None: - return - self._glos.setInfo(key, unescape_unicode(value)) - - def setMetadata(self, header: Element) -> None: - if (elem := header.find("./bookname")) is not None and elem.text: - self.setGlosInfo("name", elem.text) - - if (elem := header.find("./author")) is not None and elem.text: - self.setGlosInfo("author", elem.text) - - if (elem := header.find("./email")) is not None and elem.text: - self.setGlosInfo("email", elem.text) - - if (elem := header.find("./website")) is not None and elem.text: - self.setGlosInfo("website", elem.text) - - if (elem := header.find("./description")) is not None and elem.text: - self.setGlosInfo("description", elem.text) - - if (elem := header.find("./bookname")) is not None and elem.text: - self.setGlosInfo("name", elem.text) - - if (elem := header.find("./bookname")) is not None and elem.text: - self.setGlosInfo("name", elem.text) - - if (elem := header.find("./date")) is not None and elem.text: - self.setGlosInfo("creationTime", elem.text) - - # if (elem := header.find("./dicttype")) is not None and elem.text: - # self.setGlosInfo("dicttype", elem.text) - - def renderDefiList( - self, - defisWithFormat: list[tuple[str, str]], - ) -> tuple[str, str]: - if not defisWithFormat: - return "", "" - if len(defisWithFormat) == 1: - return defisWithFormat[0] - - defiFormatSet: set[str] = set() - defiFormatSet.update(_type for _, _type in defisWithFormat) - - if len(defiFormatSet) == 1: - format_ = defiFormatSet.pop() - if format_ == "h": - return "\n
                      ".join([defi for defi, _ in defisWithFormat]), format_ - return "\n".join([defi for defi, _ in defisWithFormat]), format_ - - # convert plaintext or xdxf to html - defis: list[str] = [] - for defi_, format_ in defisWithFormat: - if format_ == "m": - defis.append("
                      " + defi_.replace("\n", "
                      ") + "
                      ") - elif format_ == "x": - defis.append(self.xdxf_transform(defi_)) - else: - defis.append(defi_) - return "\n
                      \n".join(defis), "h" - - def __iter__(self) -> Iterator[EntryType]: - from lxml import etree as ET - - glos = self._glos - fileSize = self._fileSize - self._file = file = compressionOpen(self._filename, mode="rb") - context = ET.iterparse( # type: ignore # noqa: PGH003 - self._file, - events=("end",), - tag="article", - ) - for _, _elem in context: - elem = cast("Element", _elem) - words: list[str] = [] - defisWithFormat: list[tuple[str, str]] = [] - for child in elem.iterchildren(): - if not child.text: - continue - if child.tag in {"key", "synonym"}: - words.append(child.text) - elif child.tag == "definition": - type_ = child.attrib.get("type", "") - if type_: - new_type = { - "m": "m", - "t": "m", - "y": "m", - "g": "h", - "h": "h", - "x": "x", - }.get(type_, "") - if not new_type: - log.warning(f"unsupported definition type {type_}") - type_ = new_type - if not type_: - type_ = "m" - defi_ = child.text.strip() - if type_ == "x" and self._xdxf_to_html: - defi_ = self.xdxf_transform(defi_) - type_ = "h" - defisWithFormat.append((defi_, type_)) - # TODO: child.tag == "definition-r" - else: - log.warning(f"unknown tag {child.tag}") - - defi, defiFormat = self.renderDefiList(defisWithFormat) - - yield glos.newEntry( - words, - defi, - defiFormat=defiFormat, - byteProgress=(file.tell(), fileSize), - ) - - # clean up preceding siblings to save memory - # this can reduce memory usage from >300 MB to ~25 MB - while elem.getprevious() is not None: - parent = elem.getparent() - if parent is None: - break - del parent[0] - - -class Writer: - _encoding: str = "utf-8" - - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._resDir = "" - - def open( - self, - filename: str, - ) -> None: - self._filename = filename - self._resDir = join(dirname(self._filename), "res") - self._file = compressionOpen( - self._filename, - mode="w", - encoding=self._encoding, - ) - - def finish(self) -> None: - self._file.close() - - def writeInfo( - self, - maker: builder.ElementMaker, - pretty: bool, - ) -> None: - from lxml import etree as ET - - glos = self._glos - - desc = glos.getInfo("description") - copyright_ = glos.getInfo("copyright") - if copyright_: - desc = f"{copyright_}\n{desc}" - publisher = glos.getInfo("publisher") - if publisher: - desc = f"Publisher: {publisher}\n{desc}" - - info = maker.info( - maker.version("3.0.0"), - maker.bookname(glos.getInfo("name")), - maker.author(glos.getInfo("author")), - maker.email(glos.getInfo("email")), - maker.website(glos.getInfo("website")), - maker.description(desc), - maker.date(glos.getInfo("creationTime")), - maker.dicttype(""), - ) - file = self._file - file.write( - cast( - "bytes", - ET.tostring( - info, - encoding=self._encoding, - pretty_print=pretty, - ), - ).decode(self._encoding) - + "\n", - ) - - def writeDataEntry( - self, - maker: builder.ElementMaker, # noqa: ARG002 - entry: EntryType, - ) -> None: - entry.save(self._resDir) - # TODO: create article tag with "definition-r" in it? - # or just save the file to res/ directory? or both? - # article = maker.article( - # maker.key(entry.s_word), - # maker.definition_r( - # ET.CDATA(entry.defi), - # **{"type": ext}) - # ) - # ) - - def write(self) -> Generator[None, EntryType, None]: - from lxml import builder - from lxml import etree as ET - - file = self._file - encoding = self._encoding - maker = builder.ElementMaker() - - file.write( - """ - -""", - ) - - self.writeInfo(maker, pretty=True) - - if not isdir(self._resDir): - os.mkdir(self._resDir) - - pretty = True - while True: - entry = yield - if entry is None: - break - if entry.isData(): - self.writeDataEntry(maker, entry) - continue - entry.detectDefiFormat() - article = maker.article( - maker.key(entry.l_word[0]), - ) - for alt in entry.l_word[1:]: - article.append(maker.synonym(alt)) - article.append( - maker.definition( - ET.CDATA(entry.defi), - type=entry.defiFormat, - ), - ) - ET.indent(article, space="") - articleStr = cast( - "bytes", - ET.tostring( - article, - pretty_print=pretty, - encoding=encoding, - ), - ).decode(encoding) - # for some reason, "´k" becomes " ́k" (for example) # noqa: RUF003 - # stardict-text2bin tool also does this. - # https://en.wiktionary.org/wiki/%CB%88#Translingual - self._file.write(articleStr + "\n") - - file.write("") - - if not os.listdir(self._resDir): - os.rmdir(self._resDir) diff --git a/pyglossary/plugins/stardict_textual/reader.py b/pyglossary/plugins/stardict_textual/reader.py new file mode 100644 index 000000000..91fea26c8 --- /dev/null +++ b/pyglossary/plugins/stardict_textual/reader.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.lxml_types import Element + from pyglossary.xdxf.transform import XdxfTransformer + + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import exc_note, log, pip +from pyglossary.html_utils import unescape_unicode +from pyglossary.io_utils import nullBinaryIO + + +class Reader: + _encoding: str = "utf-8" + _xdxf_to_html: bool = True + + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._fileSize = 0 + self._xdxfTr: XdxfTransformer | None = None + + def xdxf_setup(self) -> XdxfTransformer: + from pyglossary.xdxf.transform import XdxfTransformer + + self._xdxfTr = tr = XdxfTransformer(encoding="utf-8") + return tr + + def xdxf_transform(self, text: str) -> str: + tr = self._xdxfTr + if tr is None: + tr = self.xdxf_setup() + return tr.transformByInnerString(text) + + def __len__(self) -> int: + return 0 + + def close(self) -> None: + self._file.close() + self._file = nullBinaryIO + self._filename = "" + self._fileSize = 0 + + def open(self, filename: str) -> None: + try: + from lxml import etree as ET + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + self._filename = filename + cfile = compressionOpen(filename, mode="rb") + + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + # self._glos.setInfo("input_file_size", f"{self._fileSize}") + else: + log.warning("StarDict Textual File Reader: file is not seekable") + + context = ET.iterparse( # type: ignore # noqa: PGH003 + cfile, + events=("end",), + tag="info", + ) + for _, elem in context: + self.setMetadata(elem) # type: ignore + break + + cfile.close() + + def setGlosInfo(self, key: str, value: str) -> None: + if value is None: + return + self._glos.setInfo(key, unescape_unicode(value)) + + def setMetadata(self, header: Element) -> None: + if (elem := header.find("./bookname")) is not None and elem.text: + self.setGlosInfo("name", elem.text) + + if (elem := header.find("./author")) is not None and elem.text: + self.setGlosInfo("author", elem.text) + + if (elem := header.find("./email")) is not None and elem.text: + self.setGlosInfo("email", elem.text) + + if (elem := header.find("./website")) is not None and elem.text: + self.setGlosInfo("website", elem.text) + + if (elem := header.find("./description")) is not None and elem.text: + self.setGlosInfo("description", elem.text) + + if (elem := header.find("./bookname")) is not None and elem.text: + self.setGlosInfo("name", elem.text) + + if (elem := header.find("./bookname")) is not None and elem.text: + self.setGlosInfo("name", elem.text) + + if (elem := header.find("./date")) is not None and elem.text: + self.setGlosInfo("creationTime", elem.text) + + # if (elem := header.find("./dicttype")) is not None and elem.text: + # self.setGlosInfo("dicttype", elem.text) + + def renderDefiList( + self, + defisWithFormat: list[tuple[str, str]], + ) -> tuple[str, str]: + if not defisWithFormat: + return "", "" + if len(defisWithFormat) == 1: + return defisWithFormat[0] + + defiFormatSet: set[str] = set() + defiFormatSet.update(_type for _, _type in defisWithFormat) + + if len(defiFormatSet) == 1: + format_ = defiFormatSet.pop() + if format_ == "h": + return "\n
                      ".join([defi for defi, _ in defisWithFormat]), format_ + return "\n".join([defi for defi, _ in defisWithFormat]), format_ + + # convert plaintext or xdxf to html + defis: list[str] = [] + for defi_, format_ in defisWithFormat: + if format_ == "m": + defis.append("
                      " + defi_.replace("\n", "
                      ") + "
                      ") + elif format_ == "x": + defis.append(self.xdxf_transform(defi_)) + else: + defis.append(defi_) + return "\n
                      \n".join(defis), "h" + + def __iter__(self) -> Iterator[EntryType]: + from lxml import etree as ET + + glos = self._glos + fileSize = self._fileSize + self._file = file = compressionOpen(self._filename, mode="rb") + context = ET.iterparse( # type: ignore # noqa: PGH003 + self._file, + events=("end",), + tag="article", + ) + for _, _elem in context: + elem = cast("Element", _elem) + words: list[str] = [] + defisWithFormat: list[tuple[str, str]] = [] + for child in elem.iterchildren(): + if not child.text: + continue + if child.tag in {"key", "synonym"}: + words.append(child.text) + elif child.tag == "definition": + type_ = child.attrib.get("type", "") + if type_: + new_type = { + "m": "m", + "t": "m", + "y": "m", + "g": "h", + "h": "h", + "x": "x", + }.get(type_, "") + if not new_type: + log.warning(f"unsupported definition type {type_}") + type_ = new_type + if not type_: + type_ = "m" + defi_ = child.text.strip() + if type_ == "x" and self._xdxf_to_html: + defi_ = self.xdxf_transform(defi_) + type_ = "h" + defisWithFormat.append((defi_, type_)) + # TODO: child.tag == "definition-r" + else: + log.warning(f"unknown tag {child.tag}") + + defi, defiFormat = self.renderDefiList(defisWithFormat) + + yield glos.newEntry( + words, + defi, + defiFormat=defiFormat, + byteProgress=(file.tell(), fileSize), + ) + + # clean up preceding siblings to save memory + # this can reduce memory usage from >300 MB to ~25 MB + while elem.getprevious() is not None: + parent = elem.getparent() + if parent is None: + break + del parent[0] diff --git a/pyglossary/plugins/stardict_textual/writer.py b/pyglossary/plugins/stardict_textual/writer.py new file mode 100644 index 000000000..c7681d839 --- /dev/null +++ b/pyglossary/plugins/stardict_textual/writer.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import os +from os.path import dirname, isdir, join +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + from collections.abc import Generator + + from lxml import builder + + from pyglossary.glossary_types import EntryType, GlossaryType + + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) + + +class Writer: + _encoding: str = "utf-8" + + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._resDir = "" + + def open( + self, + filename: str, + ) -> None: + self._filename = filename + self._resDir = join(dirname(self._filename), "res") + self._file = compressionOpen( + self._filename, + mode="w", + encoding=self._encoding, + ) + + def finish(self) -> None: + self._file.close() + + def writeInfo( + self, + maker: builder.ElementMaker, + pretty: bool, + ) -> None: + from lxml import etree as ET + + glos = self._glos + + desc = glos.getInfo("description") + copyright_ = glos.getInfo("copyright") + if copyright_: + desc = f"{copyright_}\n{desc}" + publisher = glos.getInfo("publisher") + if publisher: + desc = f"Publisher: {publisher}\n{desc}" + + info = maker.info( + maker.version("3.0.0"), + maker.bookname(glos.getInfo("name")), + maker.author(glos.getInfo("author")), + maker.email(glos.getInfo("email")), + maker.website(glos.getInfo("website")), + maker.description(desc), + maker.date(glos.getInfo("creationTime")), + maker.dicttype(""), + ) + file = self._file + file.write( + cast( + "bytes", + ET.tostring( + info, + encoding=self._encoding, + pretty_print=pretty, + ), + ).decode(self._encoding) + + "\n", + ) + + def writeDataEntry( + self, + maker: builder.ElementMaker, # noqa: ARG002 + entry: EntryType, + ) -> None: + entry.save(self._resDir) + # TODO: create article tag with "definition-r" in it? + # or just save the file to res/ directory? or both? + # article = maker.article( + # maker.key(entry.s_word), + # maker.definition_r( + # ET.CDATA(entry.defi), + # **{"type": ext}) + # ) + # ) + + def write(self) -> Generator[None, EntryType, None]: + from lxml import builder + from lxml import etree as ET + + file = self._file + encoding = self._encoding + maker = builder.ElementMaker() + + file.write( + """ + +""", + ) + + self.writeInfo(maker, pretty=True) + + if not isdir(self._resDir): + os.mkdir(self._resDir) + + pretty = True + while True: + entry = yield + if entry is None: + break + if entry.isData(): + self.writeDataEntry(maker, entry) + continue + entry.detectDefiFormat() + article = maker.article( + maker.key(entry.l_word[0]), + ) + for alt in entry.l_word[1:]: + article.append(maker.synonym(alt)) + article.append( + maker.definition( + ET.CDATA(entry.defi), + type=entry.defiFormat, + ), + ) + ET.indent(article, space="") + articleStr = cast( + "bytes", + ET.tostring( + article, + pretty_print=pretty, + encoding=encoding, + ), + ).decode(encoding) + # for some reason, "´k" becomes " ́k" (for example) # noqa: RUF003 + # stardict-text2bin tool also does this. + # https://en.wiktionary.org/wiki/%CB%88#Translingual + self._file.write(articleStr + "\n") + + file.write("") + + if not os.listdir(self._resDir): + os.rmdir(self._resDir) diff --git a/pyglossary/plugins/tabfile/__init__.py b/pyglossary/plugins/tabfile/__init__.py index f6324e1e5..a0939400b 100644 --- a/pyglossary/plugins/tabfile/__init__.py +++ b/pyglossary/plugins/tabfile/__init__.py @@ -2,28 +2,31 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -from pyglossary.compression import stdCompressions -from pyglossary.core import log from pyglossary.option import ( BoolOption, EncodingOption, FileSizeOption, Option, ) -from pyglossary.text_reader import TextGlossaryReader -from pyglossary.text_utils import ( - splitByBarUnescapeNTB, - unescapeNTB, -) - -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType -__all__ = ["Reader"] +from .reader import Reader +from .writer import Writer + +__all__ = [ + "Reader", + "Writer", + "description", + "enable", + "extensionCreate", + "extensions", + "kind", + "lname", + "name", + "optionsProp", + "singleFile", + "website", + "wiki", +] enable = True lname = "tabfile" @@ -50,89 +53,3 @@ comment="Add headwords title to beginning of definition", ), } - - -class Reader(TextGlossaryReader): - @classmethod - def isInfoWord(cls, word: str) -> bool: - return word.startswith("#") - - @classmethod - def fixInfoWord(cls, word: str) -> str: - return word.lstrip("#") - - def nextBlock(self) -> tuple[str | list[str], str, None] | None: - if not self._file: - raise StopIteration - line = self.readline() - if not line: - raise StopIteration - line = line.rstrip("\n") - if not line: - return None - ### - word: str | list[str] - word, tab, defi = line.partition("\t") - if not tab: - log.warning( - f"Warning: line starting with {line[:10]!r} has no tab!", - ) - return None - ### - if self._glos.alts: - word = splitByBarUnescapeNTB(word) - if len(word) == 1: - word = word[0] - else: - word = unescapeNTB(word, bar=False) - ### - defi = unescapeNTB(defi) - ### - return word, defi, None - - -class Writer: - _encoding: str = "utf-8" - _enable_info: bool = True - _resources: bool = True - _file_size_approx: int = 0 - _word_title: bool = False - - compressions = stdCompressions - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - - def open( - self, - filename: str, - ) -> None: - self._filename = filename - - def finish(self) -> None: - pass - - def write(self) -> Generator[None, EntryType, None]: - from pyglossary.text_utils import escapeNTB, joinByBar - from pyglossary.text_writer import TextGlossaryWriter - - writer = TextGlossaryWriter( - self._glos, - entryFmt="{word}\t{defi}\n", - writeInfo=self._enable_info, - outInfoKeysAliasDict=None, - ) - writer.setAttrs( - encoding=self._encoding, - wordListEncodeFunc=joinByBar, - wordEscapeFunc=escapeNTB, - defiEscapeFunc=escapeNTB, - ext=".txt", - resources=self._resources, - word_title=self._word_title, - file_size_approx=self._file_size_approx, - ) - writer.open(self._filename) - yield from writer.write() - writer.finish() diff --git a/pyglossary/plugins/tabfile/reader.py b/pyglossary/plugins/tabfile/reader.py new file mode 100644 index 000000000..c834b288c --- /dev/null +++ b/pyglossary/plugins/tabfile/reader.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from pyglossary.core import log +from pyglossary.text_reader import TextGlossaryReader +from pyglossary.text_utils import ( + splitByBarUnescapeNTB, + unescapeNTB, +) + + +class Reader(TextGlossaryReader): + @classmethod + def isInfoWord(cls, word: str) -> bool: + return word.startswith("#") + + @classmethod + def fixInfoWord(cls, word: str) -> str: + return word.lstrip("#") + + def nextBlock(self) -> tuple[str | list[str], str, None] | None: + if not self._file: + raise StopIteration + line = self.readline() + if not line: + raise StopIteration + line = line.rstrip("\n") + if not line: + return None + ### + word: str | list[str] + word, tab, defi = line.partition("\t") + if not tab: + log.warning( + f"Warning: line starting with {line[:10]!r} has no tab!", + ) + return None + ### + if self._glos.alts: + word = splitByBarUnescapeNTB(word) + if len(word) == 1: + word = word[0] + else: + word = unescapeNTB(word, bar=False) + ### + defi = unescapeNTB(defi) + ### + return word, defi, None diff --git a/pyglossary/plugins/tabfile/writer.py b/pyglossary/plugins/tabfile/writer.py new file mode 100644 index 000000000..cbdf42fe8 --- /dev/null +++ b/pyglossary/plugins/tabfile/writer.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.compression import stdCompressions + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + _encoding: str = "utf-8" + _enable_info: bool = True + _resources: bool = True + _file_size_approx: int = 0 + _word_title: bool = False + + compressions = stdCompressions + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + + def open( + self, + filename: str, + ) -> None: + self._filename = filename + + def finish(self) -> None: + pass + + def write(self) -> Generator[None, EntryType, None]: + from pyglossary.text_utils import escapeNTB, joinByBar + from pyglossary.text_writer import TextGlossaryWriter + + writer = TextGlossaryWriter( + self._glos, + entryFmt="{word}\t{defi}\n", + writeInfo=self._enable_info, + outInfoKeysAliasDict=None, + ) + writer.setAttrs( + encoding=self._encoding, + wordListEncodeFunc=joinByBar, + wordEscapeFunc=escapeNTB, + defiEscapeFunc=escapeNTB, + ext=".txt", + resources=self._resources, + word_title=self._word_title, + file_size_approx=self._file_size_approx, + ) + writer.open(self._filename) + yield from writer.write() + writer.finish() diff --git a/pyglossary/plugins/testformat/__init__.py b/pyglossary/plugins/testformat/__init__.py index be780e5fc..a38dbd2e8 100644 --- a/pyglossary/plugins/testformat/__init__.py +++ b/pyglossary/plugins/testformat/__init__.py @@ -1,13 +1,11 @@ - - -from __future__ import annotations - # -*- coding: utf-8 -*- -from collections.abc import Generator, Iterator +from __future__ import annotations -from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -37,87 +35,3 @@ # key is option/argument name, value is instance of Option optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._wordCount = 0 - - def __len__(self) -> int: - # return the number of entries if you have it - # if you don't, return 0 and progressbar will be disabled - # self._wordCount can be set in self.open function - # but if you want to set it, you should set it before - # iteration begins and __iter__ method is called - return self._wordCount - - def open(self, filename: str) -> None: - # open the file, read headers / info and set info to self._glos - # and set self._wordCount if you can - # read-options should be keyword arguments in this method - self._wordCount = 100 - # log.info(f"some useful message") - # here read info from file and set to Glossary object - self._glos.setInfo("name", "Test") - desc = "Test glossary created by a PyGlossary plugin" - self._glos.setInfo("description", desc) - self._glos.setInfo("author", "Me") - self._glos.setInfo("copyright", "GPL") - - def close(self) -> None: - # this is called after reading/conversion is finished - # if you have an open file object, close it here - # if you need to clean up temp files, do it here - pass - - def __iter__(self) -> Iterator[EntryType]: - # the easiest and simplest way to implement an Iterator is - # by writing a generator, by calling: yield glos.newEntry(word, defi) - # inside a loop (typically iterating over a file object for text file) - # another way (which is harder) is by implementing __next__ method - # and returning self in __iter__ - # that forces you to keep the state manually because __next__ is called - # repeatedly, but __iter__ is only called once - glos = self._glos - for i in range(self._wordCount): - # here get word and definition from file(depending on your format) - word = f"word_{i}" - defi = f"definition {i}" - yield glos.newEntry(word, defi) - - -class Writer: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - - def open(self, filename: str) -> None: - self._filename = filename - - def write(self) -> Generator[None, EntryType, None]: - glos = self._glos - filename = self._filename # noqa - # log.info(f"some useful message") - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # can save it with entry.save(directory) - continue - word = entry.s_word # noqa - defi = entry.defi # noqa - # here write word and defi to the output file (depending on - # your format) - # here read info from Glossaey object - name = glos.getInfo("name") # noqa - desc = glos.getInfo("description") # noqa - author = glos.author # noqa - copyright = glos.getInfo("copyright") # noqa - # if an info key doesn't exist, getInfo returns empty string - # now write info to the output file (depending on your output format) - - def finish(self) -> None: - self._filename = "" diff --git a/pyglossary/plugins/testformat/reader.py b/pyglossary/plugins/testformat/reader.py new file mode 100644 index 000000000..7f59bcff3 --- /dev/null +++ b/pyglossary/plugins/testformat/reader.py @@ -0,0 +1,57 @@ + + +from __future__ import annotations + +# -*- coding: utf-8 -*- +from collections.abc import Iterator + +from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._wordCount = 0 + + def __len__(self) -> int: + # return the number of entries if you have it + # if you don't, return 0 and progressbar will be disabled + # self._wordCount can be set in self.open function + # but if you want to set it, you should set it before + # iteration begins and __iter__ method is called + return self._wordCount + + def open(self, filename: str) -> None: + # open the file, read headers / info and set info to self._glos + # and set self._wordCount if you can + # read-options should be keyword arguments in this method + self._wordCount = 100 + # log.info(f"some useful message") + # here read info from file and set to Glossary object + self._glos.setInfo("name", "Test") + desc = "Test glossary created by a PyGlossary plugin" + self._glos.setInfo("description", desc) + self._glos.setInfo("author", "Me") + self._glos.setInfo("copyright", "GPL") + + def close(self) -> None: + # this is called after reading/conversion is finished + # if you have an open file object, close it here + # if you need to clean up temp files, do it here + pass + + def __iter__(self) -> Iterator[EntryType]: + # the easiest and simplest way to implement an Iterator is + # by writing a generator, by calling: yield glos.newEntry(word, defi) + # inside a loop (typically iterating over a file object for text file) + # another way (which is harder) is by implementing __next__ method + # and returning self in __iter__ + # that forces you to keep the state manually because __next__ is called + # repeatedly, but __iter__ is only called once + glos = self._glos + for i in range(self._wordCount): + # here get word and definition from file(depending on your format) + word = f"word_{i}" + defi = f"definition {i}" + yield glos.newEntry(word, defi) diff --git a/pyglossary/plugins/testformat/writer.py b/pyglossary/plugins/testformat/writer.py new file mode 100644 index 000000000..48f18b227 --- /dev/null +++ b/pyglossary/plugins/testformat/writer.py @@ -0,0 +1,43 @@ + + +from __future__ import annotations + +# -*- coding: utf-8 -*- +from collections.abc import Generator + +from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + + def open(self, filename: str) -> None: + self._filename = filename + + def write(self) -> Generator[None, EntryType, None]: + glos = self._glos + filename = self._filename # noqa + # log.info(f"some useful message") + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # can save it with entry.save(directory) + continue + word = entry.s_word # noqa + defi = entry.defi # noqa + # here write word and defi to the output file (depending on + # your format) + # here read info from Glossaey object + name = glos.getInfo("name") # noqa + desc = glos.getInfo("description") # noqa + author = glos.author # noqa + copyright = glos.getInfo("copyright") # noqa + # if an info key doesn't exist, getInfo returns empty string + # now write info to the output file (depending on your output format) + + def finish(self) -> None: + self._filename = "" diff --git a/pyglossary/plugins/wiktextract/__init__.py b/pyglossary/plugins/wiktextract/__init__.py index 9fa987bb6..8c3ab13d1 100644 --- a/pyglossary/plugins/wiktextract/__init__.py +++ b/pyglossary/plugins/wiktextract/__init__.py @@ -1,26 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import collections -from collections import Counter -from io import BytesIO, IOBase -from json import loads as json_loads -from typing import TYPE_CHECKING, cast - -if TYPE_CHECKING: - from collections.abc import Callable, Iterator - from typing import Any - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.lxml_types import Element, T_htmlfile - - -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import exc_note, log, pip -from pyglossary.io_utils import nullBinaryIO from pyglossary.option import ( BoolOption, ListOption, @@ -28,6 +8,8 @@ StrOption, ) +from .reader import Reader + __all__ = [ "Reader", "description", @@ -82,636 +64,3 @@ comment="Enable categories", ), } - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - _word_title: bool = False - _pron_color: str = "gray" - _gram_color: str = "green" - - # 'top right' or 'top right bottom left' - _example_padding: str = "10px 20px" - - _audio: bool = True - - _audio_formats: list[str] = ["ogg", "mp3"] - - _categories: bool = False - - topicStyle = ( - "color:white;" - "background:green;" - "padding-left:3px;" - "padding-right:3px;" - "border-radius:0.5ex;" - # 0.5ex ~= 0.3em, but "ex" is recommended - ) - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: IOBase = nullBinaryIO - self._fileSize = 0 - self._wordCount = 0 - - def open( - self, - filename: str, - ) -> None: - try: - pass - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - self._filename = filename - cfile = compressionOpen(filename, mode="rt", encoding="utf-8") - - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - self._glos.setInfo("input_file_size", str(self._fileSize)) - else: - self.warning("Wiktextract Reader: file is not seekable") - - self._glos.setDefaultDefiFormat("h") - - if self._word_title: - self._glos.setInfo("definition_has_headwords", "True") - - self._file = cfile - self._warnings: Counter[str] = collections.Counter() - - def close(self) -> None: - self._file.close() - self._file = nullBinaryIO - self._filename = "" - self._fileSize = 0 - - def __len__(self) -> int: - return 0 - - def __iter__(self) -> Iterator[EntryType]: - while line := self._file.readline(): - line = line.strip() - if not line: - continue - yield self.makeEntry(json_loads(line)) - for _msg, count in self._warnings.most_common(): - msg = _msg - if count > 1: - msg = f"[{count} times] {msg}" - log.warning(msg) - - def warning(self, msg: str) -> None: - self._warnings[msg] += 1 - - def makeEntry(self, data: dict[str, Any]) -> EntryType: # noqa: PLR0912 - from lxml import etree as ET - - glos = self._glos - f = BytesIO() - - def br() -> Element: - return ET.Element("br") - - keywords: list[str] = [] - inflectedKeywords: list[str] = [] - - word = data.get("word") - if word: - keywords.append(word) - - for formDict in data.get("forms", []): - form: str = formDict.get("form", "") - if not form: - continue - if len(form) > 80: - self.warning(f"'form' too long: {form}") - continue - source: str = formDict.get("source", "") - # tags = formDict.get("tags", []) - if source == "Inflection": - inflectedKeywords.append(form) - else: - keywords.append(form) - - keywords += inflectedKeywords - - with ET.htmlfile(f, encoding="utf-8") as hf: - with hf.element("div"): - if self._word_title: - for keyword in keywords: - with hf.element(glos.titleTag(keyword)): - hf.write(keyword) - hf.write(br()) - - hf_ = cast("T_htmlfile", hf) - - self.writeSoundList(hf_, data.get("sounds")) - - pos: str | None = data.get("pos") - if pos: - with hf.element("div", attrib={"class": "pos"}): - with hf.element("font", color=self._gram_color): - hf.write(pos) - - senses = data.get("senses") or [] - - self.writeSenseList(hf_, senses) # type: ignore - - self.writeSynonyms(hf_, data.get("synonyms")) # type: ignore - - self.writeAntonyms(hf_, data.get("antonyms")) # type: ignore - - # TODO: data.get("translations") - # list[dict[str, str]] - # dict keys: code, "lang", "sense", "word" - - etymology: str = data.get("etymology_text", "") - if etymology: - hf.write(br()) - with hf.element("div"): - hf.write(f"Etymology: {etymology}") - - if self._categories: - categories = [] - for sense in senses: - senseCats = sense.get("categories") - if senseCats: - categories += senseCats - self.writeSenseCategories(hf_, categories) - - defi = f.getvalue().decode("utf-8") - # defi = defi.replace("\xa0", " ") # do we need to do this? - file = self._file - return self._glos.newEntry( - keywords, - defi, - defiFormat="h", - byteProgress=(file.tell(), self._fileSize), - ) - - def writeSoundPron( - self, - hf: T_htmlfile, - sound: dict[str, Any], - ) -> None: - # "homophone" key found in Dutch and Arabic dictionaries - # (similar-sounding words for Arabic) - for key in ("ipa", "other", "rhymes", "homophone"): - value = sound.get(key) - if not value: - continue - with hf.element("font", color=self._pron_color): - hf.write(str(value)) - hf.write(f" ({key})") - - def writeSoundAudio( - self, - hf: T_htmlfile, - sound: dict[str, Any], - ) -> None: - # TODO: add a read-option for audio - # keys for audio: - # "audio" (file name), "text" (link text), "ogg_url", "mp3_url" - # possible "tags" (list[str]) - - text = sound.get("text") - if text: - hf.write(f"{text}: ") - with hf.element("audio", attrib={"controls": ""}): - for _format in self._audio_formats: - url = sound.get(f"{_format}_url") - if not url: - continue - with hf.element( - "source", - attrib={ - "src": url, - "type": f"audio/{_format}", - }, - ): - pass - - def writeSoundList( - self, - hf: T_htmlfile, - soundList: list[dict[str, Any]] | None, - ) -> None: - if not soundList: - return - - pronList: list[dict[str, Any]] = [] - audioList: list[dict[str, Any]] = [] - - for sound in soundList: - if "audio" in sound: - if self._audio: - audioList.append(sound) - continue - pronList.append(sound) - # can it contain both audio and pronunciation? - - if pronList: - with hf.element("div", attrib={"class": "pronunciations"}): - for i, sound in enumerate(pronList): - if i > 0: - hf.write(", ") - self.writeSoundPron(hf, sound) - - for sound in audioList: - with hf.element("div", attrib={"class": "audio"}): - self.writeSoundAudio(hf, sound) - - def writeSenseList( - self, - hf: T_htmlfile, - senseList: list[dict[str, Any]], - ) -> None: - if not senseList: - return - - self.makeList( - hf, - senseList, - self.writeSense, - ) - - def writeSenseGloss( # noqa: PLR6301 - self, - hf: T_htmlfile, - text: str | None, - ) -> None: - hf.write(text or "") - - def writeSenseCategory( # noqa: PLR6301 - self, - hf: T_htmlfile, - category: dict[str, Any], - ) -> None: - # keys: name: str, kind: str, parents: list, source: str - # values for "source" (that I found): "w", "w+disamb" - name = category.get("name") - if not name: - self.warning(f"{category = }") - return - desc = name - source = category.get("source") - if source: - desc = f"{desc} (source: {source})" - hf.write(desc) - - def writeSenseCategories( - self, - hf: T_htmlfile, - categories: list[dict[str, Any]] | None, - ) -> None: - if not categories: - return - # long names, mostly about grammar? - with hf.element("div", attrib={"class": "categories"}): - hf.write("Categories: ") - self.makeList(hf, categories, self.writeSenseCategory) - - def writeSenseExample( # noqa: PLR6301, PLR0912 - self, - hf: T_htmlfile, - example: dict[str, str | list], - ) -> None: - # example keys: text, "english", "ref", "type" - textList: list[tuple[str | None, str]] = [] - text_: str | list = example.pop("example", "") - if text_: - assert isinstance(text_, str) - textList.append((None, text_)) - - example.pop("ref", "") - example.pop("type", "") - - for key, value in example.items(): - if not value: - continue - prefix: str | None = key - if prefix in ("text",): # noqa: PLR6201, FURB171 - prefix = None - if isinstance(value, str): - textList.append((prefix, value)) - elif isinstance(value, list): - for item in value: - if isinstance(item, str): - textList.append((prefix, item)) - elif isinstance(item, list): - textList += [(prefix, item2) for item2 in item] - else: - log.error(f"writeSenseExample: invalid type for {value=}") - - if not textList: - return - - def writePair(prefix: str | None, text: str) -> None: - if prefix: - with hf.element("b"): - hf.write(prefix) - hf.write(": ") - hf.write(text) - - if len(textList) == 1: - prefix, text = textList[0] - writePair(prefix, text) - return - - with hf.element("ul"): - for prefix, text in textList: - with hf.element("li"): - writePair(prefix, text) - - def writeSenseExamples( - self, - hf: T_htmlfile, - examples: list[dict[str, str | list]] | None, - ) -> None: - from lxml import etree as ET - - if not examples: - return - hf.write(ET.Element("br")) - with hf.element("div", attrib={"class": "examples"}): - hf.write("Examples:") - hf.write(ET.Element("br")) - for example in examples: - with hf.element( - "div", - attrib={ - "class": "example", - "style": f"padding: {self._example_padding};", - }, - ): - self.writeSenseExample(hf, example) - - def writeSenseFormOf( # noqa: PLR6301 - self, - hf: T_htmlfile, - form_of: dict[str, str], - ) -> None: - from lxml import etree as ET - - # {"word": ..., "extra": ...} - word = form_of.get("word") - if not word: - return - hf.write(word) - extra = form_of.get("extra") - if extra: - hf.write(ET.Element("br")) - hf.write(extra) - - def writeSenseFormOfList( - self, - hf: T_htmlfile, - form_of_list: list[dict[str, str]] | None, - ) -> None: - if not form_of_list: - return - with hf.element("div", attrib={"class": "form_of"}): - hf.write("Form of: ") - self.makeList(hf, form_of_list, self.writeSenseFormOf) - - def writeTags( - self, - hf: T_htmlfile, - tags: list[str] | None, - toRemove: list[str] | None, - ) -> None: - if not tags: - return - - if toRemove: - for tag in toRemove: - if tag in tags: - tags.remove(tag) - if not tags: - return - - with hf.element("div", attrib={"class": "tags"}): - for i, tag in enumerate(tags): - if i > 0: - hf.write(", ") - with hf.element("font", color=self._gram_color): - hf.write(tag) - - def writeTopics( - self, - hf: T_htmlfile, - topics: list[str] | None, - ) -> None: - if not topics: - return - - with hf.element("div", attrib={"class": "tags"}): - for i, topic in enumerate(topics): - if i > 0: - hf.write(" ") - with hf.element("span", style=self.topicStyle): - hf.write(topic) - - def addWordLink( # noqa: PLR6301 - self, - hf: T_htmlfile, - word: str, - wordClass: str = "", - ) -> None: - i = word.find(" [") - if i >= 0: - word = word[:i] - if not word: - return - attrib = {"href": f"bword://{word}"} - if wordClass: - attrib["class"] = wordClass - with hf.element( - "a", - attrib=attrib, - ): - hf.write(word) - - def writeSynonyms( - self, - hf: T_htmlfile, - synonyms: list[dict[str, Any]] | None, - ) -> None: - if not synonyms: - return - - # "word": "str", - # "sense": "str", - # "_dis1": "str", - # "tags": list[str] - # "extra": "str", - # "english": "str" - - with hf.element("div"): - hf.write("Synonyms: ") - for i, item in enumerate(synonyms): - if i > 0: - hf.write(", ") - word = item.get("word") - if not word: - continue - self.addWordLink(hf, word) - - def writeAntonyms( - self, - hf: T_htmlfile, - antonyms: list[dict[str, str]] | None, - ) -> None: - if not antonyms: - return - # dict keys: word - with hf.element("div"): - hf.write("Antonyms: ") - for i, item in enumerate(antonyms): - if i > 0: - hf.write(", ") - word = item.get("word") - if not word: - continue - self.addWordLink(hf, word, wordClass="antonym") - - def writeRelated( - self, - hf: T_htmlfile, - relatedList: list[dict[str, str]] | None, - ) -> None: - if not relatedList: - return - # dict keys: sense, "word", "english" - with hf.element("div"): - hf.write("Related: ") - for i, item in enumerate(relatedList): - if i > 0: - hf.write(", ") - word = item.get("word") - if not word: - continue - self.addWordLink(hf, word) - - def writeSenseLinks( - self, - hf: T_htmlfile, - linkList: list[list[str]] | None, - ) -> None: - if not linkList: - return - with hf.element("div"): - hf.write("Links: ") - for i, link in enumerate(linkList): - if len(link) != 2: - self.warning(f"unexpected {link =}") - continue - text, ref = link - sq = ref.find("#") - if sq == 0: - ref = text - elif sq > 0: - ref = ref[:sq] - if i > 0: - hf.write(", ") - self.addWordLink(hf, ref) - - def writeSense( - self, - hf: T_htmlfile, - sense: dict[str, Any], - ) -> None: - from lxml import etree as ET - - # tags seem to be mostly about grammar, so with format it like grammar - self.writeTags( - hf, - sense.get("tags"), - toRemove=["form-of"], - ) - - # for key in ("english",): - # text: "str | None" = sense.get("english") - # if not text: - # continue - # keyCap = key.capitalize() - # with hf.element("div"): - # with hf.element("b"): - # hf.write(keyCap) - # hf.write(f": {text}") - - # sense["glosses"] and sense["english"] seems to be unreliable - # for example: - # "raw_glosses": ["(short) story, fable, play"], - # "english": "short", - # "glosses": ["story, fable, play"], - - glosses: list[str] | None = sense.get("raw_glosses") - if not glosses: - glosses = sense.get("glosses") - if glosses: - self.makeList(hf, glosses, self.writeSenseGloss) - - self.writeTopics(hf, sense.get("topics")) - - self.writeSenseFormOfList(hf, sense.get("form_of")) - - self.writeSynonyms(hf, sense.get("synonyms")) - - self.writeAntonyms(hf, sense.get("antonyms")) - - self.writeRelated(hf, sense.get("related")) - - self.writeSenseLinks(hf, sense.get("links")) - - self.writeSenseExamples(hf, sense.get("examples")) - - # alt_of[i]["word"] seem to point to a word that is - # mentioned in sense["raw_glosses"] - # so we could try to find that word and turn it into a link - # sense.get("alt_of"): list[dict[str, str]] | None - - # sense.get("wikipedia", []): list[str] - # sense.get("wikidata", []): list[str] - # sense.get("id", ""): str # not useful - # sense.get("senseid", []): list[str] # not useful - - hf.write(ET.Element("br")) - - @staticmethod - def makeList( # noqa: PLR0913 - hf: T_htmlfile, - input_objects: list[Any], - processor: Callable, - ordered: bool = True, - skip_single: bool = True, - # single_prefix: str = "", - # list_type: str = "", - ) -> None: - """Wrap elements into
                        if more than one element.""" - if not input_objects: - return - - if skip_single and len(input_objects) == 1: - # if single_prefix: - # hf.write(single_prefix) - processor(hf, input_objects[0]) - return - - attrib: dict[str, str] = {} - # if list_type: - # attrib["type"] = list_type - - with hf.element("ol" if ordered else "ul", attrib=attrib): - for el in input_objects: - with hf.element("li"): - processor(hf, el) diff --git a/pyglossary/plugins/wiktextract/reader.py b/pyglossary/plugins/wiktextract/reader.py new file mode 100644 index 000000000..a029edd19 --- /dev/null +++ b/pyglossary/plugins/wiktextract/reader.py @@ -0,0 +1,656 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import collections +from collections import Counter +from io import BytesIO, IOBase +from json import loads as json_loads +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + from collections.abc import Callable, Iterator + from typing import Any + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.lxml_types import Element, T_htmlfile + + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import exc_note, log, pip +from pyglossary.io_utils import nullBinaryIO + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + _word_title: bool = False + _pron_color: str = "gray" + _gram_color: str = "green" + + # 'top right' or 'top right bottom left' + _example_padding: str = "10px 20px" + + _audio: bool = True + + _audio_formats: list[str] = ["ogg", "mp3"] + + _categories: bool = False + + topicStyle = ( + "color:white;" + "background:green;" + "padding-left:3px;" + "padding-right:3px;" + "border-radius:0.5ex;" + # 0.5ex ~= 0.3em, but "ex" is recommended + ) + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: IOBase = nullBinaryIO + self._fileSize = 0 + self._wordCount = 0 + + def open( + self, + filename: str, + ) -> None: + try: + pass + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + self._filename = filename + cfile = compressionOpen(filename, mode="rt", encoding="utf-8") + + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + self._glos.setInfo("input_file_size", str(self._fileSize)) + else: + self.warning("Wiktextract Reader: file is not seekable") + + self._glos.setDefaultDefiFormat("h") + + if self._word_title: + self._glos.setInfo("definition_has_headwords", "True") + + self._file = cfile + self._warnings: Counter[str] = collections.Counter() + + def close(self) -> None: + self._file.close() + self._file = nullBinaryIO + self._filename = "" + self._fileSize = 0 + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> Iterator[EntryType]: + while line := self._file.readline(): + line = line.strip() + if not line: + continue + yield self.makeEntry(json_loads(line)) + for _msg, count in self._warnings.most_common(): + msg = _msg + if count > 1: + msg = f"[{count} times] {msg}" + log.warning(msg) + + def warning(self, msg: str) -> None: + self._warnings[msg] += 1 + + def makeEntry(self, data: dict[str, Any]) -> EntryType: # noqa: PLR0912 + from lxml import etree as ET + + glos = self._glos + f = BytesIO() + + def br() -> Element: + return ET.Element("br") + + keywords: list[str] = [] + inflectedKeywords: list[str] = [] + + word = data.get("word") + if word: + keywords.append(word) + + for formDict in data.get("forms", []): + form: str = formDict.get("form", "") + if not form: + continue + if len(form) > 80: + self.warning(f"'form' too long: {form}") + continue + source: str = formDict.get("source", "") + # tags = formDict.get("tags", []) + if source == "Inflection": + inflectedKeywords.append(form) + else: + keywords.append(form) + + keywords += inflectedKeywords + + with ET.htmlfile(f, encoding="utf-8") as hf: + with hf.element("div"): + if self._word_title: + for keyword in keywords: + with hf.element(glos.titleTag(keyword)): + hf.write(keyword) + hf.write(br()) + + hf_ = cast("T_htmlfile", hf) + + self.writeSoundList(hf_, data.get("sounds")) + + pos: str | None = data.get("pos") + if pos: + with hf.element("div", attrib={"class": "pos"}): + with hf.element("font", color=self._gram_color): + hf.write(pos) + + senses = data.get("senses") or [] + + self.writeSenseList(hf_, senses) # type: ignore + + self.writeSynonyms(hf_, data.get("synonyms")) # type: ignore + + self.writeAntonyms(hf_, data.get("antonyms")) # type: ignore + + # TODO: data.get("translations") + # list[dict[str, str]] + # dict keys: code, "lang", "sense", "word" + + etymology: str = data.get("etymology_text", "") + if etymology: + hf.write(br()) + with hf.element("div"): + hf.write(f"Etymology: {etymology}") + + if self._categories: + categories = [] + for sense in senses: + senseCats = sense.get("categories") + if senseCats: + categories += senseCats + self.writeSenseCategories(hf_, categories) + + defi = f.getvalue().decode("utf-8") + # defi = defi.replace("\xa0", " ") # do we need to do this? + file = self._file + return self._glos.newEntry( + keywords, + defi, + defiFormat="h", + byteProgress=(file.tell(), self._fileSize), + ) + + def writeSoundPron( + self, + hf: T_htmlfile, + sound: dict[str, Any], + ) -> None: + # "homophone" key found in Dutch and Arabic dictionaries + # (similar-sounding words for Arabic) + for key in ("ipa", "other", "rhymes", "homophone"): + value = sound.get(key) + if not value: + continue + with hf.element("font", color=self._pron_color): + hf.write(str(value)) + hf.write(f" ({key})") + + def writeSoundAudio( + self, + hf: T_htmlfile, + sound: dict[str, Any], + ) -> None: + # TODO: add a read-option for audio + # keys for audio: + # "audio" (file name), "text" (link text), "ogg_url", "mp3_url" + # possible "tags" (list[str]) + + text = sound.get("text") + if text: + hf.write(f"{text}: ") + with hf.element("audio", attrib={"controls": ""}): + for _format in self._audio_formats: + url = sound.get(f"{_format}_url") + if not url: + continue + with hf.element( + "source", + attrib={ + "src": url, + "type": f"audio/{_format}", + }, + ): + pass + + def writeSoundList( + self, + hf: T_htmlfile, + soundList: list[dict[str, Any]] | None, + ) -> None: + if not soundList: + return + + pronList: list[dict[str, Any]] = [] + audioList: list[dict[str, Any]] = [] + + for sound in soundList: + if "audio" in sound: + if self._audio: + audioList.append(sound) + continue + pronList.append(sound) + # can it contain both audio and pronunciation? + + if pronList: + with hf.element("div", attrib={"class": "pronunciations"}): + for i, sound in enumerate(pronList): + if i > 0: + hf.write(", ") + self.writeSoundPron(hf, sound) + + for sound in audioList: + with hf.element("div", attrib={"class": "audio"}): + self.writeSoundAudio(hf, sound) + + def writeSenseList( + self, + hf: T_htmlfile, + senseList: list[dict[str, Any]], + ) -> None: + if not senseList: + return + + self.makeList( + hf, + senseList, + self.writeSense, + ) + + def writeSenseGloss( # noqa: PLR6301 + self, + hf: T_htmlfile, + text: str | None, + ) -> None: + hf.write(text or "") + + def writeSenseCategory( # noqa: PLR6301 + self, + hf: T_htmlfile, + category: dict[str, Any], + ) -> None: + # keys: name: str, kind: str, parents: list, source: str + # values for "source" (that I found): "w", "w+disamb" + name = category.get("name") + if not name: + self.warning(f"{category = }") + return + desc = name + source = category.get("source") + if source: + desc = f"{desc} (source: {source})" + hf.write(desc) + + def writeSenseCategories( + self, + hf: T_htmlfile, + categories: list[dict[str, Any]] | None, + ) -> None: + if not categories: + return + # long names, mostly about grammar? + with hf.element("div", attrib={"class": "categories"}): + hf.write("Categories: ") + self.makeList(hf, categories, self.writeSenseCategory) + + def writeSenseExample( # noqa: PLR6301, PLR0912 + self, + hf: T_htmlfile, + example: dict[str, str | list], + ) -> None: + # example keys: text, "english", "ref", "type" + textList: list[tuple[str | None, str]] = [] + text_: str | list = example.pop("example", "") + if text_: + assert isinstance(text_, str) + textList.append((None, text_)) + + example.pop("ref", "") + example.pop("type", "") + + for key, value in example.items(): + if not value: + continue + prefix: str | None = key + if prefix in ("text",): # noqa: PLR6201, FURB171 + prefix = None + if isinstance(value, str): + textList.append((prefix, value)) + elif isinstance(value, list): + for item in value: + if isinstance(item, str): + textList.append((prefix, item)) + elif isinstance(item, list): + textList += [(prefix, item2) for item2 in item] + else: + log.error(f"writeSenseExample: invalid type for {value=}") + + if not textList: + return + + def writePair(prefix: str | None, text: str) -> None: + if prefix: + with hf.element("b"): + hf.write(prefix) + hf.write(": ") + hf.write(text) + + if len(textList) == 1: + prefix, text = textList[0] + writePair(prefix, text) + return + + with hf.element("ul"): + for prefix, text in textList: + with hf.element("li"): + writePair(prefix, text) + + def writeSenseExamples( + self, + hf: T_htmlfile, + examples: list[dict[str, str | list]] | None, + ) -> None: + from lxml import etree as ET + + if not examples: + return + hf.write(ET.Element("br")) + with hf.element("div", attrib={"class": "examples"}): + hf.write("Examples:") + hf.write(ET.Element("br")) + for example in examples: + with hf.element( + "div", + attrib={ + "class": "example", + "style": f"padding: {self._example_padding};", + }, + ): + self.writeSenseExample(hf, example) + + def writeSenseFormOf( # noqa: PLR6301 + self, + hf: T_htmlfile, + form_of: dict[str, str], + ) -> None: + from lxml import etree as ET + + # {"word": ..., "extra": ...} + word = form_of.get("word") + if not word: + return + hf.write(word) + extra = form_of.get("extra") + if extra: + hf.write(ET.Element("br")) + hf.write(extra) + + def writeSenseFormOfList( + self, + hf: T_htmlfile, + form_of_list: list[dict[str, str]] | None, + ) -> None: + if not form_of_list: + return + with hf.element("div", attrib={"class": "form_of"}): + hf.write("Form of: ") + self.makeList(hf, form_of_list, self.writeSenseFormOf) + + def writeTags( + self, + hf: T_htmlfile, + tags: list[str] | None, + toRemove: list[str] | None, + ) -> None: + if not tags: + return + + if toRemove: + for tag in toRemove: + if tag in tags: + tags.remove(tag) + if not tags: + return + + with hf.element("div", attrib={"class": "tags"}): + for i, tag in enumerate(tags): + if i > 0: + hf.write(", ") + with hf.element("font", color=self._gram_color): + hf.write(tag) + + def writeTopics( + self, + hf: T_htmlfile, + topics: list[str] | None, + ) -> None: + if not topics: + return + + with hf.element("div", attrib={"class": "tags"}): + for i, topic in enumerate(topics): + if i > 0: + hf.write(" ") + with hf.element("span", style=self.topicStyle): + hf.write(topic) + + def addWordLink( # noqa: PLR6301 + self, + hf: T_htmlfile, + word: str, + wordClass: str = "", + ) -> None: + i = word.find(" [") + if i >= 0: + word = word[:i] + if not word: + return + attrib = {"href": f"bword://{word}"} + if wordClass: + attrib["class"] = wordClass + with hf.element( + "a", + attrib=attrib, + ): + hf.write(word) + + def writeSynonyms( + self, + hf: T_htmlfile, + synonyms: list[dict[str, Any]] | None, + ) -> None: + if not synonyms: + return + + # "word": "str", + # "sense": "str", + # "_dis1": "str", + # "tags": list[str] + # "extra": "str", + # "english": "str" + + with hf.element("div"): + hf.write("Synonyms: ") + for i, item in enumerate(synonyms): + if i > 0: + hf.write(", ") + word = item.get("word") + if not word: + continue + self.addWordLink(hf, word) + + def writeAntonyms( + self, + hf: T_htmlfile, + antonyms: list[dict[str, str]] | None, + ) -> None: + if not antonyms: + return + # dict keys: word + with hf.element("div"): + hf.write("Antonyms: ") + for i, item in enumerate(antonyms): + if i > 0: + hf.write(", ") + word = item.get("word") + if not word: + continue + self.addWordLink(hf, word, wordClass="antonym") + + def writeRelated( + self, + hf: T_htmlfile, + relatedList: list[dict[str, str]] | None, + ) -> None: + if not relatedList: + return + # dict keys: sense, "word", "english" + with hf.element("div"): + hf.write("Related: ") + for i, item in enumerate(relatedList): + if i > 0: + hf.write(", ") + word = item.get("word") + if not word: + continue + self.addWordLink(hf, word) + + def writeSenseLinks( + self, + hf: T_htmlfile, + linkList: list[list[str]] | None, + ) -> None: + if not linkList: + return + with hf.element("div"): + hf.write("Links: ") + for i, link in enumerate(linkList): + if len(link) != 2: + self.warning(f"unexpected {link =}") + continue + text, ref = link + sq = ref.find("#") + if sq == 0: + ref = text + elif sq > 0: + ref = ref[:sq] + if i > 0: + hf.write(", ") + self.addWordLink(hf, ref) + + def writeSense( + self, + hf: T_htmlfile, + sense: dict[str, Any], + ) -> None: + from lxml import etree as ET + + # tags seem to be mostly about grammar, so with format it like grammar + self.writeTags( + hf, + sense.get("tags"), + toRemove=["form-of"], + ) + + # for key in ("english",): + # text: "str | None" = sense.get("english") + # if not text: + # continue + # keyCap = key.capitalize() + # with hf.element("div"): + # with hf.element("b"): + # hf.write(keyCap) + # hf.write(f": {text}") + + # sense["glosses"] and sense["english"] seems to be unreliable + # for example: + # "raw_glosses": ["(short) story, fable, play"], + # "english": "short", + # "glosses": ["story, fable, play"], + + glosses: list[str] | None = sense.get("raw_glosses") + if not glosses: + glosses = sense.get("glosses") + if glosses: + self.makeList(hf, glosses, self.writeSenseGloss) + + self.writeTopics(hf, sense.get("topics")) + + self.writeSenseFormOfList(hf, sense.get("form_of")) + + self.writeSynonyms(hf, sense.get("synonyms")) + + self.writeAntonyms(hf, sense.get("antonyms")) + + self.writeRelated(hf, sense.get("related")) + + self.writeSenseLinks(hf, sense.get("links")) + + self.writeSenseExamples(hf, sense.get("examples")) + + # alt_of[i]["word"] seem to point to a word that is + # mentioned in sense["raw_glosses"] + # so we could try to find that word and turn it into a link + # sense.get("alt_of"): list[dict[str, str]] | None + + # sense.get("wikipedia", []): list[str] + # sense.get("wikidata", []): list[str] + # sense.get("id", ""): str # not useful + # sense.get("senseid", []): list[str] # not useful + + hf.write(ET.Element("br")) + + @staticmethod + def makeList( # noqa: PLR0913 + hf: T_htmlfile, + input_objects: list[Any], + processor: Callable, + ordered: bool = True, + skip_single: bool = True, + # single_prefix: str = "", + # list_type: str = "", + ) -> None: + """Wrap elements into
                          if more than one element.""" + if not input_objects: + return + + if skip_single and len(input_objects) == 1: + # if single_prefix: + # hf.write(single_prefix) + processor(hf, input_objects[0]) + return + + attrib: dict[str, str] = {} + # if list_type: + # attrib["type"] = list_type + + with hf.element("ol" if ordered else "ul", attrib=attrib): + for el in input_objects: + with hf.element("li"): + processor(hf, el) diff --git a/pyglossary/plugins/wordnet/__init__.py b/pyglossary/plugins/wordnet/__init__.py index b9e9007e4..ad237d80f 100644 --- a/pyglossary/plugins/wordnet/__init__.py +++ b/pyglossary/plugins/wordnet/__init__.py @@ -1,33 +1,11 @@ # -*- coding: utf-8 -*- -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 3 -# as published by the Free Software Foundation. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License -# for more details. -# -# Copyright (C) 2023 Saeed Rasooli -# Copyright (C) 2015 Igor Tkach -# -# This plugin is based on https://github.com/itkach/wordnet2slob from __future__ import annotations -import os -import re -import sys -from collections import defaultdict from typing import TYPE_CHECKING -from pyglossary.core import log +from .reader import Reader if TYPE_CHECKING: - import io - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option __all__ = [ @@ -61,303 +39,3 @@ # key is option/argument name, value is instance of Option optionsProp: dict[str, Option] = {} - -# original expression from -# http://stackoverflow.com/questions/694344/regular-expression-that-matches-between-quotes-containing-escaped-quotes -# "(?:[^\\"]+|\\.)*" -# some examples don't have closing quote which -# make the subn with this expression hang -# quotedTextPattern = re.compile(r'"(?:[^"]+|\.)*["|\n]') - -# make it a capturing group so that we can get rid of quotes -quotedTextPattern = re.compile(r'"([^"]+)"') - -refPattern = re.compile(r"`(\w+)'") - - -class SynSet: - def __init__(self, line: str | bytes) -> None: - self.line = line - if isinstance(line, bytes): - line = line.decode("utf-8") - meta, self.gloss = line.split("|") - self.meta_parts = meta.split() - - @property - def offset(self) -> int: - return int(self.meta_parts[0]) - - @property - def lex_filenum(self) -> str: - return self.meta_parts[1] - - @property - def ss_type(self) -> str: - return self.meta_parts[2] - - @property - def w_cnt(self) -> int: - return int(self.meta_parts[3], 16) - - @property - def words(self) -> list[str]: - return [self.meta_parts[4 + 2 * i].replace("_", " ") for i in range(self.w_cnt)] - - @property - def pointers(self) -> list[Pointer]: - p_cnt_index = 4 + 2 * self.w_cnt - p_cnt = self.meta_parts[p_cnt_index] - pointer_count = int(p_cnt) - start = p_cnt_index + 1 - return [ - Pointer(*self.meta_parts[start + i * 4 : start + (i + 1) * 4]) # type: ignore - for i in range(pointer_count) - ] - - def __repr__(self) -> str: - return f"SynSet({self.line!r})" - - -class PointerSymbols: - n = { - "!": "Antonyms", - "@": "Hypernyms", - "@i": "Instance hypernyms", - "~": "Hyponyms", - "~i": "Instance hyponyms", - "#m": "Member holonyms", - "#s": "Substance holonyms", - "#p": "Part holonyms", - "%m": "Member meronyms", - "%s": "Substance meronyms", - "%p": "Part meronyms", - "=": "Attributes", - "+": "Derivationally related forms", - ";c": "Domain of synset - TOPIC", - "-c": "Member of this domain - TOPIC", - ";r": "Domain of synset - REGION", - "-r": "Member of this domain - REGION", - ";u": "Domain of synset - USAGE", - "-u": "Member of this domain - USAGE", - "^": "Also see", - } - - v = { - "!": "Antonyms", - "@": "Hypernyms", - "~": "Hyponyms", - "*": "Entailments", - ">": "Cause", - "^": "Also see", - "$": "Verb group", - "+": "Derivationally related forms", - ";c": "Domain of synset - TOPIC", - ";r": "Domain of synset - REGION", - ";u": "Domain of synset - USAGE", - } - - a = s = { - "!": "Antonyms", - "+": "Derivationally related forms", - "&": "Similar to", - "<": "Participle of verb", - "\\": "Pertainyms", - "=": "Attributes", - "^": "Also see", - ";c": "Domain of synset - TOPIC", - ";r": "Domain of synset - REGION", - ";u": "Domain of synset - USAGE", - } - - r = { - "!": "Antonyms", - "\\": "Derived from adjective", - "+": "Derivationally related forms", - ";c": "Domain of synset - TOPIC", - ";r": "Domain of synset - REGION", - ";u": "Domain of synset - USAGE", - "^": "Also see", - } - - -class Pointer: - def __init__(self, symbol: str, offset: str, pos: str, source_target: str) -> None: - self.symbol = symbol - self.offset = int(offset) - self.pos = pos - self.source_target = source_target - self.source = int(source_target[:2], 16) - self.target = int(source_target[2:], 16) - - def __repr__(self) -> str: - return ( - f"Pointer({self.symbol!r}, {self.offset!r}, " - f"{self.pos!r}, {self.source_target!r})" - ) - - -class WordNet: - article_template = "

                          %s

                          %s" - synSetTypes = { - "n": "n.", - "v": "v.", - "a": "adj.", - "s": "adj. satellite", - "r": "adv.", - } - - file2pos = { - "data.adj": ["a", "s"], - "data.adv": ["r"], - "data.noun": ["n"], - "data.verb": ["v"], - } - - def __init__(self, wordnetdir: str) -> None: - self.wordnetdir = wordnetdir - self.collector: dict[str, list[str]] = defaultdict(list) - - @staticmethod - def iterlines(dict_dir: str) -> Iterator[str]: - for name in os.listdir(dict_dir): - if not name.startswith("data."): - continue - with open(os.path.join(dict_dir, name), encoding="utf-8") as f: - for line in f: - if not line.startswith(" "): - yield line - - # PLR0912 Too many branches (16 > 12) - def prepare(self) -> None: # noqa: PLR0912 - synSetTypes = self.synSetTypes - file2pos = self.file2pos - - dict_dir = self.wordnetdir - - files: dict[str, io.TextIOWrapper] = {} - for name in os.listdir(dict_dir): - if name.startswith("data.") and name in file2pos: - f = open(os.path.join(dict_dir, name), encoding="utf-8") # noqa: SIM115 - for key in file2pos[name]: - files[key] = f - - def a(word: str) -> str: - return f'{word}' - - for index, line in enumerate(self.iterlines(dict_dir)): - if index % 100 == 0 and index > 0: - sys.stdout.write(".") - sys.stdout.flush() - if index % 5000 == 0 and index > 0: - sys.stdout.write("\n") - sys.stdout.flush() - if not line or not line.strip(): - continue - synset = SynSet(line) - gloss_with_examples, _ = quotedTextPattern.subn( - lambda x: f'{x.group(1)}', - synset.gloss, - ) - gloss_with_examples, _ = refPattern.subn( - lambda x: a(x.group(1)), - gloss_with_examples, - ) - - words = synset.words - for index2, word in enumerate(words): - # TODO: move this block to a func - synonyms = ", ".join(a(w) for w in words if w != word) - synonyms_str = ( - f'
                          Synonyms: {synonyms}' - if synonyms - else "" - ) - pointers = defaultdict(list) - for pointer in synset.pointers: - if ( - pointer.source - and pointer.target - and pointer.source - 1 != index2 - ): - continue - symbol = pointer.symbol - if symbol and symbol[:1] in {";", "-"}: - continue - try: - symbol_desc = getattr(PointerSymbols, synset.ss_type)[symbol] - except KeyError: - log.warning( - f"unknown pointer symbol {symbol} for {synset.ss_type} ", - ) - symbol_desc = symbol - - data_file = files[pointer.pos] - data_file.seek(pointer.offset) - referenced_synset = SynSet(data_file.readline()) - if pointer.source == pointer.target == 0: - pointers[symbol_desc] = [ - w for w in referenced_synset.words if w not in words - ] - else: - referenced_word = referenced_synset.words[pointer.target - 1] - if referenced_word not in pointers[symbol_desc]: - pointers[symbol_desc].append(referenced_word) - - pointers_str = "".join( - [ - f'
                          {symbol_desc}: ' - + ", ".join(a(w) for w in referenced_words) - for symbol_desc, referenced_words in pointers.items() - if referenced_words - ], - ) - self.collector[word].append( - f'{synSetTypes[synset.ss_type]}' - f" {gloss_with_examples}{synonyms_str}{pointers_str}", - ) - sys.stdout.write("\n") - sys.stdout.flush() - - def process(self) -> Iterator[tuple[str, str]]: - article_template = self.article_template - - for title in self.collector: - article_pieces = self.collector[title] - article_pieces_count = len(article_pieces) - text = None - if article_pieces_count > 1: - ol = ["
                            "] + [f"
                          1. {ap}
                          2. " for ap in article_pieces] + ["
                          "] - text = article_template % (title, "".join(ol)) - elif article_pieces_count == 1: - text = article_template % (title, article_pieces[0]) - - if text: - yield title, text - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._wordCount = 0 - self.wordnet: WordNet | None = None - - def __len__(self) -> int: - return self._wordCount - - def open(self, filename: str) -> None: - self.wordnet = WordNet(filename) - log.info("Running wordnet.prepare()") - self.wordnet.prepare() - - # TODO: metadata - - def close(self) -> None: - self.wordnet = None - - def __iter__(self) -> Iterator[EntryType]: - if self.wordnet is None: - raise ValueError("self.wordnet is None") - glos = self._glos - for word, defi in self.wordnet.process(): - yield glos.newEntry(word, defi) diff --git a/pyglossary/plugins/wordnet/reader.py b/pyglossary/plugins/wordnet/reader.py new file mode 100644 index 000000000..434cafe2f --- /dev/null +++ b/pyglossary/plugins/wordnet/reader.py @@ -0,0 +1,330 @@ +# -*- coding: utf-8 -*- +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3 +# as published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License +# for more details. +# +# Copyright (C) 2023 Saeed Rasooli +# Copyright (C) 2015 Igor Tkach +# +# This plugin is based on https://github.com/itkach/wordnet2slob +from __future__ import annotations + +import os +import re +import sys +from collections import defaultdict +from typing import TYPE_CHECKING + +from pyglossary.core import log + +if TYPE_CHECKING: + import io + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + +# original expression from +# http://stackoverflow.com/questions/694344/regular-expression-that-matches-between-quotes-containing-escaped-quotes +# "(?:[^\\"]+|\\.)*" +# some examples don't have closing quote which +# make the subn with this expression hang +# quotedTextPattern = re.compile(r'"(?:[^"]+|\.)*["|\n]') + +# make it a capturing group so that we can get rid of quotes +quotedTextPattern = re.compile(r'"([^"]+)"') + +refPattern = re.compile(r"`(\w+)'") + + +class SynSet: + def __init__(self, line: str | bytes) -> None: + self.line = line + if isinstance(line, bytes): + line = line.decode("utf-8") + meta, self.gloss = line.split("|") + self.meta_parts = meta.split() + + @property + def offset(self) -> int: + return int(self.meta_parts[0]) + + @property + def lex_filenum(self) -> str: + return self.meta_parts[1] + + @property + def ss_type(self) -> str: + return self.meta_parts[2] + + @property + def w_cnt(self) -> int: + return int(self.meta_parts[3], 16) + + @property + def words(self) -> list[str]: + return [self.meta_parts[4 + 2 * i].replace("_", " ") for i in range(self.w_cnt)] + + @property + def pointers(self) -> list[Pointer]: + p_cnt_index = 4 + 2 * self.w_cnt + p_cnt = self.meta_parts[p_cnt_index] + pointer_count = int(p_cnt) + start = p_cnt_index + 1 + return [ + Pointer(*self.meta_parts[start + i * 4 : start + (i + 1) * 4]) # type: ignore + for i in range(pointer_count) + ] + + def __repr__(self) -> str: + return f"SynSet({self.line!r})" + + +class PointerSymbols: + n = { + "!": "Antonyms", + "@": "Hypernyms", + "@i": "Instance hypernyms", + "~": "Hyponyms", + "~i": "Instance hyponyms", + "#m": "Member holonyms", + "#s": "Substance holonyms", + "#p": "Part holonyms", + "%m": "Member meronyms", + "%s": "Substance meronyms", + "%p": "Part meronyms", + "=": "Attributes", + "+": "Derivationally related forms", + ";c": "Domain of synset - TOPIC", + "-c": "Member of this domain - TOPIC", + ";r": "Domain of synset - REGION", + "-r": "Member of this domain - REGION", + ";u": "Domain of synset - USAGE", + "-u": "Member of this domain - USAGE", + "^": "Also see", + } + + v = { + "!": "Antonyms", + "@": "Hypernyms", + "~": "Hyponyms", + "*": "Entailments", + ">": "Cause", + "^": "Also see", + "$": "Verb group", + "+": "Derivationally related forms", + ";c": "Domain of synset - TOPIC", + ";r": "Domain of synset - REGION", + ";u": "Domain of synset - USAGE", + } + + a = s = { + "!": "Antonyms", + "+": "Derivationally related forms", + "&": "Similar to", + "<": "Participle of verb", + "\\": "Pertainyms", + "=": "Attributes", + "^": "Also see", + ";c": "Domain of synset - TOPIC", + ";r": "Domain of synset - REGION", + ";u": "Domain of synset - USAGE", + } + + r = { + "!": "Antonyms", + "\\": "Derived from adjective", + "+": "Derivationally related forms", + ";c": "Domain of synset - TOPIC", + ";r": "Domain of synset - REGION", + ";u": "Domain of synset - USAGE", + "^": "Also see", + } + + +class Pointer: + def __init__(self, symbol: str, offset: str, pos: str, source_target: str) -> None: + self.symbol = symbol + self.offset = int(offset) + self.pos = pos + self.source_target = source_target + self.source = int(source_target[:2], 16) + self.target = int(source_target[2:], 16) + + def __repr__(self) -> str: + return ( + f"Pointer({self.symbol!r}, {self.offset!r}, " + f"{self.pos!r}, {self.source_target!r})" + ) + + +class WordNet: + article_template = "

                          %s

                          %s" + synSetTypes = { + "n": "n.", + "v": "v.", + "a": "adj.", + "s": "adj. satellite", + "r": "adv.", + } + + file2pos = { + "data.adj": ["a", "s"], + "data.adv": ["r"], + "data.noun": ["n"], + "data.verb": ["v"], + } + + def __init__(self, wordnetdir: str) -> None: + self.wordnetdir = wordnetdir + self.collector: dict[str, list[str]] = defaultdict(list) + + @staticmethod + def iterlines(dict_dir: str) -> Iterator[str]: + for name in os.listdir(dict_dir): + if not name.startswith("data."): + continue + with open(os.path.join(dict_dir, name), encoding="utf-8") as f: + for line in f: + if not line.startswith(" "): + yield line + + # PLR0912 Too many branches (16 > 12) + def prepare(self) -> None: # noqa: PLR0912 + synSetTypes = self.synSetTypes + file2pos = self.file2pos + + dict_dir = self.wordnetdir + + files: dict[str, io.TextIOWrapper] = {} + for name in os.listdir(dict_dir): + if name.startswith("data.") and name in file2pos: + f = open(os.path.join(dict_dir, name), encoding="utf-8") # noqa: SIM115 + for key in file2pos[name]: + files[key] = f + + def a(word: str) -> str: + return f'{word}' + + for index, line in enumerate(self.iterlines(dict_dir)): + if index % 100 == 0 and index > 0: + sys.stdout.write(".") + sys.stdout.flush() + if index % 5000 == 0 and index > 0: + sys.stdout.write("\n") + sys.stdout.flush() + if not line or not line.strip(): + continue + synset = SynSet(line) + gloss_with_examples, _ = quotedTextPattern.subn( + lambda x: f'{x.group(1)}', + synset.gloss, + ) + gloss_with_examples, _ = refPattern.subn( + lambda x: a(x.group(1)), + gloss_with_examples, + ) + + words = synset.words + for index2, word in enumerate(words): + # TODO: move this block to a func + synonyms = ", ".join(a(w) for w in words if w != word) + synonyms_str = ( + f'
                          Synonyms: {synonyms}' + if synonyms + else "" + ) + pointers = defaultdict(list) + for pointer in synset.pointers: + if ( + pointer.source + and pointer.target + and pointer.source - 1 != index2 + ): + continue + symbol = pointer.symbol + if symbol and symbol[:1] in {";", "-"}: + continue + try: + symbol_desc = getattr(PointerSymbols, synset.ss_type)[symbol] + except KeyError: + log.warning( + f"unknown pointer symbol {symbol} for {synset.ss_type} ", + ) + symbol_desc = symbol + + data_file = files[pointer.pos] + data_file.seek(pointer.offset) + referenced_synset = SynSet(data_file.readline()) + if pointer.source == pointer.target == 0: + pointers[symbol_desc] = [ + w for w in referenced_synset.words if w not in words + ] + else: + referenced_word = referenced_synset.words[pointer.target - 1] + if referenced_word not in pointers[symbol_desc]: + pointers[symbol_desc].append(referenced_word) + + pointers_str = "".join( + [ + f'
                          {symbol_desc}: ' + + ", ".join(a(w) for w in referenced_words) + for symbol_desc, referenced_words in pointers.items() + if referenced_words + ], + ) + self.collector[word].append( + f'{synSetTypes[synset.ss_type]}' + f" {gloss_with_examples}{synonyms_str}{pointers_str}", + ) + sys.stdout.write("\n") + sys.stdout.flush() + + def process(self) -> Iterator[tuple[str, str]]: + article_template = self.article_template + + for title in self.collector: + article_pieces = self.collector[title] + article_pieces_count = len(article_pieces) + text = None + if article_pieces_count > 1: + ol = ["
                            "] + [f"
                          1. {ap}
                          2. " for ap in article_pieces] + ["
                          "] + text = article_template % (title, "".join(ol)) + elif article_pieces_count == 1: + text = article_template % (title, article_pieces[0]) + + if text: + yield title, text + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._wordCount = 0 + self.wordnet: WordNet | None = None + + def __len__(self) -> int: + return self._wordCount + + def open(self, filename: str) -> None: + self.wordnet = WordNet(filename) + log.info("Running wordnet.prepare()") + self.wordnet.prepare() + + # TODO: metadata + + def close(self) -> None: + self.wordnet = None + + def __iter__(self) -> Iterator[EntryType]: + if self.wordnet is None: + raise ValueError("self.wordnet is None") + glos = self._glos + for word, defi in self.wordnet.process(): + yield glos.newEntry(word, defi) diff --git a/pyglossary/plugins/wordset/__init__.py b/pyglossary/plugins/wordset/__init__.py index f2a0ce8b8..2cf426303 100644 --- a/pyglossary/plugins/wordset/__init__.py +++ b/pyglossary/plugins/wordset/__init__.py @@ -1,22 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from json import load -from os import listdir -from os.path import isfile, join, splitext -from typing import TYPE_CHECKING - -from pyglossary.core import log from pyglossary.option import ( EncodingOption, Option, ) -if TYPE_CHECKING: - from collections.abc import Iterator - from typing import Any - - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader __all__ = [ "Reader", @@ -49,85 +39,3 @@ optionsProp: dict[str, Option] = { "encoding": EncodingOption(), } - - -class Reader: - _encoding: str = "utf-8" - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - self.defiTemplate = ( - "

                          " - '{speech_part}' - "
                          " - "{def}" - "
                          " - "{example}" - "

                          " - ) - """ - { - "id": "492099d426", - "def": "without musical accompaniment", - "example": "they performed a cappella", - "speech_part": "adverb" - }, - """ - - def close(self) -> None: - self._clear() - - def _clear(self) -> None: - self._filename = "" - - def open(self, filename: str) -> None: - self._filename = filename - name = self._glos.getInfo("name") - if not name or name == "data": - self._glos.setInfo("name", "Wordset.org") - self._glos.setDefaultDefiFormat("h") - - def __len__(self) -> int: - return 0 - - @staticmethod - def fileNameSortKey(fname: str) -> str: - fname = splitext(fname)[0] - if fname == "misc": - return "\x80" - return fname - - @staticmethod - def sortKey(word: str) -> Any: - return word.lower().encode("utf-8", errors="replace") - - def __iter__(self) -> Iterator[EntryType]: - if not self._filename: - raise RuntimeError("iterating over a reader while it's not open") - - direc = self._filename - encoding = self._encoding - glos = self._glos - - for fname in sorted(listdir(direc), key=self.fileNameSortKey): - fpath = join(direc, fname) - if not (fname.endswith(".json") and isfile(fpath)): - continue - with open(fpath, encoding=encoding) as fileObj: - data: dict[str, dict[str, Any]] = load(fileObj) - for word in sorted(data, key=self.sortKey): - entryDict = data[word] - defi = "".join( - self.defiTemplate.format( - **{ - "word": word, - "def": meaning.get("def", ""), - "example": meaning.get("example", ""), - "speech_part": meaning.get("speech_part", ""), - }, - ) - for meaning in entryDict.get("meanings", []) - ) - yield glos.newEntry(word, defi, defiFormat="h") - log.info(f"finished reading {fname}") diff --git a/pyglossary/plugins/wordset/reader.py b/pyglossary/plugins/wordset/reader.py new file mode 100644 index 000000000..4c19b8b42 --- /dev/null +++ b/pyglossary/plugins/wordset/reader.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from json import load +from os import listdir +from os.path import isfile, join, splitext +from typing import TYPE_CHECKING + +from pyglossary.core import log + +if TYPE_CHECKING: + from collections.abc import Iterator + from typing import Any + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + _encoding: str = "utf-8" + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + self.defiTemplate = ( + "

                          " + '{speech_part}' + "
                          " + "{def}" + "
                          " + "{example}" + "

                          " + ) + """ + { + "id": "492099d426", + "def": "without musical accompaniment", + "example": "they performed a cappella", + "speech_part": "adverb" + }, + """ + + def close(self) -> None: + self._clear() + + def _clear(self) -> None: + self._filename = "" + + def open(self, filename: str) -> None: + self._filename = filename + name = self._glos.getInfo("name") + if not name or name == "data": + self._glos.setInfo("name", "Wordset.org") + self._glos.setDefaultDefiFormat("h") + + def __len__(self) -> int: + return 0 + + @staticmethod + def fileNameSortKey(fname: str) -> str: + fname = splitext(fname)[0] + if fname == "misc": + return "\x80" + return fname + + @staticmethod + def sortKey(word: str) -> Any: + return word.lower().encode("utf-8", errors="replace") + + def __iter__(self) -> Iterator[EntryType]: + if not self._filename: + raise RuntimeError("iterating over a reader while it's not open") + + direc = self._filename + encoding = self._encoding + glos = self._glos + + for fname in sorted(listdir(direc), key=self.fileNameSortKey): + fpath = join(direc, fname) + if not (fname.endswith(".json") and isfile(fpath)): + continue + with open(fpath, encoding=encoding) as fileObj: + data: dict[str, dict[str, Any]] = load(fileObj) + for word in sorted(data, key=self.sortKey): + entryDict = data[word] + defi = "".join( + self.defiTemplate.format( + **{ + "word": word, + "def": meaning.get("def", ""), + "example": meaning.get("example", ""), + "speech_part": meaning.get("speech_part", ""), + }, + ) + for meaning in entryDict.get("meanings", []) + ) + yield glos.newEntry(word, defi, defiFormat="h") + log.info(f"finished reading {fname}") diff --git a/pyglossary/plugins/xdxf/__init__.py b/pyglossary/plugins/xdxf/__init__.py index 0df1b3a51..e3e32a2e1 100644 --- a/pyglossary/plugins/xdxf/__init__.py +++ b/pyglossary/plugins/xdxf/__init__.py @@ -1,52 +1,12 @@ # -*- coding: utf-8 -*- -# xdxf/__init__.py from __future__ import annotations -"""xdxf file format reader and utils to convert xdxf to html.""" -# -# Copyright © 2023 Saeed Rasooli -# Copyright © 2016 ivan tkachenko me@ratijas.tk -# -# some parts of this file include code from: -# Aard Dictionary Tools . -# Copyright © 2008-2009 Igor Tkach -# -# This program is a free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# You can get a copy of GNU General Public License along this program -# But you can always get it from http://www.gnu.org/licenses/gpl.txt -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -import re -import typing -from typing import TYPE_CHECKING, cast - -if TYPE_CHECKING: - import io - from collections.abc import Iterator, Sequence - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.lxml_types import Element - -from lxml import etree as ET - -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import log -from pyglossary.io_utils import nullBinaryIO from pyglossary.option import ( BoolOption, Option, ) -from pyglossary.text_utils import toStr + +from .reader import Reader __all__ = [ "Reader", @@ -112,212 +72,3 @@ ... """ - -if TYPE_CHECKING: - - class TransformerType(typing.Protocol): - def transform(self, article: Element) -> str: ... - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - _html: bool = True - _xsl: bool = False - - infoKeyMap = { - "full_name": "name", - "full_title": "name", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._encoding = "utf-8" - self._htmlTr: TransformerType | None = None - self._re_span_k = re.compile( - '[^<>]*(
                          )?', - ) - - def makeTransformer(self) -> None: - if self._xsl: - from pyglossary.xdxf.xsl_transform import XslXdxfTransformer - - self._htmlTr = XslXdxfTransformer(encoding=self._encoding) - return - - from pyglossary.xdxf.transform import XdxfTransformer - - self._htmlTr = XdxfTransformer(encoding=self._encoding) - - def open(self, filename: str) -> None: # noqa: PLR0912 - # - - self._filename = filename - if self._html: - self.makeTransformer() - self._glos.setDefaultDefiFormat("h") - else: - self._glos.setDefaultDefiFormat("x") - - cfile = self._file = cast( - "io.IOBase", - compressionOpen( - self._filename, - mode="rb", - ), - ) - - context = ET.iterparse( # type: ignore - cfile, - events=("end",), - ) - for _, _elem in context: - elem = cast("Element", _elem) - if elem.tag in {"meta_info", "ar", "k", "abr", "dtrn"}: - break - # every other tag before or is considered info - if elem.tag == "abbr_def": - continue - # in case of multiple or multiple tags, the last one - # will be stored. - # Very few formats support more than one language pair in their - # metadata, so it's not very useful to have multiple - if elem.tag == "from": - for key, value in elem.attrib.items(): - if key.endswith("}lang"): - self._glos.sourceLangName = value.split("-")[0] - break - continue - if elem.tag == "to": - for key, value in elem.attrib.items(): - if key.endswith("}lang"): - self._glos.targetLangName = value.split("-")[0] - break - continue - if not elem.text: - if elem.tag != "br": - log.warning(f"empty tag <{elem.tag}>") - continue - key = self.infoKeyMap.get(elem.tag, elem.tag) - self._glos.setInfo(key, elem.text) - - del context - - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - self._glos.setInfo("input_file_size", str(self._fileSize)) - else: - log.warning("XDXF Reader: file is not seekable") - self._file.close() - self._file = compressionOpen(self._filename, mode="rb") - - def __len__(self) -> int: - return 0 - - def __iter__(self) -> Iterator[EntryType]: - context = ET.iterparse( # type: ignore - self._file, - events=("end",), - tag="ar", - ) - for _, _article in context: - article = cast("Element", _article) - article.tail = None - words = [toStr(w) for w in self.titles(article)] - if self._htmlTr: - defi = self._htmlTr.transform(article) - defiFormat = "h" - if len(words) == 1: - defi = self._re_span_k.sub("", defi) - else: - b_defi = cast("bytes", ET.tostring(article, encoding=self._encoding)) - defi = b_defi[4:-5].decode(self._encoding).strip() - defiFormat = "x" - - # log.info(f"{defi=}, {words=}") - yield self._glos.newEntry( - words, - defi, - defiFormat=defiFormat, - byteProgress=(self._file.tell(), self._fileSize), - ) - # clean up preceding siblings to save memory - # this can reduce memory usage from 1 GB to ~25 MB - parent = article.getparent() - if parent is None: - continue - while article.getprevious() is not None: - del parent[0] - - def close(self) -> None: - self._file.close() - self._file = nullBinaryIO - - @staticmethod - def tostring( - elem: Element, - ) -> str: - return ( - ET.tostring( - elem, - method="html", - pretty_print=True, - ) - .decode("utf-8") - .strip() - ) - - def titles(self, article: Element) -> list[str]: - """ - :param article: tag - :return: (title (str) | None, alternative titles (set)) - """ - from itertools import combinations - - titles: list[str] = [] - for title_element in article.findall("k"): - if title_element.text is None: - # TODO: look for tag? - log.warning(f"empty title element: {self.tostring(title_element)}") - continue - n_opts = len([c for c in title_element if c.tag == "opt"]) - if n_opts: - titles += [ - self._mktitle(title_element, comb) - for j in range(n_opts + 1) - for comb in combinations(list(range(n_opts)), j) - ] - else: - titles.append(self._mktitle(title_element)) - - return titles - - def _mktitle( # noqa: PLR6301 - self, - title_element: Element, - include_opts: Sequence | None = None, - ) -> str: - if include_opts is None: - include_opts = () - title = title_element.text or "" - opt_i = -1 - for c in title_element: - if c.tag == "nu" and c.tail: - if title: - title += c.tail - else: - title = c.tail - if c.tag == "opt" and c.text is not None: - opt_i += 1 - if opt_i in include_opts: - title += c.text - if c.tail: - title += c.tail - return title.strip() diff --git a/pyglossary/plugins/xdxf/reader.py b/pyglossary/plugins/xdxf/reader.py new file mode 100644 index 000000000..194d26d73 --- /dev/null +++ b/pyglossary/plugins/xdxf/reader.py @@ -0,0 +1,252 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2023 Saeed Rasooli +# Copyright © 2016 ivan tkachenko me@ratijas.tk +# +# some parts of this file include code from: +# Aard Dictionary Tools . +# Copyright © 2008-2009 Igor Tkach +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# You can get a copy of GNU General Public License along this program +# But you can always get it from http://www.gnu.org/licenses/gpl.txt +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +from __future__ import annotations + +import re +import typing +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Iterator, Sequence + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.lxml_types import Element + +from lxml import etree as ET + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import log +from pyglossary.io_utils import nullBinaryIO +from pyglossary.text_utils import toStr + +if TYPE_CHECKING: + + class TransformerType(typing.Protocol): + def transform(self, article: Element) -> str: ... + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + _html: bool = True + _xsl: bool = False + + infoKeyMap = { + "full_name": "name", + "full_title": "name", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._encoding = "utf-8" + self._htmlTr: TransformerType | None = None + self._re_span_k = re.compile( + '[^<>]*(
                          )?', + ) + + def makeTransformer(self) -> None: + if self._xsl: + from pyglossary.xdxf.xsl_transform import XslXdxfTransformer + + self._htmlTr = XslXdxfTransformer(encoding=self._encoding) + return + + from pyglossary.xdxf.transform import XdxfTransformer + + self._htmlTr = XdxfTransformer(encoding=self._encoding) + + def open(self, filename: str) -> None: # noqa: PLR0912 + # + + self._filename = filename + if self._html: + self.makeTransformer() + self._glos.setDefaultDefiFormat("h") + else: + self._glos.setDefaultDefiFormat("x") + + cfile = self._file = cast( + "io.IOBase", + compressionOpen( + self._filename, + mode="rb", + ), + ) + + context = ET.iterparse( # type: ignore + cfile, + events=("end",), + ) + for _, _elem in context: + elem = cast("Element", _elem) + if elem.tag in {"meta_info", "ar", "k", "abr", "dtrn"}: + break + # every other tag before or
                          is considered info + if elem.tag == "abbr_def": + continue + # in case of multiple or multiple tags, the last one + # will be stored. + # Very few formats support more than one language pair in their + # metadata, so it's not very useful to have multiple + if elem.tag == "from": + for key, value in elem.attrib.items(): + if key.endswith("}lang"): + self._glos.sourceLangName = value.split("-")[0] + break + continue + if elem.tag == "to": + for key, value in elem.attrib.items(): + if key.endswith("}lang"): + self._glos.targetLangName = value.split("-")[0] + break + continue + if not elem.text: + if elem.tag != "br": + log.warning(f"empty tag <{elem.tag}>") + continue + key = self.infoKeyMap.get(elem.tag, elem.tag) + self._glos.setInfo(key, elem.text) + + del context + + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + self._glos.setInfo("input_file_size", str(self._fileSize)) + else: + log.warning("XDXF Reader: file is not seekable") + self._file.close() + self._file = compressionOpen(self._filename, mode="rb") + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> Iterator[EntryType]: + context = ET.iterparse( # type: ignore + self._file, + events=("end",), + tag="ar", + ) + for _, _article in context: + article = cast("Element", _article) + article.tail = None + words = [toStr(w) for w in self.titles(article)] + if self._htmlTr: + defi = self._htmlTr.transform(article) + defiFormat = "h" + if len(words) == 1: + defi = self._re_span_k.sub("", defi) + else: + b_defi = cast("bytes", ET.tostring(article, encoding=self._encoding)) + defi = b_defi[4:-5].decode(self._encoding).strip() + defiFormat = "x" + + # log.info(f"{defi=}, {words=}") + yield self._glos.newEntry( + words, + defi, + defiFormat=defiFormat, + byteProgress=(self._file.tell(), self._fileSize), + ) + # clean up preceding siblings to save memory + # this can reduce memory usage from 1 GB to ~25 MB + parent = article.getparent() + if parent is None: + continue + while article.getprevious() is not None: + del parent[0] + + def close(self) -> None: + self._file.close() + self._file = nullBinaryIO + + @staticmethod + def tostring( + elem: Element, + ) -> str: + return ( + ET.tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + def titles(self, article: Element) -> list[str]: + """ + :param article: tag + :return: (title (str) | None, alternative titles (set)) + """ + from itertools import combinations + + titles: list[str] = [] + for title_element in article.findall("k"): + if title_element.text is None: + # TODO: look for tag? + log.warning(f"empty title element: {self.tostring(title_element)}") + continue + n_opts = len([c for c in title_element if c.tag == "opt"]) + if n_opts: + titles += [ + self._mktitle(title_element, comb) + for j in range(n_opts + 1) + for comb in combinations(list(range(n_opts)), j) + ] + else: + titles.append(self._mktitle(title_element)) + + return titles + + def _mktitle( # noqa: PLR6301 + self, + title_element: Element, + include_opts: Sequence | None = None, + ) -> str: + if include_opts is None: + include_opts = () + title = title_element.text or "" + opt_i = -1 + for c in title_element: + if c.tag == "nu" and c.tail: + if title: + title += c.tail + else: + title = c.tail + if c.tag == "opt" and c.text is not None: + opt_i += 1 + if opt_i in include_opts: + title += c.text + if c.tail: + title += c.tail + return title.strip() diff --git a/pyglossary/plugins/xdxf_css/__init__.py b/pyglossary/plugins/xdxf_css/__init__.py index df7d27333..a0972364f 100644 --- a/pyglossary/plugins/xdxf_css/__init__.py +++ b/pyglossary/plugins/xdxf_css/__init__.py @@ -1,54 +1,16 @@ # -*- coding: utf-8 -*- -# xdxf/__init__.py from __future__ import annotations -"""xdxf file format reader and utils to convert xdxf to html.""" -# -# Copyright © 2023 Saeed Rasooli -# Copyright © 2016 ivan tkachenko me@ratijas.tk -# -# some parts of this file include code from: -# Aard Dictionary Tools . -# Copyright © 2008-2009 Igor Tkach -# -# This program is a free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# You can get a copy of GNU General Public License along this program -# But you can always get it from http://www.gnu.org/licenses/gpl.txt -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -import re -import typing -from os.path import join -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING from pyglossary.option import BoolOption -if TYPE_CHECKING: - import io - from collections.abc import Iterator, Sequence +from .reader import Reader - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.lxml_types import Element +if TYPE_CHECKING: from pyglossary.option import Option -from lxml import etree as ET - -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import log, rootDir -from pyglossary.io_utils import nullBinaryIO -from pyglossary.text_utils import toStr - __all__ = [ "Reader", "description", @@ -110,241 +72,3 @@ ... """ - -if TYPE_CHECKING: - - class TransformerType(typing.Protocol): - def transform(self, article: Element) -> str: ... - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - _html: bool = True - - infoKeyMap = { - "full_name": "name", - "full_title": "name", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._encoding = "utf-8" - self._htmlTr: TransformerType | None = None - self._re_span_k = re.compile( - '[^<>]*(
                          )?', - ) - self._has_added_css: bool = False - self._has_added_js: bool = False - self._abbr_defs_js: bytes - - def makeTransformer(self) -> None: - from pyglossary.xdxf.css_js_transform import XdxfTransformer - - self._htmlTr = XdxfTransformer(encoding=self._encoding) - - def open(self, filename: str) -> None: # noqa: PLR0912 - # - - self._filename = filename - self.makeTransformer() - self._glos.setDefaultDefiFormat("h") - - cfile = self._file = cast( - "io.IOBase", - compressionOpen( - self._filename, - mode="rb", - ), - ) - - context = ET.iterparse( # type: ignore - cfile, - events=("end",), - ) - abbr_defs: list[Element] = [] - for _, _elem in context: - elem = cast("Element", _elem) - if elem.tag in {"meta_info", "ar", "k", "abr", "dtrn"}: - break - # every other tag before or
                          is considered info - if elem.tag == "abbr_def": - abbr_defs.append(elem) - continue - # in case of multiple or multiple tags, the last one - # will be stored. - # Very few formats support more than one language pair in their - # metadata, so it's not very useful to have multiple - if elem.tag == "from": - for key, value in elem.attrib.items(): - if key.endswith("}lang"): - self._glos.sourceLangName = value.split("-")[0] - break - continue - if elem.tag == "to": - for key, value in elem.attrib.items(): - if key.endswith("}lang"): - self._glos.targetLangName = value.split("-")[0] - break - continue - if not elem.text: - if elem.tag != "br": - log.warning(f"empty tag <{elem.tag}>") - continue - key = self.infoKeyMap.get(elem.tag, elem.tag) - self._glos.setInfo(key, elem.text) - self._abbr_defs_js = self.generate_abbr_js(abbr_defs) - del context - - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - self._glos.setInfo("input_file_size", str(self._fileSize)) - else: - log.warning("XDXF Reader: file is not seekable") - self._file.close() - self._file = compressionOpen(self._filename, mode="rb") - - def __len__(self) -> int: - return 0 - - def __iter__(self) -> Iterator[EntryType]: - context = ET.iterparse( # type: ignore - self._file, - events=("end",), - tag="ar", - ) - - if not self._has_added_css: - self._has_added_css = True - cssPath = join(rootDir, "pyglossary", "xdxf", "xdxf.css") - with open(cssPath, "rb") as css_file: - yield self._glos.newDataEntry("css/xdxf.css", css_file.read()) - - if self._abbr_defs_js is not None and not self._has_added_js: - self._has_added_js = True - yield self._glos.newDataEntry("js/xdxf.js", self._abbr_defs_js) - - for _, _article in context: - article = cast("Element", _article) - article.tail = None - words = [toStr(w) for w in self.titles(article)] - - defi = self._htmlTr.transform(article) - defiFormat = "h" - if len(words) == 1: - defi = self._re_span_k.sub("", defi) - - defi = f""" - - - - - - {defi} - - -""" - # log.info(f"{defi=}, {words=}") - yield self._glos.newEntry( - words, - defi, - defiFormat=defiFormat, - byteProgress=(self._file.tell(), self._fileSize), - ) - # clean up preceding siblings to save memory - # this can reduce memory usage from 1 GB to ~25 MB - parent = article.getparent() - if parent is None: - continue - while article.getprevious() is not None: - del parent[0] - - def close(self) -> None: - self._file.close() - self._file = nullBinaryIO - - def generate_abbr_js(self, abbr_defs: list[Element]) -> bytes: - abbr_map_js = """const abbr_map = new Map();\n""" - for abbr_def in abbr_defs: - abbr_k_list: list[str] = [] - abbr_v_text = "" - for child in abbr_def.xpath("child::node()"): - if child.tag == "abbr_k": - abbr_k_list.append(self._htmlTr.stringify_children(child)) - if child.tag == "abbr_v": - abbr_v_text = self._htmlTr.stringify_children(child) - # TODO escape apostrophes - for abbr_k in abbr_k_list: - if abbr_k and abbr_v_text: - abbr_map_js += f"abbr_map.set('{abbr_k}', '{abbr_v_text}');\n" - with open(join(rootDir, "pyglossary", "xdxf", "xdxf.js"), "rb") as js_file: - return abbr_map_js.encode(encoding="utf-8") + js_file.read() - - @staticmethod - def tostring( - elem: Element, - ) -> str: - return ( - ET.tostring( - elem, - method="html", - pretty_print=True, - ) - .decode("utf-8") - .strip() - ) - - def titles(self, article: Element) -> list[str]: - """ - :param article: tag - :return: (title (str) | None, alternative titles (set)) - """ - from itertools import combinations - - titles: list[str] = [] - for title_element in article.findall("k"): - if title_element.text is None: - # TODO: look for tag? - log.warning(f"empty title element: {self.tostring(title_element)}") - continue - n_opts = len([c for c in title_element if c.tag == "opt"]) - if n_opts: - titles += [ - self._mktitle(title_element, comb) - for j in range(n_opts + 1) - for comb in combinations(list(range(n_opts)), j) - ] - else: - titles.append(self._mktitle(title_element)) - - return titles - - def _mktitle( # noqa: PLR6301 - self, - title_element: Element, - include_opts: Sequence | None = None, - ) -> str: - if include_opts is None: - include_opts = () - title = title_element.text or "" - opt_i = -1 - for c in title_element: - if c.tag == "nu" and c.tail: - if title: - title += c.tail - else: - title = c.tail - if c.tag == "opt" and c.text is not None: - opt_i += 1 - if opt_i in include_opts: - title += c.text - if c.tail: - title += c.tail - return title.strip() diff --git a/pyglossary/plugins/xdxf_css/reader.py b/pyglossary/plugins/xdxf_css/reader.py new file mode 100644 index 000000000..1a1f0a076 --- /dev/null +++ b/pyglossary/plugins/xdxf_css/reader.py @@ -0,0 +1,284 @@ +# -*- coding: utf-8 -*- +# xdxf file format reader and utils to convert xdxf to html. +# +# Copyright © 2023 Saeed Rasooli +# Copyright © 2016 ivan tkachenko me@ratijas.tk +# +# some parts of this file include code from: +# Aard Dictionary Tools . +# Copyright © 2008-2009 Igor Tkach +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# You can get a copy of GNU General Public License along this program +# But you can always get it from http://www.gnu.org/licenses/gpl.txt +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +from __future__ import annotations + +import re +import typing +from os.path import join +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Iterator, Sequence + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.lxml_types import Element + + +from lxml import etree as ET + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import log, rootDir +from pyglossary.io_utils import nullBinaryIO +from pyglossary.text_utils import toStr + +if TYPE_CHECKING: + + class TransformerType(typing.Protocol): + def transform(self, article: Element) -> str: ... + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + _html: bool = True + + infoKeyMap = { + "full_name": "name", + "full_title": "name", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._encoding = "utf-8" + self._htmlTr: TransformerType | None = None + self._re_span_k = re.compile( + '[^<>]*(
                          )?', + ) + self._has_added_css: bool = False + self._has_added_js: bool = False + self._abbr_defs_js: bytes + + def makeTransformer(self) -> None: + from pyglossary.xdxf.css_js_transform import XdxfTransformer + + self._htmlTr = XdxfTransformer(encoding=self._encoding) + + def open(self, filename: str) -> None: # noqa: PLR0912 + # + + self._filename = filename + self.makeTransformer() + self._glos.setDefaultDefiFormat("h") + + cfile = self._file = cast( + "io.IOBase", + compressionOpen( + self._filename, + mode="rb", + ), + ) + + context = ET.iterparse( # type: ignore + cfile, + events=("end",), + ) + abbr_defs: list[Element] = [] + for _, _elem in context: + elem = cast("Element", _elem) + if elem.tag in {"meta_info", "ar", "k", "abr", "dtrn"}: + break + # every other tag before or
                          is considered info + if elem.tag == "abbr_def": + abbr_defs.append(elem) + continue + # in case of multiple or multiple tags, the last one + # will be stored. + # Very few formats support more than one language pair in their + # metadata, so it's not very useful to have multiple + if elem.tag == "from": + for key, value in elem.attrib.items(): + if key.endswith("}lang"): + self._glos.sourceLangName = value.split("-")[0] + break + continue + if elem.tag == "to": + for key, value in elem.attrib.items(): + if key.endswith("}lang"): + self._glos.targetLangName = value.split("-")[0] + break + continue + if not elem.text: + if elem.tag != "br": + log.warning(f"empty tag <{elem.tag}>") + continue + key = self.infoKeyMap.get(elem.tag, elem.tag) + self._glos.setInfo(key, elem.text) + self._abbr_defs_js = self.generate_abbr_js(abbr_defs) + del context + + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + self._glos.setInfo("input_file_size", str(self._fileSize)) + else: + log.warning("XDXF Reader: file is not seekable") + self._file.close() + self._file = compressionOpen(self._filename, mode="rb") + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> Iterator[EntryType]: + context = ET.iterparse( # type: ignore + self._file, + events=("end",), + tag="ar", + ) + + if not self._has_added_css: + self._has_added_css = True + cssPath = join(rootDir, "pyglossary", "xdxf", "xdxf.css") + with open(cssPath, "rb") as css_file: + yield self._glos.newDataEntry("css/xdxf.css", css_file.read()) + + if self._abbr_defs_js is not None and not self._has_added_js: + self._has_added_js = True + yield self._glos.newDataEntry("js/xdxf.js", self._abbr_defs_js) + + for _, _article in context: + article = cast("Element", _article) + article.tail = None + words = [toStr(w) for w in self.titles(article)] + + defi = self._htmlTr.transform(article) + defiFormat = "h" + if len(words) == 1: + defi = self._re_span_k.sub("", defi) + + defi = f""" + + + + + + {defi} + + +""" + # log.info(f"{defi=}, {words=}") + yield self._glos.newEntry( + words, + defi, + defiFormat=defiFormat, + byteProgress=(self._file.tell(), self._fileSize), + ) + # clean up preceding siblings to save memory + # this can reduce memory usage from 1 GB to ~25 MB + parent = article.getparent() + if parent is None: + continue + while article.getprevious() is not None: + del parent[0] + + def close(self) -> None: + self._file.close() + self._file = nullBinaryIO + + def generate_abbr_js(self, abbr_defs: list[Element]) -> bytes: + abbr_map_js = """const abbr_map = new Map();\n""" + for abbr_def in abbr_defs: + abbr_k_list: list[str] = [] + abbr_v_text = "" + for child in abbr_def.xpath("child::node()"): + if child.tag == "abbr_k": + abbr_k_list.append(self._htmlTr.stringify_children(child)) + if child.tag == "abbr_v": + abbr_v_text = self._htmlTr.stringify_children(child) + # TODO escape apostrophes + for abbr_k in abbr_k_list: + if abbr_k and abbr_v_text: + abbr_map_js += f"abbr_map.set('{abbr_k}', '{abbr_v_text}');\n" + with open(join(rootDir, "pyglossary", "xdxf", "xdxf.js"), "rb") as js_file: + return abbr_map_js.encode(encoding="utf-8") + js_file.read() + + @staticmethod + def tostring( + elem: Element, + ) -> str: + return ( + ET.tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + def titles(self, article: Element) -> list[str]: + """ + :param article: tag + :return: (title (str) | None, alternative titles (set)) + """ + from itertools import combinations + + titles: list[str] = [] + for title_element in article.findall("k"): + if title_element.text is None: + # TODO: look for tag? + log.warning(f"empty title element: {self.tostring(title_element)}") + continue + n_opts = len([c for c in title_element if c.tag == "opt"]) + if n_opts: + titles += [ + self._mktitle(title_element, comb) + for j in range(n_opts + 1) + for comb in combinations(list(range(n_opts)), j) + ] + else: + titles.append(self._mktitle(title_element)) + + return titles + + def _mktitle( # noqa: PLR6301 + self, + title_element: Element, + include_opts: Sequence | None = None, + ) -> str: + if include_opts is None: + include_opts = () + title = title_element.text or "" + opt_i = -1 + for c in title_element: + if c.tag == "nu" and c.tail: + if title: + title += c.tail + else: + title = c.tail + if c.tag == "opt" and c.text is not None: + opt_i += 1 + if opt_i in include_opts: + title += c.text + if c.tail: + title += c.tail + return title.strip() diff --git a/pyglossary/plugins/xdxf_lax/__init__.py b/pyglossary/plugins/xdxf_lax/__init__.py index f6fac25bb..521c1597f 100644 --- a/pyglossary/plugins/xdxf_lax/__init__.py +++ b/pyglossary/plugins/xdxf_lax/__init__.py @@ -1,53 +1,12 @@ # -*- coding: utf-8 -*- -# from __future__ import annotations -"""Lax implementation of xdxf reader.""" -# -# Copyright © 2023 Saeed Rasooli -# Copyright © 2016 ivan tkachenko me@ratijas.tk -# -# some parts of this file include code from: -# Aard Dictionary Tools . -# Copyright © 2008-2009 Igor Tkach -# -# This program is a free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# You can get a copy of GNU General Public License along this program -# But you can always get it from http://www.gnu.org/licenses/gpl.txt -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -import re -import typing -from typing import TYPE_CHECKING, cast - -if TYPE_CHECKING: - import io - from collections.abc import Iterator, Sequence - - from lxml.html import HtmlElement as Element - - from pyglossary.glossary_types import EntryType, GlossaryType - -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import log -from pyglossary.io_utils import nullBinaryIO from pyglossary.option import ( BoolOption, Option, ) -from pyglossary.text_utils import toStr -from pyglossary.xdxf.transform import XdxfTransformer -from pyglossary.xdxf.xsl_transform import XslXdxfTransformer + +from .reader import Reader __all__ = [ "Reader", @@ -83,204 +42,3 @@ comment="Use XSL transformation", ), } - - -if TYPE_CHECKING: - - class TransformerType(typing.Protocol): - def transform(self, article: Element) -> str: ... - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - _html: bool = True - _xsl: bool = False - - infoKeyMap = { - "full_name": "name", - "full_title": "name", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._encoding = "utf-8" - self._htmlTr: TransformerType | None = None - self._re_span_k = re.compile( - '[^<>]*(
                          )?', - ) - - def readUntil(self, untilByte: bytes) -> tuple[int, bytes]: - file = self._file - buf = b"" - while True: - tmp = file.read(100) - if not tmp: - break - buf += tmp - index = buf.find(untilByte) - if index < 0: - continue - file.seek(file.tell() - len(buf) + index) - return index, buf[:index] - return -1, buf - - def _readOneMetadata(self, tag: str, infoKey: str) -> None: - from lxml.etree import XML - - endTag = f"".encode("ascii") - descStart, _ = self.readUntil(f"<{tag}>".encode("ascii")) - if descStart < 0: - log.warning(f"did not find {tag} open") - return - - descEnd, desc = self.readUntil(endTag) - if descEnd < 0: - log.warning(f"did not find {tag} close") - return - - desc += endTag - elem = XML(desc) - if elem.text: - self._glos.setInfo(infoKey, elem.text) - - def readMetadata(self) -> None: - file = self._file - pos = file.tell() - self._readOneMetadata("full_name", "title") - file.seek(pos) - self._readOneMetadata("description", "description") - - def open(self, filename: str) -> None: - # - self._filename = filename - if self._html: - if self._xsl: - self._htmlTr = XslXdxfTransformer(encoding=self._encoding) - else: - self._htmlTr = XdxfTransformer(encoding=self._encoding) - self._glos.setDefaultDefiFormat("h") - else: - self._glos.setDefaultDefiFormat("x") - - cfile = self._file = compressionOpen(self._filename, mode="rb") - - self.readMetadata() - - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - self._glos.setInfo("input_file_size", str(self._fileSize)) - - def __len__(self) -> int: - return 0 - - def __iter__(self) -> Iterator[EntryType]: - from lxml.html import fromstring, tostring - - while True: - start, _ = self.readUntil(b"") - if end < 0: - break - b_article += b"
                          " - s_article = b_article.decode("utf-8") - try: - article = cast("Element", fromstring(s_article)) - except Exception as e: - log.exception(s_article) - raise e from None - words = [toStr(w) for w in self.titles(article)] - if self._htmlTr: - defi = self._htmlTr.transform(article) - defiFormat = "h" - if len(words) == 1: - defi = self._re_span_k.sub("", defi) - else: - b_defi = cast("bytes", tostring(article, encoding=self._encoding)) - defi = b_defi[4:-5].decode(self._encoding).strip() - defiFormat = "x" - - # log.info(f"{defi=}, {words=}") - yield self._glos.newEntry( - words, - defi, - defiFormat=defiFormat, - byteProgress=(self._file.tell(), self._fileSize), - ) - - def close(self) -> None: - if self._file: - self._file.close() - self._file = nullBinaryIO - - @staticmethod - def tostring( - elem: Element, - ) -> str: - from lxml.html import tostring - - return ( - tostring( - elem, - method="html", - pretty_print=True, - ) - .decode("utf-8") - .strip() - ) - - def titles(self, article: Element) -> list[str]: - """ - :param article: tag - :return: (title (str) | None, alternative titles (set)) - """ - from itertools import combinations - - titles: list[str] = [] - for title_element in article.findall("k"): - if title_element.text is None: - # TODO: look for tag? - log.warning(f"empty title element: {self.tostring(title_element)}") - continue - n_opts = len([c for c in title_element if c.tag == "opt"]) - if n_opts: - titles += [ - self._mktitle(title_element, comb) - for j in range(n_opts + 1) - for comb in combinations(list(range(n_opts)), j) - ] - else: - titles.append(self._mktitle(title_element)) - - return titles - - def _mktitle( # noqa: PLR6301 - self, - title_element: Element, - include_opts: Sequence | None = None, - ) -> str: - if include_opts is None: - include_opts = () - title = title_element.text or "" - opt_i = -1 - for c in title_element: - if c.tag == "nu" and c.tail: - if title: - title += c.tail - else: - title = c.tail - if c.tag == "opt" and c.text is not None: - opt_i += 1 - if opt_i in include_opts: - title += c.text - if c.tail: - title += c.tail - return title.strip() diff --git a/pyglossary/plugins/xdxf_lax/reader.py b/pyglossary/plugins/xdxf_lax/reader.py new file mode 100644 index 000000000..806787246 --- /dev/null +++ b/pyglossary/plugins/xdxf_lax/reader.py @@ -0,0 +1,246 @@ +# -*- coding: utf-8 -*- +# +# Lax implementation of xdxf reader. +# +# Copyright © 2023 Saeed Rasooli +# Copyright © 2016 ivan tkachenko me@ratijas.tk +# +# some parts of this file include code from: +# Aard Dictionary Tools . +# Copyright © 2008-2009 Igor Tkach +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# You can get a copy of GNU General Public License along this program +# But you can always get it from http://www.gnu.org/licenses/gpl.txt +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +from __future__ import annotations + +import re +import typing +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Iterator, Sequence + + from lxml.html import HtmlElement as Element + + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import log +from pyglossary.io_utils import nullBinaryIO +from pyglossary.text_utils import toStr +from pyglossary.xdxf.transform import XdxfTransformer +from pyglossary.xdxf.xsl_transform import XslXdxfTransformer + +if TYPE_CHECKING: + + class TransformerType(typing.Protocol): + def transform(self, article: Element) -> str: ... + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + _html: bool = True + _xsl: bool = False + + infoKeyMap = { + "full_name": "name", + "full_title": "name", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._encoding = "utf-8" + self._htmlTr: TransformerType | None = None + self._re_span_k = re.compile( + '[^<>]*(
                          )?', + ) + + def readUntil(self, untilByte: bytes) -> tuple[int, bytes]: + file = self._file + buf = b"" + while True: + tmp = file.read(100) + if not tmp: + break + buf += tmp + index = buf.find(untilByte) + if index < 0: + continue + file.seek(file.tell() - len(buf) + index) + return index, buf[:index] + return -1, buf + + def _readOneMetadata(self, tag: str, infoKey: str) -> None: + from lxml.etree import XML + + endTag = f"".encode("ascii") + descStart, _ = self.readUntil(f"<{tag}>".encode("ascii")) + if descStart < 0: + log.warning(f"did not find {tag} open") + return + + descEnd, desc = self.readUntil(endTag) + if descEnd < 0: + log.warning(f"did not find {tag} close") + return + + desc += endTag + elem = XML(desc) + if elem.text: + self._glos.setInfo(infoKey, elem.text) + + def readMetadata(self) -> None: + file = self._file + pos = file.tell() + self._readOneMetadata("full_name", "title") + file.seek(pos) + self._readOneMetadata("description", "description") + + def open(self, filename: str) -> None: + # + self._filename = filename + if self._html: + if self._xsl: + self._htmlTr = XslXdxfTransformer(encoding=self._encoding) + else: + self._htmlTr = XdxfTransformer(encoding=self._encoding) + self._glos.setDefaultDefiFormat("h") + else: + self._glos.setDefaultDefiFormat("x") + + cfile = self._file = compressionOpen(self._filename, mode="rb") + + self.readMetadata() + + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + self._glos.setInfo("input_file_size", str(self._fileSize)) + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> Iterator[EntryType]: + from lxml.html import fromstring, tostring + + while True: + start, _ = self.readUntil(b"") + if end < 0: + break + b_article += b"
                          " + s_article = b_article.decode("utf-8") + try: + article = cast("Element", fromstring(s_article)) + except Exception as e: + log.exception(s_article) + raise e from None + words = [toStr(w) for w in self.titles(article)] + if self._htmlTr: + defi = self._htmlTr.transform(article) + defiFormat = "h" + if len(words) == 1: + defi = self._re_span_k.sub("", defi) + else: + b_defi = cast("bytes", tostring(article, encoding=self._encoding)) + defi = b_defi[4:-5].decode(self._encoding).strip() + defiFormat = "x" + + # log.info(f"{defi=}, {words=}") + yield self._glos.newEntry( + words, + defi, + defiFormat=defiFormat, + byteProgress=(self._file.tell(), self._fileSize), + ) + + def close(self) -> None: + if self._file: + self._file.close() + self._file = nullBinaryIO + + @staticmethod + def tostring( + elem: Element, + ) -> str: + from lxml.html import tostring + + return ( + tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + def titles(self, article: Element) -> list[str]: + """ + :param article: tag + :return: (title (str) | None, alternative titles (set)) + """ + from itertools import combinations + + titles: list[str] = [] + for title_element in article.findall("k"): + if title_element.text is None: + # TODO: look for tag? + log.warning(f"empty title element: {self.tostring(title_element)}") + continue + n_opts = len([c for c in title_element if c.tag == "opt"]) + if n_opts: + titles += [ + self._mktitle(title_element, comb) + for j in range(n_opts + 1) + for comb in combinations(list(range(n_opts)), j) + ] + else: + titles.append(self._mktitle(title_element)) + + return titles + + def _mktitle( # noqa: PLR6301 + self, + title_element: Element, + include_opts: Sequence | None = None, + ) -> str: + if include_opts is None: + include_opts = () + title = title_element.text or "" + opt_i = -1 + for c in title_element: + if c.tag == "nu" and c.tail: + if title: + title += c.tail + else: + title = c.tail + if c.tag == "opt" and c.text is not None: + opt_i += 1 + if opt_i in include_opts: + title += c.text + if c.tail: + title += c.tail + return title.strip() diff --git a/pyglossary/plugins/yomichan/__init__.py b/pyglossary/plugins/yomichan/__init__.py index 428766582..2fa262e3c 100644 --- a/pyglossary/plugins/yomichan/__init__.py +++ b/pyglossary/plugins/yomichan/__init__.py @@ -1,12 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import json -import os -import re -from os.path import join -from typing import TYPE_CHECKING, Any - from pyglossary.flags import ALWAYS from pyglossary.option import ( BoolOption, @@ -15,10 +9,7 @@ StrOption, ) -if TYPE_CHECKING: - from collections.abc import Generator, Sequence - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -180,239 +171,3 @@ ), ), } - - -def _isKana(char: str) -> bool: - assert len(char) == 1 - val = ord(char) - return ( - 0x3040 <= val <= 0x309F # Hiragana - or 0x30A0 <= val <= 0x30FF # Katakana (incl. center dot) - or 0xFF65 <= val <= 0xFF9F # Half-width Katakana (incl. center dot) - ) - - -def _isKanji(char: str) -> bool: - assert len(char) == 1 - val = ord(char) - return ( - 0x3400 <= val <= 0x4DBF # CJK Unified Ideographs Extension A - or 0x4E00 <= val <= 0x9FFF # CJK Unified Ideographs - or 0xF900 <= val <= 0xFAFF # CJK Compatibility Ideographs - or 0x20000 <= val <= 0x2A6DF # CJK Unified Ideographs Extension B - or 0x2A700 <= val <= 0x2B73F # CJK Unified Ideographs Extension C - or 0x2B740 <= val <= 0x2B81F # CJK Unified Ideographs Extension D - or 0x2F800 <= val <= 0x2FA1F # CJK Compatibility Ideographs Supplement - ) - - -def _uniqueList(lst: Sequence[str]) -> list[str]: - seen: set[str] = set() - result: list[str] = [] - for elem in lst: - if elem not in seen: - seen.add(elem) - result.append(elem) - - return result - - -def _compilePat(pattern: str) -> re.Pattern | None: - if not pattern: - return None - return re.compile(pattern) - - -class Writer: - depends = { - "bs4": "beautifulsoup4", - } - - _term_bank_size = 10_000 - _term_from_headword_only = True - _no_term_from_reading = True - _delete_word_pattern = "" - _ignore_word_with_pattern = "" - _alternates_from_word_pattern = "" - _alternates_from_defi_pattern = "" - _rule_v1_defi_pattern = "" - _rule_v5_defi_pattern = "" - _rule_vs_defi_pattern = "" - _rule_vk_defi_pattern = "" - _rule_adji_defi_pattern = "" - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - # Yomichan technically supports "structured content" that renders to - # HTML, but it doesn't seem widely used. So here we also strip HTML - # formatting for simplicity. - glos.removeHtmlTagsAll() - self.delete_word_pattern = _compilePat(self._delete_word_pattern) - self.ignore_word_with_pattern = _compilePat(self._ignore_word_with_pattern) - self.alternates_from_word_pattern = _compilePat( - self._alternates_from_word_pattern - ) - self.alternates_from_defi_pattern = _compilePat( - self._alternates_from_defi_pattern - ) - self.rules = [ - (_compilePat(self._rule_v1_defi_pattern), "v1"), - (_compilePat(self._rule_v5_defi_pattern), "v5"), - (_compilePat(self._rule_vs_defi_pattern), "vs"), - (_compilePat(self._rule_vk_defi_pattern), "vk"), - (_compilePat(self._rule_adji_defi_pattern), "adj-i"), - ] - - def _getInfo(self, key: str) -> str: - info = self._glos.getInfo(key) - return info.replace("\n", "
                          ") - - def _getAuthor(self) -> str: - return self._glos.author.replace("\n", "
                          ") - - def _getDictionaryIndex(self) -> dict[str, Any]: - # Schema: https://github.com/FooSoft/yomichan/ - # blob/master/ext/data/schemas/dictionary-index-schema.json - return { - "title": self._getInfo("title"), - "revision": "PyGlossary export", - "sequenced": True, - "format": 3, - "author": self._getAuthor(), - "url": self._getInfo("website"), - "description": self._getInfo("description"), - } - - def _getExpressionsAndReadingFromEntry( - self, - entry: EntryType, - ) -> tuple[list[str], str]: - term_expressions = entry.l_word - - alternates_from_word_pattern = self.alternates_from_word_pattern - if alternates_from_word_pattern: - for word in entry.l_word: - term_expressions += alternates_from_word_pattern.findall(word) - - if self.alternates_from_defi_pattern: - term_expressions += self.alternates_from_defi_pattern.findall( - entry.defi, - re.MULTILINE, - ) - - delete_word_pattern = self.delete_word_pattern - if delete_word_pattern: - term_expressions = [ - delete_word_pattern.sub("", expression) - for expression in term_expressions - ] - - ignore_word_with_pattern = self.ignore_word_with_pattern - if ignore_word_with_pattern: - term_expressions = [ - expression - for expression in term_expressions - if not ignore_word_with_pattern.search(expression) - ] - - term_expressions = _uniqueList(term_expressions) - - try: - reading = next( - expression - for expression in entry.l_word + term_expressions - if all(map(_isKana, expression)) - ) - except StopIteration: - reading = "" - - if self._no_term_from_reading and len(term_expressions) > 1: - term_expressions = [ - expression for expression in term_expressions if expression != reading - ] - - if self._term_from_headword_only: - term_expressions = term_expressions[:1] - - return term_expressions, reading - - def _getRuleIdentifiersFromEntry(self, entry: EntryType) -> list[str]: - return [ - rule - for pattern, rule in self.rules - if pattern and pattern.search(entry.defi, re.MULTILINE) - ] - - def _getTermsFromEntry( - self, - entry: EntryType, - sequenceNumber: int, - ) -> list[list[Any]]: - termExpressions, reading = self._getExpressionsAndReadingFromEntry(entry) - ruleIdentifiers = self._getRuleIdentifiersFromEntry(entry) - - # Schema: https://github.com/FooSoft/yomichan/ - # blob/master/ext/data/schemas/dictionary-term-bank-v3-schema.json - return [ - [ - expression, - # reading only added if expression contains kanji - reading if any(map(_isKanji, expression)) else "", - "", # definition tags - " ".join(ruleIdentifiers), - 0, # score - [entry.defi], - sequenceNumber, - "", # term tags - ] - for expression in termExpressions - ] - - def open(self, filename: str) -> None: - self._filename = filename - self._glos.mergeEntriesWithSameHeadwordPlaintext() - - def finish(self) -> None: - self._filename = "" - - def write(self) -> Generator[None, EntryType, None]: - direc = self._filename - - os.makedirs(direc, exist_ok=True) - - with open(join(direc, "index.json"), "w", encoding="utf-8") as f: - json.dump(self._getDictionaryIndex(), f, ensure_ascii=False) - - entryCount = 0 - termBankIndex = 0 - terms: list[list[Any]] = [] - - def flushTerms() -> None: - nonlocal termBankIndex - if not terms: - return - with open( - join(direc, f"term_bank_{termBankIndex + 1}.json"), - mode="w", - encoding="utf-8", - ) as _file: - json.dump(terms, _file, ensure_ascii=False) - terms.clear() - termBankIndex += 1 - - while True: - entry: EntryType - entry = yield - if entry is None: - break - - if entry.isData(): - continue - - terms.extend(self._getTermsFromEntry(entry, entryCount)) - entryCount += 1 - if len(terms) >= self._term_bank_size: - flushTerms() - - flushTerms() diff --git a/pyglossary/plugins/yomichan/writer.py b/pyglossary/plugins/yomichan/writer.py new file mode 100644 index 000000000..94b6a2a31 --- /dev/null +++ b/pyglossary/plugins/yomichan/writer.py @@ -0,0 +1,249 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import json +import os +import re +from os.path import join +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Generator, Sequence + + from pyglossary.glossary_types import EntryType, GlossaryType + + +def _isKana(char: str) -> bool: + assert len(char) == 1 + val = ord(char) + return ( + 0x3040 <= val <= 0x309F # Hiragana + or 0x30A0 <= val <= 0x30FF # Katakana (incl. center dot) + or 0xFF65 <= val <= 0xFF9F # Half-width Katakana (incl. center dot) + ) + + +def _isKanji(char: str) -> bool: + assert len(char) == 1 + val = ord(char) + return ( + 0x3400 <= val <= 0x4DBF # CJK Unified Ideographs Extension A + or 0x4E00 <= val <= 0x9FFF # CJK Unified Ideographs + or 0xF900 <= val <= 0xFAFF # CJK Compatibility Ideographs + or 0x20000 <= val <= 0x2A6DF # CJK Unified Ideographs Extension B + or 0x2A700 <= val <= 0x2B73F # CJK Unified Ideographs Extension C + or 0x2B740 <= val <= 0x2B81F # CJK Unified Ideographs Extension D + or 0x2F800 <= val <= 0x2FA1F # CJK Compatibility Ideographs Supplement + ) + + +def _uniqueList(lst: Sequence[str]) -> list[str]: + seen: set[str] = set() + result: list[str] = [] + for elem in lst: + if elem not in seen: + seen.add(elem) + result.append(elem) + + return result + + +def _compilePat(pattern: str) -> re.Pattern | None: + if not pattern: + return None + return re.compile(pattern) + + +class Writer: + depends = { + "bs4": "beautifulsoup4", + } + + _term_bank_size = 10_000 + _term_from_headword_only = True + _no_term_from_reading = True + _delete_word_pattern = "" + _ignore_word_with_pattern = "" + _alternates_from_word_pattern = "" + _alternates_from_defi_pattern = "" + _rule_v1_defi_pattern = "" + _rule_v5_defi_pattern = "" + _rule_vs_defi_pattern = "" + _rule_vk_defi_pattern = "" + _rule_adji_defi_pattern = "" + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + # Yomichan technically supports "structured content" that renders to + # HTML, but it doesn't seem widely used. So here we also strip HTML + # formatting for simplicity. + glos.removeHtmlTagsAll() + self.delete_word_pattern = _compilePat(self._delete_word_pattern) + self.ignore_word_with_pattern = _compilePat(self._ignore_word_with_pattern) + self.alternates_from_word_pattern = _compilePat( + self._alternates_from_word_pattern + ) + self.alternates_from_defi_pattern = _compilePat( + self._alternates_from_defi_pattern + ) + self.rules = [ + (_compilePat(self._rule_v1_defi_pattern), "v1"), + (_compilePat(self._rule_v5_defi_pattern), "v5"), + (_compilePat(self._rule_vs_defi_pattern), "vs"), + (_compilePat(self._rule_vk_defi_pattern), "vk"), + (_compilePat(self._rule_adji_defi_pattern), "adj-i"), + ] + + def _getInfo(self, key: str) -> str: + info = self._glos.getInfo(key) + return info.replace("\n", "
                          ") + + def _getAuthor(self) -> str: + return self._glos.author.replace("\n", "
                          ") + + def _getDictionaryIndex(self) -> dict[str, Any]: + # Schema: https://github.com/FooSoft/yomichan/ + # blob/master/ext/data/schemas/dictionary-index-schema.json + return { + "title": self._getInfo("title"), + "revision": "PyGlossary export", + "sequenced": True, + "format": 3, + "author": self._getAuthor(), + "url": self._getInfo("website"), + "description": self._getInfo("description"), + } + + def _getExpressionsAndReadingFromEntry( + self, + entry: EntryType, + ) -> tuple[list[str], str]: + term_expressions = entry.l_word + + alternates_from_word_pattern = self.alternates_from_word_pattern + if alternates_from_word_pattern: + for word in entry.l_word: + term_expressions += alternates_from_word_pattern.findall(word) + + if self.alternates_from_defi_pattern: + term_expressions += self.alternates_from_defi_pattern.findall( + entry.defi, + re.MULTILINE, + ) + + delete_word_pattern = self.delete_word_pattern + if delete_word_pattern: + term_expressions = [ + delete_word_pattern.sub("", expression) + for expression in term_expressions + ] + + ignore_word_with_pattern = self.ignore_word_with_pattern + if ignore_word_with_pattern: + term_expressions = [ + expression + for expression in term_expressions + if not ignore_word_with_pattern.search(expression) + ] + + term_expressions = _uniqueList(term_expressions) + + try: + reading = next( + expression + for expression in entry.l_word + term_expressions + if all(map(_isKana, expression)) + ) + except StopIteration: + reading = "" + + if self._no_term_from_reading and len(term_expressions) > 1: + term_expressions = [ + expression for expression in term_expressions if expression != reading + ] + + if self._term_from_headword_only: + term_expressions = term_expressions[:1] + + return term_expressions, reading + + def _getRuleIdentifiersFromEntry(self, entry: EntryType) -> list[str]: + return [ + rule + for pattern, rule in self.rules + if pattern and pattern.search(entry.defi, re.MULTILINE) + ] + + def _getTermsFromEntry( + self, + entry: EntryType, + sequenceNumber: int, + ) -> list[list[Any]]: + termExpressions, reading = self._getExpressionsAndReadingFromEntry(entry) + ruleIdentifiers = self._getRuleIdentifiersFromEntry(entry) + + # Schema: https://github.com/FooSoft/yomichan/ + # blob/master/ext/data/schemas/dictionary-term-bank-v3-schema.json + return [ + [ + expression, + # reading only added if expression contains kanji + reading if any(map(_isKanji, expression)) else "", + "", # definition tags + " ".join(ruleIdentifiers), + 0, # score + [entry.defi], + sequenceNumber, + "", # term tags + ] + for expression in termExpressions + ] + + def open(self, filename: str) -> None: + self._filename = filename + self._glos.mergeEntriesWithSameHeadwordPlaintext() + + def finish(self) -> None: + self._filename = "" + + def write(self) -> Generator[None, EntryType, None]: + direc = self._filename + + os.makedirs(direc, exist_ok=True) + + with open(join(direc, "index.json"), "w", encoding="utf-8") as f: + json.dump(self._getDictionaryIndex(), f, ensure_ascii=False) + + entryCount = 0 + termBankIndex = 0 + terms: list[list[Any]] = [] + + def flushTerms() -> None: + nonlocal termBankIndex + if not terms: + return + with open( + join(direc, f"term_bank_{termBankIndex + 1}.json"), + mode="w", + encoding="utf-8", + ) as _file: + json.dump(terms, _file, ensure_ascii=False) + terms.clear() + termBankIndex += 1 + + while True: + entry: EntryType + entry = yield + if entry is None: + break + + if entry.isData(): + continue + + terms.extend(self._getTermsFromEntry(entry, entryCount)) + entryCount += 1 + if len(terms) >= self._term_bank_size: + flushTerms() + + flushTerms() diff --git a/pyglossary/plugins/zimfile/__init__.py b/pyglossary/plugins/zimfile/__init__.py index f8951c0a2..8fb211e65 100644 --- a/pyglossary/plugins/zimfile/__init__.py +++ b/pyglossary/plugins/zimfile/__init__.py @@ -1,20 +1,9 @@ # -*- coding: utf-8 -*- - from __future__ import annotations -import os -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Iterator - - from libzim.reader import Archive # type: ignore +from pyglossary.option import Option, UnicodeErrorsOption - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.option import Option - -from pyglossary.core import cacheDir, exc_note, log, pip -from pyglossary.option import UnicodeErrorsOption +from .reader import Reader __all__ = [ "Reader", @@ -61,172 +50,3 @@ # I can't find any way to download zim files from https://library.kiwix.org/ # which wiki.openzim.org points at for downloaing zim files - - -class Reader: - _text_unicode_errors = "replace" - _html_unicode_errors = "replace" - depends = { - "libzim": "libzim>=1.0", - } - - resourceMimeTypes = { - "image/png", - "image/jpeg", - "image/gif", - "image/svg+xml", - "image/webp", - "image/x-icon", - "text/css", - "text/javascript", - "application/javascript", - "application/json", - "application/octet-stream", - "application/octet-stream+xapian", - "application/x-chrome-extension", - "application/warc-headers", - "application/font-woff", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._zimfile: Archive | None = None - - def open(self, filename: str) -> None: - try: - from libzim.reader import Archive - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install libzim` to install") - raise - - self._filename = filename - self._zimfile = Archive(filename) - - def close(self) -> None: - self._filename = "" - self._zimfile = None - - def __len__(self) -> int: - if self._zimfile is None: - log.error("len(reader) called before reader.open()") - return 0 - return self._zimfile.entry_count - - def __iter__(self) -> Iterator[EntryType | None]: # noqa: PLR0912 - glos = self._glos - zimfile = self._zimfile - if zimfile is None: - return - emptyContentCount = 0 - invalidMimeTypeCount = 0 - undefinedMimeTypeCount = 0 - entryCount = zimfile.entry_count - - redirectCount = 0 - - windows = os.sep == "\\" - - try: - f_namemax = os.statvfs(cacheDir).f_namemax # type: ignore - except AttributeError: - log.warning("Unsupported operating system (no os.statvfs)") - # Windows: CreateFileA has a limit of 260 characters. - # CreateFileW supports names up to about 32760 characters (64kB). - f_namemax = 200 - - fileNameTooLong: list[str] = [] - - text_unicode_errors = self._text_unicode_errors - html_unicode_errors = self._html_unicode_errors - - for entryIndex in range(entryCount): - zEntry = zimfile._get_entry_by_id(entryIndex) - word = zEntry.title - - if zEntry.is_redirect: - redirectCount += 1 - targetWord = zEntry.get_redirect_entry().title - yield glos.newEntry( - word, - f'Redirect: {targetWord}', - defiFormat="h", - ) - continue - - zItem = zEntry.get_item() - b_content = zItem.content.tobytes() - - if not b_content: - emptyContentCount += 1 - yield None - # TODO: test with more zim files - # Looks like: zItem.path == zEntry.path == "-" + word - # print(f"b_content empty, {word=}, {zEntry.path=}, {zItem.path=}") - # if zEntry.path == "-" + word: - # yield None - # else: - # defi = f"Path: {zEntry.path}" - # yield glos.newEntry(word, defi, defiFormat="m") - continue - - try: - mimetype = zItem.mimetype - except RuntimeError: - invalidMimeTypeCount += 1 - mimetype = "" - yield glos.newDataEntry(word, b_content) - - if mimetype == "undefined": - undefinedMimeTypeCount += 1 - continue - - mimetype = mimetype.split(";")[0] - - if mimetype.startswith("text/html"): - # can be "text/html;raw=true" - defi = b_content.decode("utf-8", errors=html_unicode_errors) - defi = defi.replace(' src="../I/', ' src="./') - yield glos.newEntry(word, defi, defiFormat="h") - continue - - if mimetype == "text/plain": - yield glos.newEntry( - word, - b_content.decode("utf-8", errors=text_unicode_errors), - defiFormat="m", - ) - continue - - if mimetype not in self.resourceMimeTypes: - log.warning(f"Unrecognized {mimetype=}") - - if len(word) > f_namemax: - fileNameTooLong.append(word) - continue - - if "|" in word: - log.warning(f"resource title: {word}") - if windows: - continue - - try: - entry = glos.newDataEntry(word, b_content) - except Exception as e: - log.error(f"error creating file: {e}") - continue - yield entry - - log.info(f"ZIM Entry Count: {entryCount}") - - if fileNameTooLong: - log.warning(f"Files with name too long: {len(fileNameTooLong)}") - - if emptyContentCount > 0: - log.info(f"Empty Content Count: {emptyContentCount}") - if invalidMimeTypeCount > 0: - log.info(f"Invalid MIME-Type Count: {invalidMimeTypeCount}") - if undefinedMimeTypeCount > 0: - log.info(f"MIME-Type 'undefined' Count: {invalidMimeTypeCount}") - if redirectCount > 0: - log.info(f"Redirect Count: {redirectCount}") diff --git a/pyglossary/plugins/zimfile/reader.py b/pyglossary/plugins/zimfile/reader.py new file mode 100644 index 000000000..3ed362f03 --- /dev/null +++ b/pyglossary/plugins/zimfile/reader.py @@ -0,0 +1,184 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + + from libzim.reader import Archive # type: ignore + + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import cacheDir, exc_note, log, pip + + +class Reader: + _text_unicode_errors = "replace" + _html_unicode_errors = "replace" + depends = { + "libzim": "libzim>=1.0", + } + + resourceMimeTypes = { + "image/png", + "image/jpeg", + "image/gif", + "image/svg+xml", + "image/webp", + "image/x-icon", + "text/css", + "text/javascript", + "application/javascript", + "application/json", + "application/octet-stream", + "application/octet-stream+xapian", + "application/x-chrome-extension", + "application/warc-headers", + "application/font-woff", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._zimfile: Archive | None = None + + def open(self, filename: str) -> None: + try: + from libzim.reader import Archive + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install libzim` to install") + raise + + self._filename = filename + self._zimfile = Archive(filename) + + def close(self) -> None: + self._filename = "" + self._zimfile = None + + def __len__(self) -> int: + if self._zimfile is None: + log.error("len(reader) called before reader.open()") + return 0 + return self._zimfile.entry_count + + def __iter__(self) -> Iterator[EntryType | None]: # noqa: PLR0912 + glos = self._glos + zimfile = self._zimfile + if zimfile is None: + return + emptyContentCount = 0 + invalidMimeTypeCount = 0 + undefinedMimeTypeCount = 0 + entryCount = zimfile.entry_count + + redirectCount = 0 + + windows = os.sep == "\\" + + try: + f_namemax = os.statvfs(cacheDir).f_namemax # type: ignore + except AttributeError: + log.warning("Unsupported operating system (no os.statvfs)") + # Windows: CreateFileA has a limit of 260 characters. + # CreateFileW supports names up to about 32760 characters (64kB). + f_namemax = 200 + + fileNameTooLong: list[str] = [] + + text_unicode_errors = self._text_unicode_errors + html_unicode_errors = self._html_unicode_errors + + for entryIndex in range(entryCount): + zEntry = zimfile._get_entry_by_id(entryIndex) + word = zEntry.title + + if zEntry.is_redirect: + redirectCount += 1 + targetWord = zEntry.get_redirect_entry().title + yield glos.newEntry( + word, + f'Redirect: {targetWord}', + defiFormat="h", + ) + continue + + zItem = zEntry.get_item() + b_content = zItem.content.tobytes() + + if not b_content: + emptyContentCount += 1 + yield None + # TODO: test with more zim files + # Looks like: zItem.path == zEntry.path == "-" + word + # print(f"b_content empty, {word=}, {zEntry.path=}, {zItem.path=}") + # if zEntry.path == "-" + word: + # yield None + # else: + # defi = f"Path: {zEntry.path}" + # yield glos.newEntry(word, defi, defiFormat="m") + continue + + try: + mimetype = zItem.mimetype + except RuntimeError: + invalidMimeTypeCount += 1 + mimetype = "" + yield glos.newDataEntry(word, b_content) + + if mimetype == "undefined": + undefinedMimeTypeCount += 1 + continue + + mimetype = mimetype.split(";")[0] + + if mimetype.startswith("text/html"): + # can be "text/html;raw=true" + defi = b_content.decode("utf-8", errors=html_unicode_errors) + defi = defi.replace(' src="../I/', ' src="./') + yield glos.newEntry(word, defi, defiFormat="h") + continue + + if mimetype == "text/plain": + yield glos.newEntry( + word, + b_content.decode("utf-8", errors=text_unicode_errors), + defiFormat="m", + ) + continue + + if mimetype not in self.resourceMimeTypes: + log.warning(f"Unrecognized {mimetype=}") + + if len(word) > f_namemax: + fileNameTooLong.append(word) + continue + + if "|" in word: + log.warning(f"resource title: {word}") + if windows: + continue + + try: + entry = glos.newDataEntry(word, b_content) + except Exception as e: + log.error(f"error creating file: {e}") + continue + yield entry + + log.info(f"ZIM Entry Count: {entryCount}") + + if fileNameTooLong: + log.warning(f"Files with name too long: {len(fileNameTooLong)}") + + if emptyContentCount > 0: + log.info(f"Empty Content Count: {emptyContentCount}") + if invalidMimeTypeCount > 0: + log.info(f"Invalid MIME-Type Count: {invalidMimeTypeCount}") + if undefinedMimeTypeCount > 0: + log.info(f"MIME-Type 'undefined' Count: {invalidMimeTypeCount}") + if redirectCount > 0: + log.info(f"Redirect Count: {redirectCount}") diff --git a/tests/deprecated/glossary_security_test.py b/tests/deprecated/glossary_security_test.py index 78f55f060..81fd531d8 100644 --- a/tests/deprecated/glossary_security_test.py +++ b/tests/deprecated/glossary_security_test.py @@ -62,5 +62,6 @@ def test_convert_4(self): self.assertIsNone(res) self.assertLogCritical("Unable to detect output format!") + if __name__ == "__main__": unittest.main()