diff --git a/pyglossary/plugins/aard2_slob/__init__.py b/pyglossary/plugins/aard2_slob/__init__.py index 8d75434ff..6e63ead7a 100644 --- a/pyglossary/plugins/aard2_slob/__init__.py +++ b/pyglossary/plugins/aard2_slob/__init__.py @@ -1,19 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import os -import re -import shutil -from os.path import isfile, splitext -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Generator, Iterator - - from pyglossary import slob - from pyglossary.glossary_types import EntryType, GlossaryType - -from pyglossary.core import cacheDir, exc_note, log, pip from pyglossary.option import ( BoolOption, FileSizeOption, @@ -22,6 +9,9 @@ StrOption, ) +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -92,380 +82,3 @@ " instructions on how to install PyICU.", ), ] - -t_created_at = "created.at" -t_label = "label" -t_created_by = "created.by" -t_copyright = "copyright" -t_license_name = "license.name" -t_license_url = "license.url" -t_uri = "uri" -t_edition = "edition" - -supported_tags = { - t_label, - t_created_at, - t_created_by, - t_copyright, - t_uri, - t_edition, -} - - -class Reader: - depends = { - "icu": "PyICU", # >=1.5 - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - self._re_bword = re.compile( - "(<a href=[^<>]+?>)", - re.IGNORECASE, - ) - - def close(self) -> None: - if self._slobObj is not None: - self._slobObj.close() - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._slobObj: slob.Slob | None = None - - # TODO: PLR0912 Too many branches (13 > 12) - def open(self, filename: str) -> None: # noqa: PLR0912 - try: - import icu # type: ignore # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install PyICU` to install") - raise - from pyglossary import slob - - self._filename = filename - self._slobObj = slob.open(filename) - tags = dict(self._slobObj.tags.items()) - - if t_label in tags: - self._glos.setInfo("name", tags[t_label]) - - if t_created_at in tags: - self._glos.setInfo("creationTime", tags[t_created_at]) - - if t_created_by in tags: - self._glos.setInfo("author", tags[t_created_by]) - - copyrightLines: list[str] = [] - for key in (t_copyright, t_license_name, t_license_url): - try: - value = tags.pop(key) - except KeyError: - continue - copyrightLines.append(value) - if copyrightLines: - self._glos.setInfo("copyright", "\n".join(copyrightLines)) - - if t_uri in tags: - self._glos.setInfo("website", tags[t_uri]) - - if t_edition in tags: - self._glos.setInfo("edition", tags[t_edition]) - - for key, value in tags.items(): - if key in supported_tags: - continue - self._glos.setInfo(f"slob.{key}", value) - - def __len__(self) -> int: - if self._slobObj is None: - log.error("called len() on a reader which is not open") - return 0 - return len(self._slobObj) - - @staticmethod - def _href_sub(m: re.Match) -> str: - st = m.group(0) - if "//" in st: - return st - return st.replace('href="', 'href="bword://').replace( - "href='", - "href='bword://", - ) - - def __iter__(self) -> Iterator[EntryType | None]: - from pyglossary.slob import MIME_HTML, MIME_TEXT - - if self._slobObj is None: - raise RuntimeError("iterating over a reader while it's not open") - - slobObj = self._slobObj - blobSet = set() - - # slob library gives duplicate blobs when iterating over slobObj - # even keeping the last id is not enough, since duplicate blobs - # are not all consecutive. so we have to keep a set of blob IDs - - for blob in slobObj: - id_ = blob.identity - if id_ in blobSet: - yield None # update progressbar - continue - blobSet.add(id_) - - # blob.key is str, blob.content is bytes - word = blob.key - - ctype = blob.content_type.split(";")[0] - if ctype not in {MIME_HTML, MIME_TEXT}: - log.debug(f"unknown {blob.content_type=} in {word=}") - word = word.removeprefix("~/") - yield self._glos.newDataEntry(word, blob.content) - continue - defiFormat = "" - if ctype == MIME_HTML: - defiFormat = "h" - elif ctype == MIME_TEXT: - defiFormat = "m" - - defi = blob.content.decode("utf-8") - defi = self._re_bword.sub(self._href_sub, defi) - yield self._glos.newEntry(word, defi, defiFormat=defiFormat) - - -class Writer: - depends = { - "icu": "PyICU", - } - - _compression: str = "zlib" - _content_type: str = "" - _file_size_approx: int = 0 - _file_size_approx_check_num_entries = 100 - _separate_alternates: bool = False - _word_title: bool = False - _version_info: bool = False - - _audio_goldendict: bool = False - - resourceMimeTypes = { - "png": "image/png", - "jpeg": "image/jpeg", - "jpg": "image/jpeg", - "gif": "image/gif", - "svg": "image/svg+xml", - "webp": "image/webp", - "tiff": "image/tiff", - "tif": "image/tiff", - "bmp": "image/bmp", - "css": "text/css", - "js": "application/javascript", - "json": "application/json", - "woff": "application/font-woff", - "woff2": "application/font-woff2", - "ttf": "application/x-font-ttf", - "otf": "application/x-font-opentype", - "mp3": "audio/mpeg", - "ogg": "audio/ogg", - "spx": "audio/x-speex", - "wav": "audio/wav", - "ini": "text/plain", - # "application/octet-stream+xapian", - "eot": "application/vnd.ms-fontobject", - "pdf": "application/pdf", - "mp4": "video/mp4", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._resPrefix = "" - self._slobWriter: slob.Writer | None = None - - @staticmethod - def _slobObserver( - event: slob.WriterEvent, # noqa: F401, F821 - ) -> None: - log.debug(f"slob: {event.name}{': ' + event.data if event.data else ''}") - - def _open(self, filepath: str, namePostfix: str) -> slob.Writer: - from pyglossary import slob - - if isfile(filepath): - shutil.move(filepath, f"{filepath}.bak") - log.warning(f"renamed existing {filepath!r} to {filepath + '.bak'!r}") - self._slobWriter = slobWriter = slob.Writer( - filepath, - observer=self._slobObserver, - workdir=cacheDir, - compression=self._compression, - version_info=self._version_info, - ) - - # "label" tag is a dictionary name shown in UI - slobWriter.tag(t_label, self._glos.getInfo("name") + namePostfix) - - createdAt = self._glos.getInfo("creationTime") - if createdAt is not None: - slobWriter.tag(t_created_at, createdAt) - createdBy = self._glos.getInfo("author") - if createdBy is not None: - slobWriter.tag(t_created_by, createdBy) - - filename = os.path.basename(filepath) - dic_uri = re.sub(r"[^A-Za-z0-9_-]+", "_", filename) - # "uri" tag is not web url, it's a part of gloss addressing ID: uri + article ID - # setting the tag allows bookmark & history migration, if dict file is updated - # we use source filename as "uri", since it is stable (most likely) - slobWriter.tag(t_uri, dic_uri) - - return slobWriter - - def open(self, filename: str) -> None: - try: - import icu # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install PyICU` to install") - raise - if isfile(filename): - raise OSError(f"File '{filename}' already exists") - namePostfix = "" - if self._file_size_approx > 0: - namePostfix = " (part 1)" - self._open(filename, namePostfix) - self._filename = filename - - def finish(self) -> None: - from time import perf_counter - - self._filename = "" - if self._slobWriter is None: - return - log.info("Finalizing slob file...") - t0 = perf_counter() - self._slobWriter.finalize() - log.info(f"Finalizing slob file took {perf_counter() - t0:.1f} seconds") - self._slobWriter = None - - def addDataEntry(self, entry: EntryType) -> None: - slobWriter = self._slobWriter - if slobWriter is None: - raise ValueError("slobWriter is None") - rel_path = entry.s_word - _, ext = splitext(rel_path) - ext = ext.lstrip(os.path.extsep).lower() - content_type = self.resourceMimeTypes.get(ext) - if not content_type: - log.error(f"Aard2 slob: unknown content type for {rel_path!r}") - return - content = entry.data - key = self._resPrefix + rel_path - try: - key.encode(slobWriter.encoding) - except UnicodeEncodeError: - log.error(f"Failed to add, broken unicode in key: {key!a}") - return - slobWriter.add(content, key, content_type=content_type) - - def addEntry(self, entry: EntryType) -> None: - words = entry.l_word - b_defi = entry.defi.encode("utf-8") - ctype = self._content_type - writer = self._slobWriter - if writer is None: - raise ValueError("slobWriter is None") - - entry.detectDefiFormat() - defiFormat = entry.defiFormat - - if self._word_title and defiFormat in {"h", "m"}: - if defiFormat == "m": - defiFormat = "h" - title = self._glos.wordTitleStr( - words[0], - ) - b_defi = title.encode("utf-8") + b_defi - - if defiFormat == "h": - b_defi = b_defi.replace(b'"bword://', b'"') - b_defi = b_defi.replace(b"'bword://", b"'") - - if not self._audio_goldendict: - b_defi = b_defi.replace( - b"""href="sound://""", - b'''onclick="new Audio(this.href).play(); return false;" href="''', - ) - b_defi = b_defi.replace( - b"""href='sound://""", - b"""onclick="new Audio(this.href).play(); return false;" href='""", - ) - b_defi = b_defi.replace(b"""<img src="/""", b'''<img src="''') - b_defi = b_defi.replace(b"""<img src='""", b"""<img src='""") - b_defi = b_defi.replace(b"""<img src="file:///""", b'''<img src="''') - b_defi = b_defi.replace(b"""<img src='file:///""", b"""<img src='""") - - if not ctype: - if defiFormat == "h": - ctype = "text/html; charset=utf-8" - elif defiFormat == "m": - ctype = "text/plain; charset=utf-8" - else: - ctype = "text/plain; charset=utf-8" - - if not self._separate_alternates: - writer.add( - b_defi, - *tuple(words), - content_type=ctype, - ) - return - - headword, *alts = words - writer.add( - b_defi, - headword, - content_type=ctype, - ) - for alt in alts: - writer.add( - b_defi, - f"{alt}, {headword}", - content_type=ctype, - ) - - def write(self) -> Generator[None, EntryType, None]: - slobWriter = self._slobWriter - if slobWriter is None: - raise ValueError("slobWriter is None") - file_size_approx = int(self._file_size_approx * 0.95) - entryCount = 0 - sumBlobSize = 0 - fileIndex = 0 - filenameNoExt, _ = splitext(self._filename) - while True: - entry = yield - if entry is None: - break - - if entry.isData(): - self.addDataEntry(entry) - else: - self.addEntry(entry) - - if file_size_approx <= 0: - continue - - # handle file_size_approx - check_every = self._file_size_approx_check_num_entries - entryCount += 1 - if entryCount % check_every == 0: - sumBlobSize = slobWriter.size_data() - if sumBlobSize >= file_size_approx: - slobWriter.finalize() - fileIndex += 1 - slobWriter = self._open( - f"{filenameNoExt}.{fileIndex}.slob", - f" (part {fileIndex + 1})", - ) - sumBlobSize = 0 - entryCount = 0 diff --git a/pyglossary/plugins/aard2_slob/reader.py b/pyglossary/plugins/aard2_slob/reader.py new file mode 100644 index 000000000..c80fdffb8 --- /dev/null +++ b/pyglossary/plugins/aard2_slob/reader.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary import slob + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import exc_note, log, pip +from pyglossary.plugins.aard2_slob.tags import ( + supported_tags, + t_copyright, + t_created_at, + t_created_by, + t_edition, + t_label, + t_license_name, + t_license_url, + t_uri, +) + + +class Reader: + depends = { + "icu": "PyICU", # >=1.5 + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + self._re_bword = re.compile( + "(<a href=[^<>]+?>)", + re.IGNORECASE, + ) + + def close(self) -> None: + if self._slobObj is not None: + self._slobObj.close() + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._slobObj: slob.Slob | None = None + + # TODO: PLR0912 Too many branches (13 > 12) + def open(self, filename: str) -> None: # noqa: PLR0912 + try: + import icu # type: ignore # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install PyICU` to install") + raise + from pyglossary import slob + + self._filename = filename + self._slobObj = slob.open(filename) + tags = dict(self._slobObj.tags.items()) + + if t_label in tags: + self._glos.setInfo("name", tags[t_label]) + + if t_created_at in tags: + self._glos.setInfo("creationTime", tags[t_created_at]) + + if t_created_by in tags: + self._glos.setInfo("author", tags[t_created_by]) + + copyrightLines: list[str] = [] + for key in (t_copyright, t_license_name, t_license_url): + try: + value = tags.pop(key) + except KeyError: + continue + copyrightLines.append(value) + if copyrightLines: + self._glos.setInfo("copyright", "\n".join(copyrightLines)) + + if t_uri in tags: + self._glos.setInfo("website", tags[t_uri]) + + if t_edition in tags: + self._glos.setInfo("edition", tags[t_edition]) + + for key, value in tags.items(): + if key in supported_tags: + continue + self._glos.setInfo(f"slob.{key}", value) + + def __len__(self) -> int: + if self._slobObj is None: + log.error("called len() on a reader which is not open") + return 0 + return len(self._slobObj) + + @staticmethod + def _href_sub(m: re.Match) -> str: + st = m.group(0) + if "//" in st: + return st + return st.replace('href="', 'href="bword://').replace( + "href='", + "href='bword://", + ) + + def __iter__(self) -> Iterator[EntryType | None]: + from pyglossary.slob import MIME_HTML, MIME_TEXT + + if self._slobObj is None: + raise RuntimeError("iterating over a reader while it's not open") + + slobObj = self._slobObj + blobSet = set() + + # slob library gives duplicate blobs when iterating over slobObj + # even keeping the last id is not enough, since duplicate blobs + # are not all consecutive. so we have to keep a set of blob IDs + + for blob in slobObj: + id_ = blob.identity + if id_ in blobSet: + yield None # update progressbar + continue + blobSet.add(id_) + + # blob.key is str, blob.content is bytes + word = blob.key + + ctype = blob.content_type.split(";")[0] + if ctype not in {MIME_HTML, MIME_TEXT}: + log.debug(f"unknown {blob.content_type=} in {word=}") + word = word.removeprefix("~/") + yield self._glos.newDataEntry(word, blob.content) + continue + defiFormat = "" + if ctype == MIME_HTML: + defiFormat = "h" + elif ctype == MIME_TEXT: + defiFormat = "m" + + defi = blob.content.decode("utf-8") + defi = self._re_bword.sub(self._href_sub, defi) + yield self._glos.newEntry(word, defi, defiFormat=defiFormat) diff --git a/pyglossary/plugins/aard2_slob/tags.py b/pyglossary/plugins/aard2_slob/tags.py new file mode 100644 index 000000000..e4336a02e --- /dev/null +++ b/pyglossary/plugins/aard2_slob/tags.py @@ -0,0 +1,29 @@ +t_created_at = "created.at" +t_label = "label" +t_created_by = "created.by" +t_copyright = "copyright" +t_license_name = "license.name" +t_license_url = "license.url" +t_uri = "uri" +t_edition = "edition" + +supported_tags = { + t_label, + t_created_at, + t_created_by, + t_copyright, + t_uri, + t_edition, +} + +__all__ = [ + "supported_tags", + "t_copyright", + "t_created_at", + "t_created_by", + "t_edition", + "t_label", + "t_license_name", + "t_license_url", + "t_uri", +] diff --git a/pyglossary/plugins/aard2_slob/writer.py b/pyglossary/plugins/aard2_slob/writer.py new file mode 100644 index 000000000..c8519f987 --- /dev/null +++ b/pyglossary/plugins/aard2_slob/writer.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import os +import re +import shutil +from os.path import isfile, splitext +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary import slob + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import cacheDir, exc_note, log, pip +from pyglossary.plugins.aard2_slob.tags import ( + t_created_at, + t_created_by, + t_label, + t_uri, +) + + +class Writer: + depends = { + "icu": "PyICU", + } + + _compression: str = "zlib" + _content_type: str = "" + _file_size_approx: int = 0 + _file_size_approx_check_num_entries = 100 + _separate_alternates: bool = False + _word_title: bool = False + _version_info: bool = False + + _audio_goldendict: bool = False + + resourceMimeTypes = { + "png": "image/png", + "jpeg": "image/jpeg", + "jpg": "image/jpeg", + "gif": "image/gif", + "svg": "image/svg+xml", + "webp": "image/webp", + "tiff": "image/tiff", + "tif": "image/tiff", + "bmp": "image/bmp", + "css": "text/css", + "js": "application/javascript", + "json": "application/json", + "woff": "application/font-woff", + "woff2": "application/font-woff2", + "ttf": "application/x-font-ttf", + "otf": "application/x-font-opentype", + "mp3": "audio/mpeg", + "ogg": "audio/ogg", + "spx": "audio/x-speex", + "wav": "audio/wav", + "ini": "text/plain", + # "application/octet-stream+xapian", + "eot": "application/vnd.ms-fontobject", + "pdf": "application/pdf", + "mp4": "video/mp4", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._resPrefix = "" + self._slobWriter: slob.Writer | None = None + + @staticmethod + def _slobObserver( + event: slob.WriterEvent, # noqa: F401, F821 + ) -> None: + log.debug(f"slob: {event.name}{': ' + event.data if event.data else ''}") + + def _open(self, filepath: str, namePostfix: str) -> slob.Writer: + from pyglossary import slob + + if isfile(filepath): + shutil.move(filepath, f"{filepath}.bak") + log.warning(f"renamed existing {filepath!r} to {filepath + '.bak'!r}") + self._slobWriter = slobWriter = slob.Writer( + filepath, + observer=self._slobObserver, + workdir=cacheDir, + compression=self._compression, + version_info=self._version_info, + ) + + # "label" tag is a dictionary name shown in UI + slobWriter.tag(t_label, self._glos.getInfo("name") + namePostfix) + + createdAt = self._glos.getInfo("creationTime") + if createdAt is not None: + slobWriter.tag(t_created_at, createdAt) + createdBy = self._glos.getInfo("author") + if createdBy is not None: + slobWriter.tag(t_created_by, createdBy) + + filename = os.path.basename(filepath) + dic_uri = re.sub(r"[^A-Za-z0-9_-]+", "_", filename) + # "uri" tag is not web url, it's a part of gloss addressing ID: uri + article ID + # setting the tag allows bookmark & history migration, if dict file is updated + # we use source filename as "uri", since it is stable (most likely) + slobWriter.tag(t_uri, dic_uri) + + return slobWriter + + def open(self, filename: str) -> None: + try: + import icu # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install PyICU` to install") + raise + if isfile(filename): + raise OSError(f"File '{filename}' already exists") + namePostfix = "" + if self._file_size_approx > 0: + namePostfix = " (part 1)" + self._open(filename, namePostfix) + self._filename = filename + + def finish(self) -> None: + from time import perf_counter + + self._filename = "" + if self._slobWriter is None: + return + log.info("Finalizing slob file...") + t0 = perf_counter() + self._slobWriter.finalize() + log.info(f"Finalizing slob file took {perf_counter() - t0:.1f} seconds") + self._slobWriter = None + + def addDataEntry(self, entry: EntryType) -> None: + slobWriter = self._slobWriter + if slobWriter is None: + raise ValueError("slobWriter is None") + rel_path = entry.s_word + _, ext = splitext(rel_path) + ext = ext.lstrip(os.path.extsep).lower() + content_type = self.resourceMimeTypes.get(ext) + if not content_type: + log.error(f"Aard2 slob: unknown content type for {rel_path!r}") + return + content = entry.data + key = self._resPrefix + rel_path + try: + key.encode(slobWriter.encoding) + except UnicodeEncodeError: + log.error(f"Failed to add, broken unicode in key: {key!a}") + return + slobWriter.add(content, key, content_type=content_type) + + def addEntry(self, entry: EntryType) -> None: + words = entry.l_word + b_defi = entry.defi.encode("utf-8") + ctype = self._content_type + writer = self._slobWriter + if writer is None: + raise ValueError("slobWriter is None") + + entry.detectDefiFormat() + defiFormat = entry.defiFormat + + if self._word_title and defiFormat in {"h", "m"}: + if defiFormat == "m": + defiFormat = "h" + title = self._glos.wordTitleStr( + words[0], + ) + b_defi = title.encode("utf-8") + b_defi + + if defiFormat == "h": + b_defi = b_defi.replace(b'"bword://', b'"') + b_defi = b_defi.replace(b"'bword://", b"'") + + if not self._audio_goldendict: + b_defi = b_defi.replace( + b"""href="sound://""", + b'''onclick="new Audio(this.href).play(); return false;" href="''', + ) + b_defi = b_defi.replace( + b"""href='sound://""", + b"""onclick="new Audio(this.href).play(); return false;" href='""", + ) + b_defi = b_defi.replace(b"""<img src="/""", b'''<img src="''') + b_defi = b_defi.replace(b"""<img src='""", b"""<img src='""") + b_defi = b_defi.replace(b"""<img src="file:///""", b'''<img src="''') + b_defi = b_defi.replace(b"""<img src='file:///""", b"""<img src='""") + + if not ctype: + if defiFormat == "h": + ctype = "text/html; charset=utf-8" + elif defiFormat == "m": + ctype = "text/plain; charset=utf-8" + else: + ctype = "text/plain; charset=utf-8" + + if not self._separate_alternates: + writer.add( + b_defi, + *tuple(words), + content_type=ctype, + ) + return + + headword, *alts = words + writer.add( + b_defi, + headword, + content_type=ctype, + ) + for alt in alts: + writer.add( + b_defi, + f"{alt}, {headword}", + content_type=ctype, + ) + + def write(self) -> Generator[None, EntryType, None]: + slobWriter = self._slobWriter + if slobWriter is None: + raise ValueError("slobWriter is None") + file_size_approx = int(self._file_size_approx * 0.95) + entryCount = 0 + sumBlobSize = 0 + fileIndex = 0 + filenameNoExt, _ = splitext(self._filename) + while True: + entry = yield + if entry is None: + break + + if entry.isData(): + self.addDataEntry(entry) + else: + self.addEntry(entry) + + if file_size_approx <= 0: + continue + + # handle file_size_approx + check_every = self._file_size_approx_check_num_entries + entryCount += 1 + if entryCount % check_every == 0: + sumBlobSize = slobWriter.size_data() + if sumBlobSize >= file_size_approx: + slobWriter.finalize() + fileIndex += 1 + slobWriter = self._open( + f"{filenameNoExt}.{fileIndex}.slob", + f" (part {fileIndex + 1})", + ) + sumBlobSize = 0 + entryCount = 0 diff --git a/pyglossary/plugins/almaany/__init__.py b/pyglossary/plugins/almaany/__init__.py index 9a49bb167..8838cfd62 100644 --- a/pyglossary/plugins/almaany/__init__.py +++ b/pyglossary/plugins/almaany/__init__.py @@ -1,16 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html from typing import TYPE_CHECKING if TYPE_CHECKING: - import sqlite3 - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option +from .reader import Reader + __all__ = [ "Reader", "description", @@ -40,80 +37,3 @@ "Almaany.com Arabic Dictionary - Google Play", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("h") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(*) from WordsTable") - return self._cur.fetchone()[0] - - def __iter__(self) -> Iterator[EntryType]: - if self._cur is None: - raise ValueError("cur is None") - from pyglossary.langs.writing_system import getWritingSystemFromText - - alternateDict: dict[str, list[str]] = {} - self._cur.execute("select wordkey, searchwordkey from Keys") - for row in self._cur.fetchall(): - if row[0] in alternateDict: - alternateDict[row[0]].append(row[1]) - else: - alternateDict[row[0]] = [row[1]] - - self._cur.execute( - "select word, searchword, root, meaning from WordsTable order by id", - ) - # FIXME: iteration over self._cur stops after one entry - # and self._cur.fetchone() returns None - # for row in self._cur: - for row in self._cur.fetchall(): - word = row[0] - searchword = row[1] - root = row[2] - meaning = row[3] - definition = meaning - definition = definition.replace("|", "<br>") - - if root: - definition += ( - f'<br>Root: <a href="bword://{html.escape(root)}">{root}</a>' - ) - - ws = getWritingSystemFromText(meaning) - if ws and ws.direction == "rtl": - definition = f'<div dir="rtl">{definition}</div>' - - words = [word, searchword] - if word in alternateDict: - words += alternateDict[word] - yield self._glos.newEntry( - words, - definition, - defiFormat="h", - ) - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/almaany/reader.py b/pyglossary/plugins/almaany/reader.py new file mode 100644 index 000000000..3447c1010 --- /dev/null +++ b/pyglossary/plugins/almaany/reader.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("h") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(*) from WordsTable") + return self._cur.fetchone()[0] + + def __iter__(self) -> Iterator[EntryType]: + if self._cur is None: + raise ValueError("cur is None") + from pyglossary.langs.writing_system import getWritingSystemFromText + + alternateDict: dict[str, list[str]] = {} + self._cur.execute("select wordkey, searchwordkey from Keys") + for row in self._cur.fetchall(): + if row[0] in alternateDict: + alternateDict[row[0]].append(row[1]) + else: + alternateDict[row[0]] = [row[1]] + + self._cur.execute( + "select word, searchword, root, meaning from WordsTable order by id", + ) + # FIXME: iteration over self._cur stops after one entry + # and self._cur.fetchone() returns None + # for row in self._cur: + for row in self._cur.fetchall(): + word = row[0] + searchword = row[1] + root = row[2] + meaning = row[3] + definition = meaning + definition = definition.replace("|", "<br>") + + if root: + definition += ( + f'<br>Root: <a href="bword://{html.escape(root)}">{root}</a>' + ) + + ws = getWritingSystemFromText(meaning) + if ws and ws.direction == "rtl": + definition = f'<div dir="rtl">{definition}</div>' + + words = [word, searchword] + if word in alternateDict: + words += alternateDict[word] + yield self._glos.newEntry( + words, + definition, + defiFormat="h", + ) + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/ayandict_sqlite/__init__.py b/pyglossary/plugins/ayandict_sqlite/__init__.py index 5ac40b37b..a86e83029 100644 --- a/pyglossary/plugins/ayandict_sqlite/__init__.py +++ b/pyglossary/plugins/ayandict_sqlite/__init__.py @@ -1,20 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import ( - TYPE_CHECKING, -) - -if TYPE_CHECKING: - import sqlite3 - from collections.abc import Generator, Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.xdxf.transform import XdxfTransformer - -from pyglossary.core import log from pyglossary.option import BoolOption, Option +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -49,194 +40,3 @@ comment="Create fuzzy search data", ), } - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("h") - - self._cur.execute("SELECT key, value FROM meta;") - for row in self._cur.fetchall(): - if row[0] == "hash": - continue - self._glos.setInfo(row[0], row[1]) - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(id) from entry") - return self._cur.fetchone()[0] - - def __iter__(self) -> Iterator[EntryType]: - from json import loads - - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - "SELECT entry.term, entry.article, " - "json_group_array(alt.term)" - "FROM entry LEFT JOIN alt ON entry.id=alt.id " - "GROUP BY entry.id;", - ) - for row in self._cur.fetchall(): - terms = [row[0]] + [alt for alt in loads(row[2]) if alt] - article = row[1] - yield self._glos.newEntry(terms, article, defiFormat="h") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() - - -class Writer: - _fuzzy: int = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - self._xdxfTr: XdxfTransformer | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - con = self._con = connect(filename) - self._cur = self._con.cursor() - - for query in ( - "CREATE TABLE meta ('key' TEXT PRIMARY KEY NOT NULL, 'value' TEXT);", - ( - "CREATE TABLE entry ('id' INTEGER PRIMARY KEY NOT NULL, " - "'term' TEXT, 'article' TEXT);" - ), - "CREATE TABLE alt ('id' INTEGER NOT NULL, 'term' TEXT);", - "CREATE INDEX idx_meta ON meta(key);", - "CREATE INDEX idx_entry_term ON entry(term COLLATE NOCASE);", - "CREATE INDEX idx_alt_id ON alt(id);", - "CREATE INDEX idx_alt_term ON alt(term COLLATE NOCASE);", - ): - try: - con.execute(query) - except Exception as e: # noqa: PERF203 - log.error(f"query: {query}") - raise e - - for key, value in self._glos.iterInfo(): - con.execute( - "INSERT INTO meta (key, value) VALUES (?, ?);", - (key, value), - ) - - if self._fuzzy: - con.execute( - "CREATE TABLE fuzzy3 ('sub' TEXT NOT NULL, " - "'term' TEXT NOT NULL, " - "id INTEGER NOT NULL);", - ) - con.execute( - "CREATE INDEX idx_fuzzy3_sub ON fuzzy3(sub COLLATE NOCASE);", - ) - - con.commit() - - def finish(self) -> None: - if self._con is None or self._cur is None: - return - - self._con.commit() - self._con.close() - self._con = None - self._cur = None - - def xdxf_setup(self) -> None: - from pyglossary.xdxf.transform import XdxfTransformer - - # if self._xsl: - # self._xdxfTr = XslXdxfTransformer(encoding="utf-8") - # return - self._xdxfTr = XdxfTransformer(encoding="utf-8") - - def xdxf_transform(self, text: str) -> str: - if self._xdxfTr is None: - self.xdxf_setup() - return self._xdxfTr.transformByInnerString(text) # type: ignore - - def write(self) -> Generator[None, EntryType, None]: - import hashlib - - cur = self._cur - if cur is None: - raise ValueError("cur is None") - hash_ = hashlib.md5() - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # can save it with entry.save(directory) - continue - defi = entry.defi - entry.detectDefiFormat() - if entry.defiFormat == "m": - if "\n" in defi: - defi = f"<pre>{defi}</pre>" - elif entry.defiFormat == "x": - defi = self.xdxf_transform(defi) - - cur.execute( - "INSERT INTO entry(term, article) VALUES (?, ?);", - (entry.l_word[0], defi), - ) - id_ = cur.lastrowid - if id_ is None: - raise ValueError("lastrowid is None") - for alt in entry.l_word[1:]: - cur.execute( - "INSERT INTO alt(id, term) VALUES (?, ?);", - (id_, alt), - ) - hash_.update(entry.s_word.encode("utf-8")) - if self._fuzzy: - self.addFuzzy(id_, entry.l_word) - - cur.execute( - "INSERT INTO meta (key, value) VALUES (?, ?);", - ("hash", hash_.hexdigest()), - ) - - def addFuzzy(self, id_: int, terms: list[str]) -> None: - cur = self._cur - if cur is None: - raise ValueError("cur is None") - for term in terms: - subs: set[str] = set() - for word in term.split(" "): - eword = "\n" + word - subs.update(eword[i : i + 3] for i in range(len(eword) - 2)) - for sub in subs: - cur.execute( - "INSERT INTO fuzzy3(sub, term, id) VALUES (?, ?, ?);", - (sub, term, id_), - ) diff --git a/pyglossary/plugins/ayandict_sqlite/reader.py b/pyglossary/plugins/ayandict_sqlite/reader.py new file mode 100644 index 000000000..b1ed0b6eb --- /dev/null +++ b/pyglossary/plugins/ayandict_sqlite/reader.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("h") + + self._cur.execute("SELECT key, value FROM meta;") + for row in self._cur.fetchall(): + if row[0] == "hash": + continue + self._glos.setInfo(row[0], row[1]) + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(id) from entry") + return self._cur.fetchone()[0] + + def __iter__(self) -> Iterator[EntryType]: + from json import loads + + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + "SELECT entry.term, entry.article, " + "json_group_array(alt.term)" + "FROM entry LEFT JOIN alt ON entry.id=alt.id " + "GROUP BY entry.id;", + ) + for row in self._cur.fetchall(): + terms = [row[0]] + [alt for alt in loads(row[2]) if alt] + article = row[1] + yield self._glos.newEntry(terms, article, defiFormat="h") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/ayandict_sqlite/writer.py b/pyglossary/plugins/ayandict_sqlite/writer.py new file mode 100644 index 000000000..810631c71 --- /dev/null +++ b/pyglossary/plugins/ayandict_sqlite/writer.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, +) + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.xdxf.transform import XdxfTransformer + +from pyglossary.core import log + + +class Writer: + _fuzzy: int = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + self._xdxfTr: XdxfTransformer | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + con = self._con = connect(filename) + self._cur = self._con.cursor() + + for query in ( + "CREATE TABLE meta ('key' TEXT PRIMARY KEY NOT NULL, 'value' TEXT);", + ( + "CREATE TABLE entry ('id' INTEGER PRIMARY KEY NOT NULL, " + "'term' TEXT, 'article' TEXT);" + ), + "CREATE TABLE alt ('id' INTEGER NOT NULL, 'term' TEXT);", + "CREATE INDEX idx_meta ON meta(key);", + "CREATE INDEX idx_entry_term ON entry(term COLLATE NOCASE);", + "CREATE INDEX idx_alt_id ON alt(id);", + "CREATE INDEX idx_alt_term ON alt(term COLLATE NOCASE);", + ): + try: + con.execute(query) + except Exception as e: # noqa: PERF203 + log.error(f"query: {query}") + raise e + + for key, value in self._glos.iterInfo(): + con.execute( + "INSERT INTO meta (key, value) VALUES (?, ?);", + (key, value), + ) + + if self._fuzzy: + con.execute( + "CREATE TABLE fuzzy3 ('sub' TEXT NOT NULL, " + "'term' TEXT NOT NULL, " + "id INTEGER NOT NULL);", + ) + con.execute( + "CREATE INDEX idx_fuzzy3_sub ON fuzzy3(sub COLLATE NOCASE);", + ) + + con.commit() + + def finish(self) -> None: + if self._con is None or self._cur is None: + return + + self._con.commit() + self._con.close() + self._con = None + self._cur = None + + def xdxf_setup(self) -> None: + from pyglossary.xdxf.transform import XdxfTransformer + + # if self._xsl: + # self._xdxfTr = XslXdxfTransformer(encoding="utf-8") + # return + self._xdxfTr = XdxfTransformer(encoding="utf-8") + + def xdxf_transform(self, text: str) -> str: + if self._xdxfTr is None: + self.xdxf_setup() + return self._xdxfTr.transformByInnerString(text) # type: ignore + + def write(self) -> Generator[None, EntryType, None]: + import hashlib + + cur = self._cur + if cur is None: + raise ValueError("cur is None") + hash_ = hashlib.md5() + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # can save it with entry.save(directory) + continue + defi = entry.defi + entry.detectDefiFormat() + if entry.defiFormat == "m": + if "\n" in defi: + defi = f"<pre>{defi}</pre>" + elif entry.defiFormat == "x": + defi = self.xdxf_transform(defi) + + cur.execute( + "INSERT INTO entry(term, article) VALUES (?, ?);", + (entry.l_word[0], defi), + ) + id_ = cur.lastrowid + if id_ is None: + raise ValueError("lastrowid is None") + for alt in entry.l_word[1:]: + cur.execute( + "INSERT INTO alt(id, term) VALUES (?, ?);", + (id_, alt), + ) + hash_.update(entry.s_word.encode("utf-8")) + if self._fuzzy: + self.addFuzzy(id_, entry.l_word) + + cur.execute( + "INSERT INTO meta (key, value) VALUES (?, ?);", + ("hash", hash_.hexdigest()), + ) + + def addFuzzy(self, id_: int, terms: list[str]) -> None: + cur = self._cur + if cur is None: + raise ValueError("cur is None") + for term in terms: + subs: set[str] = set() + for word in term.split(" "): + eword = "\n" + word + subs.update(eword[i : i + 3] for i in range(len(eword) - 2)) + for sub in subs: + cur.execute( + "INSERT INTO fuzzy3(sub, term, id) VALUES (?, ?, ?);", + (sub, term, id_), + ) diff --git a/pyglossary/plugins/cc_kedict/__init__.py b/pyglossary/plugins/cc_kedict/__init__.py index 772c2ff6b..5289633ef 100644 --- a/pyglossary/plugins/cc_kedict/__init__.py +++ b/pyglossary/plugins/cc_kedict/__init__.py @@ -2,20 +2,12 @@ # mypy: ignore-errors from __future__ import annotations -from io import BytesIO -from os.path import isdir, join -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING if TYPE_CHECKING: - from collections.abc import Callable, Iterator - - import lxml - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option -from pyglossary.core import exc_note, log, pip -from pyglossary.text_reader import TextGlossaryReader +from .reader import Reader __all__ = [ "Reader", @@ -46,295 +38,3 @@ "@mhagiwara/cc-kedict", ) optionsProp: dict[str, Option] = {} - - -class YamlReader(TextGlossaryReader): - tagStyle = ( - "color:white;" - "background:green;" - "padding-left:3px;" - "padding-right:3px;" - "border-radius:0.5ex;" - # 0.5ex ~= 0.3em, but "ex" is recommended - ) - - def __init__( # noqa: PLR0913 - self, - glos: GlossaryType, - spellKey: str = "", - posKey: str = "", - synsKey: str = "", - tagsKey: str = "", - ) -> None: - TextGlossaryReader.__init__(self, glos) - self._spellKey = spellKey - self._posKey = posKey - self._synsKey = synsKey - self._tagsKey = tagsKey - - self._posMapping = { - "n": "noun", - "v": "verb", - "a": "adjective", - "pron": "pronoun", - "propn": "proper noun", - "intj": "interjection", - "det": "determiner", - "part": "particle", - "adv": "adverb", - "num": "number", - "abbrev": "abbreviation", - "suf": "suffix", - "pref": "prefix", - } - - @classmethod - def isInfoWord(cls, _word: str) -> bool: - return False - - @classmethod - def fixInfoWord(cls, _word: str) -> str: - return "" - - @staticmethod - def _makeList( - hf: lxml.etree.htmlfile, - input_objects: list[Any], - processor: Callable, - single_prefix: str | None = None, - skip_single: bool = True, - ) -> None: - """Wrap elements into <ol> if more than one element.""" - if not input_objects: - return - - if skip_single and len(input_objects) == 1: - # if single_prefix is None: - # single_prefix = ET.Element("br") - if single_prefix: - hf.write(single_prefix) - processor(hf, input_objects[0], 1) - return - - with hf.element("ol"): - for el in input_objects: - with hf.element("li"): - processor(hf, el, len(input_objects)) - - def _processExample( # noqa: PLR6301 - self, - hf: lxml.etree.htmlfile, - exampleDict: dict, - _count: int, - ) -> None: - from lxml import etree as ET - - if not exampleDict.get("example"): - log.error(f"invalid example: {exampleDict}") - return - - hf.write(exampleDict["example"]) - - transliteration = exampleDict.get("transliteration") - if transliteration: - hf.write(ET.Element("br")) - with hf.element("font", color="green"): - hf.write(f"{transliteration}") - - translation = exampleDict.get("translation") - if translation: - hf.write(ET.Element("br")) - with hf.element("i"): - hf.write(f"{translation}") - - def _processDef( - self, - hf: lxml.etree.htmlfile, - defDict: dict, - count: int, - ) -> None: - from lxml import etree as ET - - text = defDict.get("def", "") - if text: - hf.write(text) - - examples = defDict.get("examples") - if examples: - if text: - if count == 1: - hf.write(ET.Element("br")) - hf.write(ET.Element("br")) - with hf.element("i"): - hf.write("Examples:") - self._makeList( - hf, - examples, - self._processExample, - skip_single=False, - ) - - def _processNote( # noqa: PLR6301 - self, - hf: lxml.etree.htmlfile, - note: str, - _count: int, - ) -> None: - hf.write(note) - - def _processEntry( - self, - hf: lxml.etree.htmlfile, - edict: dict, - ) -> None: - from lxml import etree as ET - - if self._spellKey and self._spellKey in edict: - spelling = edict[self._spellKey] - if not isinstance(spelling, str): - log.error(f"{spelling=}, {type(spelling)=}, {edict=}") - # https://github.com/mhagiwara/cc-kedict/pull/1 - spelling = "on" if spelling is True else "" - if spelling: - with hf.element("font", color="green"): - hf.write(spelling) - hf.write(ET.Element("br")) - - if self._posKey and self._posKey in edict: - pos = edict[self._posKey] - pos = self._posMapping.get(pos, pos) - with hf.element("i"): - hf.write(pos.capitalize()) - hf.write(ET.Element("br")) - - if self._tagsKey and self._tagsKey in edict: - tags = edict[self._tagsKey] - for i, tag in enumerate(tags): - if i > 0: - hf.write(" ") - with hf.element("span", style=self.tagStyle): - hf.write(tag) - hf.write(ET.Element("br")) - - defs = edict.get("defs") - if defs: - self._makeList( - hf, - defs, - self._processDef, - ) - - if self._synsKey and self._synsKey in edict: - hf.write("Synonyms: ") - for i, word in enumerate(edict[self._synsKey]): - if i > 0: - with hf.element("big"): - hf.write(" | ") # NESTED: 5 - with hf.element("a", href=f"bword://{word}"): - hf.write(word) - hf.write(ET.Element("br")) - - notes = edict.get("notes") - if notes: - hf.write(ET.Element("br")) - hf.write("Notes:") - self._makeList( - hf, - notes, - self._processNote, - skip_single=False, - ) - - def _createEntry( - self, - yamlBlock: str, - ) -> tuple[str, str, None] | None: - from lxml import etree as ET - from yaml import load - - try: - from yaml import CLoader as Loader - except ImportError: - from yaml import Loader - - edict = load(yamlBlock, Loader=Loader) - word = edict.get("word") - if not word: - log.error(f"no word in {edict}") - return None - - f = BytesIO() - - with ET.htmlfile(f, encoding="utf-8") as hf: - with hf.element("div"): - self._processEntry(hf, edict) - - defi = f.getvalue().decode("utf-8") - return word, defi, None - - def nextBlock(self) -> EntryType: - if not self._file: - raise StopIteration - lines: list[str] = [] - while True: - line = self.readline() - if not line: - break - line = line.rstrip("\n\r") - if not line: - continue - if line.startswith("- "): - line = " " + line[1:] - if lines: - self._bufferLine = line - return self._createEntry("\n".join(lines)) - - lines.append(line) - - if lines: - return self._createEntry("\n".join(lines)) - - raise StopIteration - - -class Reader: - depends = { - "yaml": "PyYAML", - "lxml": "lxml", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._yaml = YamlReader( - glos, - spellKey="romaja", - posKey="pos", - synsKey="syns", - tagsKey="tags", - ) - - def __len__(self) -> int: - return 0 - - def open(self, filename: str) -> None: - try: - from lxml import etree as ET # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - if isdir(filename): - filename = join(filename, "kedict.yml") - self._filename = filename - - self._glos.sourceLangName = "Korean" - self._glos.targetLangName = "English" - - self._glos.setDefaultDefiFormat("h") - self._yaml.open(filename) - - def close(self) -> None: - self._yaml.close() - - def __iter__(self) -> Iterator[EntryType]: - yield from self._yaml diff --git a/pyglossary/plugins/cc_kedict/reader.py b/pyglossary/plugins/cc_kedict/reader.py new file mode 100644 index 000000000..1a9efcb4f --- /dev/null +++ b/pyglossary/plugins/cc_kedict/reader.py @@ -0,0 +1,309 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +from __future__ import annotations + +from io import BytesIO +from os.path import isdir, join +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from collections.abc import Callable, Iterator + + import lxml + + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import exc_note, log, pip +from pyglossary.text_reader import TextGlossaryReader + + +class YamlReader(TextGlossaryReader): + tagStyle = ( + "color:white;" + "background:green;" + "padding-left:3px;" + "padding-right:3px;" + "border-radius:0.5ex;" + # 0.5ex ~= 0.3em, but "ex" is recommended + ) + + def __init__( # noqa: PLR0913 + self, + glos: GlossaryType, + spellKey: str = "", + posKey: str = "", + synsKey: str = "", + tagsKey: str = "", + ) -> None: + TextGlossaryReader.__init__(self, glos) + self._spellKey = spellKey + self._posKey = posKey + self._synsKey = synsKey + self._tagsKey = tagsKey + + self._posMapping = { + "n": "noun", + "v": "verb", + "a": "adjective", + "pron": "pronoun", + "propn": "proper noun", + "intj": "interjection", + "det": "determiner", + "part": "particle", + "adv": "adverb", + "num": "number", + "abbrev": "abbreviation", + "suf": "suffix", + "pref": "prefix", + } + + @classmethod + def isInfoWord(cls, _word: str) -> bool: + return False + + @classmethod + def fixInfoWord(cls, _word: str) -> str: + return "" + + @staticmethod + def _makeList( + hf: lxml.etree.htmlfile, + input_objects: list[Any], + processor: Callable, + single_prefix: str | None = None, + skip_single: bool = True, + ) -> None: + """Wrap elements into <ol> if more than one element.""" + if not input_objects: + return + + if skip_single and len(input_objects) == 1: + # if single_prefix is None: + # single_prefix = ET.Element("br") + if single_prefix: + hf.write(single_prefix) + processor(hf, input_objects[0], 1) + return + + with hf.element("ol"): + for el in input_objects: + with hf.element("li"): + processor(hf, el, len(input_objects)) + + def _processExample( # noqa: PLR6301 + self, + hf: lxml.etree.htmlfile, + exampleDict: dict, + _count: int, + ) -> None: + from lxml import etree as ET + + if not exampleDict.get("example"): + log.error(f"invalid example: {exampleDict}") + return + + hf.write(exampleDict["example"]) + + transliteration = exampleDict.get("transliteration") + if transliteration: + hf.write(ET.Element("br")) + with hf.element("font", color="green"): + hf.write(f"{transliteration}") + + translation = exampleDict.get("translation") + if translation: + hf.write(ET.Element("br")) + with hf.element("i"): + hf.write(f"{translation}") + + def _processDef( + self, + hf: lxml.etree.htmlfile, + defDict: dict, + count: int, + ) -> None: + from lxml import etree as ET + + text = defDict.get("def", "") + if text: + hf.write(text) + + examples = defDict.get("examples") + if examples: + if text: + if count == 1: + hf.write(ET.Element("br")) + hf.write(ET.Element("br")) + with hf.element("i"): + hf.write("Examples:") + self._makeList( + hf, + examples, + self._processExample, + skip_single=False, + ) + + def _processNote( # noqa: PLR6301 + self, + hf: lxml.etree.htmlfile, + note: str, + _count: int, + ) -> None: + hf.write(note) + + def _processEntry( + self, + hf: lxml.etree.htmlfile, + edict: dict, + ) -> None: + from lxml import etree as ET + + if self._spellKey and self._spellKey in edict: + spelling = edict[self._spellKey] + if not isinstance(spelling, str): + log.error(f"{spelling=}, {type(spelling)=}, {edict=}") + # https://github.com/mhagiwara/cc-kedict/pull/1 + spelling = "on" if spelling is True else "" + if spelling: + with hf.element("font", color="green"): + hf.write(spelling) + hf.write(ET.Element("br")) + + if self._posKey and self._posKey in edict: + pos = edict[self._posKey] + pos = self._posMapping.get(pos, pos) + with hf.element("i"): + hf.write(pos.capitalize()) + hf.write(ET.Element("br")) + + if self._tagsKey and self._tagsKey in edict: + tags = edict[self._tagsKey] + for i, tag in enumerate(tags): + if i > 0: + hf.write(" ") + with hf.element("span", style=self.tagStyle): + hf.write(tag) + hf.write(ET.Element("br")) + + defs = edict.get("defs") + if defs: + self._makeList( + hf, + defs, + self._processDef, + ) + + if self._synsKey and self._synsKey in edict: + hf.write("Synonyms: ") + for i, word in enumerate(edict[self._synsKey]): + if i > 0: + with hf.element("big"): + hf.write(" | ") # NESTED: 5 + with hf.element("a", href=f"bword://{word}"): + hf.write(word) + hf.write(ET.Element("br")) + + notes = edict.get("notes") + if notes: + hf.write(ET.Element("br")) + hf.write("Notes:") + self._makeList( + hf, + notes, + self._processNote, + skip_single=False, + ) + + def _createEntry( + self, + yamlBlock: str, + ) -> tuple[str, str, None] | None: + from lxml import etree as ET + from yaml import load + + try: + from yaml import CLoader as Loader + except ImportError: + from yaml import Loader + + edict = load(yamlBlock, Loader=Loader) + word = edict.get("word") + if not word: + log.error(f"no word in {edict}") + return None + + f = BytesIO() + + with ET.htmlfile(f, encoding="utf-8") as hf: + with hf.element("div"): + self._processEntry(hf, edict) + + defi = f.getvalue().decode("utf-8") + return word, defi, None + + def nextBlock(self) -> EntryType: + if not self._file: + raise StopIteration + lines: list[str] = [] + while True: + line = self.readline() + if not line: + break + line = line.rstrip("\n\r") + if not line: + continue + if line.startswith("- "): + line = " " + line[1:] + if lines: + self._bufferLine = line + return self._createEntry("\n".join(lines)) + + lines.append(line) + + if lines: + return self._createEntry("\n".join(lines)) + + raise StopIteration + + +class Reader: + depends = { + "yaml": "PyYAML", + "lxml": "lxml", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._yaml = YamlReader( + glos, + spellKey="romaja", + posKey="pos", + synsKey="syns", + tagsKey="tags", + ) + + def __len__(self) -> int: + return 0 + + def open(self, filename: str) -> None: + try: + from lxml import etree as ET # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + if isdir(filename): + filename = join(filename, "kedict.yml") + self._filename = filename + + self._glos.sourceLangName = "Korean" + self._glos.targetLangName = "English" + + self._glos.setDefaultDefiFormat("h") + self._yaml.open(filename) + + def close(self) -> None: + self._yaml.close() + + def __iter__(self) -> Iterator[EntryType]: + yield from self._yaml diff --git a/pyglossary/plugins/crawler_dir/__init__.py b/pyglossary/plugins/crawler_dir/__init__.py index 9c0ec0557..ae64f6e5c 100644 --- a/pyglossary/plugins/crawler_dir/__init__.py +++ b/pyglossary/plugins/crawler_dir/__init__.py @@ -1,28 +1,13 @@ # mypy: ignore-errors from __future__ import annotations -from hashlib import sha1 -from os import listdir, makedirs -from os.path import dirname, isdir, isfile, join, splitext -from typing import TYPE_CHECKING - -from pyglossary.compression import ( - compressionOpenFunc, -) -from pyglossary.core import log from pyglossary.option import ( Option, StrOption, ) -from pyglossary.text_utils import ( - escapeNTB, - splitByBarUnescapeNTB, -) - -if TYPE_CHECKING: - from collections.abc import Generator, Iterator - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -56,147 +41,3 @@ comment="Compression Algorithm", ), } - - -class Writer: - _compression: str = "" - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = None - - def finish(self) -> None: - pass - - def open(self, filename: str) -> None: - self._filename = filename - if not isdir(filename): - makedirs(filename) - - @staticmethod - def filePathFromWord(b_word: bytes) -> str: - bw = b_word.lower() - if len(bw) <= 2: - return bw.hex() - if len(bw) <= 4: - return join( - bw[:2].hex() + ".d", - bw[2:].hex(), - ) - return join( - bw[:2].hex() + ".d", - bw[2:4].hex() + ".d", - bw[4:8].hex() + "-" + sha1(b_word).hexdigest()[:8], # noqa: S324 - ) - - def write(self) -> None: - from pyglossary.json_utils import dataToPrettyJson - - filename = self._filename - - wordCount = 0 - compression = self._compression - c_open = compressionOpenFunc(compression) - if not c_open: - raise ValueError(f"invalid compression {compression!r}") - while True: - entry = yield - if entry is None: - break - if entry.isData(): - continue - fpath = join(filename, self.filePathFromWord(entry.b_word)) - if compression: - fpath = f"{fpath}.{compression}" - parentDir = dirname(fpath) - if not isdir(parentDir): - makedirs(parentDir) - if isfile(fpath): - log.warning(f"file exists: {fpath}") - fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}" # noqa: S324 - with c_open(fpath, "wt", encoding="utf-8") as _file: - _file.write( - f"{escapeNTB(entry.s_word)}\n{entry.defi}", - ) - wordCount += 1 - - with open( - join(filename, "info.json"), - mode="w", - encoding="utf-8", - ) as infoFile: - info = {} - info["name"] = self._glos.getInfo("name") - info["wordCount"] = wordCount - info |= self._glos.getExtraInfos(["name", "wordCount"]) - - infoFile.write(dataToPrettyJson(info)) - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = None - self._wordCount = 0 - - def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToData - - self._filename = filename - - with open(join(filename, "info.json"), encoding="utf-8") as infoFp: - info = jsonToData(infoFp.read()) - self._wordCount = info.pop("wordCount") - for key, value in info.items(): - self._glos.setInfo(key, value) - - def close(self) -> None: - pass - - def __len__(self) -> int: - return self._wordCount - - def _fromFile(self, fpath: str) -> EntryType: - _, ext = splitext(fpath) - c_open = compressionOpenFunc(ext.lstrip(".")) - if not c_open: - log.error(f"invalid extension {ext}") - c_open = open - with c_open(fpath, "rt", encoding="utf-8") as _file: - words = splitByBarUnescapeNTB(_file.readline().rstrip("\n")) - defi = _file.read() - return self._glos.newEntry(words, defi) - - @staticmethod - def _listdirSortKey(name: str) -> str: - name_nox, ext = splitext(name) - if ext == ".d": - return name - return name_nox - - def _readDir( - self, - dpath: str, - exclude: set[str] | None, - ) -> Generator[EntryType, None, None]: - children = listdir(dpath) - if exclude: - children = [name for name in children if name not in exclude] - children.sort(key=self._listdirSortKey) - for name in children: - cpath = join(dpath, name) - if isfile(cpath): - yield self._fromFile(cpath) - continue - if isdir(cpath): - yield from self._readDir(cpath, None) - continue - log.error(f"Not a file nor a directory: {cpath}") - - def __iter__(self) -> Iterator[EntryType]: - yield from self._readDir( - self._filename, - { - "info.json", - }, - ) diff --git a/pyglossary/plugins/crawler_dir/reader.py b/pyglossary/plugins/crawler_dir/reader.py new file mode 100644 index 000000000..9bb6b0369 --- /dev/null +++ b/pyglossary/plugins/crawler_dir/reader.py @@ -0,0 +1,88 @@ +# mypy: ignore-errors +from __future__ import annotations + +from os import listdir +from os.path import isdir, isfile, join, splitext +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + compressionOpenFunc, +) +from pyglossary.core import log +from pyglossary.text_utils import ( + splitByBarUnescapeNTB, +) + +if TYPE_CHECKING: + from collections.abc import Generator, Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = None + self._wordCount = 0 + + def open(self, filename: str) -> None: + from pyglossary.json_utils import jsonToData + + self._filename = filename + + with open(join(filename, "info.json"), encoding="utf-8") as infoFp: + info = jsonToData(infoFp.read()) + self._wordCount = info.pop("wordCount") + for key, value in info.items(): + self._glos.setInfo(key, value) + + def close(self) -> None: + pass + + def __len__(self) -> int: + return self._wordCount + + def _fromFile(self, fpath: str) -> EntryType: + _, ext = splitext(fpath) + c_open = compressionOpenFunc(ext.lstrip(".")) + if not c_open: + log.error(f"invalid extension {ext}") + c_open = open + with c_open(fpath, "rt", encoding="utf-8") as _file: + words = splitByBarUnescapeNTB(_file.readline().rstrip("\n")) + defi = _file.read() + return self._glos.newEntry(words, defi) + + @staticmethod + def _listdirSortKey(name: str) -> str: + name_nox, ext = splitext(name) + if ext == ".d": + return name + return name_nox + + def _readDir( + self, + dpath: str, + exclude: set[str] | None, + ) -> Generator[EntryType, None, None]: + children = listdir(dpath) + if exclude: + children = [name for name in children if name not in exclude] + children.sort(key=self._listdirSortKey) + for name in children: + cpath = join(dpath, name) + if isfile(cpath): + yield self._fromFile(cpath) + continue + if isdir(cpath): + yield from self._readDir(cpath, None) + continue + log.error(f"Not a file nor a directory: {cpath}") + + def __iter__(self) -> Iterator[EntryType]: + yield from self._readDir( + self._filename, + { + "info.json", + }, + ) diff --git a/pyglossary/plugins/crawler_dir/writer.py b/pyglossary/plugins/crawler_dir/writer.py new file mode 100644 index 000000000..6171a341e --- /dev/null +++ b/pyglossary/plugins/crawler_dir/writer.py @@ -0,0 +1,93 @@ +# mypy: ignore-errors +from __future__ import annotations + +from hashlib import sha1 +from os import makedirs +from os.path import dirname, isdir, isfile, join +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + compressionOpenFunc, +) +from pyglossary.core import log +from pyglossary.text_utils import ( + escapeNTB, +) + +if TYPE_CHECKING: + from pyglossary.glossary_types import GlossaryType + + +class Writer: + _compression: str = "" + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = None + + def finish(self) -> None: + pass + + def open(self, filename: str) -> None: + self._filename = filename + if not isdir(filename): + makedirs(filename) + + @staticmethod + def filePathFromWord(b_word: bytes) -> str: + bw = b_word.lower() + if len(bw) <= 2: + return bw.hex() + if len(bw) <= 4: + return join( + bw[:2].hex() + ".d", + bw[2:].hex(), + ) + return join( + bw[:2].hex() + ".d", + bw[2:4].hex() + ".d", + bw[4:8].hex() + "-" + sha1(b_word).hexdigest()[:8], # noqa: S324 + ) + + def write(self) -> None: + from pyglossary.json_utils import dataToPrettyJson + + filename = self._filename + + wordCount = 0 + compression = self._compression + c_open = compressionOpenFunc(compression) + if not c_open: + raise ValueError(f"invalid compression {compression!r}") + while True: + entry = yield + if entry is None: + break + if entry.isData(): + continue + fpath = join(filename, self.filePathFromWord(entry.b_word)) + if compression: + fpath = f"{fpath}.{compression}" + parentDir = dirname(fpath) + if not isdir(parentDir): + makedirs(parentDir) + if isfile(fpath): + log.warning(f"file exists: {fpath}") + fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}" # noqa: S324 + with c_open(fpath, "wt", encoding="utf-8") as _file: + _file.write( + f"{escapeNTB(entry.s_word)}\n{entry.defi}", + ) + wordCount += 1 + + with open( + join(filename, "info.json"), + mode="w", + encoding="utf-8", + ) as infoFile: + info = {} + info["name"] = self._glos.getInfo("name") + info["wordCount"] = wordCount + info |= self._glos.getExtraInfos(["name", "wordCount"]) + + infoFile.write(dataToPrettyJson(info)) diff --git a/pyglossary/plugins/csv_plugin/__init__.py b/pyglossary/plugins/csv_plugin/__init__.py index 1f9aebb29..36916b243 100644 --- a/pyglossary/plugins/csv_plugin/__init__.py +++ b/pyglossary/plugins/csv_plugin/__init__.py @@ -20,16 +20,7 @@ from __future__ import annotations import csv -import os -from os.path import isdir, join -from typing import TYPE_CHECKING, cast -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import log -from pyglossary.io_utils import nullTextIO from pyglossary.option import ( BoolOption, EncodingOption, @@ -37,11 +28,8 @@ Option, ) -if TYPE_CHECKING: - import io - from collections.abc import Generator, Iterable, Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -94,231 +82,3 @@ } csv.field_size_limit(0x7FFFFFFF) - - -class Reader: - compressions = stdCompressions - - _encoding: str = "utf-8" - _newline: str = "\n" - _delimiter: str = "," - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self.clear() - - def clear(self) -> None: - self._filename = "" - self._file: io.TextIOBase = nullTextIO - self._fileSize = 0 - self._leadingLinesCount = 0 - self._wordCount: int | None = None - self._pos = -1 - self._csvReader: Iterable[list[str]] | None = None - self._resDir = "" - self._resFileNames: list[str] = [] - self._bufferRow: list[str] | None = None - - def open( - self, - filename: str, - ) -> None: - from pyglossary.text_reader import TextFilePosWrapper - - self._filename = filename - cfile = cast( - "io.TextIOBase", - compressionOpen( - filename, - mode="rt", - encoding=self._encoding, - newline=self._newline, - ), - ) - - if self._glos.progressbar: - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - # self._glos.setInfo("input_file_size", f"{self._fileSize}") - else: - log.warning("CSV Reader: file is not seekable") - - self._file = TextFilePosWrapper(cfile, self._encoding) - self._csvReader = csv.reader( - self._file, - dialect="excel", - delimiter=self._delimiter, - ) - self._resDir = filename + "_res" - if isdir(self._resDir): - self._resFileNames = os.listdir(self._resDir) - else: - self._resDir = "" - self._resFileNames = [] - for row in self._csvReader: - if not row: - continue - if not row[0].startswith("#"): - self._bufferRow = row - break - if len(row) < 2: - log.error(f"invalid row: {row}") - continue - self._glos.setInfo(row[0].lstrip("#"), row[1]) - - def close(self) -> None: - if self._file: - try: - self._file.close() - except Exception: - log.exception("error while closing csv file") - self.clear() - - def __len__(self) -> int: - from pyglossary.file_utils import fileCountLines - - if self._wordCount is None: - if hasattr(self._file, "compression"): - return 0 - log.debug("Try not to use len(reader) as it takes extra time") - self._wordCount = fileCountLines(self._filename) - self._leadingLinesCount - return self._wordCount + len(self._resFileNames) - - def _iterRows(self) -> Iterator[list[str]]: - if self._csvReader is None: - raise RuntimeError("self._csvReader is None") - if self._bufferRow: - yield self._bufferRow - yield from self._csvReader - - def _processRow(self, row: list[str]) -> EntryType | None: - if not row: - return None - - word: str | list[str] - try: - word = row[0] - defi = row[1] - except IndexError: - log.error(f"invalid row: {row!r}") - return None - - try: - alts = row[2].split(",") - except IndexError: - pass - else: - word = [word] + alts - - return self._glos.newEntry( - word, - defi, - byteProgress=( - (self._file.tell(), self._fileSize) if self._fileSize else None - ), - ) - - def __iter__(self) -> Iterator[EntryType | None]: - if not self._csvReader: - raise RuntimeError("iterating over a reader while it's not open") - - wordCount = 0 - for row in self._iterRows(): - wordCount += 1 - yield self._processRow(row) - - self._wordCount = wordCount - - resDir = self._resDir - for fname in self._resFileNames: - with open(join(resDir, fname), "rb") as _file: - yield self._glos.newDataEntry( - fname, - _file.read(), - ) - - -class Writer: - compressions = stdCompressions - - _encoding: str = "utf-8" - _newline: str = "\n" - _resources: bool = True - _delimiter: str = "," - _add_defi_format: bool = False - _enable_info: bool = True - _word_title: bool = False - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._file: io.TextIOBase = nullTextIO - - def open(self, filename: str) -> None: - self._filename = filename - self._file = cast( - "io.TextIOBase", - compressionOpen( - filename, - mode="wt", - encoding=self._encoding, - newline=self._newline, - ), - ) - self._resDir = resDir = filename + "_res" - self._csvWriter = csv.writer( - self._file, - dialect="excel", - quoting=csv.QUOTE_ALL, # FIXME - delimiter=self._delimiter, - ) - if not isdir(resDir): - os.mkdir(resDir) - if self._enable_info: - for key, value in self._glos.iterInfo(): - self._csvWriter.writerow([f"#{key}", value]) - - def finish(self) -> None: - self._filename = "" - self._file.close() - self._file = nullTextIO - if not os.listdir(self._resDir): - os.rmdir(self._resDir) - - def write(self) -> Generator[None, EntryType, None]: - resources = self._resources - add_defi_format = self._add_defi_format - glos = self._glos - resDir = self._resDir - writer = self._csvWriter - word_title = self._word_title - while True: - entry = yield - if entry is None: - break - if entry.isData(): - if resources: - entry.save(resDir) - continue - - words = entry.l_word - if not words: - continue - word, alts = words[0], words[1:] - defi = entry.defi - - if word_title: - defi = glos.wordTitleStr(words[0]) + defi - - row = [ - word, - defi, - ] - if add_defi_format: - entry.detectDefiFormat() - row.append(entry.defiFormat) - if alts: - row.append(",".join(alts)) - - writer.writerow(row) diff --git a/pyglossary/plugins/csv_plugin/reader.py b/pyglossary/plugins/csv_plugin/reader.py new file mode 100644 index 000000000..8087e9e92 --- /dev/null +++ b/pyglossary/plugins/csv_plugin/reader.py @@ -0,0 +1,182 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2013-2019 Saeed Rasooli <saeed.gnu@gmail.com> (ilius) +# This file is part of PyGlossary project, https://github.com/ilius/pyglossary +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL +# If not, see <http://www.gnu.org/licenses/gpl.txt>. + +from __future__ import annotations + +import csv +import os +from os.path import isdir, join +from typing import TYPE_CHECKING, cast + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import log +from pyglossary.io_utils import nullTextIO + +if TYPE_CHECKING: + import io + from collections.abc import Iterable, Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + compressions = stdCompressions + + _encoding: str = "utf-8" + _newline: str = "\n" + _delimiter: str = "," + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self.clear() + + def clear(self) -> None: + self._filename = "" + self._file: io.TextIOBase = nullTextIO + self._fileSize = 0 + self._leadingLinesCount = 0 + self._wordCount: int | None = None + self._pos = -1 + self._csvReader: Iterable[list[str]] | None = None + self._resDir = "" + self._resFileNames: list[str] = [] + self._bufferRow: list[str] | None = None + + def open( + self, + filename: str, + ) -> None: + from pyglossary.text_reader import TextFilePosWrapper + + self._filename = filename + cfile = cast( + "io.TextIOBase", + compressionOpen( + filename, + mode="rt", + encoding=self._encoding, + newline=self._newline, + ), + ) + + if self._glos.progressbar: + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + # self._glos.setInfo("input_file_size", f"{self._fileSize}") + else: + log.warning("CSV Reader: file is not seekable") + + self._file = TextFilePosWrapper(cfile, self._encoding) + self._csvReader = csv.reader( + self._file, + dialect="excel", + delimiter=self._delimiter, + ) + self._resDir = filename + "_res" + if isdir(self._resDir): + self._resFileNames = os.listdir(self._resDir) + else: + self._resDir = "" + self._resFileNames = [] + for row in self._csvReader: + if not row: + continue + if not row[0].startswith("#"): + self._bufferRow = row + break + if len(row) < 2: + log.error(f"invalid row: {row}") + continue + self._glos.setInfo(row[0].lstrip("#"), row[1]) + + def close(self) -> None: + if self._file: + try: + self._file.close() + except Exception: + log.exception("error while closing csv file") + self.clear() + + def __len__(self) -> int: + from pyglossary.file_utils import fileCountLines + + if self._wordCount is None: + if hasattr(self._file, "compression"): + return 0 + log.debug("Try not to use len(reader) as it takes extra time") + self._wordCount = fileCountLines(self._filename) - self._leadingLinesCount + return self._wordCount + len(self._resFileNames) + + def _iterRows(self) -> Iterator[list[str]]: + if self._csvReader is None: + raise RuntimeError("self._csvReader is None") + if self._bufferRow: + yield self._bufferRow + yield from self._csvReader + + def _processRow(self, row: list[str]) -> EntryType | None: + if not row: + return None + + word: str | list[str] + try: + word = row[0] + defi = row[1] + except IndexError: + log.error(f"invalid row: {row!r}") + return None + + try: + alts = row[2].split(",") + except IndexError: + pass + else: + word = [word] + alts + + return self._glos.newEntry( + word, + defi, + byteProgress=( + (self._file.tell(), self._fileSize) if self._fileSize else None + ), + ) + + def __iter__(self) -> Iterator[EntryType | None]: + if not self._csvReader: + raise RuntimeError("iterating over a reader while it's not open") + + wordCount = 0 + for row in self._iterRows(): + wordCount += 1 + yield self._processRow(row) + + self._wordCount = wordCount + + resDir = self._resDir + for fname in self._resFileNames: + with open(join(resDir, fname), "rb") as _file: + yield self._glos.newDataEntry( + fname, + _file.read(), + ) diff --git a/pyglossary/plugins/csv_plugin/writer.py b/pyglossary/plugins/csv_plugin/writer.py new file mode 100644 index 000000000..ff1c42920 --- /dev/null +++ b/pyglossary/plugins/csv_plugin/writer.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2013-2019 Saeed Rasooli <saeed.gnu@gmail.com> (ilius) +# This file is part of PyGlossary project, https://github.com/ilius/pyglossary +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL +# If not, see <http://www.gnu.org/licenses/gpl.txt>. + +from __future__ import annotations + +import csv +import os +from os.path import isdir +from typing import TYPE_CHECKING, cast + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.io_utils import nullTextIO + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + compressions = stdCompressions + + _encoding: str = "utf-8" + _newline: str = "\n" + _resources: bool = True + _delimiter: str = "," + _add_defi_format: bool = False + _enable_info: bool = True + _word_title: bool = False + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._file: io.TextIOBase = nullTextIO + + def open(self, filename: str) -> None: + self._filename = filename + self._file = cast( + "io.TextIOBase", + compressionOpen( + filename, + mode="wt", + encoding=self._encoding, + newline=self._newline, + ), + ) + self._resDir = resDir = filename + "_res" + self._csvWriter = csv.writer( + self._file, + dialect="excel", + quoting=csv.QUOTE_ALL, # FIXME + delimiter=self._delimiter, + ) + if not isdir(resDir): + os.mkdir(resDir) + if self._enable_info: + for key, value in self._glos.iterInfo(): + self._csvWriter.writerow([f"#{key}", value]) + + def finish(self) -> None: + self._filename = "" + self._file.close() + self._file = nullTextIO + if not os.listdir(self._resDir): + os.rmdir(self._resDir) + + def write(self) -> Generator[None, EntryType, None]: + resources = self._resources + add_defi_format = self._add_defi_format + glos = self._glos + resDir = self._resDir + writer = self._csvWriter + word_title = self._word_title + while True: + entry = yield + if entry is None: + break + if entry.isData(): + if resources: + entry.save(resDir) + continue + + words = entry.l_word + if not words: + continue + word, alts = words[0], words[1:] + defi = entry.defi + + if word_title: + defi = glos.wordTitleStr(words[0]) + defi + + row = [ + word, + defi, + ] + if add_defi_format: + entry.detectDefiFormat() + row.append(entry.defiFormat) + if alts: + row.append(",".join(alts)) + + writer.writerow(row) diff --git a/pyglossary/plugins/dicformids/__init__.py b/pyglossary/plugins/dicformids/__init__.py index 625b9b7f3..8e1f4ca76 100644 --- a/pyglossary/plugins/dicformids/__init__.py +++ b/pyglossary/plugins/dicformids/__init__.py @@ -2,22 +2,16 @@ # mypy: ignore-errors from __future__ import annotations -import operator -import os -import re -from os.path import join from typing import TYPE_CHECKING -from pyglossary.core import log -from pyglossary.flags import ALWAYS -from pyglossary.plugins.tabfile import Reader as TabfileReader - if TYPE_CHECKING: - from collections.abc import Generator, Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option +from pyglossary.flags import ALWAYS + +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -52,243 +46,3 @@ ) optionsProp: dict[str, Option] = {} - - -PROP_TEMPLATE = """#DictionaryForMIDs property file -infoText={name}, author: {author} -indexFileMaxSize={indexFileMaxSize}\n -language1IndexNumberOfSourceEntries={wordCount} -language1DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate -indexCharEncoding=ISO-8859-1 -dictionaryFileSeparationCharacter='\\t' -language2NormationClassName=de.kugihan.dictionaryformids.translation.Normation -language2DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate -logLevel=0 -language1FilePostfix={directoryPostfix} -dictionaryCharEncoding=UTF-8 -numberOfAvailableLanguages=2 -language1IsSearchable=true -language2GenerateIndex=false -dictionaryFileMaxSize={dicMaxSize} -language2FilePostfix={language2FilePostfix} -searchListFileMaxSize=20000 -language2IsSearchable=false -fileEncodingFormat=plain_format1 -language1HasSeparateDictionaryFile=true -searchListCharEncoding=ISO-8859-1 -searchListFileSeparationCharacter='\t' -indexFileSeparationCharacter='\t' -language1DisplayText={sourceLang} -language2HasSeparateDictionaryFile=false -dictionaryGenerationInputCharEncoding=UTF-8 -language1GenerateIndex=true -language2DisplayText={targetLang} -language1NormationClassName=de.kugihan.dictionaryformids.translation.NormationEng -""" - - -class Reader: - re_number = re.compile(r"\d+") - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._tabFileNames: list[str] = [] - self._tabFileReader = None - - def open(self, dirname: str) -> None: - self._dirname = dirname - orderFileNames: list[tuple[int, str]] = [] - for fname in os.listdir(dirname): - if not fname.startswith("directory"): - continue - try: - num = self.re_number.findall(fname)[-1] - except IndexError: - pass - else: - orderFileNames.append((num, fname)) - orderFileNames.sort( - key=operator.itemgetter(0), - reverse=True, - ) - self._tabFileNames = [x[1] for x in orderFileNames] - self.nextTabFile() - - def __len__(self) -> int: - raise NotImplementedError # FIXME - - def __iter__(self) -> Iterator[EntryType]: - return self - - def __next__(self) -> EntryType: - for _ in range(10): - try: - return next(self._tabFileReader) - except StopIteration: # noqa: PERF203 - self._tabFileReader.close() - self.nextTabFile() - return None - - def nextTabFile(self) -> None: - try: - tabFileName = self._tabFileNames.pop() - except IndexError: - raise StopIteration from None - self._tabFileReader = TabfileReader(self._glos, hasInfo=False) - self._tabFileReader.open(join(self._dirname, tabFileName), newline="\n") - - def close(self) -> None: - if self._tabFileReader: - try: - self._tabFileReader.close() - except Exception: - pass # noqa: S110 - self._tabFileReader = None - self._tabFileNames = [] - - -class Writer: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self.linesPerDirectoryFile = 500 # 200 - self.indexFileMaxSize = 32722 # 30000 - self.directoryPostfix = "" - self.indexPostfix = "" - self._dirname = "" - # looks like we need to remove tabs, because app gives error - # but based on the java code, all punctuations should be removed - # as well, including '|' - self.re_punc = re.compile( - r"""[!"$§%&/()=?´`\\{}\[\]^°+*~#'\-_.:,;<>@|]*""", # noqa: RUF001 - ) - self.re_spaces = re.compile(" +") - self.re_tabs = re.compile("\t+") - - def normateWord(self, word: str) -> str: - word = word.strip() - word = self.re_punc.sub("", word) - word = self.re_spaces.sub(" ", word) - word = self.re_tabs.sub(" ", word) - word = word.lower() - return word # noqa: RET504 - - def writeProbs(self) -> None: - glos = self._glos - probsPath = join( - self._dirname, - "DictionaryForMIDs.properties", - ) - with open(probsPath, mode="w", newline="\n", encoding="utf-8") as fileObj: - fileObj.write( - PROP_TEMPLATE.format( - name=glos.getInfo("name"), - author=glos.author, - indexFileMaxSize=self.indexFileMaxSize, - wordCount=self.wordCount, - directoryPostfix=self.directoryPostfix, - dicMaxSize=self.dicMaxSize + 1, - language2FilePostfix="fa", # FIXME - sourceLang=glos.sourceLangName, - targetLang=glos.targetLangName, - ), - ) - - def nextIndex(self) -> None: - try: - self.indexFp.close() - except AttributeError: - self.indexIndex = 0 - - self.indexIndex += 1 - fname = f"index{self.indexPostfix}{self.indexIndex}.csv" - fpath = join(self._dirname, fname) - self.indexFp = open(fpath, mode="w", encoding="utf-8", newline="\n") - - def finish(self) -> None: - pass - - def open(self, dirname: str) -> None: - self._dirname = dirname - if not os.path.isdir(dirname): - os.mkdir(dirname) - - def write(self) -> Generator[None, EntryType, None]: - self.nextIndex() - - dicMaxSize = 0 - indexData: list[tuple[str, int, int]] = [] - - def writeBucket(dicIndex: int, entryList: list[EntryType]) -> None: - nonlocal dicMaxSize - log.debug( - f"{dicIndex=}, {len(entryList)=}, {dicMaxSize=}", - ) - dicFp = open( - join( - self._dirname, - f"directory{self.directoryPostfix}{dicIndex + 1}.csv", - ), - mode="w", - encoding="utf-8", - newline="\n", - ) - for entry in entryList: - word = entry.s_word - n_word = self.normateWord(word) - defi = entry.defi - dicLine = word + "\t" + defi + "\n" - dicPos = dicFp.tell() - dicFp.write(dicLine) - indexData.append((n_word, dicIndex + 1, dicPos)) - - dicMaxSize = max(dicMaxSize, dicFp.tell()) - dicFp.close() - - bucketSize = self.linesPerDirectoryFile - wordCount = 0 - dicIndex = 0 - entryList: list[EntryType] = [] # aka bucket - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # FIXME - continue - wordCount += 1 - entryList.append(entry) - if len(entryList) >= bucketSize: - writeBucket(dicIndex, entryList) - dicIndex += 1 - entryList = [] - - if entryList: - writeBucket(dicIndex, entryList) - entryList = [] - - self.dicMaxSize = dicMaxSize - self.wordCount = wordCount - - langSearchListFp = open( - join( - self._dirname, - f"searchlist{self.directoryPostfix}.csv", - ), - mode="w", - newline="\n", - encoding="utf-8", - ) - - langSearchListFp.write(f"{indexData[0][0]}\t{self.indexIndex}\n") - - for word, dicIndex, dicPos in indexData: - indexLine = f"{word}\t{dicIndex}-{dicPos}-B\n" - if (self.indexFp.tell() + len(indexLine)) > self.indexFileMaxSize - 10: - self.nextIndex() - langSearchListFp.write(f"{word}\t{self.indexIndex}\n") - self.indexFp.write(indexLine) - - self.indexFp.close() - langSearchListFp.close() - - self.writeProbs() diff --git a/pyglossary/plugins/dicformids/reader.py b/pyglossary/plugins/dicformids/reader.py new file mode 100644 index 000000000..9ae2bd1a8 --- /dev/null +++ b/pyglossary/plugins/dicformids/reader.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +from __future__ import annotations + +import operator +import os +import re +from os.path import join +from typing import TYPE_CHECKING + +from pyglossary.plugins.tabfile import Reader as TabfileReader + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + re_number = re.compile(r"\d+") + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._tabFileNames: list[str] = [] + self._tabFileReader = None + + def open(self, dirname: str) -> None: + self._dirname = dirname + orderFileNames: list[tuple[int, str]] = [] + for fname in os.listdir(dirname): + if not fname.startswith("directory"): + continue + try: + num = self.re_number.findall(fname)[-1] + except IndexError: + pass + else: + orderFileNames.append((num, fname)) + orderFileNames.sort( + key=operator.itemgetter(0), + reverse=True, + ) + self._tabFileNames = [x[1] for x in orderFileNames] + self.nextTabFile() + + def __len__(self) -> int: + raise NotImplementedError # FIXME + + def __iter__(self) -> Iterator[EntryType]: + return self + + def __next__(self) -> EntryType: + for _ in range(10): + try: + return next(self._tabFileReader) + except StopIteration: # noqa: PERF203 + self._tabFileReader.close() + self.nextTabFile() + return None + + def nextTabFile(self) -> None: + try: + tabFileName = self._tabFileNames.pop() + except IndexError: + raise StopIteration from None + self._tabFileReader = TabfileReader(self._glos, hasInfo=False) + self._tabFileReader.open(join(self._dirname, tabFileName), newline="\n") + + def close(self) -> None: + if self._tabFileReader: + try: + self._tabFileReader.close() + except Exception: + pass # noqa: S110 + self._tabFileReader = None + self._tabFileNames = [] diff --git a/pyglossary/plugins/dicformids/writer.py b/pyglossary/plugins/dicformids/writer.py new file mode 100644 index 000000000..44dc07ebd --- /dev/null +++ b/pyglossary/plugins/dicformids/writer.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +from __future__ import annotations + +import os +import re +from os.path import join +from typing import TYPE_CHECKING + +from pyglossary.core import log + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +PROP_TEMPLATE = """#DictionaryForMIDs property file +infoText={name}, author: {author} +indexFileMaxSize={indexFileMaxSize}\n +language1IndexNumberOfSourceEntries={wordCount} +language1DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate +indexCharEncoding=ISO-8859-1 +dictionaryFileSeparationCharacter='\\t' +language2NormationClassName=de.kugihan.dictionaryformids.translation.Normation +language2DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate +logLevel=0 +language1FilePostfix={directoryPostfix} +dictionaryCharEncoding=UTF-8 +numberOfAvailableLanguages=2 +language1IsSearchable=true +language2GenerateIndex=false +dictionaryFileMaxSize={dicMaxSize} +language2FilePostfix={language2FilePostfix} +searchListFileMaxSize=20000 +language2IsSearchable=false +fileEncodingFormat=plain_format1 +language1HasSeparateDictionaryFile=true +searchListCharEncoding=ISO-8859-1 +searchListFileSeparationCharacter='\t' +indexFileSeparationCharacter='\t' +language1DisplayText={sourceLang} +language2HasSeparateDictionaryFile=false +dictionaryGenerationInputCharEncoding=UTF-8 +language1GenerateIndex=true +language2DisplayText={targetLang} +language1NormationClassName=de.kugihan.dictionaryformids.translation.NormationEng +""" + + +class Writer: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self.linesPerDirectoryFile = 500 # 200 + self.indexFileMaxSize = 32722 # 30000 + self.directoryPostfix = "" + self.indexPostfix = "" + self._dirname = "" + # looks like we need to remove tabs, because app gives error + # but based on the java code, all punctuations should be removed + # as well, including '|' + self.re_punc = re.compile( + r"""[!"$§%&/()=?´`\\{}\[\]^°+*~#'\-_.:,;<>@|]*""", # noqa: RUF001 + ) + self.re_spaces = re.compile(" +") + self.re_tabs = re.compile("\t+") + + def normateWord(self, word: str) -> str: + word = word.strip() + word = self.re_punc.sub("", word) + word = self.re_spaces.sub(" ", word) + word = self.re_tabs.sub(" ", word) + word = word.lower() + return word # noqa: RET504 + + def writeProbs(self) -> None: + glos = self._glos + probsPath = join( + self._dirname, + "DictionaryForMIDs.properties", + ) + with open(probsPath, mode="w", newline="\n", encoding="utf-8") as fileObj: + fileObj.write( + PROP_TEMPLATE.format( + name=glos.getInfo("name"), + author=glos.author, + indexFileMaxSize=self.indexFileMaxSize, + wordCount=self.wordCount, + directoryPostfix=self.directoryPostfix, + dicMaxSize=self.dicMaxSize + 1, + language2FilePostfix="fa", # FIXME + sourceLang=glos.sourceLangName, + targetLang=glos.targetLangName, + ), + ) + + def nextIndex(self) -> None: + try: + self.indexFp.close() + except AttributeError: + self.indexIndex = 0 + + self.indexIndex += 1 + fname = f"index{self.indexPostfix}{self.indexIndex}.csv" + fpath = join(self._dirname, fname) + self.indexFp = open(fpath, mode="w", encoding="utf-8", newline="\n") + + def finish(self) -> None: + pass + + def open(self, dirname: str) -> None: + self._dirname = dirname + if not os.path.isdir(dirname): + os.mkdir(dirname) + + def write(self) -> Generator[None, EntryType, None]: + self.nextIndex() + + dicMaxSize = 0 + indexData: list[tuple[str, int, int]] = [] + + def writeBucket(dicIndex: int, entryList: list[EntryType]) -> None: + nonlocal dicMaxSize + log.debug( + f"{dicIndex=}, {len(entryList)=}, {dicMaxSize=}", + ) + dicFp = open( + join( + self._dirname, + f"directory{self.directoryPostfix}{dicIndex + 1}.csv", + ), + mode="w", + encoding="utf-8", + newline="\n", + ) + for entry in entryList: + word = entry.s_word + n_word = self.normateWord(word) + defi = entry.defi + dicLine = word + "\t" + defi + "\n" + dicPos = dicFp.tell() + dicFp.write(dicLine) + indexData.append((n_word, dicIndex + 1, dicPos)) + + dicMaxSize = max(dicMaxSize, dicFp.tell()) + dicFp.close() + + bucketSize = self.linesPerDirectoryFile + wordCount = 0 + dicIndex = 0 + entryList: list[EntryType] = [] # aka bucket + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # FIXME + continue + wordCount += 1 + entryList.append(entry) + if len(entryList) >= bucketSize: + writeBucket(dicIndex, entryList) + dicIndex += 1 + entryList = [] + + if entryList: + writeBucket(dicIndex, entryList) + entryList = [] + + self.dicMaxSize = dicMaxSize + self.wordCount = wordCount + + langSearchListFp = open( + join( + self._dirname, + f"searchlist{self.directoryPostfix}.csv", + ), + mode="w", + newline="\n", + encoding="utf-8", + ) + + langSearchListFp.write(f"{indexData[0][0]}\t{self.indexIndex}\n") + + for word, dicIndex, dicPos in indexData: + indexLine = f"{word}\t{dicIndex}-{dicPos}-B\n" + if (self.indexFp.tell() + len(indexLine)) > self.indexFileMaxSize - 10: + self.nextIndex() + langSearchListFp.write(f"{word}\t{self.indexIndex}\n") + self.indexFp.write(indexLine) + + self.indexFp.close() + langSearchListFp.close() + + self.writeProbs() diff --git a/pyglossary/plugins/dict_cc/__init__.py b/pyglossary/plugins/dict_cc/__init__.py index 9105a963e..c75ec3d64 100644 --- a/pyglossary/plugins/dict_cc/__init__.py +++ b/pyglossary/plugins/dict_cc/__init__.py @@ -1,20 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html -from operator import itemgetter -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING if TYPE_CHECKING: - import sqlite3 - from collections.abc import Callable, Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.lxml_types import Element, T_htmlfile from pyglossary.option import Option -from pyglossary.core import log +from .reader import Reader __all__ = [ "Reader", @@ -45,192 +38,3 @@ "dict.cc dictionary - Google Play", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("h") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - "select count(distinct term1)+count(distinct term2) from main_ft", - ) - return self._cur.fetchone()[0] - - @staticmethod - def makeList( - hf: T_htmlfile, - input_elements: list[Element], - processor: Callable, - single_prefix: str = "", - skip_single: bool = True, - ) -> None: - """Wrap elements into <ol> if more than one element.""" - if not input_elements: - return - - if skip_single and len(input_elements) == 1: - hf.write(single_prefix) - processor(hf, input_elements[0]) - return - - with hf.element("ol"): - for el in input_elements: - with hf.element("li"): - processor(hf, el) - - @staticmethod - def makeGroupsList( - hf: T_htmlfile, - groups: list[tuple[str, str]], - processor: Callable[[T_htmlfile, tuple[str, str]], None], - single_prefix: str = "", - skip_single: bool = True, - ) -> None: - """Wrap elements into <ol> if more than one element.""" - if not groups: - return - - if skip_single and len(groups) == 1: - hf.write(single_prefix) - processor(hf, groups[0]) - return - - with hf.element("ol"): - for el in groups: - with hf.element("li"): - processor(hf, el) - - def writeSense( # noqa: PLR6301 - self, - hf: T_htmlfile, - row: tuple[str, str], - ) -> None: - from lxml import etree as ET - - trans, entry_type = row - if entry_type: - with hf.element("i"): - hf.write(f"{entry_type}") # noqa: FURB183 - hf.write(ET.Element("br")) - try: - hf.write(trans + " ") - except Exception as e: - log.error(f"error in writing {trans!r}, {e}") - hf.write(repr(trans) + " ") - else: - with hf.element("big"): - with hf.element("a", href=f"bword://{trans}"): - hf.write("⏎") - - def iterRows( - self, - column1: str, - column2: str, - ) -> Iterator[tuple[str, str, str]]: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - f"select {column1}, {column2}, entry_type from main_ft" - f" order by {column1}", - ) - for row in self._cur.fetchall(): - term1 = row[0] - term2 = row[1] - try: - term1 = html.unescape(term1) - except Exception as e: - log.error(f"html.unescape({term1!r}) -> {e}") - try: - term2 = html.unescape(term2) - except Exception as e: - log.error(f"html.unescape({term2!r}) -> {e}") - yield term1, term2, row[2] - - def parseGender(self, headword: str) -> tuple[str | None, str]: # noqa: PLR6301 - # {m} masc masculine German: maskulin - # {f} fem feminine German: feminin - # {n} neut neutral German: neutral - # { } ???? - i = headword.find(" {") - if i <= 0: - return None, headword - if len(headword) < i + 4: - return None, headword - if headword[i + 3] != "}": - return None, headword - g = headword[i + 2] - gender = None - if g == "m": - gender = "masculine" - elif g == "f": - gender = "feminine" - elif g == "n": - gender = "neutral" - else: - log.warning(f"invalid gender {g!r}") - return None, headword - headword = headword[:i] + headword[i + 4 :] - return gender, headword - - def _iterOneDirection( - self, - column1: str, - column2: str, - ) -> Iterator[EntryType]: - from io import BytesIO - from itertools import groupby - - from lxml import etree as ET - - glos = self._glos - for headwordEscaped, groupsOrig in groupby( - self.iterRows(column1, column2), - key=itemgetter(0), - ): - headword = html.unescape(headwordEscaped) - groups: list[tuple[str, str]] = [ - (term2, entry_type) for _, term2, entry_type in groupsOrig - ] - f = BytesIO() - gender, headword = self.parseGender(headword) - with ET.htmlfile(f, encoding="utf-8") as hf: - with hf.element("div"): - if gender: - with hf.element("i"): - hf.write(gender) - hf.write(ET.Element("br")) - self.makeGroupsList( - cast("T_htmlfile", hf), - groups, - self.writeSense, - ) - defi = f.getvalue().decode("utf-8") - yield glos.newEntry(headword, defi, defiFormat="h") - - def __iter__(self) -> Iterator[EntryType]: - yield from self._iterOneDirection("term1", "term2") - yield from self._iterOneDirection("term2", "term1") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/dict_cc/reader.py b/pyglossary/plugins/dict_cc/reader.py new file mode 100644 index 000000000..e6615604a --- /dev/null +++ b/pyglossary/plugins/dict_cc/reader.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from operator import itemgetter +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Callable, Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.lxml_types import Element, T_htmlfile + + +from pyglossary.core import log + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("h") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + "select count(distinct term1)+count(distinct term2) from main_ft", + ) + return self._cur.fetchone()[0] + + @staticmethod + def makeList( + hf: T_htmlfile, + input_elements: list[Element], + processor: Callable, + single_prefix: str = "", + skip_single: bool = True, + ) -> None: + """Wrap elements into <ol> if more than one element.""" + if not input_elements: + return + + if skip_single and len(input_elements) == 1: + hf.write(single_prefix) + processor(hf, input_elements[0]) + return + + with hf.element("ol"): + for el in input_elements: + with hf.element("li"): + processor(hf, el) + + @staticmethod + def makeGroupsList( + hf: T_htmlfile, + groups: list[tuple[str, str]], + processor: Callable[[T_htmlfile, tuple[str, str]], None], + single_prefix: str = "", + skip_single: bool = True, + ) -> None: + """Wrap elements into <ol> if more than one element.""" + if not groups: + return + + if skip_single and len(groups) == 1: + hf.write(single_prefix) + processor(hf, groups[0]) + return + + with hf.element("ol"): + for el in groups: + with hf.element("li"): + processor(hf, el) + + def writeSense( # noqa: PLR6301 + self, + hf: T_htmlfile, + row: tuple[str, str], + ) -> None: + from lxml import etree as ET + + trans, entry_type = row + if entry_type: + with hf.element("i"): + hf.write(f"{entry_type}") # noqa: FURB183 + hf.write(ET.Element("br")) + try: + hf.write(trans + " ") + except Exception as e: + log.error(f"error in writing {trans!r}, {e}") + hf.write(repr(trans) + " ") + else: + with hf.element("big"): + with hf.element("a", href=f"bword://{trans}"): + hf.write("⏎") + + def iterRows( + self, + column1: str, + column2: str, + ) -> Iterator[tuple[str, str, str]]: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + f"select {column1}, {column2}, entry_type from main_ft" + f" order by {column1}", + ) + for row in self._cur.fetchall(): + term1 = row[0] + term2 = row[1] + try: + term1 = html.unescape(term1) + except Exception as e: + log.error(f"html.unescape({term1!r}) -> {e}") + try: + term2 = html.unescape(term2) + except Exception as e: + log.error(f"html.unescape({term2!r}) -> {e}") + yield term1, term2, row[2] + + def parseGender(self, headword: str) -> tuple[str | None, str]: # noqa: PLR6301 + # {m} masc masculine German: maskulin + # {f} fem feminine German: feminin + # {n} neut neutral German: neutral + # { } ???? + i = headword.find(" {") + if i <= 0: + return None, headword + if len(headword) < i + 4: + return None, headword + if headword[i + 3] != "}": + return None, headword + g = headword[i + 2] + gender = None + if g == "m": + gender = "masculine" + elif g == "f": + gender = "feminine" + elif g == "n": + gender = "neutral" + else: + log.warning(f"invalid gender {g!r}") + return None, headword + headword = headword[:i] + headword[i + 4 :] + return gender, headword + + def _iterOneDirection( + self, + column1: str, + column2: str, + ) -> Iterator[EntryType]: + from io import BytesIO + from itertools import groupby + + from lxml import etree as ET + + glos = self._glos + for headwordEscaped, groupsOrig in groupby( + self.iterRows(column1, column2), + key=itemgetter(0), + ): + headword = html.unescape(headwordEscaped) + groups: list[tuple[str, str]] = [ + (term2, entry_type) for _, term2, entry_type in groupsOrig + ] + f = BytesIO() + gender, headword = self.parseGender(headword) + with ET.htmlfile(f, encoding="utf-8") as hf: + with hf.element("div"): + if gender: + with hf.element("i"): + hf.write(gender) + hf.write(ET.Element("br")) + self.makeGroupsList( + cast("T_htmlfile", hf), + groups, + self.writeSense, + ) + defi = f.getvalue().decode("utf-8") + yield glos.newEntry(headword, defi, defiFormat="h") + + def __iter__(self) -> Iterator[EntryType]: + yield from self._iterOneDirection("term1", "term2") + yield from self._iterOneDirection("term2", "term1") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/dict_cc_split/__init__.py b/pyglossary/plugins/dict_cc_split/__init__.py index daa096949..69fbb799c 100644 --- a/pyglossary/plugins/dict_cc_split/__init__.py +++ b/pyglossary/plugins/dict_cc_split/__init__.py @@ -1,17 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html from typing import TYPE_CHECKING if TYPE_CHECKING: - import sqlite3 - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option -from pyglossary.core import log +from .reader import Reader __all__ = [ "Reader", @@ -42,73 +37,3 @@ "dict.cc dictionary - Google Play", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("m") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(*) * 2 from main_ft") - return self._cur.fetchone()[0] - - def iterRows( - self, - column1: str, - column2: str, - ) -> Iterator[tuple[str, str, str]]: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - f"select {column1}, {column2}, entry_type from main_ft" - f" order by {column1}", - ) - for row in self._cur.fetchall(): - term1 = row[0] - term2 = row[1] - try: - term1 = html.unescape(term1) - except Exception as e: - log.error(f"html.unescape({term1!r}) -> {e}") - try: - term2 = html.unescape(term2) - except Exception as e: - log.error(f"html.unescape({term2!r}) -> {e}") - yield term1, term2, row[2] - - def _iterOneDirection( - self, - column1: str, - column2: str, - ) -> Iterator[EntryType]: - for word, defi, entry_type in self.iterRows(column1, column2): - if entry_type: - word = f"{word} {{{entry_type}}}" # noqa: PLW2901 - yield self._glos.newEntry(word, defi, defiFormat="m") - - def __iter__(self) -> Iterator[EntryType]: - yield from self._iterOneDirection("term1", "term2") - yield from self._iterOneDirection("term2", "term1") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/dict_cc_split/reader.py b/pyglossary/plugins/dict_cc_split/reader.py new file mode 100644 index 000000000..1e5205f28 --- /dev/null +++ b/pyglossary/plugins/dict_cc_split/reader.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + +from pyglossary.core import log + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("m") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(*) * 2 from main_ft") + return self._cur.fetchone()[0] + + def iterRows( + self, + column1: str, + column2: str, + ) -> Iterator[tuple[str, str, str]]: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + f"select {column1}, {column2}, entry_type from main_ft" + f" order by {column1}", + ) + for row in self._cur.fetchall(): + term1 = row[0] + term2 = row[1] + try: + term1 = html.unescape(term1) + except Exception as e: + log.error(f"html.unescape({term1!r}) -> {e}") + try: + term2 = html.unescape(term2) + except Exception as e: + log.error(f"html.unescape({term2!r}) -> {e}") + yield term1, term2, row[2] + + def _iterOneDirection( + self, + column1: str, + column2: str, + ) -> Iterator[EntryType]: + for word, defi, entry_type in self.iterRows(column1, column2): + if entry_type: + word = f"{word} {{{entry_type}}}" # noqa: PLW2901 + yield self._glos.newEntry(word, defi, defiFormat="m") + + def __iter__(self) -> Iterator[EntryType]: + yield from self._iterOneDirection("term1", "term2") + yield from self._iterOneDirection("term2", "term1") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/dict_org/__init__.py b/pyglossary/plugins/dict_org/__init__.py index 8331d3adb..9af2bf0b3 100644 --- a/pyglossary/plugins/dict_org/__init__.py +++ b/pyglossary/plugins/dict_org/__init__.py @@ -2,20 +2,11 @@ from __future__ import annotations -import os -import re -from os.path import isdir, splitext -from typing import TYPE_CHECKING - -from pyglossary.core import log from pyglossary.flags import DEFAULT_NO from pyglossary.option import BoolOption, Option -from pyglossary.plugin_lib.dictdlib import DictDB - -if TYPE_CHECKING: - from collections.abc import Generator, Iterator - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -51,146 +42,3 @@ "http://dict.org/bin/Dict", "The DICT Development Group", ) - - -def installToDictd(filename: str, dictzip: bool) -> None: - """Filename is without extension (neither .index or .dict or .dict.dz).""" - import shutil - import subprocess - - targetDir = "/usr/share/dictd/" - if filename.startswith(targetDir): - return - - if not isdir(targetDir): - log.warning(f"Directory {targetDir!r} does not exist, skipping install") - return - - log.info(f"Installing {filename!r} to DICTD server directory: {targetDir}") - - if dictzip and os.path.isfile(filename + ".dict.dz"): - dictExt = ".dict.dz" - elif os.path.isfile(filename + ".dict"): - dictExt = ".dict" - else: - log.error(f"No .dict file, could not install dictd file {filename!r}") - return - - if not filename.startswith(targetDir): - shutil.copy(filename + ".index", targetDir) - shutil.copy(filename + dictExt, targetDir) - - # update /var/lib/dictd/db.list - if subprocess.call(["/usr/sbin/dictdconfig", "-w"]) != 0: - log.error( - "failed to update /var/lib/dictd/db.list file" - ", try manually running: sudo /usr/sbin/dictdconfig -w", - ) - - log.info("don't forget to restart dictd server") - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._dictdb: DictDB | None = None - - # regular expression patterns used to prettify definition text - self._re_newline_in_braces = re.compile( - r"\{(?P<left>.*?)\n(?P<right>.*?)?\}", - ) - self._re_words_in_braces = re.compile( - r"\{(?P<word>.+?)\}", - ) - - def open(self, filename: str) -> None: - filename = filename.removesuffix(".index") - self._filename = filename - self._dictdb = DictDB(filename, "read", 1) - - def close(self) -> None: - if self._dictdb is not None: - self._dictdb.close() - # self._dictdb.finish() - self._dictdb = None - - def prettifyDefinitionText(self, defi: str) -> str: - # Handle words in {} - # First, we remove any \n in {} pairs - defi = self._re_newline_in_braces.sub(r"{\g<left>\g<right>}", defi) - - # Then, replace any {words} into <a href="bword://words">words</a>, - # so it can be rendered as link correctly - defi = self._re_words_in_braces.sub( - r'<a href="bword://\g<word>">\g<word></a>', - defi, - ) - - # Use <br /> so it can be rendered as newline correctly - return defi.replace("\n", "<br />") - - def __len__(self) -> int: - if self._dictdb is None: - return 0 - return len(self._dictdb) - - def __iter__(self) -> Iterator[EntryType]: - if self._dictdb is None: - raise RuntimeError("iterating over a reader while it's not open") - dictdb = self._dictdb - for word in dictdb.getDefList(): - b_defi = b"\n\n<hr>\n\n".join(dictdb.getDef(word)) - try: - defi = b_defi.decode("utf_8", "ignore") - defi = self.prettifyDefinitionText(defi) - except Exception as e: - log.error(f"{b_defi = }") - raise e - yield self._glos.newEntry(word, defi) - - -class Writer: - _dictzip: bool = False - _install: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._dictdb: DictDB | None = None - - def finish(self) -> None: - from pyglossary.os_utils import runDictzip - - if self._dictdb is None: - raise RuntimeError("self._dictdb is None") - - self._dictdb.finish(dosort=True) - if self._dictzip: - runDictzip(f"{self._filename}.dict") - if self._install: - installToDictd( - self._filename, - self._dictzip, - ) - self._filename = "" - - def open(self, filename: str) -> None: - filename_nox, ext = splitext(filename) - if ext.lower() == ".index": - filename = filename_nox - self._dictdb = DictDB(filename, "write", 1) - self._filename = filename - - def write(self) -> Generator[None, EntryType, None]: - dictdb = self._dictdb - if dictdb is None: - raise RuntimeError("self._dictdb is None") - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # does dictd support resources? and how? FIXME - continue - dictdb.addEntry(entry.defi, entry.l_word) diff --git a/pyglossary/plugins/dict_org/reader.py b/pyglossary/plugins/dict_org/reader.py new file mode 100644 index 000000000..71a47fc13 --- /dev/null +++ b/pyglossary/plugins/dict_org/reader.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.plugin_lib.dictdlib import DictDB + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._dictdb: DictDB | None = None + + # regular expression patterns used to prettify definition text + self._re_newline_in_braces = re.compile( + r"\{(?P<left>.*?)\n(?P<right>.*?)?\}", + ) + self._re_words_in_braces = re.compile( + r"\{(?P<word>.+?)\}", + ) + + def open(self, filename: str) -> None: + filename = filename.removesuffix(".index") + self._filename = filename + self._dictdb = DictDB(filename, "read", 1) + + def close(self) -> None: + if self._dictdb is not None: + self._dictdb.close() + # self._dictdb.finish() + self._dictdb = None + + def prettifyDefinitionText(self, defi: str) -> str: + # Handle words in {} + # First, we remove any \n in {} pairs + defi = self._re_newline_in_braces.sub(r"{\g<left>\g<right>}", defi) + + # Then, replace any {words} into <a href="bword://words">words</a>, + # so it can be rendered as link correctly + defi = self._re_words_in_braces.sub( + r'<a href="bword://\g<word>">\g<word></a>', + defi, + ) + + # Use <br /> so it can be rendered as newline correctly + return defi.replace("\n", "<br />") + + def __len__(self) -> int: + if self._dictdb is None: + return 0 + return len(self._dictdb) + + def __iter__(self) -> Iterator[EntryType]: + if self._dictdb is None: + raise RuntimeError("iterating over a reader while it's not open") + dictdb = self._dictdb + for word in dictdb.getDefList(): + b_defi = b"\n\n<hr>\n\n".join(dictdb.getDef(word)) + try: + defi = b_defi.decode("utf_8", "ignore") + defi = self.prettifyDefinitionText(defi) + except Exception as e: + log.error(f"{b_defi = }") + raise e + yield self._glos.newEntry(word, defi) diff --git a/pyglossary/plugins/dict_org/writer.py b/pyglossary/plugins/dict_org/writer.py new file mode 100644 index 000000000..5cc2762e7 --- /dev/null +++ b/pyglossary/plugins/dict_org/writer.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from os.path import splitext +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.plugin_lib.dictdlib import DictDB + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +def installToDictd(filename: str, dictzip: bool) -> None: + """Filename is without extension (neither .index or .dict or .dict.dz).""" + import shutil + import subprocess + from os.path import isdir, isfile + + targetDir = "/usr/share/dictd/" + if filename.startswith(targetDir): + return + + if not isdir(targetDir): + log.warning(f"Directory {targetDir!r} does not exist, skipping install") + return + + log.info(f"Installing {filename!r} to DICTD server directory: {targetDir}") + + if dictzip and isfile(filename + ".dict.dz"): + dictExt = ".dict.dz" + elif isfile(filename + ".dict"): + dictExt = ".dict" + else: + log.error(f"No .dict file, could not install dictd file {filename!r}") + return + + if not filename.startswith(targetDir): + shutil.copy(filename + ".index", targetDir) + shutil.copy(filename + dictExt, targetDir) + + # update /var/lib/dictd/db.list + if subprocess.call(["/usr/sbin/dictdconfig", "-w"]) != 0: + log.error( + "failed to update /var/lib/dictd/db.list file" + ", try manually running: sudo /usr/sbin/dictdconfig -w", + ) + + log.info("don't forget to restart dictd server") + + +class Writer: + _dictzip: bool = False + _install: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._dictdb: DictDB | None = None + + def finish(self) -> None: + from pyglossary.os_utils import runDictzip + + if self._dictdb is None: + raise RuntimeError("self._dictdb is None") + + self._dictdb.finish(dosort=True) + if self._dictzip: + runDictzip(f"{self._filename}.dict") + if self._install: + installToDictd( + self._filename, + self._dictzip, + ) + self._filename = "" + + def open(self, filename: str) -> None: + filename_nox, ext = splitext(filename) + if ext.lower() == ".index": + filename = filename_nox + self._dictdb = DictDB(filename, "write", 1) + self._filename = filename + + def write(self) -> Generator[None, EntryType, None]: + dictdb = self._dictdb + if dictdb is None: + raise RuntimeError("self._dictdb is None") + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # does dictd support resources? and how? FIXME + continue + dictdb.addEntry(entry.defi, entry.l_word) diff --git a/pyglossary/plugins/dict_org_source/__init__.py b/pyglossary/plugins/dict_org_source/__init__.py index 5c899f1fe..9a9d63233 100644 --- a/pyglossary/plugins/dict_org_source/__init__.py +++ b/pyglossary/plugins/dict_org_source/__init__.py @@ -1,14 +1,9 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import TYPE_CHECKING - from pyglossary.option import BoolOption, Option -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -41,36 +36,3 @@ optionsProp: dict[str, Option] = { "remove_html_all": BoolOption(comment="Remove all HTML tags"), } - - -class Writer: - _remove_html_all: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - - def finish(self) -> None: - self._filename = "" - - def open(self, filename: str) -> None: - self._filename = filename - if self._remove_html_all: - self._glos.removeHtmlTagsAll() - # TODO: add another bool flag to only remove html tags that are not - # supported by GtkTextView - - @staticmethod - def _defiEscapeFunc(defi: str) -> str: - return defi.replace("\r", "") - - def write(self) -> Generator[None, EntryType, None]: - from pyglossary.text_writer import writeTxt - - yield from writeTxt( - self._glos, - entryFmt=":{word}:{defi}\n", - filename=self._filename, - defiEscapeFunc=self._defiEscapeFunc, - ext=".dtxt", - ) diff --git a/pyglossary/plugins/dict_org_source/writer.py b/pyglossary/plugins/dict_org_source/writer.py new file mode 100644 index 000000000..1548f5975 --- /dev/null +++ b/pyglossary/plugins/dict_org_source/writer.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + _remove_html_all: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + + def finish(self) -> None: + self._filename = "" + + def open(self, filename: str) -> None: + self._filename = filename + if self._remove_html_all: + self._glos.removeHtmlTagsAll() + # TODO: add another bool flag to only remove html tags that are not + # supported by GtkTextView + + @staticmethod + def _defiEscapeFunc(defi: str) -> str: + return defi.replace("\r", "") + + def write(self) -> Generator[None, EntryType, None]: + from pyglossary.text_writer import writeTxt + + yield from writeTxt( + self._glos, + entryFmt=":{word}:{defi}\n", + filename=self._filename, + defiEscapeFunc=self._defiEscapeFunc, + ext=".dtxt", + ) diff --git a/pyglossary/plugins/dictunformat/__init__.py b/pyglossary/plugins/dictunformat/__init__.py index a05c55459..77e5f8233 100644 --- a/pyglossary/plugins/dictunformat/__init__.py +++ b/pyglossary/plugins/dictunformat/__init__.py @@ -1,8 +1,8 @@ from __future__ import annotations -from pyglossary.core import log from pyglossary.option import EncodingOption, Option, StrOption -from pyglossary.text_reader import TextGlossaryReader + +from .reader import Reader __all__ = [ "Reader", @@ -38,89 +38,3 @@ comment="separator for headword and alternates", ), } - - -def unescapeDefi(defi: str) -> str: - return defi - - -class Reader(TextGlossaryReader): - _headword_separator = "; " - # https://github.com/cheusov/dictd/blob/master/dictfmt/dictunformat.in#L14 - - @classmethod - def isInfoWord(cls, word: str) -> bool: - return word.startswith("00-database-") - - @classmethod - def fixInfoWord(cls, word: str) -> str: - return word - - def setInfo(self, word: str, defi: str) -> None: - if word == "00-database-short": - self._glos.setInfo("name", defi) - return - - if word != "00-database-info": - return - - glos = self._glos - - lastKey = "" - for line in defi.split("\n"): - if not line.startswith("##:"): - if lastKey: - glos.setInfo(word, f"{glos.getInfo(lastKey)}\n{line}") - continue - - parts = line[3:].split(":") - if len(parts) < 2: - log.error(f"unexpected line: {line}") - key = lastKey = parts[0] - value = ":".join(parts[1:]) - glos.setInfo(key, value) - - def nextBlock(self) -> tuple[str | list[str], str, None] | None: - if not self._file: - raise StopIteration - word = "" - defiLines: list[str] = [] - - while True: - line = self.readline() - if not line: - break - line = line.rstrip("\n\r") - if not line: - continue - - if not line.strip("_"): - if not word: - continue - if not defiLines: - log.warning(f"no definition/value for {word!r}") - defi = unescapeDefi("\n".join(defiLines)) - words = word.split(self._headword_separator) - return words, defi, None - - if not word: - word = line - continue - - if line == word: - continue - if line.lower() == word: - word = line - continue - - defiLines.append(line) - - if word: - defi = unescapeDefi("\n".join(defiLines)) - if word.startswith("00-database-") and defi == "unknown": - log.info(f"ignoring {word} -> {defi}") - return None - words = word.split(self._headword_separator) - return words, defi, None - - raise StopIteration diff --git a/pyglossary/plugins/dictunformat/reader.py b/pyglossary/plugins/dictunformat/reader.py new file mode 100644 index 000000000..c66a0f937 --- /dev/null +++ b/pyglossary/plugins/dictunformat/reader.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from pyglossary.core import log +from pyglossary.text_reader import TextGlossaryReader + + +def unescapeDefi(defi: str) -> str: + return defi + + +class Reader(TextGlossaryReader): + _headword_separator = "; " + # https://github.com/cheusov/dictd/blob/master/dictfmt/dictunformat.in#L14 + + @classmethod + def isInfoWord(cls, word: str) -> bool: + return word.startswith("00-database-") + + @classmethod + def fixInfoWord(cls, word: str) -> str: + return word + + def setInfo(self, word: str, defi: str) -> None: + if word == "00-database-short": + self._glos.setInfo("name", defi) + return + + if word != "00-database-info": + return + + glos = self._glos + + lastKey = "" + for line in defi.split("\n"): + if not line.startswith("##:"): + if lastKey: + glos.setInfo(word, f"{glos.getInfo(lastKey)}\n{line}") + continue + + parts = line[3:].split(":") + if len(parts) < 2: + log.error(f"unexpected line: {line}") + key = lastKey = parts[0] + value = ":".join(parts[1:]) + glos.setInfo(key, value) + + def nextBlock(self) -> tuple[str | list[str], str, None] | None: + if not self._file: + raise StopIteration + word = "" + defiLines: list[str] = [] + + while True: + line = self.readline() + if not line: + break + line = line.rstrip("\n\r") + if not line: + continue + + if not line.strip("_"): + if not word: + continue + if not defiLines: + log.warning(f"no definition/value for {word!r}") + defi = unescapeDefi("\n".join(defiLines)) + words = word.split(self._headword_separator) + return words, defi, None + + if not word: + word = line + continue + + if line == word: + continue + if line.lower() == word: + word = line + continue + + defiLines.append(line) + + if word: + defi = unescapeDefi("\n".join(defiLines)) + if word.startswith("00-database-") and defi == "unknown": + log.info(f"ignoring {word} -> {defi}") + return None + words = word.split(self._headword_separator) + return words, defi, None + + raise StopIteration diff --git a/pyglossary/plugins/digitalnk/__init__.py b/pyglossary/plugins/digitalnk/__init__.py index cf35cef73..08c23d4eb 100644 --- a/pyglossary/plugins/digitalnk/__init__.py +++ b/pyglossary/plugins/digitalnk/__init__.py @@ -1,16 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html from typing import TYPE_CHECKING if TYPE_CHECKING: - import sqlite3 - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType from pyglossary.option import Option +from .reader import Reader + __all__ = [ "Reader", "description", @@ -40,51 +37,3 @@ "@digitalprk/dicrs", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("m") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(*) from dictionary") - return self._cur.fetchone()[0] - - def __iter__(self) -> Iterator[EntryType]: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - "select word, definition from dictionary order by word", - ) - # iteration over self._cur stops after one entry - # and self._cur.fetchone() returns None - # no idea why! - # https://github.com/ilius/pyglossary/issues/282 - # for row in self._cur: - for row in self._cur.fetchall(): - word = html.unescape(row[0]) - definition = row[1] - yield self._glos.newEntry(word, definition, defiFormat="m") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/digitalnk/reader.py b/pyglossary/plugins/digitalnk/reader.py new file mode 100644 index 000000000..5eb2ba373 --- /dev/null +++ b/pyglossary/plugins/digitalnk/reader.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("m") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(*) from dictionary") + return self._cur.fetchone()[0] + + def __iter__(self) -> Iterator[EntryType]: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + "select word, definition from dictionary order by word", + ) + # iteration over self._cur stops after one entry + # and self._cur.fetchone() returns None + # no idea why! + # https://github.com/ilius/pyglossary/issues/282 + # for row in self._cur: + for row in self._cur.fetchall(): + word = html.unescape(row[0]) + definition = row[1] + yield self._glos.newEntry(word, definition, defiFormat="m") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/dikt_json/__init__.py b/pyglossary/plugins/dikt_json/__init__.py index e47315cd5..39eeecf74 100644 --- a/pyglossary/plugins/dikt_json/__init__.py +++ b/pyglossary/plugins/dikt_json/__init__.py @@ -4,23 +4,13 @@ from __future__ import annotations -import re -from typing import TYPE_CHECKING - -from pyglossary.compression import ( - # compressionOpen, - stdCompressions, -) from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -55,65 +45,3 @@ comment="add headwords title to beginning of definition", ), } - - -class Writer: - _encoding: str = "utf-8" - _enable_info: bool = True - _resources: bool = True - _word_title: bool = False - - compressions = stdCompressions - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = None - glos.preventDuplicateWords() - - def open(self, filename: str) -> None: - self._filename = filename - - def finish(self) -> None: - self._filename = None - - def write(self) -> Generator[None, EntryType, None]: - from json import dumps - - from pyglossary.text_writer import writeTxt - - glos = self._glos - encoding = self._encoding - enable_info = self._enable_info - resources = self._resources - - ensure_ascii = encoding == "ascii" - - def escape(st: str) -> str: - # remove styling from HTML tags - st2 = re.sub(r' style="[^"]*"', "", st) - st2 = re.sub(r' class="[^"]*"', "", st2) - st2 = re.sub(r"<font [^>]*>", "", st2) - st2 = st2.replace("</font>", "") - st2 = re.sub(r"\n", "", st2) - st2 = st2.replace("<div></div>", "") - st2 = st2.replace("<span></span>", "") - # fix russian dictionary issues, - # such as hyphenation in word (e.g. абб{[']}а{[/']}т) - st2 = re.sub(r"\{\['\]\}", "", st2) - st2 = re.sub(r"\{\[/'\]\}", "", st2) - return dumps(st2, ensure_ascii=ensure_ascii) - - yield from writeTxt( - glos, - entryFmt="\t{word}: {defi},\n", - filename=self._filename, - encoding=encoding, - writeInfo=enable_info, - wordEscapeFunc=escape, - defiEscapeFunc=escape, - ext=".json", - head="{\n", - tail='\t"": ""\n}', - resources=resources, - word_title=self._word_title, - ) diff --git a/pyglossary/plugins/dikt_json/writer.py b/pyglossary/plugins/dikt_json/writer.py new file mode 100644 index 000000000..e7827ae4b --- /dev/null +++ b/pyglossary/plugins/dikt_json/writer.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +# from https://github.com/maxim-saplin/pyglossary + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + # compressionOpen, + stdCompressions, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Writer: + _encoding: str = "utf-8" + _enable_info: bool = True + _resources: bool = True + _word_title: bool = False + + compressions = stdCompressions + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = None + glos.preventDuplicateWords() + + def open(self, filename: str) -> None: + self._filename = filename + + def finish(self) -> None: + self._filename = None + + def write(self) -> Generator[None, EntryType, None]: + from json import dumps + + from pyglossary.text_writer import writeTxt + + glos = self._glos + encoding = self._encoding + enable_info = self._enable_info + resources = self._resources + + ensure_ascii = encoding == "ascii" + + def escape(st: str) -> str: + # remove styling from HTML tags + st2 = re.sub(r' style="[^"]*"', "", st) + st2 = re.sub(r' class="[^"]*"', "", st2) + st2 = re.sub(r"<font [^>]*>", "", st2) + st2 = st2.replace("</font>", "") + st2 = re.sub(r"\n", "", st2) + st2 = st2.replace("<div></div>", "") + st2 = st2.replace("<span></span>", "") + # fix russian dictionary issues, + # such as hyphenation in word (e.g. абб{[']}а{[/']}т) + st2 = re.sub(r"\{\['\]\}", "", st2) + st2 = re.sub(r"\{\[/'\]\}", "", st2) + return dumps(st2, ensure_ascii=ensure_ascii) + + yield from writeTxt( + glos, + entryFmt="\t{word}: {defi},\n", + filename=self._filename, + encoding=encoding, + writeInfo=enable_info, + wordEscapeFunc=escape, + defiEscapeFunc=escape, + ext=".json", + head="{\n", + tail='\t"": ""\n}', + resources=resources, + word_title=self._word_title, + ) diff --git a/pyglossary/plugins/ebook_epub2/__init__.py b/pyglossary/plugins/ebook_epub2/__init__.py index 8bf34801b..baabf0036 100644 --- a/pyglossary/plugins/ebook_epub2/__init__.py +++ b/pyglossary/plugins/ebook_epub2/__init__.py @@ -1,27 +1,7 @@ # -*- coding: utf-8 -*- -# The MIT License (MIT) -# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) -# Copyright © 2016-2019 Saeed Rasooli <saeed.gnu@gmail.com> -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -from __future__ import annotations -from typing import TYPE_CHECKING, Any +from __future__ import annotations -from pyglossary.ebook_base import EbookWriter from pyglossary.flags import ALWAYS from pyglossary.option import ( BoolOption, @@ -30,8 +10,7 @@ StrOption, ) -if TYPE_CHECKING: - from pyglossary.glossary_types import GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -85,209 +64,3 @@ comment="Path to cover file", ), } - - -class Writer(EbookWriter): - # these class attrs are only in Epub - # MIMETYPE_CONTENTS, CONTAINER_XML_CONTENTS - # NCX_TEMPLATE, NCX_NAVPOINT_TEMPLATE - - MIMETYPE_CONTENTS = "application/epub+zip" - CONTAINER_XML_CONTENTS = """<?xml version="1.0" encoding="UTF-8" ?> -<container version="1.0" - xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> - <rootfiles> - <rootfile full-path="OEBPS/content.opf" - media-type="application/oebps-package+xml"/> - </rootfiles> -</container>""" - - NCX_TEMPLATE = """<?xml version="1.0" encoding="utf-8" ?> -<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" - "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> -<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> - <head> - <meta name="dtb:uid" content="{identifier}" /> - <meta name="dtb:depth" content="1" /> - <meta name="dtb:totalPageCount" content="0" /> - <meta name="dtb:maxPageNumber" content="0" /> - </head> - <docTitle> - <text>{title}</text> - </docTitle> - <navMap> -{ncx_items} - </navMap> -</ncx>""" - - NCX_NAVPOINT_TEMPLATE = """\t<navPoint id="n{index:06d}" playOrder="{index:d}"> - <navLabel> - <text>{text}</text> - </navLabel> - <content src="{src}" /> - </navPoint>""" - - CSS_CONTENTS = b"""@charset "UTF-8"; -body { - margin: 10px 25px 10px 25px; -} -h1 { - font-size: 200%; -} -h2 { - font-size: 150%; -} -p { - margin-left: 0em; - margin-right: 0em; - margin-top: 0em; - margin-bottom: 0em; - line-height: 2em; - text-align: justify; -} -a, a:focus, a:active, a:visited { - color: black; - text-decoration: none; -} -body.indexPage {} -h1.indexTitle {} -p.indexGroups { - font-size: 150%; -} -span.indexGroup {} -body.groupPage {} -h1.groupTitle {} -div.groupNavigation {} -span.groupHeadword {} -div.groupEntry { - margin-top: 0; - margin-bottom: 1em; -} -h2.groupHeadword { - margin-left: 5%; -} -p.groupDefinition { - margin-left: 10%; - margin-right: 10%; -} -""" - - GROUP_XHTML_TEMPLATE = """<?xml version="1.0" encoding="utf-8" standalone="no"?> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" - "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> - <head> - <title>{title}</title> - <link rel="stylesheet" type="text/css" href="style.css" /> - </head> - <body id="groupPage" class="groupPage"> - <h1 class="groupTitle">{group_title}</h1> - <div class="groupNavigation"> - <a href="{previous_link}">[ Previous ]</a> -{index_link} - <a href="{next_link}">[ Next ]</a> - </div> -{group_contents} - </body> -</html>""" - GROUP_XHTML_INDEX_LINK = '\t\t<a href="index.xhtml">[ Index ]</a>' - - GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """\t<div class="groupEntry"> - <h2 class="groupHeadword">{headword}</h2> - <p class="groupDefinition">{definition}</p> - </div>""" - - OPF_TEMPLATE = """<?xml version="1.0" encoding="utf-8" ?> -<package xmlns="http://www.idpf.org/2007/opf" version="2.0" - unique-identifier="uid"> - <metadata xmlns:opf="http://www.idpf.org/2007/opf" - xmlns:dc="http://purl.org/dc/elements/1.1/"> - <dc:identifier id="uid" opf:scheme="uuid">{identifier}</dc:identifier> - <dc:language>{sourceLang}</dc:language> - <dc:title>{title}</dc:title> - <dc:creator opf:role="aut">{creator}</dc:creator> - <dc:rights>{copyright}</dc:rights> - <dc:date opf:event="creation">{creationDate}</dc:date> - {cover} - </metadata> - <manifest> -{manifest} - </manifest> - <spine toc="toc.ncx"> -{spine} - </spine> -</package>""" - - COVER_TEMPLATE = '<meta name="cover" content="{cover}" />' - - def __init__(self, glos: GlossaryType) -> None: - import uuid - - EbookWriter.__init__( - self, - glos, - ) - glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) - - @classmethod - def cls_get_prefix( - cls: type[EbookWriter], - options: dict[str, Any], - word: str, - ) -> str: - if not word: - return "" - length = options.get("group_by_prefix_length", cls._group_by_prefix_length) - prefix = word[:length].lower() - if prefix[0] < "a": - return "SPECIAL" - return prefix - - def get_prefix(self, word: str) -> str: - if not word: - return "" - length = self._group_by_prefix_length - prefix = word[:length].lower() - if prefix[0] < "a": - return "SPECIAL" - return prefix - - def write_ncx(self, group_labels: list[str]) -> None: - """ - write_ncx - only for epub. - """ - ncx_items: list[str] = [] - index = 1 - if self._include_index_page: - ncx_items.append( - self.NCX_NAVPOINT_TEMPLATE.format( - index=index, - text="Index", - src="index.xhtml", - ), - ) - index += 1 - for group_label in group_labels: - ncx_items.append( - self.NCX_NAVPOINT_TEMPLATE.format( - index=index, - text=group_label, - src=self.get_group_xhtml_file_name_from_index(index), - ), - ) - index += 1 - ncx_items_unicode = "\n".join(ncx_items) - ncx_contents = self.NCX_TEMPLATE.format( - identifier=self._glos.getInfo("uuid"), - title=self._glos.getInfo("name"), - ncx_items=ncx_items_unicode, - ).encode("utf-8") - self.add_file_manifest( - "OEBPS/toc.ncx", - "toc.ncx", - ncx_contents, - "application/x-dtbncx+xml", - ) - - # inherits write from EbookWriter diff --git a/pyglossary/plugins/ebook_epub2/writer.py b/pyglossary/plugins/ebook_epub2/writer.py new file mode 100644 index 000000000..eba888c33 --- /dev/null +++ b/pyglossary/plugins/ebook_epub2/writer.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) +# Copyright © 2016-2019 Saeed Rasooli <saeed.gnu@gmail.com> +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from pyglossary.ebook_base import EbookWriter + +if TYPE_CHECKING: + from pyglossary.glossary_types import GlossaryType + + +class Writer(EbookWriter): + # these class attrs are only in Epub + # MIMETYPE_CONTENTS, CONTAINER_XML_CONTENTS + # NCX_TEMPLATE, NCX_NAVPOINT_TEMPLATE + + MIMETYPE_CONTENTS = "application/epub+zip" + CONTAINER_XML_CONTENTS = """<?xml version="1.0" encoding="UTF-8" ?> +<container version="1.0" + xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> + <rootfiles> + <rootfile full-path="OEBPS/content.opf" + media-type="application/oebps-package+xml"/> + </rootfiles> +</container>""" + + NCX_TEMPLATE = """<?xml version="1.0" encoding="utf-8" ?> +<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" + "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> +<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> + <head> + <meta name="dtb:uid" content="{identifier}" /> + <meta name="dtb:depth" content="1" /> + <meta name="dtb:totalPageCount" content="0" /> + <meta name="dtb:maxPageNumber" content="0" /> + </head> + <docTitle> + <text>{title}</text> + </docTitle> + <navMap> +{ncx_items} + </navMap> +</ncx>""" + + NCX_NAVPOINT_TEMPLATE = """\t<navPoint id="n{index:06d}" playOrder="{index:d}"> + <navLabel> + <text>{text}</text> + </navLabel> + <content src="{src}" /> + </navPoint>""" + + CSS_CONTENTS = b"""@charset "UTF-8"; +body { + margin: 10px 25px 10px 25px; +} +h1 { + font-size: 200%; +} +h2 { + font-size: 150%; +} +p { + margin-left: 0em; + margin-right: 0em; + margin-top: 0em; + margin-bottom: 0em; + line-height: 2em; + text-align: justify; +} +a, a:focus, a:active, a:visited { + color: black; + text-decoration: none; +} +body.indexPage {} +h1.indexTitle {} +p.indexGroups { + font-size: 150%; +} +span.indexGroup {} +body.groupPage {} +h1.groupTitle {} +div.groupNavigation {} +span.groupHeadword {} +div.groupEntry { + margin-top: 0; + margin-bottom: 1em; +} +h2.groupHeadword { + margin-left: 5%; +} +p.groupDefinition { + margin-left: 10%; + margin-right: 10%; +} +""" + + GROUP_XHTML_TEMPLATE = """<?xml version="1.0" encoding="utf-8" standalone="no"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" + "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> + <head> + <title>{title}</title> + <link rel="stylesheet" type="text/css" href="style.css" /> + </head> + <body id="groupPage" class="groupPage"> + <h1 class="groupTitle">{group_title}</h1> + <div class="groupNavigation"> + <a href="{previous_link}">[ Previous ]</a> +{index_link} + <a href="{next_link}">[ Next ]</a> + </div> +{group_contents} + </body> +</html>""" + GROUP_XHTML_INDEX_LINK = '\t\t<a href="index.xhtml">[ Index ]</a>' + + GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """\t<div class="groupEntry"> + <h2 class="groupHeadword">{headword}</h2> + <p class="groupDefinition">{definition}</p> + </div>""" + + OPF_TEMPLATE = """<?xml version="1.0" encoding="utf-8" ?> +<package xmlns="http://www.idpf.org/2007/opf" version="2.0" + unique-identifier="uid"> + <metadata xmlns:opf="http://www.idpf.org/2007/opf" + xmlns:dc="http://purl.org/dc/elements/1.1/"> + <dc:identifier id="uid" opf:scheme="uuid">{identifier}</dc:identifier> + <dc:language>{sourceLang}</dc:language> + <dc:title>{title}</dc:title> + <dc:creator opf:role="aut">{creator}</dc:creator> + <dc:rights>{copyright}</dc:rights> + <dc:date opf:event="creation">{creationDate}</dc:date> + {cover} + </metadata> + <manifest> +{manifest} + </manifest> + <spine toc="toc.ncx"> +{spine} + </spine> +</package>""" + + COVER_TEMPLATE = '<meta name="cover" content="{cover}" />' + + def __init__(self, glos: GlossaryType) -> None: + import uuid + + EbookWriter.__init__( + self, + glos, + ) + glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) + + @classmethod + def cls_get_prefix( + cls: type[EbookWriter], + options: dict[str, Any], + word: str, + ) -> str: + if not word: + return "" + length = options.get("group_by_prefix_length", cls._group_by_prefix_length) + prefix = word[:length].lower() + if prefix[0] < "a": + return "SPECIAL" + return prefix + + def get_prefix(self, word: str) -> str: + if not word: + return "" + length = self._group_by_prefix_length + prefix = word[:length].lower() + if prefix[0] < "a": + return "SPECIAL" + return prefix + + def write_ncx(self, group_labels: list[str]) -> None: + """ + write_ncx + only for epub. + """ + ncx_items: list[str] = [] + index = 1 + if self._include_index_page: + ncx_items.append( + self.NCX_NAVPOINT_TEMPLATE.format( + index=index, + text="Index", + src="index.xhtml", + ), + ) + index += 1 + for group_label in group_labels: + ncx_items.append( + self.NCX_NAVPOINT_TEMPLATE.format( + index=index, + text=group_label, + src=self.get_group_xhtml_file_name_from_index(index), + ), + ) + index += 1 + ncx_items_unicode = "\n".join(ncx_items) + ncx_contents = self.NCX_TEMPLATE.format( + identifier=self._glos.getInfo("uuid"), + title=self._glos.getInfo("name"), + ncx_items=ncx_items_unicode, + ).encode("utf-8") + self.add_file_manifest( + "OEBPS/toc.ncx", + "toc.ncx", + ncx_contents, + "application/x-dtbncx+xml", + ) + + # inherits write from EbookWriter diff --git a/pyglossary/plugins/ebook_kobo/__init__.py b/pyglossary/plugins/ebook_kobo/__init__.py index 02a108f88..cbd9b6f90 100644 --- a/pyglossary/plugins/ebook_kobo/__init__.py +++ b/pyglossary/plugins/ebook_kobo/__init__.py @@ -1,41 +1,14 @@ # -*- coding: utf-8 -*- -# The MIT License (MIT) -# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) -# Copyright © 2022 Saeed Rasooli <saeed.gnu@gmail.com> -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. + from __future__ import annotations -import re -import unicodedata -from gzip import compress, decompress -from operator import itemgetter -from pathlib import Path -from pickle import dumps, loads from typing import TYPE_CHECKING -from pyglossary import core -from pyglossary.core import exc_note, log, pip from pyglossary.flags import NEVER -from pyglossary.os_utils import indir -if TYPE_CHECKING: - from collections.abc import Generator +from .writer import Writer - from pyglossary.glossary_types import EntryType, GlossaryType +if TYPE_CHECKING: from pyglossary.option import Option __all__ = [ @@ -75,199 +48,3 @@ # Penelope option: marisa_index_size=1000000 - - -def is_cyrillic_char(c: str) -> bool: - # U+0400 - U+04FF: Cyrillic - # U+0500 - U+052F: Cyrillic Supplement - if "\u0400" <= c <= "\u052f": - return True - - # U+2DE0 - U+2DFF: Cyrillic Extended-A - if "\u2de0" <= c <= "\u2dff": - return True - - # U+A640 - U+A69F: Cyrillic Extended-B - if "\ua640" <= c <= "\ua69f": - return True - - # U+1C80 - U+1C8F: Cyrillic Extended-C - if "\u1c80" <= c <= "\u1c8f": - return True - - # U+FE2E, U+FE2F: Combining Half Marks - # U+1D2B, U+1D78: Phonetic Extensions - return c in {"\ufe2e", "\ufe2f", "\u1d2b", "\u1d78"} - - -def fixFilename(fname: str) -> str: - return Path(fname.replace("/", "2F").replace("\\", "5C")).name - - -class Writer: - WORDS_FILE_NAME = "words" - - depends = { - "marisa_trie": "marisa-trie", - } - - @staticmethod - def stripFullHtmlError(entry: EntryType, error: str) -> None: - log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._words: list[str] = [] - self._img_pattern = re.compile( - '<img src="([^<>"]*?)"( [^<>]*?)?>', - re.DOTALL, - ) - # img tag has no closing - glos.stripFullHtml(errorHandler=self.stripFullHtmlError) - - def get_prefix(self, word: str) -> str: # noqa: PLR6301 - if not word: - return "11" - wo = word[:2].strip().lower() - if not wo: - return "11" - if wo[0] == "\x00": - return "11" - if len(wo) > 1 and wo[1] == "\x00": - wo = wo[:1] - if is_cyrillic_char(wo[0]): - return wo - # if either of the first 2 chars are not unicode letters, return "11" - for c in wo: - if not unicodedata.category(c).startswith("L"): - return "11" - return wo.ljust(2, "a") - - def fix_defi(self, defi: str) -> str: - # @pgaskin on #219: Kobo supports images in dictionaries, - # but these have a lot of gotchas - # (see https://pgaskin.net/dictutil/dicthtml/format.html). - # Basically, The best way to do it is to encode the images as a - # base64 data URL after shrinking it and making it grayscale - # (if it's JPG, this is as simple as only keeping the Y channel) - - # for now we just skip data entries and remove '<img' tags - return self._img_pattern.sub("[Image: \\1]", defi) - - def write_groups(self) -> Generator[None, EntryType, None]: - import gzip - - dataEntryCount = 0 - - htmlHeader = '<?xml version="1.0" encoding="utf-8"?><html>\n' - - groupCounter = 0 - htmlContents = htmlHeader - - def writeGroup(lastPrefix: str) -> None: - nonlocal htmlContents - group_fname = fixFilename(lastPrefix) - htmlContents += "</html>" - core.trace( - log, - f"writeGroup: {lastPrefix!r}, " - f"{group_fname!r}, count={groupCounter}", - ) - with gzip.open(group_fname + ".html", mode="wb") as gzipFile: - gzipFile.write(htmlContents.encode("utf-8")) - htmlContents = htmlHeader - - allWords: list[str] = [] - # TODO: switch to SQLite, like StarDict writer - data: list[tuple[str, bytes]] = [] - - while True: - entry = yield - if entry is None: - break - if entry.isData(): - dataEntryCount += 1 - continue - l_word = entry.l_word - allWords += l_word - wordsByPrefix: dict[str, list[str]] = {} - for word in l_word: - prefix = self.get_prefix(word) - if prefix in wordsByPrefix: - wordsByPrefix[prefix].append(word) - else: - wordsByPrefix[prefix] = [word] - defi = self.fix_defi(entry.defi) - mainHeadword = l_word[0] - for prefix, p_words in wordsByPrefix.items(): - headword, *variants = p_words - if headword != mainHeadword: - headword = f"{mainHeadword}, {headword}" - data.append( - ( - prefix, - compress( - dumps( - ( - headword, - variants, - defi, - ), - ), - ), - ), - ) - del entry - - log.info("Kobo: sorting entries...") - data.sort(key=itemgetter(0)) - - log.info("Kobo: writing entries...") - - lastPrefix = "" - for prefix, row in data: - headword, variants, defi = loads(decompress(row)) - if lastPrefix and prefix != lastPrefix: - writeGroup(lastPrefix) - groupCounter = 0 - lastPrefix = prefix - - htmlVariants = "".join( - f'<variant name="{v.strip().lower()}"/>' for v in variants - ) - body = f"<div><b>{headword}</b><var>{htmlVariants}</var><br/>{defi}</div>" - htmlContents += f'<w><a name="{headword}" />{body}</w>\n' - groupCounter += 1 - del data - - if groupCounter > 0: - writeGroup(lastPrefix) - - if dataEntryCount > 0: - log.warning( - f"ignored {dataEntryCount} files (data entries)" - " and replaced '<img ...' tags in definitions with placeholders", - ) - - self._words = allWords - - def open(self, filename: str) -> None: - try: - import marisa_trie # type: ignore # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install marisa-trie` to install") - raise - self._filename = filename - - def write(self) -> Generator[None, EntryType, None]: - with indir(self._filename, create=True): - yield from self.write_groups() - - def finish(self) -> None: - import marisa_trie - - with indir(self._filename, create=False): - trie = marisa_trie.Trie(self._words) - trie.save(self.WORDS_FILE_NAME) - self._filename = "" diff --git a/pyglossary/plugins/ebook_kobo/writer.py b/pyglossary/plugins/ebook_kobo/writer.py new file mode 100644 index 000000000..5b26aff01 --- /dev/null +++ b/pyglossary/plugins/ebook_kobo/writer.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) +# Copyright © 2022 Saeed Rasooli <saeed.gnu@gmail.com> +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +import re +import unicodedata +from gzip import compress, decompress +from operator import itemgetter +from pathlib import Path +from pickle import dumps, loads +from typing import TYPE_CHECKING + +from pyglossary import core +from pyglossary.core import exc_note, log, pip +from pyglossary.os_utils import indir + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +def is_cyrillic_char(c: str) -> bool: + # U+0400 - U+04FF: Cyrillic + # U+0500 - U+052F: Cyrillic Supplement + if "\u0400" <= c <= "\u052f": + return True + + # U+2DE0 - U+2DFF: Cyrillic Extended-A + if "\u2de0" <= c <= "\u2dff": + return True + + # U+A640 - U+A69F: Cyrillic Extended-B + if "\ua640" <= c <= "\ua69f": + return True + + # U+1C80 - U+1C8F: Cyrillic Extended-C + if "\u1c80" <= c <= "\u1c8f": + return True + + # U+FE2E, U+FE2F: Combining Half Marks + # U+1D2B, U+1D78: Phonetic Extensions + return c in {"\ufe2e", "\ufe2f", "\u1d2b", "\u1d78"} + + +def fixFilename(fname: str) -> str: + return Path(fname.replace("/", "2F").replace("\\", "5C")).name + + +class Writer: + WORDS_FILE_NAME = "words" + + depends = { + "marisa_trie": "marisa-trie", + } + + @staticmethod + def stripFullHtmlError(entry: EntryType, error: str) -> None: + log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._words: list[str] = [] + self._img_pattern = re.compile( + '<img src="([^<>"]*?)"( [^<>]*?)?>', + re.DOTALL, + ) + # img tag has no closing + glos.stripFullHtml(errorHandler=self.stripFullHtmlError) + + def get_prefix(self, word: str) -> str: # noqa: PLR6301 + if not word: + return "11" + wo = word[:2].strip().lower() + if not wo: + return "11" + if wo[0] == "\x00": + return "11" + if len(wo) > 1 and wo[1] == "\x00": + wo = wo[:1] + if is_cyrillic_char(wo[0]): + return wo + # if either of the first 2 chars are not unicode letters, return "11" + for c in wo: + if not unicodedata.category(c).startswith("L"): + return "11" + return wo.ljust(2, "a") + + def fix_defi(self, defi: str) -> str: + # @pgaskin on #219: Kobo supports images in dictionaries, + # but these have a lot of gotchas + # (see https://pgaskin.net/dictutil/dicthtml/format.html). + # Basically, The best way to do it is to encode the images as a + # base64 data URL after shrinking it and making it grayscale + # (if it's JPG, this is as simple as only keeping the Y channel) + + # for now we just skip data entries and remove '<img' tags + return self._img_pattern.sub("[Image: \\1]", defi) + + def write_groups(self) -> Generator[None, EntryType, None]: + import gzip + + dataEntryCount = 0 + + htmlHeader = '<?xml version="1.0" encoding="utf-8"?><html>\n' + + groupCounter = 0 + htmlContents = htmlHeader + + def writeGroup(lastPrefix: str) -> None: + nonlocal htmlContents + group_fname = fixFilename(lastPrefix) + htmlContents += "</html>" + core.trace( + log, + f"writeGroup: {lastPrefix!r}, " + f"{group_fname!r}, count={groupCounter}", + ) + with gzip.open(group_fname + ".html", mode="wb") as gzipFile: + gzipFile.write(htmlContents.encode("utf-8")) + htmlContents = htmlHeader + + allWords: list[str] = [] + # TODO: switch to SQLite, like StarDict writer + data: list[tuple[str, bytes]] = [] + + while True: + entry = yield + if entry is None: + break + if entry.isData(): + dataEntryCount += 1 + continue + l_word = entry.l_word + allWords += l_word + wordsByPrefix: dict[str, list[str]] = {} + for word in l_word: + prefix = self.get_prefix(word) + if prefix in wordsByPrefix: + wordsByPrefix[prefix].append(word) + else: + wordsByPrefix[prefix] = [word] + defi = self.fix_defi(entry.defi) + mainHeadword = l_word[0] + for prefix, p_words in wordsByPrefix.items(): + headword, *variants = p_words + if headword != mainHeadword: + headword = f"{mainHeadword}, {headword}" + data.append( + ( + prefix, + compress( + dumps( + ( + headword, + variants, + defi, + ), + ), + ), + ), + ) + del entry + + log.info("Kobo: sorting entries...") + data.sort(key=itemgetter(0)) + + log.info("Kobo: writing entries...") + + lastPrefix = "" + for prefix, row in data: + headword, variants, defi = loads(decompress(row)) + if lastPrefix and prefix != lastPrefix: + writeGroup(lastPrefix) + groupCounter = 0 + lastPrefix = prefix + + htmlVariants = "".join( + f'<variant name="{v.strip().lower()}"/>' for v in variants + ) + body = f"<div><b>{headword}</b><var>{htmlVariants}</var><br/>{defi}</div>" + htmlContents += f'<w><a name="{headword}" />{body}</w>\n' + groupCounter += 1 + del data + + if groupCounter > 0: + writeGroup(lastPrefix) + + if dataEntryCount > 0: + log.warning( + f"ignored {dataEntryCount} files (data entries)" + " and replaced '<img ...' tags in definitions with placeholders", + ) + + self._words = allWords + + def open(self, filename: str) -> None: + try: + import marisa_trie # type: ignore # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install marisa-trie` to install") + raise + self._filename = filename + + def write(self) -> Generator[None, EntryType, None]: + with indir(self._filename, create=True): + yield from self.write_groups() + + def finish(self) -> None: + import marisa_trie + + with indir(self._filename, create=False): + trie = marisa_trie.Trie(self._words) + trie.save(self.WORDS_FILE_NAME) + self._filename = "" diff --git a/pyglossary/plugins/ebook_kobo_dictfile/__init__.py b/pyglossary/plugins/ebook_kobo_dictfile/__init__.py index 7ec327ee3..946b18dfd 100644 --- a/pyglossary/plugins/ebook_kobo_dictfile/__init__.py +++ b/pyglossary/plugins/ebook_kobo_dictfile/__init__.py @@ -1,42 +1,15 @@ # -*- coding: utf-8 -*- -# The MIT License (MIT) -# Copyright © 2020-2021 Saeed Rasooli <saeed.gnu@gmail.com> -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -from __future__ import annotations -import os -from os.path import isdir -from typing import TYPE_CHECKING +from __future__ import annotations -from pyglossary.core import exc_note, log, pip -from pyglossary.image_utils import extractInlineHtmlImages -from pyglossary.io_utils import nullTextIO from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -from pyglossary.text_reader import TextGlossaryReader -if TYPE_CHECKING: - import io - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -73,155 +46,3 @@ "encoding": EncodingOption(), "extract_inline_images": BoolOption(comment="Extract inline images"), } - - -def fixWord(word: str) -> str: - return word.replace("\n", " ") - - -def escapeDefi(defi: str) -> str: - return defi.replace("\n@", "\n @").replace("\n:", "\n :").replace("\n&", "\n &") - - -class Reader(TextGlossaryReader): - depends = { - "mistune": "mistune==3.0.1", - } - - _extract_inline_images: bool = True - - def __init__(self, glos: GlossaryType) -> None: - TextGlossaryReader.__init__(self, glos, hasInfo=False) - - def open(self, filename: str) -> None: - try: - import mistune # type: ignore # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install mistune` to install") - raise - TextGlossaryReader.open(self, filename) - self._glos.setDefaultDefiFormat("h") - - @classmethod - def isInfoWord(cls, _word: str) -> bool: - return False - - @classmethod - def fixInfoWord(cls, _word: str) -> str: - raise NotImplementedError - - def fixDefi( - self, - defi: str, - html: bool, - ) -> tuple[str, list[tuple[str, str]] | None]: - import mistune - - defi = ( - defi.replace("\n @", "\n@") - .replace("\n :", "\n:") - .replace("\n &", "\n&") - .replace("</p><br />", "</p>") - .replace("</p><br/>", "</p>") - .replace("</p></br>", "</p>") - ) - defi = defi.strip() - if html: - pass - else: - defi = mistune.html(defi) - images: list[tuple[str, str]] | None = None - if self._extract_inline_images: - defi, images = extractInlineHtmlImages( - defi, - self._glos.tmpDataDir, - fnamePrefix="", # maybe f"{self._pos:06d}-" - ) - return defi, images - - def nextBlock( - self, - ) -> tuple[list[str], str, list[tuple[str, str]] | None]: - words: list[str] = [] - defiLines: list[str] = [] - html = False - - while True: - line = self.readline() - if not line: - break - line = line.rstrip("\n\r") - if line.startswith("@"): - if words: - self._bufferLine = line - defi, images = self.fixDefi("\n".join(defiLines), html=html) - return words, defi, images - words = [line[1:].strip()] - continue - if line.startswith(": "): - defiLines.append(line[2:]) - continue - if line.startswith("::"): - continue - if line.startswith("&"): - words.append(line[1:].strip()) - continue - if line.startswith("<html>"): - line = line[6:] - html = True - defiLines.append(line) - - if words: - defi, images = self.fixDefi("\n".join(defiLines), html=html) - return words, defi, images - - raise StopIteration - - -class Writer: - _encoding: str = "utf-8" - - @staticmethod - def stripFullHtmlError(entry: EntryType, error: str) -> None: - log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._file: io.TextIOBase = nullTextIO - glos.stripFullHtml(errorHandler=self.stripFullHtmlError) - - def finish(self) -> None: - self._file.close() - if not os.listdir(self._resDir): - os.rmdir(self._resDir) - - def open(self, filename: str) -> None: - self._file = open(filename, "w", encoding=self._encoding) - # dictgen's ParseDictFile does not seem to support glossary info / metedata - self._resDir = filename + "_res" - if not isdir(self._resDir): - os.mkdir(self._resDir) - - def write( - self, - ) -> Generator[None, EntryType, None]: - fileObj = self._file - resDir = self._resDir - while True: - entry = yield - if entry is None: - break - if entry.isData(): - entry.save(resDir) - continue - words = entry.l_word - defi = entry.defi - - entry.detectDefiFormat() - if entry.defiFormat == "h": - defi = f"<html>{entry.defi}" - - fileObj.write(f"@ {fixWord(words[0])}\n") - for alt in words[1:]: - fileObj.write(f"& {fixWord(alt)}\n") - fileObj.write(f"{escapeDefi(defi)}\n\n") diff --git a/pyglossary/plugins/ebook_kobo_dictfile/reader.py b/pyglossary/plugins/ebook_kobo_dictfile/reader.py new file mode 100644 index 000000000..131ab6190 --- /dev/null +++ b/pyglossary/plugins/ebook_kobo_dictfile/reader.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2020-2021 Saeed Rasooli <saeed.gnu@gmail.com> +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.core import exc_note, pip +from pyglossary.image_utils import extractInlineHtmlImages +from pyglossary.text_reader import TextGlossaryReader + +if TYPE_CHECKING: + from pyglossary.glossary_types import GlossaryType + + +class Reader(TextGlossaryReader): + depends = { + "mistune": "mistune==3.0.1", + } + + _extract_inline_images: bool = True + + def __init__(self, glos: GlossaryType) -> None: + TextGlossaryReader.__init__(self, glos, hasInfo=False) + + def open(self, filename: str) -> None: + try: + import mistune # type: ignore # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install mistune` to install") + raise + TextGlossaryReader.open(self, filename) + self._glos.setDefaultDefiFormat("h") + + @classmethod + def isInfoWord(cls, _word: str) -> bool: + return False + + @classmethod + def fixInfoWord(cls, _word: str) -> str: + raise NotImplementedError + + def fixDefi( + self, + defi: str, + html: bool, + ) -> tuple[str, list[tuple[str, str]] | None]: + import mistune + + defi = ( + defi.replace("\n @", "\n@") + .replace("\n :", "\n:") + .replace("\n &", "\n&") + .replace("</p><br />", "</p>") + .replace("</p><br/>", "</p>") + .replace("</p></br>", "</p>") + ) + defi = defi.strip() + if html: + pass + else: + defi = mistune.html(defi) + images: list[tuple[str, str]] | None = None + if self._extract_inline_images: + defi, images = extractInlineHtmlImages( + defi, + self._glos.tmpDataDir, + fnamePrefix="", # maybe f"{self._pos:06d}-" + ) + return defi, images + + def nextBlock( + self, + ) -> tuple[list[str], str, list[tuple[str, str]] | None]: + words: list[str] = [] + defiLines: list[str] = [] + html = False + + while True: + line = self.readline() + if not line: + break + line = line.rstrip("\n\r") + if line.startswith("@"): + if words: + self._bufferLine = line + defi, images = self.fixDefi("\n".join(defiLines), html=html) + return words, defi, images + words = [line[1:].strip()] + continue + if line.startswith(": "): + defiLines.append(line[2:]) + continue + if line.startswith("::"): + continue + if line.startswith("&"): + words.append(line[1:].strip()) + continue + if line.startswith("<html>"): + line = line[6:] + html = True + defiLines.append(line) + + if words: + defi, images = self.fixDefi("\n".join(defiLines), html=html) + return words, defi, images + + raise StopIteration diff --git a/pyglossary/plugins/ebook_kobo_dictfile/writer.py b/pyglossary/plugins/ebook_kobo_dictfile/writer.py new file mode 100644 index 000000000..60c9c9651 --- /dev/null +++ b/pyglossary/plugins/ebook_kobo_dictfile/writer.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2020-2021 Saeed Rasooli <saeed.gnu@gmail.com> +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +import os +from os.path import isdir +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.io_utils import nullTextIO + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +def fixWord(word: str) -> str: + return word.replace("\n", " ") + + +def escapeDefi(defi: str) -> str: + return defi.replace("\n@", "\n @").replace("\n:", "\n :").replace("\n&", "\n &") + + +class Writer: + _encoding: str = "utf-8" + + @staticmethod + def stripFullHtmlError(entry: EntryType, error: str) -> None: + log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._file: io.TextIOBase = nullTextIO + glos.stripFullHtml(errorHandler=self.stripFullHtmlError) + + def finish(self) -> None: + self._file.close() + if not os.listdir(self._resDir): + os.rmdir(self._resDir) + + def open(self, filename: str) -> None: + self._file = open(filename, "w", encoding=self._encoding) + # dictgen's ParseDictFile does not seem to support glossary info / metedata + self._resDir = filename + "_res" + if not isdir(self._resDir): + os.mkdir(self._resDir) + + def write( + self, + ) -> Generator[None, EntryType, None]: + fileObj = self._file + resDir = self._resDir + while True: + entry = yield + if entry is None: + break + if entry.isData(): + entry.save(resDir) + continue + words = entry.l_word + defi = entry.defi + + entry.detectDefiFormat() + if entry.defiFormat == "h": + defi = f"<html>{entry.defi}" + + fileObj.write(f"@ {fixWord(words[0])}\n") + for alt in words[1:]: + fileObj.write(f"& {fixWord(alt)}\n") + fileObj.write(f"{escapeDefi(defi)}\n\n") diff --git a/pyglossary/plugins/ebook_mobi/__init__.py b/pyglossary/plugins/ebook_mobi/__init__.py index 9ac4e18ec..00da1e1ad 100644 --- a/pyglossary/plugins/ebook_mobi/__init__.py +++ b/pyglossary/plugins/ebook_mobi/__init__.py @@ -1,33 +1,8 @@ # -*- coding: utf-8 -*- -# The MIT License (MIT) -# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) -# Copyright © 2016-2022 Saeed Rasooli <saeed.gnu@gmail.com> -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -from __future__ import annotations -import os -from datetime import datetime -from os.path import join, split -from typing import TYPE_CHECKING +from __future__ import annotations -from pyglossary.core import log -from pyglossary.ebook_base import EbookWriter from pyglossary.flags import DEFAULT_YES -from pyglossary.langs import Lang from pyglossary.option import ( BoolOption, FileSizeOption, @@ -36,10 +11,7 @@ StrOption, ) -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -121,277 +93,3 @@ " for creating Mobipocket e-books.", ), ] - - -class GroupStateBySize: - def __init__(self, writer: Writer) -> None: - self.writer = writer - self.group_index = -1 - self.reset() - - def reset(self) -> None: - self.group_contents: list[str] = [] - self.group_size = 0 - - def add(self, entry: EntryType) -> None: - defi = entry.defi - content = self.writer.format_group_content( - entry.l_word[0], - defi, - variants=entry.l_word[1:], - ) - self.group_contents.append(content) - self.group_size += len(content.encode("utf-8")) - - -class Writer(EbookWriter): - _compress: bool = False - _keep: bool = False - _kindlegen_path: str = "" - _file_size_approx: int = 271360 - _hide_word_index: bool = False - _spellcheck: bool = True - _exact: bool = False - CSS_CONTENTS = b""""@charset "UTF-8";""" - GROUP_XHTML_TEMPLATE = """<?xml version="1.0" encoding="utf-8" \ -standalone="no"?> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" \ -"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> -<html xmlns:cx=\ -"https://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf" \ -xmlns:dc="http://purl.org/dc/elements/1.1/" \ -xmlns:idx="https://kindlegen.s3.amazonaws.com\ -/AmazonKindlePublishingGuidelines.pdf" \ -xmlns:math="http://exslt.org/math" \ -xmlns:mbp="https://kindlegen.s3.amazonaws.com\ -/AmazonKindlePublishingGuidelines.pdf" \ -xmlns:mmc="https://kindlegen.s3.amazonaws.com\ -/AmazonKindlePublishingGuidelines.pdf" \ -xmlns:saxon="http://saxon.sf.net/" xmlns:svg="http://www.w3.org/2000/svg" \ -xmlns:tl="https://kindlegen.s3.amazonaws.com\ -/AmazonKindlePublishingGuidelines.pdf" \ -xmlns:xs="http://www.w3.org/2001/XMLSchema" \ -xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> -<head> -<meta content="text/html; charset=utf-8" http-equiv="Content-Type" /> -<link href="style.css" rel="stylesheet" type="text/css" /> -</head> -<body> -<mbp:frameset> -{group_contents} -</mbp:frameset> -</body> -</html>""" - - GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """<idx:entry \ -scriptable="yes"{spellcheck_str}> -<idx:orth{value_headword}>{headword_visible}{infl} -</idx:orth> -<br/>{definition} -</idx:entry> -<hr/>""" - - GROUP_XHTML_WORD_INFL_TEMPLATE = """<idx:infl> -{iforms_str} -</idx:infl>""" - - GROUP_XHTML_WORD_IFORM_TEMPLATE = """<idx:iform \ -value="{inflword}"{exact_str} />""" - - OPF_TEMPLATE = """<?xml version="1.0" encoding="utf-8"?> -<package unique-identifier="uid"> -<metadata> -<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" -xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/"> -<dc:Title>{title}</dc:Title> -<dc:Language>{sourceLang}</dc:Language> -<dc:Identifier id="uid">{identifier}</dc:Identifier> -<dc:Creator>{creator}</dc:Creator> -<dc:Rights>{copyright}</dc:Rights> -<dc:description>{description}</dc:description> -<dc:Subject BASICCode="REF008000">Dictionaries</dc:Subject> -</dc-metadata> -<x-metadata> -<output encoding="utf-8"></output> -<DictionaryInLanguage>{sourceLang}</DictionaryInLanguage> -<DictionaryOutLanguage>{targetLang}</DictionaryOutLanguage> -<EmbeddedCover>{cover}</EmbeddedCover> -</x-metadata> -</metadata> -<manifest> -{manifest} -</manifest> -<spine> -{spine} -</spine> -<tours></tours> -<guide></guide> -</package>""" - - def __init__(self, glos: GlossaryType) -> None: - import uuid - - EbookWriter.__init__( - self, - glos, - ) - glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) - # FIXME: check if full html pages/documents as entry do work - # glos.stripFullHtml(errorHandler=None) - - def get_prefix(self, word: str) -> str: - if not word: - return "" - length = self._group_by_prefix_length - prefix = word[:length].lower() - if prefix[0] < "a": - return "SPECIAL" - return prefix - - def format_group_content( - self, - word: str, - defi: str, - variants: list[str] | None = None, - ) -> str: - hide_word_index = self._hide_word_index - infl = "" - if variants: - iforms_list = [ - self.GROUP_XHTML_WORD_IFORM_TEMPLATE.format( - inflword=variant, - exact_str=' exact="yes"' if self._exact else "", - ) - for variant in variants - ] - infl = "\n" + self.GROUP_XHTML_WORD_INFL_TEMPLATE.format( - iforms_str="\n".join(iforms_list), - ) - - headword = self.escape_if_needed(word) - - defi = self.escape_if_needed(defi) - - if hide_word_index: - headword_visible = "" - value_headword = f' value="{headword}"' - else: - headword_visible = "\n" + self._glos.wordTitleStr(headword) - value_headword = "" - - return self.GROUP_XHTML_WORD_DEFINITION_TEMPLATE.format( - spellcheck_str=' spell="yes"' if self._spellcheck else "", - headword_visible=headword_visible, - value_headword=value_headword, - definition=defi, - infl=infl, - ) - - @staticmethod - def getLangCode(lang: Lang | None) -> str: - return lang.code if isinstance(lang, Lang) else "" - - def get_opf_contents( - self, - manifest_contents: str, - spine_contents: str, - ) -> bytes: - cover = "" - if self.cover: - cover = self.COVER_TEMPLATE.format(cover=self.cover) - creationDate = datetime.now().strftime("%Y-%m-%d") - - return self.OPF_TEMPLATE.format( - identifier=self._glos.getInfo("uuid"), - # use Language code instead name for kindlegen - sourceLang=self.getLangCode(self._glos.sourceLang), - targetLang=self.getLangCode(self._glos.targetLang), - title=self._glos.getInfo("name"), - creator=self._glos.author, - copyright=self._glos.getInfo("copyright"), - description=self._glos.getInfo("description"), - creationDate=creationDate, - cover=cover, - manifest=manifest_contents, - spine=spine_contents, - ).encode("utf-8") - - def write_groups(self) -> Generator[None, EntryType, None]: - def add_group(state: GroupStateBySize) -> None: - if state.group_size <= 0: - return - state.group_index += 1 - index = state.group_index + self.GROUP_START_INDEX - group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) - self.add_file_manifest( - "OEBPS/" + group_xhtml_path, - group_xhtml_path, - self.GROUP_XHTML_TEMPLATE.format( - group_contents=self.GROUP_XHTML_WORD_DEFINITION_JOINER.join( - state.group_contents, - ), - ).encode("utf-8"), - "application/xhtml+xml", - ) - - state = GroupStateBySize(self) - while True: - entry = yield - if entry is None: - break - if entry.isData(): - continue - - if state.group_size >= self._file_size_approx: - add_group(state) - state.reset() - - state.add(entry) - - add_group(state) - - def write(self) -> Generator[None, EntryType, None]: - import shutil - import subprocess - - filename = self._filename - kindlegen_path = self._kindlegen_path - - yield from EbookWriter.write(self) - - # download kindlegen from this page: - # https://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000765211 - - # run kindlegen - if not kindlegen_path: - kindlegen_path = shutil.which("kindlegen") or "" - if not kindlegen_path: - log.warning( - f"Not running kindlegen, the raw files are located in {filename}", - ) - log.warning( - "Provide KindleGen path with: --write-options 'kindlegen_path=...'", - ) - return - - # name = self._glos.getInfo("name") - log.info(f"Creating .mobi file with kindlegen, using {kindlegen_path!r}") - direc, filename = split(filename) - cmd = [ - kindlegen_path, - join(filename, "OEBPS", "content.opf"), - "-gen_ff_mobi7", - "-o", - "content.mobi", - ] - proc = subprocess.Popen( - cmd, - cwd=direc, - stdout=subprocess.PIPE, - stdin=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - output = proc.communicate() - log.info(output[0].decode("utf-8")) - mobi_path_abs = os.path.join(filename, "OEBPS", "content.mobi") - log.info(f"Created .mobi file with kindlegen: {mobi_path_abs}") diff --git a/pyglossary/plugins/ebook_mobi/writer.py b/pyglossary/plugins/ebook_mobi/writer.py new file mode 100644 index 000000000..36484ff8e --- /dev/null +++ b/pyglossary/plugins/ebook_mobi/writer.py @@ -0,0 +1,308 @@ +# -*- coding: utf-8 -*- +# The MIT License (MIT) +# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it) +# Copyright © 2016-2022 Saeed Rasooli <saeed.gnu@gmail.com> +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +from __future__ import annotations + +import os +from datetime import datetime +from os.path import join, split +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.ebook_base import EbookWriter +from pyglossary.langs import Lang + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class GroupStateBySize: + def __init__(self, writer: Writer) -> None: + self.writer = writer + self.group_index = -1 + self.reset() + + def reset(self) -> None: + self.group_contents: list[str] = [] + self.group_size = 0 + + def add(self, entry: EntryType) -> None: + defi = entry.defi + content = self.writer.format_group_content( + entry.l_word[0], + defi, + variants=entry.l_word[1:], + ) + self.group_contents.append(content) + self.group_size += len(content.encode("utf-8")) + + +class Writer(EbookWriter): + _compress: bool = False + _keep: bool = False + _kindlegen_path: str = "" + _file_size_approx: int = 271360 + _hide_word_index: bool = False + _spellcheck: bool = True + _exact: bool = False + CSS_CONTENTS = b""""@charset "UTF-8";""" + GROUP_XHTML_TEMPLATE = """<?xml version="1.0" encoding="utf-8" \ +standalone="no"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" \ +"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> +<html xmlns:cx=\ +"https://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf" \ +xmlns:dc="http://purl.org/dc/elements/1.1/" \ +xmlns:idx="https://kindlegen.s3.amazonaws.com\ +/AmazonKindlePublishingGuidelines.pdf" \ +xmlns:math="http://exslt.org/math" \ +xmlns:mbp="https://kindlegen.s3.amazonaws.com\ +/AmazonKindlePublishingGuidelines.pdf" \ +xmlns:mmc="https://kindlegen.s3.amazonaws.com\ +/AmazonKindlePublishingGuidelines.pdf" \ +xmlns:saxon="http://saxon.sf.net/" xmlns:svg="http://www.w3.org/2000/svg" \ +xmlns:tl="https://kindlegen.s3.amazonaws.com\ +/AmazonKindlePublishingGuidelines.pdf" \ +xmlns:xs="http://www.w3.org/2001/XMLSchema" \ +xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> +<head> +<meta content="text/html; charset=utf-8" http-equiv="Content-Type" /> +<link href="style.css" rel="stylesheet" type="text/css" /> +</head> +<body> +<mbp:frameset> +{group_contents} +</mbp:frameset> +</body> +</html>""" + + GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """<idx:entry \ +scriptable="yes"{spellcheck_str}> +<idx:orth{value_headword}>{headword_visible}{infl} +</idx:orth> +<br/>{definition} +</idx:entry> +<hr/>""" + + GROUP_XHTML_WORD_INFL_TEMPLATE = """<idx:infl> +{iforms_str} +</idx:infl>""" + + GROUP_XHTML_WORD_IFORM_TEMPLATE = """<idx:iform \ +value="{inflword}"{exact_str} />""" + + OPF_TEMPLATE = """<?xml version="1.0" encoding="utf-8"?> +<package unique-identifier="uid"> +<metadata> +<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" +xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/"> +<dc:Title>{title}</dc:Title> +<dc:Language>{sourceLang}</dc:Language> +<dc:Identifier id="uid">{identifier}</dc:Identifier> +<dc:Creator>{creator}</dc:Creator> +<dc:Rights>{copyright}</dc:Rights> +<dc:description>{description}</dc:description> +<dc:Subject BASICCode="REF008000">Dictionaries</dc:Subject> +</dc-metadata> +<x-metadata> +<output encoding="utf-8"></output> +<DictionaryInLanguage>{sourceLang}</DictionaryInLanguage> +<DictionaryOutLanguage>{targetLang}</DictionaryOutLanguage> +<EmbeddedCover>{cover}</EmbeddedCover> +</x-metadata> +</metadata> +<manifest> +{manifest} +</manifest> +<spine> +{spine} +</spine> +<tours></tours> +<guide></guide> +</package>""" + + def __init__(self, glos: GlossaryType) -> None: + import uuid + + EbookWriter.__init__( + self, + glos, + ) + glos.setInfo("uuid", str(uuid.uuid4()).replace("-", "")) + # FIXME: check if full html pages/documents as entry do work + # glos.stripFullHtml(errorHandler=None) + + def get_prefix(self, word: str) -> str: + if not word: + return "" + length = self._group_by_prefix_length + prefix = word[:length].lower() + if prefix[0] < "a": + return "SPECIAL" + return prefix + + def format_group_content( + self, + word: str, + defi: str, + variants: list[str] | None = None, + ) -> str: + hide_word_index = self._hide_word_index + infl = "" + if variants: + iforms_list = [ + self.GROUP_XHTML_WORD_IFORM_TEMPLATE.format( + inflword=variant, + exact_str=' exact="yes"' if self._exact else "", + ) + for variant in variants + ] + infl = "\n" + self.GROUP_XHTML_WORD_INFL_TEMPLATE.format( + iforms_str="\n".join(iforms_list), + ) + + headword = self.escape_if_needed(word) + + defi = self.escape_if_needed(defi) + + if hide_word_index: + headword_visible = "" + value_headword = f' value="{headword}"' + else: + headword_visible = "\n" + self._glos.wordTitleStr(headword) + value_headword = "" + + return self.GROUP_XHTML_WORD_DEFINITION_TEMPLATE.format( + spellcheck_str=' spell="yes"' if self._spellcheck else "", + headword_visible=headword_visible, + value_headword=value_headword, + definition=defi, + infl=infl, + ) + + @staticmethod + def getLangCode(lang: Lang | None) -> str: + return lang.code if isinstance(lang, Lang) else "" + + def get_opf_contents( + self, + manifest_contents: str, + spine_contents: str, + ) -> bytes: + cover = "" + if self.cover: + cover = self.COVER_TEMPLATE.format(cover=self.cover) + creationDate = datetime.now().strftime("%Y-%m-%d") + + return self.OPF_TEMPLATE.format( + identifier=self._glos.getInfo("uuid"), + # use Language code instead name for kindlegen + sourceLang=self.getLangCode(self._glos.sourceLang), + targetLang=self.getLangCode(self._glos.targetLang), + title=self._glos.getInfo("name"), + creator=self._glos.author, + copyright=self._glos.getInfo("copyright"), + description=self._glos.getInfo("description"), + creationDate=creationDate, + cover=cover, + manifest=manifest_contents, + spine=spine_contents, + ).encode("utf-8") + + def write_groups(self) -> Generator[None, EntryType, None]: + def add_group(state: GroupStateBySize) -> None: + if state.group_size <= 0: + return + state.group_index += 1 + index = state.group_index + self.GROUP_START_INDEX + group_xhtml_path = self.get_group_xhtml_file_name_from_index(index) + self.add_file_manifest( + "OEBPS/" + group_xhtml_path, + group_xhtml_path, + self.GROUP_XHTML_TEMPLATE.format( + group_contents=self.GROUP_XHTML_WORD_DEFINITION_JOINER.join( + state.group_contents, + ), + ).encode("utf-8"), + "application/xhtml+xml", + ) + + state = GroupStateBySize(self) + while True: + entry = yield + if entry is None: + break + if entry.isData(): + continue + + if state.group_size >= self._file_size_approx: + add_group(state) + state.reset() + + state.add(entry) + + add_group(state) + + def write(self) -> Generator[None, EntryType, None]: + import shutil + import subprocess + + filename = self._filename + kindlegen_path = self._kindlegen_path + + yield from EbookWriter.write(self) + + # download kindlegen from this page: + # https://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000765211 + + # run kindlegen + if not kindlegen_path: + kindlegen_path = shutil.which("kindlegen") or "" + if not kindlegen_path: + log.warning( + f"Not running kindlegen, the raw files are located in {filename}", + ) + log.warning( + "Provide KindleGen path with: --write-options 'kindlegen_path=...'", + ) + return + + # name = self._glos.getInfo("name") + log.info(f"Creating .mobi file with kindlegen, using {kindlegen_path!r}") + direc, filename = split(filename) + cmd = [ + kindlegen_path, + join(filename, "OEBPS", "content.opf"), + "-gen_ff_mobi7", + "-o", + "content.mobi", + ] + proc = subprocess.Popen( + cmd, + cwd=direc, + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + output = proc.communicate() + log.info(output[0].decode("utf-8")) + mobi_path_abs = os.path.join(filename, "OEBPS", "content.mobi") + log.info(f"Created .mobi file with kindlegen: {mobi_path_abs}") diff --git a/pyglossary/plugins/edict2/__init__.py b/pyglossary/plugins/edict2/__init__.py index f0cb45408..50b9a2466 100644 --- a/pyglossary/plugins/edict2/__init__.py +++ b/pyglossary/plugins/edict2/__init__.py @@ -1,23 +1,12 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -from pyglossary.core import log -from pyglossary.io_utils import nullTextIO from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -from . import conv - -if TYPE_CHECKING: - import io - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType - +from .reader import Reader __all__ = [ "Reader", @@ -71,78 +60,3 @@ comment="Set to false to disable tones coloring", ), } - - -class Reader: - depends = { - "lxml": "lxml", - } - - _encoding: str = "utf-8" - _traditional_title: bool = False - _colorize_tones: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self.file: io.TextIOBase = nullTextIO - self._fileSize = 0 - - def open(self, filename: str) -> None: - # self._glos.sourceLangName = "Chinese" - # self._glos.targetLangName = "English" - - cfile = self.file = open(filename, encoding=self._encoding) - - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - # self._glos.setInfo("input_file_size", f"{self._fileSize}") - else: - log.warning("EDICT2 Reader: file is not seekable") - - def close(self) -> None: - self.file.close() - self.file = nullTextIO - - def __len__(self) -> int: - return 0 - - def __iter__(self) -> Iterator[EntryType]: - file = self.file - fileSize = self._fileSize - glos = self._glos - - render_syllables = ( - conv.render_syllables_color - if self._colorize_tones - else conv.render_syllables_no_color - ) - parse_line = ( - conv.parse_line_trad if self._traditional_title else conv.parse_line_simp - ) - - while True: - line = file.readline() - if not line: - break - line = line.rstrip("\n") - if not line: - continue - if line.startswith("#"): - continue - parts = parse_line(line) - if parts is None: - log.warning(f"bad line: {line!r}") - continue - names, article_text = conv.render_article( - render_syllables, - conv.Article(*parts), - ) - entry = glos.newEntry( - names, - article_text, - defiFormat="h", - byteProgress=(file.tell(), fileSize) if fileSize else None, - ) - yield entry diff --git a/pyglossary/plugins/edict2/reader.py b/pyglossary/plugins/edict2/reader.py new file mode 100644 index 000000000..378cc0251 --- /dev/null +++ b/pyglossary/plugins/edict2/reader.py @@ -0,0 +1,89 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.io_utils import nullTextIO + +from . import conv + +if TYPE_CHECKING: + import io + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + depends = { + "lxml": "lxml", + } + + _encoding: str = "utf-8" + _traditional_title: bool = False + _colorize_tones: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self.file: io.TextIOBase = nullTextIO + self._fileSize = 0 + + def open(self, filename: str) -> None: + # self._glos.sourceLangName = "Chinese" + # self._glos.targetLangName = "English" + + cfile = self.file = open(filename, encoding=self._encoding) + + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + # self._glos.setInfo("input_file_size", f"{self._fileSize}") + else: + log.warning("EDICT2 Reader: file is not seekable") + + def close(self) -> None: + self.file.close() + self.file = nullTextIO + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> Iterator[EntryType]: + file = self.file + fileSize = self._fileSize + glos = self._glos + + render_syllables = ( + conv.render_syllables_color + if self._colorize_tones + else conv.render_syllables_no_color + ) + parse_line = ( + conv.parse_line_trad if self._traditional_title else conv.parse_line_simp + ) + + while True: + line = file.readline() + if not line: + break + line = line.rstrip("\n") + if not line: + continue + if line.startswith("#"): + continue + parts = parse_line(line) + if parts is None: + log.warning(f"bad line: {line!r}") + continue + names, article_text = conv.render_article( + render_syllables, + conv.Article(*parts), + ) + entry = glos.newEntry( + names, + article_text, + defiFormat="h", + byteProgress=(file.tell(), fileSize) if fileSize else None, + ) + yield entry diff --git a/pyglossary/plugins/edlin/__init__.py b/pyglossary/plugins/edlin/__init__.py index fc5e428f8..6f6664762 100644 --- a/pyglossary/plugins/edlin/__init__.py +++ b/pyglossary/plugins/edlin/__init__.py @@ -1,45 +1,15 @@ # -*- coding: utf-8 -*- -# edlin.py -# -# Copyright © 2016-2019 Saeed Rasooli <saeed.gnu@gmail.com> (ilius) -# This file is part of PyGlossary project, https://github.com/ilius/pyglossary -# -# This program is a free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 3, or (at your option) -# any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL -# If not, see <http://www.gnu.org/licenses/gpl.txt>. from __future__ import annotations -import os -from os.path import dirname, isdir, isfile, join -from typing import TYPE_CHECKING - -from pyglossary.core import log from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -from pyglossary.text_utils import ( - escapeNTB, - splitByBarUnescapeNTB, - unescapeNTB, -) - -if TYPE_CHECKING: - from collections.abc import Generator, Iterator - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -72,241 +42,3 @@ "encoding": EncodingOption(), "prev_link": BoolOption(comment="Enable link to previous entry"), } - - -def makeDir(direc: str) -> None: - if not isdir(direc): - os.makedirs(direc) - - -class Reader: - _encoding: str = "utf-8" - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def close(self) -> None: - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._prev_link = True - self._wordCount = None - self._rootPath = None - self._resDir = "" - self._resFileNames: list[str] = [] - - def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToData - - if isdir(filename): - infoFname = join(filename, "info.json") - elif isfile(filename): - infoFname = filename - filename = dirname(filename) - else: - raise ValueError( - f"error while opening {filename!r}: no such file or directory", - ) - self._filename = filename - - with open(infoFname, encoding=self._encoding) as infoFp: - info = jsonToData(infoFp.read()) - self._wordCount = info.pop("wordCount") - self._prev_link = info.pop("prev_link") - self._rootPath = info.pop("root") - for key, value in info.items(): - self._glos.setInfo(key, value) - - self._resDir = join(filename, "res") - if isdir(self._resDir): - self._resFileNames = os.listdir(self._resDir) - else: - self._resDir = "" - self._resFileNames = [] - - def __len__(self) -> int: - if self._wordCount is None: - log.error("called len() on a reader which is not open") - return 0 - return self._wordCount + len(self._resFileNames) - - def __iter__(self) -> Iterator[EntryType]: - if not self._rootPath: - raise RuntimeError("iterating over a reader while it's not open") - - wordCount = 0 - nextPath = self._rootPath - while nextPath != "END": - wordCount += 1 - # before or after reading word and defi - # (and skipping empty entry)? FIXME - - with open( - join(self._filename, nextPath), - encoding=self._encoding, - ) as _file: - header = _file.readline().rstrip() - if self._prev_link: - _prevPath, nextPath = header.split(" ") - else: - nextPath = header - word = _file.readline() - if not word: - yield None # update progressbar - continue - defi = _file.read() - if not defi: - log.warning( - f"Edlin Reader: no definition for word {word!r}, skipping", - ) - yield None # update progressbar - continue - word = word.rstrip() - defi = defi.rstrip() - - if self._glos.alts: - word = splitByBarUnescapeNTB(word) - if len(word) == 1: - word = word[0] - else: - word = unescapeNTB(word, bar=False) - - # defi = unescapeNTB(defi) - yield self._glos.newEntry(word, defi) - - if wordCount != self._wordCount: - log.warning( - f"{wordCount} words found, " - f"wordCount in info.json was {self._wordCount}", - ) - self._wordCount = wordCount - - resDir = self._resDir - for fname in self._resFileNames: - with open(join(resDir, fname), "rb") as _file: - yield self._glos.newDataEntry( - fname, - _file.read(), - ) - - -class Writer: - _encoding: str = "utf-8" - _prev_link: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def finish(self) -> None: - self._clear() - - def open(self, filename: str) -> None: - self._filename = filename - self._resDir = join(filename, "res") - os.makedirs(filename) - os.mkdir(self._resDir) - - def _clear(self) -> None: - self._filename = "" - self._resDir = "" - self._encoding = "utf-8" - self._hashSet: set[str] = set() - # self._wordCount = None - - @staticmethod - def hashToPath(h: str) -> str: - return h[:2] + "/" + h[2:] - - def getEntryHash(self, entry: EntryType) -> str: - """ - Return hash string for given entry - don't call it twice for one entry, if you do you will get a - different hash string. - """ - from hashlib import sha1 - - hash_ = sha1(entry.s_word.encode("utf-8")).hexdigest()[:8] # noqa: S324 - if hash_ not in self._hashSet: - self._hashSet.add(hash_) - return hash_ - index = 0 - while True: - tmp_hash = hash_ + f"{index:x}" - if tmp_hash not in self._hashSet: - self._hashSet.add(tmp_hash) - return tmp_hash - index += 1 - - def saveEntry( - self, - thisEntry: EntryType, - thisHash: str, - prevHash: str | None, - nextHash: str | None, - ) -> None: - dpath = join(self._filename, thisHash[:2]) - makeDir(dpath) - with open( - join(dpath, thisHash[2:]), - "w", - encoding=self._encoding, - ) as toFile: - nextPath = self.hashToPath(nextHash) if nextHash else "END" - if self._prev_link: - prevPath = self.hashToPath(prevHash) if prevHash else "START" - header = prevPath + " " + nextPath - else: - header = nextPath - toFile.write( - "\n".join( - [ - header, - escapeNTB(thisEntry.s_word, bar=False), - thisEntry.defi, - ], - ), - ) - - def write(self) -> Generator[None, EntryType, None]: - from pyglossary.json_utils import dataToPrettyJson - - thisEntry = yield - if thisEntry is None: - raise ValueError("glossary is empty") - - count = 1 - rootHash = thisHash = self.getEntryHash(thisEntry) - prevHash = None - - while True: - nextEntry = yield - if nextEntry is None: - break - if nextEntry.isData(): - nextEntry.save(self._resDir) - continue - nextHash = self.getEntryHash(nextEntry) - self.saveEntry(thisEntry, thisHash, prevHash, nextHash) - thisEntry = nextEntry - prevHash, thisHash = thisHash, nextHash - count += 1 - self.saveEntry(thisEntry, thisHash, prevHash, None) - - with open( - join(self._filename, "info.json"), - "w", - encoding=self._encoding, - ) as toFile: - info = {} - info["name"] = self._glos.getInfo("name") - info["root"] = self.hashToPath(rootHash) - info["prev_link"] = self._prev_link - info["wordCount"] = count - # info["modified"] = - - info |= self._glos.getExtraInfos(["name", "root", "prev_link", "wordCount"]) - - toFile.write(dataToPrettyJson(info)) diff --git a/pyglossary/plugins/edlin/reader.py b/pyglossary/plugins/edlin/reader.py new file mode 100644 index 000000000..a14526777 --- /dev/null +++ b/pyglossary/plugins/edlin/reader.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import os +from os.path import dirname, isdir, isfile, join +from typing import TYPE_CHECKING + +from pyglossary.core import log +from pyglossary.text_utils import ( + splitByBarUnescapeNTB, + unescapeNTB, +) + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + +class Reader: + _encoding: str = "utf-8" + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def close(self) -> None: + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._prev_link = True + self._wordCount = None + self._rootPath = None + self._resDir = "" + self._resFileNames: list[str] = [] + + def open(self, filename: str) -> None: + from pyglossary.json_utils import jsonToData + + if isdir(filename): + infoFname = join(filename, "info.json") + elif isfile(filename): + infoFname = filename + filename = dirname(filename) + else: + raise ValueError( + f"error while opening {filename!r}: no such file or directory", + ) + self._filename = filename + + with open(infoFname, encoding=self._encoding) as infoFp: + info = jsonToData(infoFp.read()) + self._wordCount = info.pop("wordCount") + self._prev_link = info.pop("prev_link") + self._rootPath = info.pop("root") + for key, value in info.items(): + self._glos.setInfo(key, value) + + self._resDir = join(filename, "res") + if isdir(self._resDir): + self._resFileNames = os.listdir(self._resDir) + else: + self._resDir = "" + self._resFileNames = [] + + def __len__(self) -> int: + if self._wordCount is None: + log.error("called len() on a reader which is not open") + return 0 + return self._wordCount + len(self._resFileNames) + + def __iter__(self) -> Iterator[EntryType]: + if not self._rootPath: + raise RuntimeError("iterating over a reader while it's not open") + + wordCount = 0 + nextPath = self._rootPath + while nextPath != "END": + wordCount += 1 + # before or after reading word and defi + # (and skipping empty entry)? FIXME + + with open( + join(self._filename, nextPath), + encoding=self._encoding, + ) as _file: + header = _file.readline().rstrip() + if self._prev_link: + _prevPath, nextPath = header.split(" ") + else: + nextPath = header + word = _file.readline() + if not word: + yield None # update progressbar + continue + defi = _file.read() + if not defi: + log.warning( + f"Edlin Reader: no definition for word {word!r}, skipping", + ) + yield None # update progressbar + continue + word = word.rstrip() + defi = defi.rstrip() + + if self._glos.alts: + word = splitByBarUnescapeNTB(word) + if len(word) == 1: + word = word[0] + else: + word = unescapeNTB(word, bar=False) + + # defi = unescapeNTB(defi) + yield self._glos.newEntry(word, defi) + + if wordCount != self._wordCount: + log.warning( + f"{wordCount} words found, " + f"wordCount in info.json was {self._wordCount}", + ) + self._wordCount = wordCount + + resDir = self._resDir + for fname in self._resFileNames: + with open(join(resDir, fname), "rb") as _file: + yield self._glos.newDataEntry( + fname, + _file.read(), + ) diff --git a/pyglossary/plugins/edlin/writer.py b/pyglossary/plugins/edlin/writer.py new file mode 100644 index 000000000..6bd53b0a4 --- /dev/null +++ b/pyglossary/plugins/edlin/writer.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import os +from os.path import isdir, join +from typing import TYPE_CHECKING + +from pyglossary.text_utils import ( + escapeNTB, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + + + +def makeDir(direc: str) -> None: + if not isdir(direc): + os.makedirs(direc) + + + +class Writer: + _encoding: str = "utf-8" + _prev_link: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def finish(self) -> None: + self._clear() + + def open(self, filename: str) -> None: + self._filename = filename + self._resDir = join(filename, "res") + os.makedirs(filename) + os.mkdir(self._resDir) + + def _clear(self) -> None: + self._filename = "" + self._resDir = "" + self._encoding = "utf-8" + self._hashSet: set[str] = set() + # self._wordCount = None + + @staticmethod + def hashToPath(h: str) -> str: + return h[:2] + "/" + h[2:] + + def getEntryHash(self, entry: EntryType) -> str: + """ + Return hash string for given entry + don't call it twice for one entry, if you do you will get a + different hash string. + """ + from hashlib import sha1 + + hash_ = sha1(entry.s_word.encode("utf-8")).hexdigest()[:8] # noqa: S324 + if hash_ not in self._hashSet: + self._hashSet.add(hash_) + return hash_ + index = 0 + while True: + tmp_hash = hash_ + f"{index:x}" + if tmp_hash not in self._hashSet: + self._hashSet.add(tmp_hash) + return tmp_hash + index += 1 + + def saveEntry( + self, + thisEntry: EntryType, + thisHash: str, + prevHash: str | None, + nextHash: str | None, + ) -> None: + dpath = join(self._filename, thisHash[:2]) + makeDir(dpath) + with open( + join(dpath, thisHash[2:]), + "w", + encoding=self._encoding, + ) as toFile: + nextPath = self.hashToPath(nextHash) if nextHash else "END" + if self._prev_link: + prevPath = self.hashToPath(prevHash) if prevHash else "START" + header = prevPath + " " + nextPath + else: + header = nextPath + toFile.write( + "\n".join( + [ + header, + escapeNTB(thisEntry.s_word, bar=False), + thisEntry.defi, + ], + ), + ) + + def write(self) -> Generator[None, EntryType, None]: + from pyglossary.json_utils import dataToPrettyJson + + thisEntry = yield + if thisEntry is None: + raise ValueError("glossary is empty") + + count = 1 + rootHash = thisHash = self.getEntryHash(thisEntry) + prevHash = None + + while True: + nextEntry = yield + if nextEntry is None: + break + if nextEntry.isData(): + nextEntry.save(self._resDir) + continue + nextHash = self.getEntryHash(nextEntry) + self.saveEntry(thisEntry, thisHash, prevHash, nextHash) + thisEntry = nextEntry + prevHash, thisHash = thisHash, nextHash + count += 1 + self.saveEntry(thisEntry, thisHash, prevHash, None) + + with open( + join(self._filename, "info.json"), + "w", + encoding=self._encoding, + ) as toFile: + info = {} + info["name"] = self._glos.getInfo("name") + info["root"] = self.hashToPath(rootHash) + info["prev_link"] = self._prev_link + info["wordCount"] = count + # info["modified"] = + + info |= self._glos.getExtraInfos(["name", "root", "prev_link", "wordCount"]) + + toFile.write(dataToPrettyJson(info)) diff --git a/pyglossary/plugins/gettext_po/__init__.py b/pyglossary/plugins/gettext_po/__init__.py index 978b7c455..cd6dd9887 100644 --- a/pyglossary/plugins/gettext_po/__init__.py +++ b/pyglossary/plugins/gettext_po/__init__.py @@ -2,23 +2,13 @@ from __future__ import annotations -import os -from os.path import isdir -from typing import TYPE_CHECKING - -from pyglossary.core import exc_note, log, pip -from pyglossary.io_utils import nullTextIO from pyglossary.option import ( BoolOption, Option, ) -from pyglossary.text_utils import splitByBar - -if TYPE_CHECKING: - import io - from collections.abc import Generator, Iterator - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -52,166 +42,3 @@ optionsProp: dict[str, Option] = { "resources": BoolOption(comment="Enable resources / data files"), } - - -class Reader: - depends = { - "polib": "polib", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._alts = glos.alts - self.clear() - - def clear(self) -> None: - self._filename = "" - self._file: io.TextIOBase = nullTextIO - self._wordCount: int | None = None - self._resDir = "" - self._resFileNames: list[str] = [] - - def open(self, filename: str) -> None: - self._filename = filename - self._file = open(filename, encoding="utf-8") - self._resDir = filename + "_res" - if isdir(self._resDir): - self._resFileNames = os.listdir(self._resDir) - else: - self._resDir = "" - self._resFileNames = [] - - def close(self) -> None: - self._file.close() - self._file = nullTextIO - self.clear() - - def __len__(self) -> int: - from pyglossary.file_utils import fileCountLines - - if self._wordCount is None: - log.debug("Try not to use len(reader) as it takes extra time") - self._wordCount = fileCountLines( - self._filename, - newline=b"\nmsgid", - ) - return self._wordCount - - def makeEntry(self, word: str, defi: str) -> EntryType: - if self._alts: - return self._glos.newEntry(splitByBar(word), defi) - return self._glos.newEntry(word, defi) - - def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912 - try: - from polib import unescape as po_unescape - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install polib` to install") - raise - - file = self._file - - word = "" - defi = "" - msgstr = False - wordCount = 0 - for line_ in file: - line = line_.strip() # noqa: PLW2901 - if not line: - continue - if line.startswith("#"): - continue - if line.startswith("msgid "): - if word: - yield self.makeEntry(word, defi) - wordCount += 1 - word = "" - defi = "" - else: - pass - # TODO: parse defi and set glos info? - # but this should be done in self.open - word = po_unescape(line[6:]) - if word.startswith('"'): - if len(word) < 2 or word[-1] != '"': - raise ValueError("invalid po line: line") - word = word[1:-1] - msgstr = False - continue - if line.startswith("msgstr "): - if msgstr: - log.error("msgid omitted!") - defi = po_unescape(line[7:]) - if defi.startswith('"'): - if len(defi) < 2 or defi[-1] != '"': - raise ValueError("invalid po line: line") - defi = defi[1:-1] - msgstr = True - continue - - line = po_unescape(line) - if line.startswith('"'): - if len(line) < 2 or line[-1] != '"': - raise ValueError("invalid po line: line") - line = line[1:-1] - - if msgstr: - defi += line - else: - word += line - if word: - yield self.makeEntry(word, defi) - wordCount += 1 - self._wordCount = wordCount - - -class Writer: - depends = { - "polib": "polib", - } - - _resources: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.TextIOBase = nullTextIO - glos.preventDuplicateWords() - - def open(self, filename: str) -> None: - try: - from polib import escape as po_escape - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install polib` to install") - raise - - self._filename = filename - self._file = file = open(filename, mode="w", encoding="utf-8") - file.write('#\nmsgid ""\nmsgstr ""\n') - for key, value in self._glos.iterInfo(): - file.write(f'"{po_escape(key)}: {po_escape(value)}\\n"\n') - - def finish(self) -> None: - self._filename = "" - self._file.close() - self._file = nullTextIO - - def write(self) -> Generator[None, EntryType, None]: - from polib import escape as po_escape - - file = self._file - - resources = self._resources - filename = self._filename - while True: - entry = yield - if entry is None: - break - if entry.isData(): - if resources: - entry.save(filename + "_res") - continue - file.write( - f'msgid "{po_escape(entry.s_word)}"\n' - f'msgstr "{po_escape(entry.defi)}"\n\n', - ) diff --git a/pyglossary/plugins/gettext_po/reader.py b/pyglossary/plugins/gettext_po/reader.py new file mode 100644 index 000000000..126288488 --- /dev/null +++ b/pyglossary/plugins/gettext_po/reader.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import os +from os.path import isdir +from typing import TYPE_CHECKING + +from pyglossary.core import exc_note, log, pip +from pyglossary.io_utils import nullTextIO +from pyglossary.text_utils import splitByBar + +if TYPE_CHECKING: + import io + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + + +class Reader: + depends = { + "polib": "polib", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._alts = glos.alts + self.clear() + + def clear(self) -> None: + self._filename = "" + self._file: io.TextIOBase = nullTextIO + self._wordCount: int | None = None + self._resDir = "" + self._resFileNames: list[str] = [] + + def open(self, filename: str) -> None: + self._filename = filename + self._file = open(filename, encoding="utf-8") + self._resDir = filename + "_res" + if isdir(self._resDir): + self._resFileNames = os.listdir(self._resDir) + else: + self._resDir = "" + self._resFileNames = [] + + def close(self) -> None: + self._file.close() + self._file = nullTextIO + self.clear() + + def __len__(self) -> int: + from pyglossary.file_utils import fileCountLines + + if self._wordCount is None: + log.debug("Try not to use len(reader) as it takes extra time") + self._wordCount = fileCountLines( + self._filename, + newline=b"\nmsgid", + ) + return self._wordCount + + def makeEntry(self, word: str, defi: str) -> EntryType: + if self._alts: + return self._glos.newEntry(splitByBar(word), defi) + return self._glos.newEntry(word, defi) + + def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912 + try: + from polib import unescape as po_unescape + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install polib` to install") + raise + + file = self._file + + word = "" + defi = "" + msgstr = False + wordCount = 0 + for line_ in file: + line = line_.strip() # noqa: PLW2901 + if not line: + continue + if line.startswith("#"): + continue + if line.startswith("msgid "): + if word: + yield self.makeEntry(word, defi) + wordCount += 1 + word = "" + defi = "" + else: + pass + # TODO: parse defi and set glos info? + # but this should be done in self.open + word = po_unescape(line[6:]) + if word.startswith('"'): + if len(word) < 2 or word[-1] != '"': + raise ValueError("invalid po line: line") + word = word[1:-1] + msgstr = False + continue + if line.startswith("msgstr "): + if msgstr: + log.error("msgid omitted!") + defi = po_unescape(line[7:]) + if defi.startswith('"'): + if len(defi) < 2 or defi[-1] != '"': + raise ValueError("invalid po line: line") + defi = defi[1:-1] + msgstr = True + continue + + line = po_unescape(line) + if line.startswith('"'): + if len(line) < 2 or line[-1] != '"': + raise ValueError("invalid po line: line") + line = line[1:-1] + + if msgstr: + defi += line + else: + word += line + if word: + yield self.makeEntry(word, defi) + wordCount += 1 + self._wordCount = wordCount diff --git a/pyglossary/plugins/gettext_po/writer.py b/pyglossary/plugins/gettext_po/writer.py new file mode 100644 index 000000000..7364e1d19 --- /dev/null +++ b/pyglossary/plugins/gettext_po/writer.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.core import exc_note, pip +from pyglossary.io_utils import nullTextIO + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + +class Writer: + depends = { + "polib": "polib", + } + + _resources: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.TextIOBase = nullTextIO + glos.preventDuplicateWords() + + def open(self, filename: str) -> None: + try: + from polib import escape as po_escape + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install polib` to install") + raise + + self._filename = filename + self._file = file = open(filename, mode="w", encoding="utf-8") + file.write('#\nmsgid ""\nmsgstr ""\n') + for key, value in self._glos.iterInfo(): + file.write(f'"{po_escape(key)}: {po_escape(value)}\\n"\n') + + def finish(self) -> None: + self._filename = "" + self._file.close() + self._file = nullTextIO + + def write(self) -> Generator[None, EntryType, None]: + from polib import escape as po_escape + + file = self._file + + resources = self._resources + filename = self._filename + while True: + entry = yield + if entry is None: + break + if entry.isData(): + if resources: + entry.save(filename + "_res") + continue + file.write( + f'msgid "{po_escape(entry.s_word)}"\n' + f'msgstr "{po_escape(entry.defi)}"\n\n', + ) diff --git a/pyglossary/plugins/html_dir/__init__.py b/pyglossary/plugins/html_dir/__init__.py index 8931a0697..d47850759 100644 --- a/pyglossary/plugins/html_dir/__init__.py +++ b/pyglossary/plugins/html_dir/__init__.py @@ -1,24 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html -import os -import re -import time -from functools import lru_cache -from os.path import isdir, isfile, join -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - import io - from collections.abc import Generator - - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) - -from pyglossary.core import log from pyglossary.option import ( BoolOption, EncodingOption, @@ -26,10 +8,8 @@ Option, StrOption, ) -from pyglossary.text_utils import ( - escapeNTB, - unescapeNTB, -) + +from .writer import Writer __all__ = [ "Writer", @@ -80,469 +60,3 @@ comment="Add headwords title to beginning of definition", ), } - -nbsp = "\xa0" -# nbsp = " " - -darkStyle = """ -body {{ - background-color: #373737; - color: #eee; -}} -a {{ color: #aaaaff; }} -a.broken {{ color: #e0c0c0; }} -a.no_ul {{ text-decoration: none; }} -b.headword {{ font-size: 1.5em; color: #c7ffb9; }} -h1 {{ font-size: 1.5em; color: #c7ffb9;}} -h2 {{ font-size: 1.3em;}} -h3 {{ font-size: 1.0em;}} -h4 {{ font-size: 1.0em;}} -h5 {{ font-size: 1.0em;}} -h6 {{ font-size: 1.0em;}} -""" - - -class Writer: - _encoding: str = "utf-8" - _resources: bool = True - _max_file_size: int = 102400 - _filename_format: str = "{n:05d}.html" - _escape_defi: bool = False - _dark: bool = True - _css: str = "" - _word_title: bool = True - - @staticmethod - def stripFullHtmlError(entry: EntryType, error: str) -> None: - log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._fileObj: io.IOBase | None = None - self._encoding = "utf-8" - self._filename_format = "{n:05d}.html" - self._tail = "</body></html>" - self._filenameList: list[str] = [] - glos.stripFullHtml(errorHandler=self.stripFullHtmlError) - - self._resSrcPattern = re.compile(' src="([^"]*)"') - - def open(self, filename: str) -> None: - self._filename = filename - self._resDir = resDir = join(filename, "res") - if not isdir(filename): - os.mkdir(filename) - if not isdir(resDir): - os.mkdir(resDir) - if self._css: - self.copyCSS(self._css) - - def copyCSS(self, cssPath: str) -> None: - import shutil - - shutil.copy(cssPath, join(self._filename, "style.css")) - - def finish(self) -> None: - pass - - def getNextFilename(self) -> str: - return self._filename_format.format( - n=len(self._filenameList), - ) - - def nextFile(self) -> io.TextIOBase: - if self._fileObj: - self._fileObj.write(self._tail) - self._fileObj.close() - filename = self.getNextFilename() - self._filenameList.append(filename) - self._fileObj = open( - join( - self._filename, - filename, - ), - mode="w", - encoding=self._encoding, - ) - return self._fileObj - - def fixLinks(self, linkTargetSet: set[str]) -> None: # noqa: PLR0912 - import gc - - gc.collect() - dirn = self._filename - - filenameList = self._filenameList - - fileByWord: dict[str, list[tuple[str, int]]] = {} - for line in open(join(dirn, "index.txt"), encoding="utf-8"): - line = line.rstrip("\n") # noqa: PLW2901 - if not line: - continue - entryIndexStr, wordEsc, filename, _ = line.split("\t") - entryIndex = int(entryIndexStr) - # entryId = f"entry{entryIndex}" - word = unescapeNTB(wordEsc) - if word not in linkTargetSet: - continue - if word in fileByWord: - fileByWord[word].append((filename, entryIndex)) - else: - fileByWord[word] = [(filename, entryIndex)] - - # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile: - # json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t") - - @lru_cache(maxsize=10) - def getLinksByFile(fileIndex: int) -> io.TextIOBase: - return open( - join(dirn, f"links{fileIndex}"), - mode="a", - encoding="utf-8", - ) - - log.info("") - for line in open(join(dirn, "links.txt"), encoding="utf-8"): - line = line.rstrip("\n") # noqa: PLW2901 - if not line: - continue - target, fileIndexStr, x_start, x_size = line.split("\t") - target = unescapeNTB(target) - if target not in fileByWord: - targetNew = "" - else: - targetFilename, targetEntryIndex = fileByWord[target][0] - if targetFilename == filename: - continue - targetNew = f"{targetFilename}#entry{targetEntryIndex}" - file = getLinksByFile(int(fileIndexStr)) - file.write( - f"{x_start}\t{x_size}\t{targetNew}\n", - ) - file.flush() - - linkTargetSet.clear() - del fileByWord, linkTargetSet - gc.collect() - - if os.sep == "\\": - time.sleep(0.1) - - entry_url_fmt = self._glos.getInfo("entry_url") - - re_href = re.compile( - b' href="[^<>"]*?"', - re.IGNORECASE, - ) - - for fileIndex, filename in enumerate(filenameList): - if not isfile(join(dirn, f"links{fileIndex}")): - continue - with open(join(dirn, filename), mode="rb") as inFile: - with open(join(dirn, f"{filename}.new"), mode="wb") as outFile: - for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"): - outFile.flush() - ( - b_x_start, - b_x_size, - b_target, - ) = linkLine.rstrip(b"\n").split(b"\t") - outFile.write( - inFile.read( - int(b_x_start, 16) - inFile.tell(), - ), - ) - curLink = inFile.read(int(b_x_size, 16)) - - if b_target: - outFile.write( - re_href.sub( - b' href="./' + b_target + b'"', - curLink, - ), - ) - continue - - if not entry_url_fmt: - outFile.write( - curLink.replace( - b' href="#', - b' class="broken" href="#', - ), - ) - continue - - st = curLink.decode("utf-8") - i = st.find('href="#') - j = st.find('"', i + 7) - word = st[i + 7 : j] - url = entry_url_fmt.format(word=word) - outFile.write( - ( - st[:i] + f'class="broken" href="{url}"' + st[j + 1 :] - ).encode("utf-8"), - ) - - outFile.write(inFile.read()) - - os.remove(join(dirn, filename)) - os.rename(join(dirn, f"{filename}.new"), join(dirn, filename)) - os.remove(join(dirn, f"links{fileIndex}")) - - def writeInfo(self, filename: str, header: str) -> None: - glos = self._glos - title = glos.getInfo("name") - customStyle = ( - "table, th, td {border: 1px solid black; " - "border-collapse: collapse; padding: 5px;}" - ) - infoHeader = header.format( - pageTitle=f"Info: {title}", - customStyle=customStyle, - ) - with open( - join(filename, "info.html"), - mode="w", - encoding=self._encoding, - ) as _file: - _file.write( - infoHeader + "<table>" - "<tr>" - '<th width="%10">Key</th>' - '<th width="%90">Value</th>' - "</tr>\n", - ) - for key, value in glos.iterInfo(): - _file.write( - f"<tr><td>{key}</td><td>{value}</td></tr>\n", - ) - _file.write("</table></body></html>") - - @staticmethod - def _subResSrc(m: re.Match) -> str: - url = m.group(1) - if "://" in url: - return m.group(0) - url = "res/" + url - return f' src="{url}"' - - def write(self) -> Generator[None, EntryType, None]: # noqa: PLR0912 - encoding = self._encoding - resources = self._resources - max_file_size = self._max_file_size - filename_format = self._filename_format - escape_defi = self._escape_defi - - wordSep = ' <font color="red">|</font> ' - - initFileSizeMax = 100 - - glos = self._glos - - filename = self._filename - self._encoding = encoding - self._filename_format = filename_format - - entry_url_fmt = glos.getInfo("entry_url") - - def getEntryWebLink(entry: EntryType) -> str: - if not entry_url_fmt: - return "" - url = entry_url_fmt.format(word=html.escape(entry.l_word[0])) - return f'{nbsp}<a class="no_ul" href="{url}">🌏</a>' - - # from math import log2, ceil - # maxPosHexLen = int(ceil(log2(max_file_size) / 4)) - - indexTxtFileObj = open( - join(filename, "index.txt"), - mode="w", - encoding="utf-8", - ) - linksTxtFileObj = open( - join(filename, "links.txt"), - mode="w", - encoding="utf-8", - ) - - title = glos.getInfo("name") - style = "" - if self._dark: - style = darkStyle - - cssLink = '<link rel="stylesheet" href="style.css" />' if self._css else "" - - header = ( - "<!DOCTYPE html>\n" - "<html><head>" - "<title>{pageTitle}</title>" - f'<meta charset="{encoding}">' - f'<style type="text/css">{style}{{customStyle}}</style>{cssLink}' - "</meta></head><body>\n" - ) - - def pageHeader(n: int) -> str: - return header.format( - pageTitle=f"Page {n} of {title}", - customStyle="", - ) - - def navBar() -> str: - links: list[str] = [] - if len(self._filenameList) > 1: - links.append(f'<a href="./{self._filenameList[-2]}">◀</a>') - links.extend( - [ - f'<a href="./{self.getNextFilename()}">▶</a>', - '<a href="./info.html">ℹ️</a></div>', # noqa: RUF001 - ], - ) - return ( - '<nav style="text-align: center; font-size: 2.5em;">' - + f"{nbsp}{nbsp}{nbsp}".join(links) - + "</nav>" - ) - - tailSize = len(self._tail.encode(encoding)) - - if max_file_size < len(header) + tailSize: - raise ValueError(f"{max_file_size=} is too small") - - max_file_size -= tailSize - - if not isdir(self._filename): - os.mkdir(self._filename) - - fileObj = self.nextFile() - fileObj.write(pageHeader(0)) - fileObj.write(navBar()) - - re_fixed_link = re.compile( - r'<a (?:[^<>]*? )?href="#([^<>"]+?)">[^<>]+?</a>', - re.IGNORECASE, - ) - - linkTargetSet = set() - - def replaceBword(text: str) -> str: - return text.replace( - ' href="bword://', - ' href="#', - ) - - def addLinks(text: str, pos: int) -> None: - for m in re_fixed_link.finditer(text): - if ' class="entry_link"' in m.group(0): - continue - if m.group(0).count("href=") != 1: - log.error(f"unexpected match: {m.group(0)}") - target = html.unescape(m.group(1)) - linkTargetSet.add(target) - start = m.start() - b_start = len(text[:start].encode(encoding)) - b_size = len(text[start : m.end()].encode(encoding)) - linksTxtFileObj.write( - f"{escapeNTB(target)}\t" - f"{len(self._filenameList) - 1}\t" - f"{pos + b_start:x}\t" - f"{b_size:x}\n", - ) - linksTxtFileObj.flush() - - self.writeInfo(filename, header) - - word_title = self._word_title - - resDir = self._resDir - entryIndex = -1 - while True: - entryIndex += 1 - entry = yield - if entry is None: - break - if entry.isData(): - if resources: - entry.save(resDir) - continue - - entry.detectDefiFormat() - defi = entry.defi - defiFormat = entry.defiFormat - - if defi.startswith("<!DOCTYPE html>") and defiFormat != "h": - log.error(f"bad {defiFormat=}") - defiFormat = "h" - - if defiFormat == "m": - defi = html.escape(defi) - if "\n" in defi: - # could be markdown or unformatted plaintext - # FIXME: this changes the font to a monospace - defi = f"<pre>{defi}</pre>" - elif defiFormat == "h": - defi = self._resSrcPattern.sub(self._subResSrc, defi) - if escape_defi: - defi = html.escape(defi) - - entryId = f"entry{entryIndex}" - - if word_title: - words = [html.escape(word) for word in entry.l_word] - title = glos.wordTitleStr( - wordSep.join(words), - sample=entry.l_word[0], - class_="headword", - ) - - if not title: - title = f"Entry {entryIndex}" - - # entry_link_sym = "¶" - entry_link_sym = "🔗" - text = ( - f'<div id="{entryId}">{title}{nbsp}{nbsp}' - f'<a class="no_ul" class="entry_link" href="#{entryId}">' - f"{entry_link_sym}</a>" - f"{getEntryWebLink(entry)}" - f"<br>\n{defi}" - "</div>\n" - "<hr>\n" - ) - pos = fileObj.tell() - if pos > initFileSizeMax and pos > max_file_size - len( - text.encode(encoding), - ): - fileObj = self.nextFile() - fileObj.write( - pageHeader( - len(self._filenameList) - 1, - ), - ) - fileObj.write(navBar()) - pos = fileObj.tell() - tmpFilename = escapeNTB(self._filenameList[-1]) - for word in entry.l_word: - indexTxtFileObj.write( - f"{entryIndex}\t" - f"{escapeNTB(word)}\t" - f"{tmpFilename}\t" - f"{pos}\n", - ) - del tmpFilename - text = replaceBword(text) - addLinks(text, pos) - fileObj.write(text) - - fileObj.close() - self._fileObj = None - indexTxtFileObj.close() - - linksTxtFileObj.close() - - if linkTargetSet: - log.info(f"{len(linkTargetSet)} link targets found") - log.info("Fixing links, please wait...") - self.fixLinks(linkTargetSet) - - os.remove(join(filename, "links.txt")) diff --git a/pyglossary/plugins/html_dir/writer.py b/pyglossary/plugins/html_dir/writer.py new file mode 100644 index 000000000..6451f09ce --- /dev/null +++ b/pyglossary/plugins/html_dir/writer.py @@ -0,0 +1,491 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +import os +import re +import time +from functools import lru_cache +from os.path import isdir, isfile, join +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + +from pyglossary.core import log +from pyglossary.text_utils import ( + escapeNTB, + unescapeNTB, +) + +nbsp = "\xa0" +# nbsp = " " + +darkStyle = """ +body {{ + background-color: #373737; + color: #eee; +}} +a {{ color: #aaaaff; }} +a.broken {{ color: #e0c0c0; }} +a.no_ul {{ text-decoration: none; }} +b.headword {{ font-size: 1.5em; color: #c7ffb9; }} +h1 {{ font-size: 1.5em; color: #c7ffb9;}} +h2 {{ font-size: 1.3em;}} +h3 {{ font-size: 1.0em;}} +h4 {{ font-size: 1.0em;}} +h5 {{ font-size: 1.0em;}} +h6 {{ font-size: 1.0em;}} +""" + + +class Writer: + _encoding: str = "utf-8" + _resources: bool = True + _max_file_size: int = 102400 + _filename_format: str = "{n:05d}.html" + _escape_defi: bool = False + _dark: bool = True + _css: str = "" + _word_title: bool = True + + @staticmethod + def stripFullHtmlError(entry: EntryType, error: str) -> None: + log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}") + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._fileObj: io.IOBase | None = None + self._encoding = "utf-8" + self._filename_format = "{n:05d}.html" + self._tail = "</body></html>" + self._filenameList: list[str] = [] + glos.stripFullHtml(errorHandler=self.stripFullHtmlError) + + self._resSrcPattern = re.compile(' src="([^"]*)"') + + def open(self, filename: str) -> None: + self._filename = filename + self._resDir = resDir = join(filename, "res") + if not isdir(filename): + os.mkdir(filename) + if not isdir(resDir): + os.mkdir(resDir) + if self._css: + self.copyCSS(self._css) + + def copyCSS(self, cssPath: str) -> None: + import shutil + + shutil.copy(cssPath, join(self._filename, "style.css")) + + def finish(self) -> None: + pass + + def getNextFilename(self) -> str: + return self._filename_format.format( + n=len(self._filenameList), + ) + + def nextFile(self) -> io.TextIOBase: + if self._fileObj: + self._fileObj.write(self._tail) + self._fileObj.close() + filename = self.getNextFilename() + self._filenameList.append(filename) + self._fileObj = open( + join( + self._filename, + filename, + ), + mode="w", + encoding=self._encoding, + ) + return self._fileObj + + def fixLinks(self, linkTargetSet: set[str]) -> None: # noqa: PLR0912 + import gc + + gc.collect() + dirn = self._filename + + filenameList = self._filenameList + + fileByWord: dict[str, list[tuple[str, int]]] = {} + for line in open(join(dirn, "index.txt"), encoding="utf-8"): + line = line.rstrip("\n") # noqa: PLW2901 + if not line: + continue + entryIndexStr, wordEsc, filename, _ = line.split("\t") + entryIndex = int(entryIndexStr) + # entryId = f"entry{entryIndex}" + word = unescapeNTB(wordEsc) + if word not in linkTargetSet: + continue + if word in fileByWord: + fileByWord[word].append((filename, entryIndex)) + else: + fileByWord[word] = [(filename, entryIndex)] + + # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile: + # json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t") + + @lru_cache(maxsize=10) + def getLinksByFile(fileIndex: int) -> io.TextIOBase: + return open( + join(dirn, f"links{fileIndex}"), + mode="a", + encoding="utf-8", + ) + + log.info("") + for line in open(join(dirn, "links.txt"), encoding="utf-8"): + line = line.rstrip("\n") # noqa: PLW2901 + if not line: + continue + target, fileIndexStr, x_start, x_size = line.split("\t") + target = unescapeNTB(target) + if target not in fileByWord: + targetNew = "" + else: + targetFilename, targetEntryIndex = fileByWord[target][0] + if targetFilename == filename: + continue + targetNew = f"{targetFilename}#entry{targetEntryIndex}" + file = getLinksByFile(int(fileIndexStr)) + file.write( + f"{x_start}\t{x_size}\t{targetNew}\n", + ) + file.flush() + + linkTargetSet.clear() + del fileByWord, linkTargetSet + gc.collect() + + if os.sep == "\\": + time.sleep(0.1) + + entry_url_fmt = self._glos.getInfo("entry_url") + + re_href = re.compile( + b' href="[^<>"]*?"', + re.IGNORECASE, + ) + + for fileIndex, filename in enumerate(filenameList): + if not isfile(join(dirn, f"links{fileIndex}")): + continue + with open(join(dirn, filename), mode="rb") as inFile: + with open(join(dirn, f"{filename}.new"), mode="wb") as outFile: + for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"): + outFile.flush() + ( + b_x_start, + b_x_size, + b_target, + ) = linkLine.rstrip(b"\n").split(b"\t") + outFile.write( + inFile.read( + int(b_x_start, 16) - inFile.tell(), + ), + ) + curLink = inFile.read(int(b_x_size, 16)) + + if b_target: + outFile.write( + re_href.sub( + b' href="./' + b_target + b'"', + curLink, + ), + ) + continue + + if not entry_url_fmt: + outFile.write( + curLink.replace( + b' href="#', + b' class="broken" href="#', + ), + ) + continue + + st = curLink.decode("utf-8") + i = st.find('href="#') + j = st.find('"', i + 7) + word = st[i + 7 : j] + url = entry_url_fmt.format(word=word) + outFile.write( + ( + st[:i] + f'class="broken" href="{url}"' + st[j + 1 :] + ).encode("utf-8"), + ) + + outFile.write(inFile.read()) + + os.remove(join(dirn, filename)) + os.rename(join(dirn, f"{filename}.new"), join(dirn, filename)) + os.remove(join(dirn, f"links{fileIndex}")) + + def writeInfo(self, filename: str, header: str) -> None: + glos = self._glos + title = glos.getInfo("name") + customStyle = ( + "table, th, td {border: 1px solid black; " + "border-collapse: collapse; padding: 5px;}" + ) + infoHeader = header.format( + pageTitle=f"Info: {title}", + customStyle=customStyle, + ) + with open( + join(filename, "info.html"), + mode="w", + encoding=self._encoding, + ) as _file: + _file.write( + infoHeader + "<table>" + "<tr>" + '<th width="%10">Key</th>' + '<th width="%90">Value</th>' + "</tr>\n", + ) + for key, value in glos.iterInfo(): + _file.write( + f"<tr><td>{key}</td><td>{value}</td></tr>\n", + ) + _file.write("</table></body></html>") + + @staticmethod + def _subResSrc(m: re.Match) -> str: + url = m.group(1) + if "://" in url: + return m.group(0) + url = "res/" + url + return f' src="{url}"' + + def write(self) -> Generator[None, EntryType, None]: # noqa: PLR0912 + encoding = self._encoding + resources = self._resources + max_file_size = self._max_file_size + filename_format = self._filename_format + escape_defi = self._escape_defi + + wordSep = ' <font color="red">|</font> ' + + initFileSizeMax = 100 + + glos = self._glos + + filename = self._filename + self._encoding = encoding + self._filename_format = filename_format + + entry_url_fmt = glos.getInfo("entry_url") + + def getEntryWebLink(entry: EntryType) -> str: + if not entry_url_fmt: + return "" + url = entry_url_fmt.format(word=html.escape(entry.l_word[0])) + return f'{nbsp}<a class="no_ul" href="{url}">🌏</a>' + + # from math import log2, ceil + # maxPosHexLen = int(ceil(log2(max_file_size) / 4)) + + indexTxtFileObj = open( + join(filename, "index.txt"), + mode="w", + encoding="utf-8", + ) + linksTxtFileObj = open( + join(filename, "links.txt"), + mode="w", + encoding="utf-8", + ) + + title = glos.getInfo("name") + style = "" + if self._dark: + style = darkStyle + + cssLink = '<link rel="stylesheet" href="style.css" />' if self._css else "" + + header = ( + "<!DOCTYPE html>\n" + "<html><head>" + "<title>{pageTitle}</title>" + f'<meta charset="{encoding}">' + f'<style type="text/css">{style}{{customStyle}}</style>{cssLink}' + "</meta></head><body>\n" + ) + + def pageHeader(n: int) -> str: + return header.format( + pageTitle=f"Page {n} of {title}", + customStyle="", + ) + + def navBar() -> str: + links: list[str] = [] + if len(self._filenameList) > 1: + links.append(f'<a href="./{self._filenameList[-2]}">◀</a>') + links.extend( + [ + f'<a href="./{self.getNextFilename()}">▶</a>', + '<a href="./info.html">ℹ️</a></div>', # noqa: RUF001 + ], + ) + return ( + '<nav style="text-align: center; font-size: 2.5em;">' + + f"{nbsp}{nbsp}{nbsp}".join(links) + + "</nav>" + ) + + tailSize = len(self._tail.encode(encoding)) + + if max_file_size < len(header) + tailSize: + raise ValueError(f"{max_file_size=} is too small") + + max_file_size -= tailSize + + if not isdir(self._filename): + os.mkdir(self._filename) + + fileObj = self.nextFile() + fileObj.write(pageHeader(0)) + fileObj.write(navBar()) + + re_fixed_link = re.compile( + r'<a (?:[^<>]*? )?href="#([^<>"]+?)">[^<>]+?</a>', + re.IGNORECASE, + ) + + linkTargetSet = set() + + def replaceBword(text: str) -> str: + return text.replace( + ' href="bword://', + ' href="#', + ) + + def addLinks(text: str, pos: int) -> None: + for m in re_fixed_link.finditer(text): + if ' class="entry_link"' in m.group(0): + continue + if m.group(0).count("href=") != 1: + log.error(f"unexpected match: {m.group(0)}") + target = html.unescape(m.group(1)) + linkTargetSet.add(target) + start = m.start() + b_start = len(text[:start].encode(encoding)) + b_size = len(text[start : m.end()].encode(encoding)) + linksTxtFileObj.write( + f"{escapeNTB(target)}\t" + f"{len(self._filenameList) - 1}\t" + f"{pos + b_start:x}\t" + f"{b_size:x}\n", + ) + linksTxtFileObj.flush() + + self.writeInfo(filename, header) + + word_title = self._word_title + + resDir = self._resDir + entryIndex = -1 + while True: + entryIndex += 1 + entry = yield + if entry is None: + break + if entry.isData(): + if resources: + entry.save(resDir) + continue + + entry.detectDefiFormat() + defi = entry.defi + defiFormat = entry.defiFormat + + if defi.startswith("<!DOCTYPE html>") and defiFormat != "h": + log.error(f"bad {defiFormat=}") + defiFormat = "h" + + if defiFormat == "m": + defi = html.escape(defi) + if "\n" in defi: + # could be markdown or unformatted plaintext + # FIXME: this changes the font to a monospace + defi = f"<pre>{defi}</pre>" + elif defiFormat == "h": + defi = self._resSrcPattern.sub(self._subResSrc, defi) + if escape_defi: + defi = html.escape(defi) + + entryId = f"entry{entryIndex}" + + if word_title: + words = [html.escape(word) for word in entry.l_word] + title = glos.wordTitleStr( + wordSep.join(words), + sample=entry.l_word[0], + class_="headword", + ) + + if not title: + title = f"Entry {entryIndex}" + + # entry_link_sym = "¶" + entry_link_sym = "🔗" + text = ( + f'<div id="{entryId}">{title}{nbsp}{nbsp}' + f'<a class="no_ul" class="entry_link" href="#{entryId}">' + f"{entry_link_sym}</a>" + f"{getEntryWebLink(entry)}" + f"<br>\n{defi}" + "</div>\n" + "<hr>\n" + ) + pos = fileObj.tell() + if pos > initFileSizeMax and pos > max_file_size - len( + text.encode(encoding), + ): + fileObj = self.nextFile() + fileObj.write( + pageHeader( + len(self._filenameList) - 1, + ), + ) + fileObj.write(navBar()) + pos = fileObj.tell() + tmpFilename = escapeNTB(self._filenameList[-1]) + for word in entry.l_word: + indexTxtFileObj.write( + f"{entryIndex}\t" + f"{escapeNTB(word)}\t" + f"{tmpFilename}\t" + f"{pos}\n", + ) + del tmpFilename + text = replaceBword(text) + addLinks(text, pos) + fileObj.write(text) + + fileObj.close() + self._fileObj = None + indexTxtFileObj.close() + + linksTxtFileObj.close() + + if linkTargetSet: + log.info(f"{len(linkTargetSet)} link targets found") + log.info("Fixing links, please wait...") + self.fixLinks(linkTargetSet) + + os.remove(join(filename, "links.txt")) diff --git a/pyglossary/plugins/info_plugin/__init__.py b/pyglossary/plugins/info_plugin/__init__.py index 8c4852ae0..57f4cc719 100644 --- a/pyglossary/plugins/info_plugin/__init__.py +++ b/pyglossary/plugins/info_plugin/__init__.py @@ -6,13 +6,9 @@ from pyglossary.info_writer import InfoWriter as Writer -if TYPE_CHECKING: - from collections.abc import Iterator +from .reader import Reader - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) +if TYPE_CHECKING: from pyglossary.option import Option __all__ = [ @@ -44,25 +40,3 @@ # key is option/argument name, value is instance of Option optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - - def close(self) -> None: - pass - - def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToData - - with open(filename, encoding="utf-8") as infoFp: - info = jsonToData(infoFp.read()) - for key, value in info.items(): - self._glos.setInfo(key, value) - - def __len__(self) -> int: - return 0 - - def __iter__(self) -> Iterator[EntryType | None]: - yield None diff --git a/pyglossary/plugins/info_plugin/reader.py b/pyglossary/plugins/info_plugin/reader.py new file mode 100644 index 000000000..f8c212230 --- /dev/null +++ b/pyglossary/plugins/info_plugin/reader.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + + def close(self) -> None: + pass + + def open(self, filename: str) -> None: + from pyglossary.json_utils import jsonToData + + with open(filename, encoding="utf-8") as infoFp: + info = jsonToData(infoFp.read()) + assert isinstance(info, dict) + for key, value in info.items(): + self._glos.setInfo(key, value) + + def __len__(self) -> int: + return 0 + + def __iter__(self) -> Iterator[EntryType | None]: + yield None diff --git a/pyglossary/plugins/jmdict/__init__.py b/pyglossary/plugins/jmdict/__init__.py index e5f88f31c..de0297912 100644 --- a/pyglossary/plugins/jmdict/__init__.py +++ b/pyglossary/plugins/jmdict/__init__.py @@ -1,28 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import os -import re -import unicodedata -from io import BytesIO -from typing import TYPE_CHECKING, cast - -if TYPE_CHECKING: - import io - from collections.abc import Callable, Iterator - - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) - from pyglossary.lxml_types import Element, T_htmlfile - -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import exc_note, pip -from pyglossary.io_utils import nullBinaryIO from pyglossary.option import ( BoolOption, IntOption, @@ -30,6 +8,8 @@ StrOption, ) +from .reader import Reader + __all__ = [ "Reader", "description", @@ -69,395 +49,3 @@ comment="Add translitation (romaji) of keywords", ), } - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - _example_padding: int = 10 - _example_color: str = "" - # _example_color: str = "#008FE1" - _translitation: bool = False - - tagStyle = ( - "color:white;" - "background:green;" - "padding-left:3px;" - "padding-right:3px;" - "border-radius:0.5ex;" - # 0.5ex ~= 0.3em, but "ex" is recommended - ) - - gikun_key = "gikun (meaning as reading) or jukujikun (special kanji reading)" - re_inf_mapping = { - gikun_key: "gikun/jukujikun", - "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete - "word containing irregular kana usage": "irregular", - } - - @staticmethod - def makeList( - hf: T_htmlfile, - input_objects: list[Element], - processor: Callable, - single_prefix: str = "", - skip_single: bool = True, - ) -> None: - """Wrap elements into <ol> if more than one element.""" - if not input_objects: - return - - if skip_single and len(input_objects) == 1: - hf.write(single_prefix) - processor(hf, input_objects[0]) - return - - with hf.element("ol"): - for el in input_objects: - with hf.element("li"): - processor(hf, el) - - # TODO: break it down - # PLR0912 Too many branches (23 > 12) - def writeSense( # noqa: PLR0912 - self, - hf: T_htmlfile, - sense: Element, - ) -> None: - from lxml import etree as ET - - def br() -> Element: - return ET.Element("br") - - for elem in sense.findall("pos"): - if not elem.text: - continue - desc = elem.text - if desc == "unclassified": - continue - with hf.element("i"): - hf.write(desc.capitalize()) - hf.write(br()) - - glossList = [elem.text.strip() for elem in sense.findall("gloss") if elem.text] - if glossList: - for i, gloss in enumerate(glossList): - if i > 0: - hf.write(", ") - hf.write(gloss) - hf.write(br()) - - relatedWords: list[str] = [] - for elem in sense.findall("xref"): - if not elem.text: - continue - word = elem.text.strip() - word = self._link_number_postfix.sub("", word) - relatedWords.append(word) - - if relatedWords: - hf.write("Related: ") - for i, word in enumerate(relatedWords): - if i > 0: - with hf.element("big"): - hf.write(" | ") - with hf.element("a", href=f"bword://{word}"): - hf.write(word) - hf.write(br()) - - antonymWords: list[str] = [] - for elem in sense.findall("ant"): - if not elem.text: - continue - word = elem.text.strip() - word = self._link_number_postfix.sub("", word) - antonymWords.append(word) - if antonymWords: - hf.write("Antonym: ") - for i, word in enumerate(antonymWords): - if i > 0: - with hf.element("big"): - hf.write(" | ") - with hf.element( - "a", - href=f"bword://{word}", - attrib={"class": "antonym"}, - ): - hf.write(word) - hf.write(br()) - - for i, elem in enumerate(sense.findall("field")): - if not elem.text: - continue - if i > 0: - hf.write(" ") - desc = elem.text - with hf.element("span", style=self.tagStyle): - hf.write(desc) - hf.write(br()) - - for i, elem in enumerate(sense.findall("misc")): - if not elem.text: - continue - if i > 0: - hf.write(" ") - desc = elem.text - with hf.element("small"): - with hf.element("span", style=self.tagStyle): - hf.write(desc) - hf.write(br()) - - examples = sense.findall("example") - # TODO: move to a method - if examples: # noqa: PLR1702 - with hf.element( - "div", - attrib={ - "class": "example", - "style": f"padding: {self._example_padding}px 0px;", - }, - ): - hf.write("Examples:") - with hf.element("ul"): - for i, elem in enumerate(examples): - if not elem.text: - continue - if i > 0: - hf.write(" ") - # one ex_srce (id?), one ex_text, and two ex_sent tags - textElem = elem.find("ex_text") - if textElem is None: - continue - if not textElem.text: - continue - text = textElem.text - sentList: list[str] = [] - for sentElem in elem.findall("ex_sent"): - if not sentElem.text: - continue - sentList.append(sentElem.text) - with hf.element("li"): - style: dict[str, str] = {} - if self._example_color: - style["color"] = self._example_color - with hf.element("font", attrib=style): - hf.write(text) - for sent in sentList: - hf.write(br()) - hf.write(sent) - - # TODO: break it down - def getEntryByElem( # noqa: PLR0912 - self, - entry: Element, - ) -> EntryType: - from lxml import etree as ET - - glos = self._glos - keywords: list[str] = [] - f = BytesIO() - translit = self._translitation - - def br() -> Element: - return ET.Element("br") - - with ET.htmlfile(f, encoding="utf-8") as hf: # noqa: PLR1702 - kebList: list[str] = [] - rebList: list[str] = [] - kebDisplayList: list[str] = [] - rebDisplayList: list[tuple[str, list[str]]] = [] - with hf.element("div"): - for k_ele in entry.findall("k_ele"): - keb = k_ele.find("keb") - if keb is None: - continue - if not keb.text: - continue - keb_text = keb.text - keb_text_norm = unicodedata.normalize("NFKC", keb_text) - keywords.append(keb_text_norm) - if keb_text != keb_text_norm: - keywords.append(keb_text) - kebList.append(keb_text) - keb_display = keb_text - if translit: - import romkan # type: ignore - - t_keb = romkan.to_roma(keb_text) - if t_keb and t_keb.isascii(): - keywords.append(t_keb) - keb_display += f" ({t_keb})" - kebDisplayList.append(keb_display) - # for elem in k_ele.findall("ke_pri"): - # log.info(elem.text) - - for r_ele in entry.findall("r_ele"): - reb = r_ele.find("reb") - if reb is None: - continue - if not reb.text: - continue - props: list[str] = [] - if r_ele.find("re_nokanji") is not None: - props.append("no kanji") - inf = r_ele.find("re_inf") - if inf is not None and inf.text: - props.append( - self.re_inf_mapping.get(inf.text, inf.text), - ) - keywords.append(reb.text) - reb_text = reb.text - rebList.append(reb_text) - reb_display = reb_text - if translit: - import romkan - - t_reb = romkan.to_roma(reb.text) - if t_reb and t_reb.isascii(): - keywords.append(t_reb) - reb_display += f" ({t_reb})" - rebDisplayList.append((reb_display, props)) - # for elem in r_ele.findall("re_pri"): - # log.info(elem.text) - - # this is for making internal links valid - # this makes too many alternates! - # but we don't seem to have a choice - # except for scanning and indexing all words once - # and then starting over and fixing/optimizing links - for s_keb in kebList: - for s_reb in rebList: - keywords.append(f"{s_keb}・{s_reb}") # noqa: PERF401 - - if kebDisplayList: - with hf.element(glos.titleTag(kebDisplayList[0])): - for i, s_keb in enumerate(kebDisplayList): - if i > 0: - with hf.element("font", color="red"): - hf.write(" | ") - hf.write(s_keb) - hf.write(br()) - - if rebDisplayList: - for i, (s_reb, props) in enumerate(rebDisplayList): - if i > 0: - with hf.element("font", color="red"): - hf.write(" | ") - with hf.element("font", color="green"): - hf.write(s_reb) - for prop in props: - hf.write(" ") - with hf.element("small"): - with hf.element("span", style=self.tagStyle): - hf.write(prop) - hf.write(br()) - - hf_ = cast("T_htmlfile", hf) - self.makeList( - hf_, - entry.findall("sense"), - self.writeSense, - ) - - defi = f.getvalue().decode("utf-8") - file = self._file - byteProgress = (file.tell(), self._fileSize) - return self._glos.newEntry( - keywords, - defi, - defiFormat="h", - byteProgress=byteProgress, - ) - - @staticmethod - def tostring(elem: Element) -> str: - from lxml import etree as ET - - return ( - ET.tostring( - elem, - method="html", - pretty_print=True, - ) - .decode("utf-8") - .strip() - ) - - def setCreationTime(self, header: str) -> None: - m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) - if m is None: - return - self._glos.setInfo("creationTime", m.group(1)) - - def setMetadata(self, header: str) -> None: - # TODO: self.set_info("edition", ...) - self.setCreationTime(header) - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._wordCount = 0 - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._fileSize = 0 - self._link_number_postfix = re.compile("・[0-9]+$") - - def __len__(self) -> int: - return self._wordCount - - def close(self) -> None: - if self._file: - self._file.close() - self._file = nullBinaryIO - - def open( - self, - filename: str, - ) -> None: - try: - from lxml import etree as ET # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - self._filename = filename - self._fileSize = os.path.getsize(filename) - - self._glos.sourceLangName = "Japanese" - - self._glos.setDefaultDefiFormat("h") - self._glos.setInfo("definition_has_headwords", "True") - self._glos.setInfo("entry_url", "https://jisho.org/search/{word}") - # also good: f"https://sakuradict.com/search?q={{word}}" - - header = "" - with compressionOpen(filename, mode="rt", encoding="utf-8") as text_file: - text_file = cast("io.TextIOBase", text_file) - for line in text_file: - if "<JMdict>" in line: - break - header += line - self.setMetadata(header) - - self._file = compressionOpen(filename, mode="rb") - - def __iter__(self) -> Iterator[EntryType]: - from lxml import etree as ET - - context = ET.iterparse( # type: ignore # noqa: PGH003 - self._file, - events=("end",), - tag="entry", - ) - for _, _elem in context: - elem = cast("Element", _elem) - yield self.getEntryByElem(elem) - # clean up preceding siblings to save memory - # this reduces memory usage from ~64 MB to ~30 MB - parent = elem.getparent() - if parent is None: - continue - while elem.getprevious() is not None: - del parent[0] diff --git a/pyglossary/plugins/jmdict/reader.py b/pyglossary/plugins/jmdict/reader.py new file mode 100644 index 000000000..16de72ffc --- /dev/null +++ b/pyglossary/plugins/jmdict/reader.py @@ -0,0 +1,417 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import os +import re +import unicodedata +from io import BytesIO +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Callable, Iterator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + from pyglossary.lxml_types import Element, T_htmlfile + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import exc_note, pip +from pyglossary.io_utils import nullBinaryIO + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + _example_padding: int = 10 + _example_color: str = "" + # _example_color: str = "#008FE1" + _translitation: bool = False + + tagStyle = ( + "color:white;" + "background:green;" + "padding-left:3px;" + "padding-right:3px;" + "border-radius:0.5ex;" + # 0.5ex ~= 0.3em, but "ex" is recommended + ) + + gikun_key = "gikun (meaning as reading) or jukujikun (special kanji reading)" + re_inf_mapping = { + gikun_key: "gikun/jukujikun", + "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete + "word containing irregular kana usage": "irregular", + } + + @staticmethod + def makeList( + hf: T_htmlfile, + input_objects: list[Element], + processor: Callable, + single_prefix: str = "", + skip_single: bool = True, + ) -> None: + """Wrap elements into <ol> if more than one element.""" + if not input_objects: + return + + if skip_single and len(input_objects) == 1: + hf.write(single_prefix) + processor(hf, input_objects[0]) + return + + with hf.element("ol"): + for el in input_objects: + with hf.element("li"): + processor(hf, el) + + # TODO: break it down + # PLR0912 Too many branches (23 > 12) + def writeSense( # noqa: PLR0912 + self, + hf: T_htmlfile, + sense: Element, + ) -> None: + from lxml import etree as ET + + def br() -> Element: + return ET.Element("br") + + for elem in sense.findall("pos"): + if not elem.text: + continue + desc = elem.text + if desc == "unclassified": + continue + with hf.element("i"): + hf.write(desc.capitalize()) + hf.write(br()) + + glossList = [elem.text.strip() for elem in sense.findall("gloss") if elem.text] + if glossList: + for i, gloss in enumerate(glossList): + if i > 0: + hf.write(", ") + hf.write(gloss) + hf.write(br()) + + relatedWords: list[str] = [] + for elem in sense.findall("xref"): + if not elem.text: + continue + word = elem.text.strip() + word = self._link_number_postfix.sub("", word) + relatedWords.append(word) + + if relatedWords: + hf.write("Related: ") + for i, word in enumerate(relatedWords): + if i > 0: + with hf.element("big"): + hf.write(" | ") + with hf.element("a", href=f"bword://{word}"): + hf.write(word) + hf.write(br()) + + antonymWords: list[str] = [] + for elem in sense.findall("ant"): + if not elem.text: + continue + word = elem.text.strip() + word = self._link_number_postfix.sub("", word) + antonymWords.append(word) + if antonymWords: + hf.write("Antonym: ") + for i, word in enumerate(antonymWords): + if i > 0: + with hf.element("big"): + hf.write(" | ") + with hf.element( + "a", + href=f"bword://{word}", + attrib={"class": "antonym"}, + ): + hf.write(word) + hf.write(br()) + + for i, elem in enumerate(sense.findall("field")): + if not elem.text: + continue + if i > 0: + hf.write(" ") + desc = elem.text + with hf.element("span", style=self.tagStyle): + hf.write(desc) + hf.write(br()) + + for i, elem in enumerate(sense.findall("misc")): + if not elem.text: + continue + if i > 0: + hf.write(" ") + desc = elem.text + with hf.element("small"): + with hf.element("span", style=self.tagStyle): + hf.write(desc) + hf.write(br()) + + examples = sense.findall("example") + # TODO: move to a method + if examples: # noqa: PLR1702 + with hf.element( + "div", + attrib={ + "class": "example", + "style": f"padding: {self._example_padding}px 0px;", + }, + ): + hf.write("Examples:") + with hf.element("ul"): + for i, elem in enumerate(examples): + if not elem.text: + continue + if i > 0: + hf.write(" ") + # one ex_srce (id?), one ex_text, and two ex_sent tags + textElem = elem.find("ex_text") + if textElem is None: + continue + if not textElem.text: + continue + text = textElem.text + sentList: list[str] = [] + for sentElem in elem.findall("ex_sent"): + if not sentElem.text: + continue + sentList.append(sentElem.text) + with hf.element("li"): + style: dict[str, str] = {} + if self._example_color: + style["color"] = self._example_color + with hf.element("font", attrib=style): + hf.write(text) + for sent in sentList: + hf.write(br()) + hf.write(sent) + + # TODO: break it down + def getEntryByElem( # noqa: PLR0912 + self, + entry: Element, + ) -> EntryType: + from lxml import etree as ET + + glos = self._glos + keywords: list[str] = [] + f = BytesIO() + translit = self._translitation + + def br() -> Element: + return ET.Element("br") + + with ET.htmlfile(f, encoding="utf-8") as hf: # noqa: PLR1702 + kebList: list[str] = [] + rebList: list[str] = [] + kebDisplayList: list[str] = [] + rebDisplayList: list[tuple[str, list[str]]] = [] + with hf.element("div"): + for k_ele in entry.findall("k_ele"): + keb = k_ele.find("keb") + if keb is None: + continue + if not keb.text: + continue + keb_text = keb.text + keb_text_norm = unicodedata.normalize("NFKC", keb_text) + keywords.append(keb_text_norm) + if keb_text != keb_text_norm: + keywords.append(keb_text) + kebList.append(keb_text) + keb_display = keb_text + if translit: + import romkan # type: ignore + + t_keb = romkan.to_roma(keb_text) + if t_keb and t_keb.isascii(): + keywords.append(t_keb) + keb_display += f" ({t_keb})" + kebDisplayList.append(keb_display) + # for elem in k_ele.findall("ke_pri"): + # log.info(elem.text) + + for r_ele in entry.findall("r_ele"): + reb = r_ele.find("reb") + if reb is None: + continue + if not reb.text: + continue + props: list[str] = [] + if r_ele.find("re_nokanji") is not None: + props.append("no kanji") + inf = r_ele.find("re_inf") + if inf is not None and inf.text: + props.append( + self.re_inf_mapping.get(inf.text, inf.text), + ) + keywords.append(reb.text) + reb_text = reb.text + rebList.append(reb_text) + reb_display = reb_text + if translit: + import romkan + + t_reb = romkan.to_roma(reb.text) + if t_reb and t_reb.isascii(): + keywords.append(t_reb) + reb_display += f" ({t_reb})" + rebDisplayList.append((reb_display, props)) + # for elem in r_ele.findall("re_pri"): + # log.info(elem.text) + + # this is for making internal links valid + # this makes too many alternates! + # but we don't seem to have a choice + # except for scanning and indexing all words once + # and then starting over and fixing/optimizing links + for s_keb in kebList: + for s_reb in rebList: + keywords.append(f"{s_keb}・{s_reb}") # noqa: PERF401 + + if kebDisplayList: + with hf.element(glos.titleTag(kebDisplayList[0])): + for i, s_keb in enumerate(kebDisplayList): + if i > 0: + with hf.element("font", color="red"): + hf.write(" | ") + hf.write(s_keb) + hf.write(br()) + + if rebDisplayList: + for i, (s_reb, props) in enumerate(rebDisplayList): + if i > 0: + with hf.element("font", color="red"): + hf.write(" | ") + with hf.element("font", color="green"): + hf.write(s_reb) + for prop in props: + hf.write(" ") + with hf.element("small"): + with hf.element("span", style=self.tagStyle): + hf.write(prop) + hf.write(br()) + + hf_ = cast("T_htmlfile", hf) + self.makeList( + hf_, + entry.findall("sense"), + self.writeSense, + ) + + defi = f.getvalue().decode("utf-8") + file = self._file + byteProgress = (file.tell(), self._fileSize) + return self._glos.newEntry( + keywords, + defi, + defiFormat="h", + byteProgress=byteProgress, + ) + + @staticmethod + def tostring(elem: Element) -> str: + from lxml import etree as ET + + return ( + ET.tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + def setCreationTime(self, header: str) -> None: + m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) + if m is None: + return + self._glos.setInfo("creationTime", m.group(1)) + + def setMetadata(self, header: str) -> None: + # TODO: self.set_info("edition", ...) + self.setCreationTime(header) + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._wordCount = 0 + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._fileSize = 0 + self._link_number_postfix = re.compile("・[0-9]+$") + + def __len__(self) -> int: + return self._wordCount + + def close(self) -> None: + if self._file: + self._file.close() + self._file = nullBinaryIO + + def open( + self, + filename: str, + ) -> None: + try: + from lxml import etree as ET # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + self._filename = filename + self._fileSize = os.path.getsize(filename) + + self._glos.sourceLangName = "Japanese" + + self._glos.setDefaultDefiFormat("h") + self._glos.setInfo("definition_has_headwords", "True") + self._glos.setInfo("entry_url", "https://jisho.org/search/{word}") + # also good: f"https://sakuradict.com/search?q={{word}}" + + header = "" + with compressionOpen(filename, mode="rt", encoding="utf-8") as text_file: + text_file = cast("io.TextIOBase", text_file) + for line in text_file: + if "<JMdict>" in line: + break + header += line + self.setMetadata(header) + + self._file = compressionOpen(filename, mode="rb") + + def __iter__(self) -> Iterator[EntryType]: + from lxml import etree as ET + + context = ET.iterparse( # type: ignore # noqa: PGH003 + self._file, + events=("end",), + tag="entry", + ) + for _, _elem in context: + elem = cast("Element", _elem) + yield self.getEntryByElem(elem) + # clean up preceding siblings to save memory + # this reduces memory usage from ~64 MB to ~30 MB + parent = elem.getparent() + if parent is None: + continue + while elem.getprevious() is not None: + del parent[0] diff --git a/pyglossary/plugins/jmnedict/__init__.py b/pyglossary/plugins/jmnedict/__init__.py index 59582d936..ba4213465 100644 --- a/pyglossary/plugins/jmnedict/__init__.py +++ b/pyglossary/plugins/jmnedict/__init__.py @@ -1,28 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import os -import re -from io import BytesIO -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING -if TYPE_CHECKING: - import io - from collections.abc import Callable, Iterator +from .reader import Reader - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) - from pyglossary.lxml_types import Element, T_htmlfile +if TYPE_CHECKING: from pyglossary.option import Option -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import exc_note, pip -from pyglossary.io_utils import nullBinaryIO __all__ = [ "Reader", @@ -53,277 +38,3 @@ "EDRDG Wiki", ) optionsProp: dict[str, Option] = {} - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - tagStyle = ( - "color:white;" - "background:green;" - "padding-left:3px;" - "padding-right:3px;" - "border-radius:0.5ex;" - # 0.5ex ~= 0.3em, but "ex" is recommended - ) - - gikun_key = "gikun (meaning as reading) or jukujikun (special kanji reading)" - re_inf_mapping = { - gikun_key: "gikun/jukujikun", - "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete - "word containing irregular kana usage": "irregular", - } - - @staticmethod - def makeList( - hf: T_htmlfile, - input_objects: list[Element], - processor: Callable, - single_prefix: str = "", - skip_single: bool = True, - ) -> None: - """Wrap elements into <ol> if more than one element.""" - if not input_objects: - return - - if skip_single and len(input_objects) == 1: - hf.write(single_prefix) - processor(hf, input_objects[0]) - return - - with hf.element("ol"): - for el in input_objects: - with hf.element("li"): - processor(hf, el) - - def writeTrans( - self, - hf: T_htmlfile, - trans: Element, - ) -> None: - from lxml import etree as ET - - def br() -> Element: - return ET.Element("br") - - for elem in trans.findall("name_type"): - if not elem.text: - continue - desc = elem.text - with hf.element("i"): - hf.write(desc.capitalize()) - hf.write(br()) - - for elem in trans.findall("trans_det"): - if not elem.text: - continue - desc = elem.text - hf.write(desc) - hf.write(br()) - - relatedWords: list[str] = [] - for elem in trans.findall("xref"): - if not elem.text: - continue - word = elem.text.strip() - word = self._link_number_postfix.sub("", word) - relatedWords.append(word) - - if relatedWords: - hf.write("Related: ") - for i, word in enumerate(relatedWords): - if i > 0: - with hf.element("big"): - hf.write(" | ") - with hf.element("a", href=f"bword://{word}"): - hf.write(word) - hf.write(br()) - - def getEntryByElem( # noqa: PLR0912 - self, - entry: Element, - ) -> EntryType: - from lxml import etree as ET - - glos = self._glos - keywords: list[str] = [] - f = BytesIO() - - def br() -> Element: - return ET.Element("br") - - with ET.htmlfile(f, encoding="utf-8") as hf: # noqa: PLR1702 - kebList: list[str] = [] - rebList: list[tuple[str, list[str]]] = [] - with hf.element("div"): - for k_ele in entry.findall("k_ele"): - keb = k_ele.find("keb") - if keb is None: - continue - if not keb.text: - continue - kebList.append(keb.text) - keywords.append(keb.text) - # for elem in k_ele.findall("ke_pri"): - # log.info(elem.text) - - for r_ele in entry.findall("r_ele"): - reb = r_ele.find("reb") - if reb is None: - continue - if not reb.text: - continue - props: list[str] = [] - if r_ele.find("re_nokanji") is not None: - props.append("no kanji") - inf = r_ele.find("re_inf") - if inf is not None and inf.text: - props.append( - self.re_inf_mapping.get(inf.text, inf.text), - ) - rebList.append((reb.text, props)) - keywords.append(reb.text) - # for elem in r_ele.findall("re_pri"): - # log.info(elem.text) - - # this is for making internal links valid - # this makes too many alternates! - # but we don't seem to have a choice - # except for scanning and indexing all words once - # and then starting over and fixing/optimizing links - for s_keb in kebList: - for s_reb, _ in rebList: - keywords.append(f"{s_keb}・{s_reb}") - - if kebList: - with hf.element(glos.titleTag(kebList[0])): - for i, s_keb in enumerate(kebList): - if i > 0: - with hf.element("font", color="red"): - hf.write(" | ") - hf.write(s_keb) - hf.write(br()) - - if rebList: - for i, (s_reb, props) in enumerate(rebList): - if i > 0: - with hf.element("font", color="red"): - hf.write(" | ") - with hf.element("font", color="green"): - hf.write(s_reb) - for prop in props: - hf.write(" ") - with hf.element("small"): - with hf.element("span", style=self.tagStyle): - hf.write(prop) - hf.write(br()) - - hf_ = cast("T_htmlfile", hf) - self.makeList( - hf_, - entry.findall("trans"), - self.writeTrans, - ) - - defi = f.getvalue().decode("utf-8") - file = self._file - byteProgress = (file.tell(), self._fileSize) - return self._glos.newEntry( - keywords, - defi, - defiFormat="h", - byteProgress=byteProgress, - ) - - @staticmethod - def tostring(elem: Element) -> str: - from lxml import etree as ET - - return ( - ET.tostring( - elem, - method="html", - pretty_print=True, - ) - .decode("utf-8") - .strip() - ) - - def setCreationTime(self, header: str) -> None: - m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) - if m is None: - return - self._glos.setInfo("creationTime", m.group(1)) - - def setMetadata(self, header: str) -> None: - # TODO: self.set_info("edition", ...) - self.setCreationTime(header) - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._wordCount = 0 - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._fileSize = 0 - self._link_number_postfix = re.compile("・[0-9]+$") - - def __len__(self) -> int: - return self._wordCount - - def close(self) -> None: - if self._file: - self._file.close() - self._file = nullBinaryIO - - def open( - self, - filename: str, - ) -> None: - try: - from lxml import etree as ET # noqa: F401 - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - self._filename = filename - self._fileSize = os.path.getsize(filename) - - self._glos.sourceLangName = "Japanese" - - self._glos.setDefaultDefiFormat("h") - self._glos.setInfo("definition_has_headwords", "True") - self._glos.setInfo("entry_url", "https://jisho.org/search/{word}") - # also good: f"https://sakuradict.com/search?q={{word}}" - - header = "" - with compressionOpen(filename, mode="rt", encoding="utf-8") as text_file: - text_file = cast("io.TextIOBase", text_file) - for line in text_file: - if "<JMdict>" in line: - break - header += line - self.setMetadata(header) - - self._file = compressionOpen(filename, mode="rb") - - def __iter__(self) -> Iterator[EntryType]: - from lxml import etree as ET - - context = ET.iterparse( # type: ignore # noqa: PGH003 - self._file, - events=("end",), - tag="entry", - ) - for _, _elem in context: - elem = cast("Element", _elem) - yield self.getEntryByElem(elem) - # clean up preceding siblings to save memory - # this reduces memory usage from ~64 MB to ~30 MB - parent = elem.getparent() - if parent is None: - continue - while elem.getprevious() is not None: - del parent[0] diff --git a/pyglossary/plugins/jmnedict/reader.py b/pyglossary/plugins/jmnedict/reader.py new file mode 100644 index 000000000..8d25b8ce1 --- /dev/null +++ b/pyglossary/plugins/jmnedict/reader.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import os +import re +from io import BytesIO +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Callable, Iterator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + from pyglossary.lxml_types import Element, T_htmlfile + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import exc_note, pip +from pyglossary.io_utils import nullBinaryIO + + +class Reader: + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + tagStyle = ( + "color:white;" + "background:green;" + "padding-left:3px;" + "padding-right:3px;" + "border-radius:0.5ex;" + # 0.5ex ~= 0.3em, but "ex" is recommended + ) + + gikun_key = "gikun (meaning as reading) or jukujikun (special kanji reading)" + re_inf_mapping = { + gikun_key: "gikun/jukujikun", + "out-dated or obsolete kana usage": "obsolete", # outdated/obsolete + "word containing irregular kana usage": "irregular", + } + + @staticmethod + def makeList( + hf: T_htmlfile, + input_objects: list[Element], + processor: Callable, + single_prefix: str = "", + skip_single: bool = True, + ) -> None: + """Wrap elements into <ol> if more than one element.""" + if not input_objects: + return + + if skip_single and len(input_objects) == 1: + hf.write(single_prefix) + processor(hf, input_objects[0]) + return + + with hf.element("ol"): + for el in input_objects: + with hf.element("li"): + processor(hf, el) + + def writeTrans( + self, + hf: T_htmlfile, + trans: Element, + ) -> None: + from lxml import etree as ET + + def br() -> Element: + return ET.Element("br") + + for elem in trans.findall("name_type"): + if not elem.text: + continue + desc = elem.text + with hf.element("i"): + hf.write(desc.capitalize()) + hf.write(br()) + + for elem in trans.findall("trans_det"): + if not elem.text: + continue + desc = elem.text + hf.write(desc) + hf.write(br()) + + relatedWords: list[str] = [] + for elem in trans.findall("xref"): + if not elem.text: + continue + word = elem.text.strip() + word = self._link_number_postfix.sub("", word) + relatedWords.append(word) + + if relatedWords: + hf.write("Related: ") + for i, word in enumerate(relatedWords): + if i > 0: + with hf.element("big"): + hf.write(" | ") + with hf.element("a", href=f"bword://{word}"): + hf.write(word) + hf.write(br()) + + def getEntryByElem( # noqa: PLR0912 + self, + entry: Element, + ) -> EntryType: + from lxml import etree as ET + + glos = self._glos + keywords: list[str] = [] + f = BytesIO() + + def br() -> Element: + return ET.Element("br") + + with ET.htmlfile(f, encoding="utf-8") as hf: # noqa: PLR1702 + kebList: list[str] = [] + rebList: list[tuple[str, list[str]]] = [] + with hf.element("div"): + for k_ele in entry.findall("k_ele"): + keb = k_ele.find("keb") + if keb is None: + continue + if not keb.text: + continue + kebList.append(keb.text) + keywords.append(keb.text) + # for elem in k_ele.findall("ke_pri"): + # log.info(elem.text) + + for r_ele in entry.findall("r_ele"): + reb = r_ele.find("reb") + if reb is None: + continue + if not reb.text: + continue + props: list[str] = [] + if r_ele.find("re_nokanji") is not None: + props.append("no kanji") + inf = r_ele.find("re_inf") + if inf is not None and inf.text: + props.append( + self.re_inf_mapping.get(inf.text, inf.text), + ) + rebList.append((reb.text, props)) + keywords.append(reb.text) + # for elem in r_ele.findall("re_pri"): + # log.info(elem.text) + + # this is for making internal links valid + # this makes too many alternates! + # but we don't seem to have a choice + # except for scanning and indexing all words once + # and then starting over and fixing/optimizing links + for s_keb in kebList: + for s_reb, _ in rebList: + keywords.append(f"{s_keb}・{s_reb}") + + if kebList: + with hf.element(glos.titleTag(kebList[0])): + for i, s_keb in enumerate(kebList): + if i > 0: + with hf.element("font", color="red"): + hf.write(" | ") + hf.write(s_keb) + hf.write(br()) + + if rebList: + for i, (s_reb, props) in enumerate(rebList): + if i > 0: + with hf.element("font", color="red"): + hf.write(" | ") + with hf.element("font", color="green"): + hf.write(s_reb) + for prop in props: + hf.write(" ") + with hf.element("small"): + with hf.element("span", style=self.tagStyle): + hf.write(prop) + hf.write(br()) + + hf_ = cast("T_htmlfile", hf) + self.makeList( + hf_, + entry.findall("trans"), + self.writeTrans, + ) + + defi = f.getvalue().decode("utf-8") + file = self._file + byteProgress = (file.tell(), self._fileSize) + return self._glos.newEntry( + keywords, + defi, + defiFormat="h", + byteProgress=byteProgress, + ) + + @staticmethod + def tostring(elem: Element) -> str: + from lxml import etree as ET + + return ( + ET.tostring( + elem, + method="html", + pretty_print=True, + ) + .decode("utf-8") + .strip() + ) + + def setCreationTime(self, header: str) -> None: + m = re.search("JMdict created: ([0-9]{4}-[0-9]{2}-[0-9]{2})", header) + if m is None: + return + self._glos.setInfo("creationTime", m.group(1)) + + def setMetadata(self, header: str) -> None: + # TODO: self.set_info("edition", ...) + self.setCreationTime(header) + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._wordCount = 0 + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._fileSize = 0 + self._link_number_postfix = re.compile("・[0-9]+$") + + def __len__(self) -> int: + return self._wordCount + + def close(self) -> None: + if self._file: + self._file.close() + self._file = nullBinaryIO + + def open( + self, + filename: str, + ) -> None: + try: + from lxml import etree as ET # noqa: F401 + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + self._filename = filename + self._fileSize = os.path.getsize(filename) + + self._glos.sourceLangName = "Japanese" + + self._glos.setDefaultDefiFormat("h") + self._glos.setInfo("definition_has_headwords", "True") + self._glos.setInfo("entry_url", "https://jisho.org/search/{word}") + # also good: f"https://sakuradict.com/search?q={{word}}" + + header = "" + with compressionOpen(filename, mode="rt", encoding="utf-8") as text_file: + text_file = cast("io.TextIOBase", text_file) + for line in text_file: + if "<JMdict>" in line: + break + header += line + self.setMetadata(header) + + self._file = compressionOpen(filename, mode="rb") + + def __iter__(self) -> Iterator[EntryType]: + from lxml import etree as ET + + context = ET.iterparse( # type: ignore # noqa: PGH003 + self._file, + events=("end",), + tag="entry", + ) + for _, _elem in context: + elem = cast("Element", _elem) + yield self.getEntryByElem(elem) + # clean up preceding siblings to save memory + # this reduces memory usage from ~64 MB to ~30 MB + parent = elem.getparent() + if parent is None: + continue + while elem.getprevious() is not None: + del parent[0] diff --git a/pyglossary/plugins/json_plugin/__init__.py b/pyglossary/plugins/json_plugin/__init__.py index 83fdbbb10..a21b50f69 100644 --- a/pyglossary/plugins/json_plugin/__init__.py +++ b/pyglossary/plugins/json_plugin/__init__.py @@ -2,25 +2,13 @@ from __future__ import annotations -from typing import TYPE_CHECKING - -from pyglossary.compression import ( - # compressionOpen, - stdCompressions, -) from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) +from .writer import Writer __all__ = [ "Writer", @@ -58,53 +46,3 @@ comment="add headwords title to beginning of definition", ), } - - -class Writer: - _encoding: str = "utf-8" - _enable_info: bool = True - _resources: bool = True - _word_title: bool = False - - compressions = stdCompressions - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - glos.preventDuplicateWords() - - def open(self, filename: str) -> None: - self._filename = filename - - def finish(self) -> None: - self._filename = "" - - def write(self) -> Generator[None, EntryType, None]: - from json import dumps - - from pyglossary.text_writer import writeTxt - - glos = self._glos - encoding = self._encoding - enable_info = self._enable_info - resources = self._resources - - ensure_ascii = encoding == "ascii" - - def escape(st: str) -> str: - return dumps(st, ensure_ascii=ensure_ascii) - - yield from writeTxt( - glos, - entryFmt="\t{word}: {defi},\n", - filename=self._filename, - encoding=encoding, - writeInfo=enable_info, - wordEscapeFunc=escape, - defiEscapeFunc=escape, - ext=".json", - head="{\n", - tail='\t"": ""\n}', - resources=resources, - word_title=self._word_title, - ) diff --git a/pyglossary/plugins/json_plugin/writer.py b/pyglossary/plugins/json_plugin/writer.py new file mode 100644 index 000000000..48375802b --- /dev/null +++ b/pyglossary/plugins/json_plugin/writer.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + # compressionOpen, + stdCompressions, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import ( + EntryType, + GlossaryType, + ) + +class Writer: + _encoding: str = "utf-8" + _enable_info: bool = True + _resources: bool = True + _word_title: bool = False + + compressions = stdCompressions + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + glos.preventDuplicateWords() + + def open(self, filename: str) -> None: + self._filename = filename + + def finish(self) -> None: + self._filename = "" + + def write(self) -> Generator[None, EntryType, None]: + from json import dumps + + from pyglossary.text_writer import writeTxt + + glos = self._glos + encoding = self._encoding + enable_info = self._enable_info + resources = self._resources + + ensure_ascii = encoding == "ascii" + + def escape(st: str) -> str: + return dumps(st, ensure_ascii=ensure_ascii) + + yield from writeTxt( + glos, + entryFmt="\t{word}: {defi},\n", + filename=self._filename, + encoding=encoding, + writeInfo=enable_info, + wordEscapeFunc=escape, + defiEscapeFunc=escape, + ext=".json", + head="{\n", + tail='\t"": ""\n}', + resources=resources, + word_title=self._word_title, + ) diff --git a/pyglossary/plugins/lingoes_ldf/__init__.py b/pyglossary/plugins/lingoes_ldf/__init__.py index 41f9c3269..e63e43e93 100644 --- a/pyglossary/plugins/lingoes_ldf/__init__.py +++ b/pyglossary/plugins/lingoes_ldf/__init__.py @@ -1,27 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import TYPE_CHECKING - -from pyglossary.compression import ( - # compressionOpen, - stdCompressions, -) -from pyglossary.core import log -from pyglossary.file_utils import fileCountLines from pyglossary.option import ( BoolOption, EncodingOption, NewlineOption, Option, ) -from pyglossary.text_reader import TextGlossaryReader, nextBlockResultType -from pyglossary.text_utils import splitByBar -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .reader import Reader +from .writer import Writer __all__ = [ "Reader", @@ -57,121 +45,3 @@ "resources": BoolOption(comment="Enable resources / data files"), "encoding": EncodingOption(), } - - -class Reader(TextGlossaryReader): - compressions = stdCompressions - - def __len__(self) -> int: - if self._wordCount is None: - log.debug("Try not to use len(reader) as it takes extra time") - self._wordCount = ( - fileCountLines( - self._filename, - newline=b"\n\n", - ) - - self._leadingLinesCount - ) - return self._wordCount - - @classmethod - def isInfoWord(cls, word: str) -> bool: - if isinstance(word, str): - return word.startswith("#") - - return False - - @classmethod - def fixInfoWord(cls, word: str) -> str: - if isinstance(word, str): - return word.lstrip("#").lower() - - return word - - def nextBlock(self) -> nextBlockResultType: - if not self._file: - raise StopIteration - entryLines: list[str] = [] - while True: - line = self.readline() - if not line: - raise StopIteration - line = line.rstrip("\n\r") # FIXME - if line.startswith("###"): - parts = line.split(":") - key = parts[0].strip() - value = ":".join(parts[1:]).strip() - return key, value, None - - if line: - entryLines.append(line) - continue - - # now `line` is empty, process `entryLines` - if not entryLines: - return None - if len(entryLines) < 2: - log.error( - f"invalid block near pos {self._file.tell()}" - f" in file {self._filename}", - ) - return None - word = entryLines[0] - defi = "\n".join(entryLines[1:]) - defi = defi.replace("<br/>", "\n") # FIXME - - words = splitByBar(word) - - return words, defi, None - - -class Writer: - compressions = stdCompressions - - _newline: str = "\n" - _resources: bool = True - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - - def getInfo(self, key: str) -> str: - return self._glos.getInfo(key).replace("\n", "<br>") - - def getAuthor(self) -> str: - return self._glos.author.replace("\n", "<br>") - - def finish(self) -> None: - self._filename = "" - - def open(self, filename: str) -> None: - self._filename = filename - - @staticmethod - def _defiEscapeFunc(defi: str) -> str: - return defi.replace("\n", "<br/>") - - def write(self) -> Generator[None, EntryType, None]: - from pyglossary.text_writer import writeTxt - - newline = self._newline - resources = self._resources - head = ( - f"###Title: {self.getInfo('title')}\n" - f"###Description: {self.getInfo('description')}\n" - f"###Author: {self.getAuthor()}\n" - f"###Email: {self.getInfo('email')}\n" - f"###Website: {self.getInfo('website')}\n" - f"###Copyright: {self.getInfo('copyright')}\n" - ) - yield from writeTxt( - self._glos, - entryFmt="{word}\n{defi}\n\n", - filename=self._filename, - writeInfo=False, - defiEscapeFunc=self._defiEscapeFunc, - ext=".ldf", - head=head, - newline=newline, - resources=resources, - ) diff --git a/pyglossary/plugins/lingoes_ldf/reader.py b/pyglossary/plugins/lingoes_ldf/reader.py new file mode 100644 index 000000000..211056bfe --- /dev/null +++ b/pyglossary/plugins/lingoes_ldf/reader.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from pyglossary.compression import ( + # compressionOpen, + stdCompressions, +) +from pyglossary.core import log +from pyglossary.file_utils import fileCountLines +from pyglossary.text_reader import TextGlossaryReader, nextBlockResultType +from pyglossary.text_utils import splitByBar + + +class Reader(TextGlossaryReader): + compressions = stdCompressions + + def __len__(self) -> int: + if self._wordCount is None: + log.debug("Try not to use len(reader) as it takes extra time") + self._wordCount = ( + fileCountLines( + self._filename, + newline=b"\n\n", + ) + - self._leadingLinesCount + ) + return self._wordCount + + @classmethod + def isInfoWord(cls, word: str) -> bool: + if isinstance(word, str): + return word.startswith("#") + + return False + + @classmethod + def fixInfoWord(cls, word: str) -> str: + if isinstance(word, str): + return word.lstrip("#").lower() + + return word + + def nextBlock(self) -> nextBlockResultType: + if not self._file: + raise StopIteration + entryLines: list[str] = [] + while True: + line = self.readline() + if not line: + raise StopIteration + line = line.rstrip("\n\r") # FIXME + if line.startswith("###"): + parts = line.split(":") + key = parts[0].strip() + value = ":".join(parts[1:]).strip() + return key, value, None + + if line: + entryLines.append(line) + continue + + # now `line` is empty, process `entryLines` + if not entryLines: + return None + if len(entryLines) < 2: + log.error( + f"invalid block near pos {self._file.tell()}" + f" in file {self._filename}", + ) + return None + word = entryLines[0] + defi = "\n".join(entryLines[1:]) + defi = defi.replace("<br/>", "\n") # FIXME + + words = splitByBar(word) + + return words, defi, None diff --git a/pyglossary/plugins/lingoes_ldf/writer.py b/pyglossary/plugins/lingoes_ldf/writer.py new file mode 100644 index 000000000..331c9fd31 --- /dev/null +++ b/pyglossary/plugins/lingoes_ldf/writer.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pyglossary.compression import ( + # compressionOpen, + stdCompressions, +) + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + +class Writer: + compressions = stdCompressions + + _newline: str = "\n" + _resources: bool = True + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + + def getInfo(self, key: str) -> str: + return self._glos.getInfo(key).replace("\n", "<br>") + + def getAuthor(self) -> str: + return self._glos.author.replace("\n", "<br>") + + def finish(self) -> None: + self._filename = "" + + def open(self, filename: str) -> None: + self._filename = filename + + @staticmethod + def _defiEscapeFunc(defi: str) -> str: + return defi.replace("\n", "<br/>") + + def write(self) -> Generator[None, EntryType, None]: + from pyglossary.text_writer import writeTxt + + newline = self._newline + resources = self._resources + head = ( + f"###Title: {self.getInfo('title')}\n" + f"###Description: {self.getInfo('description')}\n" + f"###Author: {self.getAuthor()}\n" + f"###Email: {self.getInfo('email')}\n" + f"###Website: {self.getInfo('website')}\n" + f"###Copyright: {self.getInfo('copyright')}\n" + ) + yield from writeTxt( + self._glos, + entryFmt="{word}\n{defi}\n\n", + filename=self._filename, + writeInfo=False, + defiEscapeFunc=self._defiEscapeFunc, + ext=".ldf", + head=head, + newline=newline, + resources=resources, + ) diff --git a/pyglossary/plugins/makindo_medical/__init__.py b/pyglossary/plugins/makindo_medical/__init__.py index 2e2f5f579..07f783113 100644 --- a/pyglossary/plugins/makindo_medical/__init__.py +++ b/pyglossary/plugins/makindo_medical/__init__.py @@ -1,14 +1,11 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import html from typing import TYPE_CHECKING -if TYPE_CHECKING: - import sqlite3 - from collections.abc import Iterator +from .reader import Reader - from pyglossary.glossary_types import EntryType, GlossaryType +if TYPE_CHECKING: from pyglossary.option import Option __all__ = [ @@ -40,50 +37,3 @@ "Makindo.co.uk Comprehensive Medical Encyclopedia", ) optionsProp: dict[str, Option] = {} - - -class Reader: - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._clear() - - def _clear(self) -> None: - self._filename = "" - self._con: sqlite3.Connection | None = None - self._cur: sqlite3.Cursor | None = None - - def open(self, filename: str) -> None: - from sqlite3 import connect - - self._filename = filename - self._con = connect(filename) - self._cur = self._con.cursor() - self._glos.setDefaultDefiFormat("h") - - def __len__(self) -> int: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute("select count(*) from NEW_TABLE") - return self._cur.fetchone()[0] - - def __iter__(self) -> Iterator[EntryType]: - if self._cur is None: - raise ValueError("cur is None") - self._cur.execute( - "select _id, contents from NEW_TABLE where _id is not null", - ) - # FIXME: iteration over self._cur stops after one entry - # and self._cur.fetchone() returns None - # for row in self._cur: - for row in self._cur.fetchall(): - word = html.unescape(row[0]) - definition = row[1].decode("utf-8", errors="ignore") - # print(f"{word!r}, {definition!r}") - yield self._glos.newEntry(word, definition, defiFormat="h") - - def close(self) -> None: - if self._cur: - self._cur.close() - if self._con: - self._con.close() - self._clear() diff --git a/pyglossary/plugins/makindo_medical/reader.py b/pyglossary/plugins/makindo_medical/reader.py new file mode 100644 index 000000000..3d2f027b8 --- /dev/null +++ b/pyglossary/plugins/makindo_medical/reader.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import html +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import sqlite3 + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + +class Reader: + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._clear() + + def _clear(self) -> None: + self._filename = "" + self._con: sqlite3.Connection | None = None + self._cur: sqlite3.Cursor | None = None + + def open(self, filename: str) -> None: + from sqlite3 import connect + + self._filename = filename + self._con = connect(filename) + self._cur = self._con.cursor() + self._glos.setDefaultDefiFormat("h") + + def __len__(self) -> int: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute("select count(*) from NEW_TABLE") + return self._cur.fetchone()[0] + + def __iter__(self) -> Iterator[EntryType]: + if self._cur is None: + raise ValueError("cur is None") + self._cur.execute( + "select _id, contents from NEW_TABLE where _id is not null", + ) + # FIXME: iteration over self._cur stops after one entry + # and self._cur.fetchone() returns None + # for row in self._cur: + for row in self._cur.fetchall(): + word = html.unescape(row[0]) + definition = row[1].decode("utf-8", errors="ignore") + # print(f"{word!r}, {definition!r}") + yield self._glos.newEntry(word, definition, defiFormat="h") + + def close(self) -> None: + if self._cur: + self._cur.close() + if self._con: + self._con.close() + self._clear() diff --git a/pyglossary/plugins/octopus_mdict_new/__init__.py b/pyglossary/plugins/octopus_mdict_new/__init__.py index 244609819..bdd3aa239 100644 --- a/pyglossary/plugins/octopus_mdict_new/__init__.py +++ b/pyglossary/plugins/octopus_mdict_new/__init__.py @@ -1,43 +1,13 @@ # -*- coding: utf-8 -*- -# Read Octopus MDict dictionary format, mdx(dictionary)/mdd(data) -# -# Copyright © 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com> -# Copyright © 2013-2021 Saeed Rasooli <saeed.gnu@gmail.com> -# -# This program is a free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# You can get a copy of GNU General Public License along this program -# But you can always get it from http://www.gnu.org/licenses/gpl.txt -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. from __future__ import annotations -import gc -import os -import re -import sys -from os.path import dirname, extsep, isfile, join, splitext -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Iterator - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.plugin_lib.readmdict import MDD, MDX - - -from pyglossary.core import log from pyglossary.option import ( BoolOption, EncodingOption, Option, ) -from pyglossary.text_utils import toStr + +from .reader import Reader __all__ = [ "Reader", @@ -87,189 +57,3 @@ then try to install [LZO library and Python binding](./doc/lzo.md).""", ), ] - - -class Reader: - _encoding: str = "" - _substyle: bool = True - _same_dir_data_files: bool = False - _audio: bool = False - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self.clear() - self._re_internal_link = re.compile("href=([\"'])(entry://|[dx]:)") - self._re_audio_link = re.compile( - '<a (type="sound" )?([^<>]*? )?href="sound://([^<>"]+)"( .*?)?>(.*?)</a>', - ) - - def clear(self) -> None: - self._filename = "" - self._mdx: MDX | None = None - self._mdd: list[MDD] = [] - self._wordCount = 0 - self._dataEntryCount = 0 - - # dict of mainWord -> newline-separated alternatives - self._linksDict: dict[str, str] = {} - - def open(self, filename: str) -> None: - from pyglossary.plugin_lib.readmdict import MDD, MDX - - self._filename = filename - self._mdx = MDX(filename, self._encoding, self._substyle) - - """ - multiple MDD files are supported with this naming schema: - FILE.mdx - FILE.mdd - FILE.1.mdd - FILE.2.mdd - FILE.3.mdd - """ - - filenameNoExt, _ext = splitext(self._filename) - mddBase = filenameNoExt + extsep - for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"): - if isfile(fname): - self._mdd.append(MDD(fname)) - mddN = 2 - while isfile(f"{mddBase}{mddN}.mdd"): - self._mdd.append(MDD(f"{mddBase}{mddN}.mdd")) - mddN += 1 - - dataEntryCount = 0 - for mdd in self._mdd: - dataEntryCount += len(mdd) - self._dataEntryCount = dataEntryCount - log.info(f"Found {len(self._mdd)} mdd files with {dataEntryCount} entries") - - # from pprint import pformat - # log.debug("mdx.header = " + pformat(self._mdx.header)) - # for key, value in self._mdx.header.items(): - # key = key.lower() - # self._glos.setInfo(key, value) - try: - title = toStr(self._mdx.header[b"Title"]) - except KeyError: - pass - else: - title = title.strip() - if title == "Title (No HTML code allowed)": - # TODO: how to avoid this? - title = "" - if title: - self._glos.setInfo("name", title) - desc = toStr(self._mdx.header.get(b"Description", "")) - if desc: - self._glos.setInfo("description", desc) - - self.loadLinks() - - def loadLinks(self) -> None: - from pyglossary.plugin_lib.readmdict import MDX - - mdx = self._mdx - if mdx is None: - raise ValueError("mdx is None") - - log.info("extracting links...") - linksDict: dict[str, str] = {} - word = "" - wordCount = 0 - for b_word, b_defi in mdx.items(): - word = b_word.decode("utf-8") - defi = b_defi.decode("utf-8").strip() - if defi.startswith("@@@LINK="): - if not word: - log.warning(f"unexpected defi: {defi}") - continue - mainWord = defi[8:] - if mainWord in linksDict: - linksDict[mainWord] += "\n" + word - else: - linksDict[mainWord] = word - continue - wordCount += 1 - - log.info( - f"extracting links done, sizeof(linksDict)={sys.getsizeof(linksDict)}", - ) - log.info(f"{wordCount = }") - self._linksDict = linksDict - self._wordCount = wordCount - self._mdx = MDX(self._filename, self._encoding, self._substyle) - - def fixDefi(self, defi: str) -> str: - defi = self._re_internal_link.sub(r"href=\1bword://", defi) - defi = defi.replace(' src="file://', ' src=".') - - if self._audio: - # \5 is the possible elements between <a ...> and </a> - # but anything between <audio...> and </audio> is completely - # ignored by Aaard2 Web and browser - # and there is no point adding it after </audio> - # which makes it shown after audio controls - - # GoldenDict acts completely different, so must use - # audio_goldendict=True option in StarDict writer instead. - - defi = self._re_audio_link.sub( - r'<audio controls src="\3"></audio>', - defi, - ) - - return defi - - def __iter__(self) -> Iterator[EntryType]: - if self._mdx is None: - log.error("trying to iterate on a closed MDX file") - return - - glos = self._glos - linksDict = self._linksDict - for b_word, b_defi in self._mdx.items(): - word = b_word.decode("utf-8") - defi = b_defi.decode("utf-8").strip() - if defi.startswith("@@@LINK="): - continue - defi = self.fixDefi(defi) - words = word - altsStr = linksDict.get(word, "") - if altsStr: - words = [word] + altsStr.split("\n") - yield glos.newEntry(words, defi) - - self._mdx = None - del linksDict - self._linksDict = {} - gc.collect() - - if self._same_dir_data_files: - dirPath = dirname(self._filename) - for fname in os.listdir(dirPath): - ext = splitext(fname)[1].lower() - if ext in {".mdx", ".mdd"}: - continue - fpath = join(dirPath, fname) - if not isfile(fpath): - continue - with open(fpath, mode="rb") as _file: - b_data = _file.read() - yield glos.newDataEntry(fname, b_data) - - for mdd in self._mdd: - try: - for b_fname, b_data in mdd.items(): - fname = toStr(b_fname) - fname = fname.replace("\\", os.sep).lstrip(os.sep) - yield glos.newDataEntry(fname, b_data) - except Exception: # noqa: PERF203 - log.exception(f"Error reading {mdd.filename}") - self._mdd = [] - - def __len__(self) -> int: - return self._wordCount + self._dataEntryCount - - def close(self) -> None: - self.clear() diff --git a/pyglossary/plugins/octopus_mdict_new/reader.py b/pyglossary/plugins/octopus_mdict_new/reader.py new file mode 100644 index 000000000..f154200dc --- /dev/null +++ b/pyglossary/plugins/octopus_mdict_new/reader.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- +# Read Octopus MDict dictionary format, mdx(dictionary)/mdd(data) +# +# Copyright © 2013 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com> +# Copyright © 2013-2021 Saeed Rasooli <saeed.gnu@gmail.com> +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# You can get a copy of GNU General Public License along this program +# But you can always get it from http://www.gnu.org/licenses/gpl.txt +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +from __future__ import annotations + +import gc +import os +import re +import sys +from os.path import dirname, extsep, isfile, join, splitext +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.plugin_lib.readmdict import MDD, MDX + + +from pyglossary.core import log +from pyglossary.text_utils import toStr + + +class Reader: + _encoding: str = "" + _substyle: bool = True + _same_dir_data_files: bool = False + _audio: bool = False + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self.clear() + self._re_internal_link = re.compile("href=([\"'])(entry://|[dx]:)") + self._re_audio_link = re.compile( + '<a (type="sound" )?([^<>]*? )?href="sound://([^<>"]+)"( .*?)?>(.*?)</a>', + ) + + def clear(self) -> None: + self._filename = "" + self._mdx: MDX | None = None + self._mdd: list[MDD] = [] + self._wordCount = 0 + self._dataEntryCount = 0 + + # dict of mainWord -> newline-separated alternatives + self._linksDict: dict[str, str] = {} + + def open(self, filename: str) -> None: + from pyglossary.plugin_lib.readmdict import MDD, MDX + + self._filename = filename + self._mdx = MDX(filename, self._encoding, self._substyle) + + """ + multiple MDD files are supported with this naming schema: + FILE.mdx + FILE.mdd + FILE.1.mdd + FILE.2.mdd + FILE.3.mdd + """ + + filenameNoExt, _ext = splitext(self._filename) + mddBase = filenameNoExt + extsep + for fname in (f"{mddBase}mdd", f"{mddBase}1.mdd"): + if isfile(fname): + self._mdd.append(MDD(fname)) + mddN = 2 + while isfile(f"{mddBase}{mddN}.mdd"): + self._mdd.append(MDD(f"{mddBase}{mddN}.mdd")) + mddN += 1 + + dataEntryCount = 0 + for mdd in self._mdd: + dataEntryCount += len(mdd) + self._dataEntryCount = dataEntryCount + log.info(f"Found {len(self._mdd)} mdd files with {dataEntryCount} entries") + + # from pprint import pformat + # log.debug("mdx.header = " + pformat(self._mdx.header)) + # for key, value in self._mdx.header.items(): + # key = key.lower() + # self._glos.setInfo(key, value) + try: + title = toStr(self._mdx.header[b"Title"]) + except KeyError: + pass + else: + title = title.strip() + if title == "Title (No HTML code allowed)": + # TODO: how to avoid this? + title = "" + if title: + self._glos.setInfo("name", title) + desc = toStr(self._mdx.header.get(b"Description", "")) + if desc: + self._glos.setInfo("description", desc) + + self.loadLinks() + + def loadLinks(self) -> None: + from pyglossary.plugin_lib.readmdict import MDX + + mdx = self._mdx + if mdx is None: + raise ValueError("mdx is None") + + log.info("extracting links...") + linksDict: dict[str, str] = {} + word = "" + wordCount = 0 + for b_word, b_defi in mdx.items(): + word = b_word.decode("utf-8") + defi = b_defi.decode("utf-8").strip() + if defi.startswith("@@@LINK="): + if not word: + log.warning(f"unexpected defi: {defi}") + continue + mainWord = defi[8:] + if mainWord in linksDict: + linksDict[mainWord] += "\n" + word + else: + linksDict[mainWord] = word + continue + wordCount += 1 + + log.info( + f"extracting links done, sizeof(linksDict)={sys.getsizeof(linksDict)}", + ) + log.info(f"{wordCount = }") + self._linksDict = linksDict + self._wordCount = wordCount + self._mdx = MDX(self._filename, self._encoding, self._substyle) + + def fixDefi(self, defi: str) -> str: + defi = self._re_internal_link.sub(r"href=\1bword://", defi) + defi = defi.replace(' src="file://', ' src=".') + + if self._audio: + # \5 is the possible elements between <a ...> and </a> + # but anything between <audio...> and </audio> is completely + # ignored by Aaard2 Web and browser + # and there is no point adding it after </audio> + # which makes it shown after audio controls + + # GoldenDict acts completely different, so must use + # audio_goldendict=True option in StarDict writer instead. + + defi = self._re_audio_link.sub( + r'<audio controls src="\3"></audio>', + defi, + ) + + return defi + + def __iter__(self) -> Iterator[EntryType]: + if self._mdx is None: + log.error("trying to iterate on a closed MDX file") + return + + glos = self._glos + linksDict = self._linksDict + for b_word, b_defi in self._mdx.items(): + word = b_word.decode("utf-8") + defi = b_defi.decode("utf-8").strip() + if defi.startswith("@@@LINK="): + continue + defi = self.fixDefi(defi) + words = word + altsStr = linksDict.get(word, "") + if altsStr: + words = [word] + altsStr.split("\n") + yield glos.newEntry(words, defi) + + self._mdx = None + del linksDict + self._linksDict = {} + gc.collect() + + if self._same_dir_data_files: + dirPath = dirname(self._filename) + for fname in os.listdir(dirPath): + ext = splitext(fname)[1].lower() + if ext in {".mdx", ".mdd"}: + continue + fpath = join(dirPath, fname) + if not isfile(fpath): + continue + with open(fpath, mode="rb") as _file: + b_data = _file.read() + yield glos.newDataEntry(fname, b_data) + + for mdd in self._mdd: + try: + for b_fname, b_data in mdd.items(): + fname = toStr(b_fname) + fname = fname.replace("\\", os.sep).lstrip(os.sep) + yield glos.newDataEntry(fname, b_data) + except Exception: # noqa: PERF203 + log.exception(f"Error reading {mdd.filename}") + self._mdd = [] + + def __len__(self) -> int: + return self._wordCount + self._dataEntryCount + + def close(self) -> None: + self.clear() diff --git a/pyglossary/plugins/sql/__init__.py b/pyglossary/plugins/sql/__init__.py index fce4cfb56..c0629c979 100644 --- a/pyglossary/plugins/sql/__init__.py +++ b/pyglossary/plugins/sql/__init__.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import annotations -from typing import TYPE_CHECKING - from pyglossary.option import ( BoolOption, EncodingOption, @@ -11,11 +9,7 @@ Option, ) -if TYPE_CHECKING: - import io - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType, GlossaryType +from .writer import Writer __all__ = [ "Writer", @@ -49,133 +43,3 @@ "newline": NewlineOption(), "transaction": BoolOption(comment="Use TRANSACTION"), } - - -class Writer: - _encoding: str = "utf-8" - _info_keys: list | None = None - _add_extra_info: bool = True - _newline: str = "<br>" - _transaction: bool = False - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.IOBase | None = None - - def finish(self) -> None: - self._filename = "" - if self._file: - self._file.close() - self._file = None - - def open(self, filename: str) -> None: - self._filename = filename - self._file = open(filename, "w", encoding=self._encoding) - self._writeInfo() - - def _writeInfo(self) -> None: - fileObj = self._file - if fileObj is None: - raise ValueError("fileObj is None") - newline = self._newline - info_keys = self._getInfoKeys() - infoDefLine = "CREATE TABLE dbinfo (" - infoValues: list[str] = [] - glos = self._glos - - for key in info_keys: - value = glos.getInfo(key) - value = ( - value.replace("'", "''") - .replace("\x00", "") - .replace("\r", "") - .replace("\n", newline) - ) - infoValues.append(f"'{value}'") - infoDefLine += f"{key} char({len(value)}), " - - infoDefLine = infoDefLine[:-2] + ");" - fileObj.write(infoDefLine + "\n") - - if self._add_extra_info: - fileObj.write( - "CREATE TABLE dbinfo_extra (" - "'id' INTEGER PRIMARY KEY NOT NULL, " - "'name' TEXT UNIQUE, 'value' TEXT);\n", - ) - - fileObj.write( - "CREATE TABLE word ('id' INTEGER PRIMARY KEY NOT NULL, " - "'w' TEXT, 'm' TEXT);\n", - ) - fileObj.write( - "CREATE TABLE alt ('id' INTEGER NOT NULL, 'w' TEXT);\n", - ) - - if self._transaction: - fileObj.write("BEGIN TRANSACTION;\n") - fileObj.write(f"INSERT INTO dbinfo VALUES({','.join(infoValues)});\n") - - if self._add_extra_info: - extraInfo = glos.getExtraInfos(info_keys) - for index, (key, value) in enumerate(extraInfo.items()): - key2 = key.replace("'", "''") - value2 = value.replace("'", "''") - fileObj.write( - f"INSERT INTO dbinfo_extra VALUES({index + 1}, " - f"'{key2}', '{value2}');\n", - ) - - def _getInfoKeys(self) -> list[str]: - info_keys = self._info_keys - if info_keys: - return info_keys - return [ - "dbname", - "author", - "version", - "direction", - "origLang", - "destLang", - "license", - "category", - "description", - ] - - def write(self) -> Generator[None, EntryType, None]: - newline = self._newline - - fileObj = self._file - if fileObj is None: - raise ValueError("fileObj is None") - - def fixStr(word: str) -> str: - return word.replace("'", "''").replace("\r", "").replace("\n", newline) - - id_ = 1 - while True: - entry = yield - if entry is None: - break - if entry.isData(): - # FIXME - continue - words = entry.l_word - word = fixStr(words[0]) - defi = fixStr(entry.defi) - fileObj.write( - f"INSERT INTO word VALUES({id_}, '{word}', '{defi}');\n", - ) - for alt in words[1:]: - fileObj.write( - f"INSERT INTO alt VALUES({id_}, '{fixStr(alt)}');\n", - ) - id_ += 1 - - if self._transaction: - fileObj.write("END TRANSACTION;\n") - - fileObj.write("CREATE INDEX ix_word_w ON word(w COLLATE NOCASE);\n") - fileObj.write("CREATE INDEX ix_alt_id ON alt(id COLLATE NOCASE);\n") - fileObj.write("CREATE INDEX ix_alt_w ON alt(w COLLATE NOCASE);\n") diff --git a/pyglossary/plugins/sql/writer.py b/pyglossary/plugins/sql/writer.py new file mode 100644 index 000000000..3042b5c7d --- /dev/null +++ b/pyglossary/plugins/sql/writer.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import io + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType, GlossaryType + +class Writer: + _encoding: str = "utf-8" + _info_keys: list | None = None + _add_extra_info: bool = True + _newline: str = "<br>" + _transaction: bool = False + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.IOBase | None = None + + def finish(self) -> None: + self._filename = "" + if self._file: + self._file.close() + self._file = None + + def open(self, filename: str) -> None: + self._filename = filename + self._file = open(filename, "w", encoding=self._encoding) + self._writeInfo() + + def _writeInfo(self) -> None: + fileObj = self._file + if fileObj is None: + raise ValueError("fileObj is None") + newline = self._newline + info_keys = self._getInfoKeys() + infoDefLine = "CREATE TABLE dbinfo (" + infoValues: list[str] = [] + glos = self._glos + + for key in info_keys: + value = glos.getInfo(key) + value = ( + value.replace("'", "''") + .replace("\x00", "") + .replace("\r", "") + .replace("\n", newline) + ) + infoValues.append(f"'{value}'") + infoDefLine += f"{key} char({len(value)}), " + + infoDefLine = infoDefLine[:-2] + ");" + fileObj.write(infoDefLine + "\n") + + if self._add_extra_info: + fileObj.write( + "CREATE TABLE dbinfo_extra (" + "'id' INTEGER PRIMARY KEY NOT NULL, " + "'name' TEXT UNIQUE, 'value' TEXT);\n", + ) + + fileObj.write( + "CREATE TABLE word ('id' INTEGER PRIMARY KEY NOT NULL, " + "'w' TEXT, 'm' TEXT);\n", + ) + fileObj.write( + "CREATE TABLE alt ('id' INTEGER NOT NULL, 'w' TEXT);\n", + ) + + if self._transaction: + fileObj.write("BEGIN TRANSACTION;\n") + fileObj.write(f"INSERT INTO dbinfo VALUES({','.join(infoValues)});\n") + + if self._add_extra_info: + extraInfo = glos.getExtraInfos(info_keys) + for index, (key, value) in enumerate(extraInfo.items()): + key2 = key.replace("'", "''") + value2 = value.replace("'", "''") + fileObj.write( + f"INSERT INTO dbinfo_extra VALUES({index + 1}, " + f"'{key2}', '{value2}');\n", + ) + + def _getInfoKeys(self) -> list[str]: + info_keys = self._info_keys + if info_keys: + return info_keys + return [ + "dbname", + "author", + "version", + "direction", + "origLang", + "destLang", + "license", + "category", + "description", + ] + + def write(self) -> Generator[None, EntryType, None]: + newline = self._newline + + fileObj = self._file + if fileObj is None: + raise ValueError("fileObj is None") + + def fixStr(word: str) -> str: + return word.replace("'", "''").replace("\r", "").replace("\n", newline) + + id_ = 1 + while True: + entry = yield + if entry is None: + break + if entry.isData(): + # FIXME + continue + words = entry.l_word + word = fixStr(words[0]) + defi = fixStr(entry.defi) + fileObj.write( + f"INSERT INTO word VALUES({id_}, '{word}', '{defi}');\n", + ) + for alt in words[1:]: + fileObj.write( + f"INSERT INTO alt VALUES({id_}, '{fixStr(alt)}');\n", + ) + id_ += 1 + + if self._transaction: + fileObj.write("END TRANSACTION;\n") + + fileObj.write("CREATE INDEX ix_word_w ON word(w COLLATE NOCASE);\n") + fileObj.write("CREATE INDEX ix_alt_id ON alt(id COLLATE NOCASE);\n") + fileObj.write("CREATE INDEX ix_alt_w ON alt(w COLLATE NOCASE);\n") diff --git a/pyglossary/plugins/stardict_merge_syns/__init__.py b/pyglossary/plugins/stardict_merge_syns/__init__.py index b13cb423f..d1ef62fc7 100644 --- a/pyglossary/plugins/stardict_merge_syns/__init__.py +++ b/pyglossary/plugins/stardict_merge_syns/__init__.py @@ -2,10 +2,6 @@ from __future__ import annotations import os -from time import perf_counter as now -from typing import ( - TYPE_CHECKING, -) from pyglossary.flags import ALWAYS, DEFAULT_YES from pyglossary.option import ( @@ -13,17 +9,8 @@ Option, StrOption, ) -from pyglossary.plugins.stardict import Writer as StdWriter - -if TYPE_CHECKING: - from collections.abc import Generator - - from pyglossary.glossary_types import EntryType - -from pyglossary.core import log -from pyglossary.glossary_utils import Error -from pyglossary.text_utils import uint32ToBytes +from .writer import Writer __all__ = [ "Writer", @@ -97,121 +84,3 @@ if os.getenv("PYGLOSSARY_STARDICT_NO_FORCE_SORT") == "1": sortOnWrite = DEFAULT_YES - - -class Writer(StdWriter): - dictzipSynFile = False - - def fixDefi(self, defi: str, defiFormat: str) -> bytes: # noqa: ARG002, PLR6301 - return defi.encode("utf-8") - - def writeCompact( - self, - defiFormat: str, - ) -> Generator[None, EntryType, None]: - """ - Build StarDict dictionary with sametypesequence option specified. - Every item definition consists of a single article. - All articles have the same format, specified in defiFormat parameter. - - defiFormat - format of article definition: h - html, m - plain text - """ - log.debug(f"writeCompact: {defiFormat=}") - - idxBlockList = self.newIdxList() - altIndexList = self.newSynList() - - dictFile = open(self._filename + ".dict", "wb") - - t0 = now() - - dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc() - - dictMark, entryIndex = 0, -1 - while True: - entry = yield - if entry is None: - break - if entry.isData(): - entry.save(self._resDir) - continue - entryIndex += 1 - - b_dictBlock = self.fixDefi(entry.defi, defiFormat) - dictFile.write(b_dictBlock) - - b_idxBlock = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock)) - for b_word in entry.lb_word: - idxBlockList.append((b_word, b_idxBlock)) - - dictMark += len(b_dictBlock) - - if dictMark > dictMarkMax: - raise Error( - f"StarDict: {dictMark = } is too big, set option large_file=true", - ) - - dictFile.close() - log.info(f"Writing dict file took {now() - t0:.2f} seconds") - - self.writeIdxFile(idxBlockList) - - self.writeIfoFile( - len(idxBlockList), - len(altIndexList), - ) - - def writeGeneral(self) -> Generator[None, EntryType, None]: - """ - Build StarDict dictionary in general case. - Every item definition may consist of an arbitrary number of articles. - sametypesequence option is not used. - """ - log.debug("writeGeneral") - idxBlockList = self.newIdxList() - altIndexList = self.newSynList() - - dictFile = open(self._filename + ".dict", "wb") - - t0 = now() - - dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc() - - dictMark, entryIndex = 0, -1 - while True: - entry = yield - if entry is None: - break - if entry.isData(): - entry.save(self._resDir) - continue - entryIndex += 1 - - defiFormat = entry.detectDefiFormat("m") # call no more than once - - b_defi = self.fixDefi(entry.defi, defiFormat) - b_dictBlock = defiFormat.encode("ascii") + b_defi + b"\x00" - dictFile.write(b_dictBlock) - - b_idxBlock = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock)) - for b_word in entry.lb_word: - idxBlockList.append((b_word, b_idxBlock)) - - dictMark += len(b_dictBlock) - - if dictMark > dictMarkMax: - raise Error( - f"StarDict: {dictMark = } is too big, set option large_file=true", - ) - - dictFile.close() - log.info(f"Writing dict file took {now() - t0:.2f} seconds") - - self.writeIdxFile(idxBlockList) - - self.writeIfoFile( - len(idxBlockList), - len(altIndexList), - ) - - # TODO: override getDescription to indicate merge_syns diff --git a/pyglossary/plugins/stardict_merge_syns/writer.py b/pyglossary/plugins/stardict_merge_syns/writer.py new file mode 100644 index 000000000..ba0349d04 --- /dev/null +++ b/pyglossary/plugins/stardict_merge_syns/writer.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from time import perf_counter as now +from typing import ( + TYPE_CHECKING, +) + +from pyglossary.plugins.stardict import Writer as StdWriter + +if TYPE_CHECKING: + from collections.abc import Generator + + from pyglossary.glossary_types import EntryType + + +from pyglossary.core import log +from pyglossary.glossary_utils import Error +from pyglossary.text_utils import uint32ToBytes + + +class Writer(StdWriter): + dictzipSynFile = False + + def fixDefi(self, defi: str, defiFormat: str) -> bytes: # noqa: ARG002, PLR6301 + return defi.encode("utf-8") + + def writeCompact( + self, + defiFormat: str, + ) -> Generator[None, EntryType, None]: + """ + Build StarDict dictionary with sametypesequence option specified. + Every item definition consists of a single article. + All articles have the same format, specified in defiFormat parameter. + + defiFormat - format of article definition: h - html, m - plain text + """ + log.debug(f"writeCompact: {defiFormat=}") + + idxBlockList = self.newIdxList() + altIndexList = self.newSynList() + + dictFile = open(self._filename + ".dict", "wb") + + t0 = now() + + dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc() + + dictMark, entryIndex = 0, -1 + while True: + entry = yield + if entry is None: + break + if entry.isData(): + entry.save(self._resDir) + continue + entryIndex += 1 + + b_dictBlock = self.fixDefi(entry.defi, defiFormat) + dictFile.write(b_dictBlock) + + b_idxBlock = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock)) + for b_word in entry.lb_word: + idxBlockList.append((b_word, b_idxBlock)) + + dictMark += len(b_dictBlock) + + if dictMark > dictMarkMax: + raise Error( + f"StarDict: {dictMark = } is too big, set option large_file=true", + ) + + dictFile.close() + log.info(f"Writing dict file took {now() - t0:.2f} seconds") + + self.writeIdxFile(idxBlockList) + + self.writeIfoFile( + len(idxBlockList), + len(altIndexList), + ) + + def writeGeneral(self) -> Generator[None, EntryType, None]: + """ + Build StarDict dictionary in general case. + Every item definition may consist of an arbitrary number of articles. + sametypesequence option is not used. + """ + log.debug("writeGeneral") + idxBlockList = self.newIdxList() + altIndexList = self.newSynList() + + dictFile = open(self._filename + ".dict", "wb") + + t0 = now() + + dictMarkToBytes, dictMarkMax = self.dictMarkToBytesFunc() + + dictMark, entryIndex = 0, -1 + while True: + entry = yield + if entry is None: + break + if entry.isData(): + entry.save(self._resDir) + continue + entryIndex += 1 + + defiFormat = entry.detectDefiFormat("m") # call no more than once + + b_defi = self.fixDefi(entry.defi, defiFormat) + b_dictBlock = defiFormat.encode("ascii") + b_defi + b"\x00" + dictFile.write(b_dictBlock) + + b_idxBlock = dictMarkToBytes(dictMark) + uint32ToBytes(len(b_dictBlock)) + for b_word in entry.lb_word: + idxBlockList.append((b_word, b_idxBlock)) + + dictMark += len(b_dictBlock) + + if dictMark > dictMarkMax: + raise Error( + f"StarDict: {dictMark = } is too big, set option large_file=true", + ) + + dictFile.close() + log.info(f"Writing dict file took {now() - t0:.2f} seconds") + + self.writeIdxFile(idxBlockList) + + self.writeIfoFile( + len(idxBlockList), + len(altIndexList), + ) + + # TODO: override getDescription to indicate merge_syns diff --git a/pyglossary/plugins/stardict_textual/__init__.py b/pyglossary/plugins/stardict_textual/__init__.py index a54d04266..80dc78d69 100644 --- a/pyglossary/plugins/stardict_textual/__init__.py +++ b/pyglossary/plugins/stardict_textual/__init__.py @@ -1,34 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import annotations -import os -from os.path import dirname, isdir, join -from typing import TYPE_CHECKING, cast - -if TYPE_CHECKING: - import io - from collections.abc import Generator, Iterator - - from lxml import builder - - from pyglossary.glossary_types import EntryType, GlossaryType - from pyglossary.lxml_types import Element - from pyglossary.xdxf.transform import XdxfTransformer - - -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import exc_note, log, pip -from pyglossary.html_utils import unescape_unicode -from pyglossary.io_utils import nullBinaryIO from pyglossary.option import ( BoolOption, EncodingOption, Option, ) +from .reader import Reader +from .writer import Writer + __all__ = [ "Reader", "Writer", @@ -66,337 +47,3 @@ comment="Convert XDXF entries to HTML", ), } - - -class Reader: - _encoding: str = "utf-8" - _xdxf_to_html: bool = True - - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file: io.IOBase = nullBinaryIO - self._fileSize = 0 - self._xdxfTr: XdxfTransformer | None = None - - def xdxf_setup(self) -> XdxfTransformer: - from pyglossary.xdxf.transform import XdxfTransformer - - self._xdxfTr = tr = XdxfTransformer(encoding="utf-8") - return tr - - def xdxf_transform(self, text: str) -> str: - tr = self._xdxfTr - if tr is None: - tr = self.xdxf_setup() - return tr.transformByInnerString(text) - - def __len__(self) -> int: - return 0 - - def close(self) -> None: - self._file.close() - self._file = nullBinaryIO - self._filename = "" - self._fileSize = 0 - - def open(self, filename: str) -> None: - try: - from lxml import etree as ET - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - self._filename = filename - cfile = compressionOpen(filename, mode="rb") - - if cfile.seekable(): - cfile.seek(0, 2) - self._fileSize = cfile.tell() - cfile.seek(0) - # self._glos.setInfo("input_file_size", f"{self._fileSize}") - else: - log.warning("StarDict Textual File Reader: file is not seekable") - - context = ET.iterparse( # type: ignore # noqa: PGH003 - cfile, - events=("end",), - tag="info", - ) - for _, elem in context: - self.setMetadata(elem) # type: ignore - break - - cfile.close() - - def setGlosInfo(self, key: str, value: str) -> None: - if value is None: - return - self._glos.setInfo(key, unescape_unicode(value)) - - def setMetadata(self, header: Element) -> None: - if (elem := header.find("./bookname")) is not None and elem.text: - self.setGlosInfo("name", elem.text) - - if (elem := header.find("./author")) is not None and elem.text: - self.setGlosInfo("author", elem.text) - - if (elem := header.find("./email")) is not None and elem.text: - self.setGlosInfo("email", elem.text) - - if (elem := header.find("./website")) is not None and elem.text: - self.setGlosInfo("website", elem.text) - - if (elem := header.find("./description")) is not None and elem.text: - self.setGlosInfo("description", elem.text) - - if (elem := header.find("./bookname")) is not None and elem.text: - self.setGlosInfo("name", elem.text) - - if (elem := header.find("./bookname")) is not None and elem.text: - self.setGlosInfo("name", elem.text) - - if (elem := header.find("./date")) is not None and elem.text: - self.setGlosInfo("creationTime", elem.text) - - # if (elem := header.find("./dicttype")) is not None and elem.text: - # self.setGlosInfo("dicttype", elem.text) - - def renderDefiList( - self, - defisWithFormat: list[tuple[str, str]], - ) -> tuple[str, str]: - if not defisWithFormat: - return "", "" - if len(defisWithFormat) == 1: - return defisWithFormat[0] - - defiFormatSet: set[str] = set() - defiFormatSet.update(_type for _, _type in defisWithFormat) - - if len(defiFormatSet) == 1: - format_ = defiFormatSet.pop() - if format_ == "h": - return "\n<hr>".join([defi for defi, _ in defisWithFormat]), format_ - return "\n".join([defi for defi, _ in defisWithFormat]), format_ - - # convert plaintext or xdxf to html - defis: list[str] = [] - for defi_, format_ in defisWithFormat: - if format_ == "m": - defis.append("<pre>" + defi_.replace("\n", "<br/>") + "</pre>") - elif format_ == "x": - defis.append(self.xdxf_transform(defi_)) - else: - defis.append(defi_) - return "\n<hr>\n".join(defis), "h" - - def __iter__(self) -> Iterator[EntryType]: - from lxml import etree as ET - - glos = self._glos - fileSize = self._fileSize - self._file = file = compressionOpen(self._filename, mode="rb") - context = ET.iterparse( # type: ignore # noqa: PGH003 - self._file, - events=("end",), - tag="article", - ) - for _, _elem in context: - elem = cast("Element", _elem) - words: list[str] = [] - defisWithFormat: list[tuple[str, str]] = [] - for child in elem.iterchildren(): - if not child.text: - continue - if child.tag in {"key", "synonym"}: - words.append(child.text) - elif child.tag == "definition": - type_ = child.attrib.get("type", "") - if type_: - new_type = { - "m": "m", - "t": "m", - "y": "m", - "g": "h", - "h": "h", - "x": "x", - }.get(type_, "") - if not new_type: - log.warning(f"unsupported definition type {type_}") - type_ = new_type - if not type_: - type_ = "m" - defi_ = child.text.strip() - if type_ == "x" and self._xdxf_to_html: - defi_ = self.xdxf_transform(defi_) - type_ = "h" - defisWithFormat.append((defi_, type_)) - # TODO: child.tag == "definition-r" - else: - log.warning(f"unknown tag {child.tag}") - - defi, defiFormat = self.renderDefiList(defisWithFormat) - - yield glos.newEntry( - words, - defi, - defiFormat=defiFormat, - byteProgress=(file.tell(), fileSize), - ) - - # clean up preceding siblings to save memory - # this can reduce memory usage from >300 MB to ~25 MB - while elem.getprevious() is not None: - parent = elem.getparent() - if parent is None: - break - del parent[0] - - -class Writer: - _encoding: str = "utf-8" - - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._resDir = "" - - def open( - self, - filename: str, - ) -> None: - self._filename = filename - self._resDir = join(dirname(self._filename), "res") - self._file = compressionOpen( - self._filename, - mode="w", - encoding=self._encoding, - ) - - def finish(self) -> None: - self._file.close() - - def writeInfo( - self, - maker: builder.ElementMaker, - pretty: bool, - ) -> None: - from lxml import etree as ET - - glos = self._glos - - desc = glos.getInfo("description") - copyright_ = glos.getInfo("copyright") - if copyright_: - desc = f"{copyright_}\n{desc}" - publisher = glos.getInfo("publisher") - if publisher: - desc = f"Publisher: {publisher}\n{desc}" - - info = maker.info( - maker.version("3.0.0"), - maker.bookname(glos.getInfo("name")), - maker.author(glos.getInfo("author")), - maker.email(glos.getInfo("email")), - maker.website(glos.getInfo("website")), - maker.description(desc), - maker.date(glos.getInfo("creationTime")), - maker.dicttype(""), - ) - file = self._file - file.write( - cast( - "bytes", - ET.tostring( - info, - encoding=self._encoding, - pretty_print=pretty, - ), - ).decode(self._encoding) - + "\n", - ) - - def writeDataEntry( - self, - maker: builder.ElementMaker, # noqa: ARG002 - entry: EntryType, - ) -> None: - entry.save(self._resDir) - # TODO: create article tag with "definition-r" in it? - # or just save the file to res/ directory? or both? - # article = maker.article( - # maker.key(entry.s_word), - # maker.definition_r( - # ET.CDATA(entry.defi), - # **{"type": ext}) - # ) - # ) - - def write(self) -> Generator[None, EntryType, None]: - from lxml import builder - from lxml import etree as ET - - file = self._file - encoding = self._encoding - maker = builder.ElementMaker() - - file.write( - """<?xml version="1.0" encoding="UTF-8" ?> -<stardict xmlns:xi="http://www.w3.org/2003/XInclude"> -""", - ) - - self.writeInfo(maker, pretty=True) - - if not isdir(self._resDir): - os.mkdir(self._resDir) - - pretty = True - while True: - entry = yield - if entry is None: - break - if entry.isData(): - self.writeDataEntry(maker, entry) - continue - entry.detectDefiFormat() - article = maker.article( - maker.key(entry.l_word[0]), - ) - for alt in entry.l_word[1:]: - article.append(maker.synonym(alt)) - article.append( - maker.definition( - ET.CDATA(entry.defi), - type=entry.defiFormat, - ), - ) - ET.indent(article, space="") - articleStr = cast( - "bytes", - ET.tostring( - article, - pretty_print=pretty, - encoding=encoding, - ), - ).decode(encoding) - # for some reason, "´k" becomes " ́k" (for example) # noqa: RUF003 - # stardict-text2bin tool also does this. - # https://en.wiktionary.org/wiki/%CB%88#Translingual - self._file.write(articleStr + "\n") - - file.write("</stardict>") - - if not os.listdir(self._resDir): - os.rmdir(self._resDir) diff --git a/pyglossary/plugins/stardict_textual/reader.py b/pyglossary/plugins/stardict_textual/reader.py new file mode 100644 index 000000000..91fea26c8 --- /dev/null +++ b/pyglossary/plugins/stardict_textual/reader.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + import io + from collections.abc import Iterator + + from pyglossary.glossary_types import EntryType, GlossaryType + from pyglossary.lxml_types import Element + from pyglossary.xdxf.transform import XdxfTransformer + + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) +from pyglossary.core import exc_note, log, pip +from pyglossary.html_utils import unescape_unicode +from pyglossary.io_utils import nullBinaryIO + + +class Reader: + _encoding: str = "utf-8" + _xdxf_to_html: bool = True + + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._file: io.IOBase = nullBinaryIO + self._fileSize = 0 + self._xdxfTr: XdxfTransformer | None = None + + def xdxf_setup(self) -> XdxfTransformer: + from pyglossary.xdxf.transform import XdxfTransformer + + self._xdxfTr = tr = XdxfTransformer(encoding="utf-8") + return tr + + def xdxf_transform(self, text: str) -> str: + tr = self._xdxfTr + if tr is None: + tr = self.xdxf_setup() + return tr.transformByInnerString(text) + + def __len__(self) -> int: + return 0 + + def close(self) -> None: + self._file.close() + self._file = nullBinaryIO + self._filename = "" + self._fileSize = 0 + + def open(self, filename: str) -> None: + try: + from lxml import etree as ET + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install lxml` to install") + raise + + self._filename = filename + cfile = compressionOpen(filename, mode="rb") + + if cfile.seekable(): + cfile.seek(0, 2) + self._fileSize = cfile.tell() + cfile.seek(0) + # self._glos.setInfo("input_file_size", f"{self._fileSize}") + else: + log.warning("StarDict Textual File Reader: file is not seekable") + + context = ET.iterparse( # type: ignore # noqa: PGH003 + cfile, + events=("end",), + tag="info", + ) + for _, elem in context: + self.setMetadata(elem) # type: ignore + break + + cfile.close() + + def setGlosInfo(self, key: str, value: str) -> None: + if value is None: + return + self._glos.setInfo(key, unescape_unicode(value)) + + def setMetadata(self, header: Element) -> None: + if (elem := header.find("./bookname")) is not None and elem.text: + self.setGlosInfo("name", elem.text) + + if (elem := header.find("./author")) is not None and elem.text: + self.setGlosInfo("author", elem.text) + + if (elem := header.find("./email")) is not None and elem.text: + self.setGlosInfo("email", elem.text) + + if (elem := header.find("./website")) is not None and elem.text: + self.setGlosInfo("website", elem.text) + + if (elem := header.find("./description")) is not None and elem.text: + self.setGlosInfo("description", elem.text) + + if (elem := header.find("./bookname")) is not None and elem.text: + self.setGlosInfo("name", elem.text) + + if (elem := header.find("./bookname")) is not None and elem.text: + self.setGlosInfo("name", elem.text) + + if (elem := header.find("./date")) is not None and elem.text: + self.setGlosInfo("creationTime", elem.text) + + # if (elem := header.find("./dicttype")) is not None and elem.text: + # self.setGlosInfo("dicttype", elem.text) + + def renderDefiList( + self, + defisWithFormat: list[tuple[str, str]], + ) -> tuple[str, str]: + if not defisWithFormat: + return "", "" + if len(defisWithFormat) == 1: + return defisWithFormat[0] + + defiFormatSet: set[str] = set() + defiFormatSet.update(_type for _, _type in defisWithFormat) + + if len(defiFormatSet) == 1: + format_ = defiFormatSet.pop() + if format_ == "h": + return "\n<hr>".join([defi for defi, _ in defisWithFormat]), format_ + return "\n".join([defi for defi, _ in defisWithFormat]), format_ + + # convert plaintext or xdxf to html + defis: list[str] = [] + for defi_, format_ in defisWithFormat: + if format_ == "m": + defis.append("<pre>" + defi_.replace("\n", "<br/>") + "</pre>") + elif format_ == "x": + defis.append(self.xdxf_transform(defi_)) + else: + defis.append(defi_) + return "\n<hr>\n".join(defis), "h" + + def __iter__(self) -> Iterator[EntryType]: + from lxml import etree as ET + + glos = self._glos + fileSize = self._fileSize + self._file = file = compressionOpen(self._filename, mode="rb") + context = ET.iterparse( # type: ignore # noqa: PGH003 + self._file, + events=("end",), + tag="article", + ) + for _, _elem in context: + elem = cast("Element", _elem) + words: list[str] = [] + defisWithFormat: list[tuple[str, str]] = [] + for child in elem.iterchildren(): + if not child.text: + continue + if child.tag in {"key", "synonym"}: + words.append(child.text) + elif child.tag == "definition": + type_ = child.attrib.get("type", "") + if type_: + new_type = { + "m": "m", + "t": "m", + "y": "m", + "g": "h", + "h": "h", + "x": "x", + }.get(type_, "") + if not new_type: + log.warning(f"unsupported definition type {type_}") + type_ = new_type + if not type_: + type_ = "m" + defi_ = child.text.strip() + if type_ == "x" and self._xdxf_to_html: + defi_ = self.xdxf_transform(defi_) + type_ = "h" + defisWithFormat.append((defi_, type_)) + # TODO: child.tag == "definition-r" + else: + log.warning(f"unknown tag {child.tag}") + + defi, defiFormat = self.renderDefiList(defisWithFormat) + + yield glos.newEntry( + words, + defi, + defiFormat=defiFormat, + byteProgress=(file.tell(), fileSize), + ) + + # clean up preceding siblings to save memory + # this can reduce memory usage from >300 MB to ~25 MB + while elem.getprevious() is not None: + parent = elem.getparent() + if parent is None: + break + del parent[0] diff --git a/pyglossary/plugins/stardict_textual/writer.py b/pyglossary/plugins/stardict_textual/writer.py new file mode 100644 index 000000000..c7681d839 --- /dev/null +++ b/pyglossary/plugins/stardict_textual/writer.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +from __future__ import annotations + +import os +from os.path import dirname, isdir, join +from typing import TYPE_CHECKING, cast + +if TYPE_CHECKING: + from collections.abc import Generator + + from lxml import builder + + from pyglossary.glossary_types import EntryType, GlossaryType + + +from pyglossary.compression import ( + compressionOpen, + stdCompressions, +) + + +class Writer: + _encoding: str = "utf-8" + + compressions = stdCompressions + depends = { + "lxml": "lxml", + } + + def __init__(self, glos: GlossaryType) -> None: + self._glos = glos + self._filename = "" + self._resDir = "" + + def open( + self, + filename: str, + ) -> None: + self._filename = filename + self._resDir = join(dirname(self._filename), "res") + self._file = compressionOpen( + self._filename, + mode="w", + encoding=self._encoding, + ) + + def finish(self) -> None: + self._file.close() + + def writeInfo( + self, + maker: builder.ElementMaker, + pretty: bool, + ) -> None: + from lxml import etree as ET + + glos = self._glos + + desc = glos.getInfo("description") + copyright_ = glos.getInfo("copyright") + if copyright_: + desc = f"{copyright_}\n{desc}" + publisher = glos.getInfo("publisher") + if publisher: + desc = f"Publisher: {publisher}\n{desc}" + + info = maker.info( + maker.version("3.0.0"), + maker.bookname(glos.getInfo("name")), + maker.author(glos.getInfo("author")), + maker.email(glos.getInfo("email")), + maker.website(glos.getInfo("website")), + maker.description(desc), + maker.date(glos.getInfo("creationTime")), + maker.dicttype(""), + ) + file = self._file + file.write( + cast( + "bytes", + ET.tostring( + info, + encoding=self._encoding, + pretty_print=pretty, + ), + ).decode(self._encoding) + + "\n", + ) + + def writeDataEntry( + self, + maker: builder.ElementMaker, # noqa: ARG002 + entry: EntryType, + ) -> None: + entry.save(self._resDir) + # TODO: create article tag with "definition-r" in it? + # or just save the file to res/ directory? or both? + # article = maker.article( + # maker.key(entry.s_word), + # maker.definition_r( + # ET.CDATA(entry.defi), + # **{"type": ext}) + # ) + # ) + + def write(self) -> Generator[None, EntryType, None]: + from lxml import builder + from lxml import etree as ET + + file = self._file + encoding = self._encoding + maker = builder.ElementMaker() + + file.write( + """<?xml version="1.0" encoding="UTF-8" ?> +<stardict xmlns:xi="http://www.w3.org/2003/XInclude"> +""", + ) + + self.writeInfo(maker, pretty=True) + + if not isdir(self._resDir): + os.mkdir(self._resDir) + + pretty = True + while True: + entry = yield + if entry is None: + break + if entry.isData(): + self.writeDataEntry(maker, entry) + continue + entry.detectDefiFormat() + article = maker.article( + maker.key(entry.l_word[0]), + ) + for alt in entry.l_word[1:]: + article.append(maker.synonym(alt)) + article.append( + maker.definition( + ET.CDATA(entry.defi), + type=entry.defiFormat, + ), + ) + ET.indent(article, space="") + articleStr = cast( + "bytes", + ET.tostring( + article, + pretty_print=pretty, + encoding=encoding, + ), + ).decode(encoding) + # for some reason, "´k" becomes " ́k" (for example) # noqa: RUF003 + # stardict-text2bin tool also does this. + # https://en.wiktionary.org/wiki/%CB%88#Translingual + self._file.write(articleStr + "\n") + + file.write("</stardict>") + + if not os.listdir(self._resDir): + os.rmdir(self._resDir) diff --git a/tests/deprecated/glossary_security_test.py b/tests/deprecated/glossary_security_test.py index 78f55f060..81fd531d8 100644 --- a/tests/deprecated/glossary_security_test.py +++ b/tests/deprecated/glossary_security_test.py @@ -62,5 +62,6 @@ def test_convert_4(self): self.assertIsNone(res) self.assertLogCritical("Unable to detect output format!") + if __name__ == "__main__": unittest.main()