From 8fb3b2d4a6eb09342bccdea003f2e813573411a5 Mon Sep 17 00:00:00 2001
From: Saeed Rasooli
Date: Tue, 31 Dec 2024 08:16:23 +0330
Subject: [PATCH] break up plugins
---
pyglossary/plugins/aard2_slob/__init__.py | 393 +----------
pyglossary/plugins/aard2_slob/reader.py | 145 ++++
pyglossary/plugins/aard2_slob/tags.py | 29 +
pyglossary/plugins/aard2_slob/writer.py | 260 +++++++
pyglossary/plugins/almaany/__init__.py | 84 +--
pyglossary/plugins/almaany/reader.py | 88 +++
.../plugins/ayandict_sqlite/__init__.py | 206 +-----
pyglossary/plugins/ayandict_sqlite/reader.py | 66 ++
pyglossary/plugins/ayandict_sqlite/writer.py | 152 ++++
pyglossary/plugins/cc_kedict/__init__.py | 304 +-------
pyglossary/plugins/cc_kedict/reader.py | 309 +++++++++
pyglossary/plugins/crawler_dir/__init__.py | 163 +----
pyglossary/plugins/crawler_dir/reader.py | 88 +++
pyglossary/plugins/crawler_dir/writer.py | 93 +++
pyglossary/plugins/csv_plugin/__init__.py | 244 +------
pyglossary/plugins/csv_plugin/reader.py | 182 +++++
pyglossary/plugins/csv_plugin/writer.py | 121 ++++
pyglossary/plugins/dicformids/__init__.py | 256 +------
pyglossary/plugins/dicformids/reader.py | 76 ++
pyglossary/plugins/dicformids/writer.py | 195 ++++++
pyglossary/plugins/dict_cc/__init__.py | 200 +-----
pyglossary/plugins/dict_cc/reader.py | 205 ++++++
pyglossary/plugins/dict_cc_split/__init__.py | 77 +-
pyglossary/plugins/dict_cc_split/reader.py | 83 +++
pyglossary/plugins/dict_org/__init__.py | 156 +----
pyglossary/plugins/dict_org/reader.py | 74 ++
pyglossary/plugins/dict_org/writer.py | 98 +++
.../plugins/dict_org_source/__init__.py | 40 +-
pyglossary/plugins/dict_org_source/writer.py | 42 ++
pyglossary/plugins/dictunformat/__init__.py | 90 +--
pyglossary/plugins/dictunformat/reader.py | 90 +++
pyglossary/plugins/digitalnk/__init__.py | 55 +-
pyglossary/plugins/digitalnk/reader.py | 59 ++
pyglossary/plugins/dikt_json/__init__.py | 74 +-
pyglossary/plugins/dikt_json/writer.py | 80 +++
pyglossary/plugins/ebook_epub2/__init__.py | 231 +-----
pyglossary/plugins/ebook_epub2/writer.py | 233 +++++++
pyglossary/plugins/ebook_kobo/__init__.py | 229 +-----
pyglossary/plugins/ebook_kobo/writer.py | 233 +++++++
.../plugins/ebook_kobo_dictfile/__init__.py | 185 +----
.../plugins/ebook_kobo_dictfile/reader.py | 123 ++++
.../plugins/ebook_kobo_dictfile/writer.py | 89 +++
pyglossary/plugins/ebook_mobi/__init__.py | 306 +-------
pyglossary/plugins/ebook_mobi/writer.py | 308 ++++++++
pyglossary/plugins/edict2/__init__.py | 88 +--
pyglossary/plugins/edict2/reader.py | 89 +++
pyglossary/plugins/edlin/__init__.py | 272 +-------
pyglossary/plugins/edlin/reader.py | 131 ++++
pyglossary/plugins/edlin/writer.py | 141 ++++
pyglossary/plugins/gettext_po/__init__.py | 177 +----
pyglossary/plugins/gettext_po/reader.py | 128 ++++
pyglossary/plugins/gettext_po/writer.py | 66 ++
pyglossary/plugins/html_dir/__init__.py | 490 +------------
pyglossary/plugins/html_dir/writer.py | 491 +++++++++++++
pyglossary/plugins/info_plugin/__init__.py | 30 +-
pyglossary/plugins/info_plugin/reader.py | 36 +
pyglossary/plugins/jmdict/__init__.py | 416 +----------
pyglossary/plugins/jmdict/reader.py | 417 +++++++++++
pyglossary/plugins/jmnedict/__init__.py | 295 +-------
pyglossary/plugins/jmnedict/reader.py | 298 ++++++++
pyglossary/plugins/json_plugin/__init__.py | 64 +-
pyglossary/plugins/json_plugin/writer.py | 68 ++
pyglossary/plugins/lingoes_ldf/__init__.py | 134 +---
pyglossary/plugins/lingoes_ldf/reader.py | 77 ++
pyglossary/plugins/lingoes_ldf/writer.py | 66 ++
.../plugins/makindo_medical/__init__.py | 54 +-
pyglossary/plugins/makindo_medical/reader.py | 58 ++
.../plugins/octopus_mdict_new/__init__.py | 220 +-----
.../plugins/octopus_mdict_new/reader.py | 221 ++++++
pyglossary/plugins/sql/__init__.py | 138 +---
pyglossary/plugins/sql/writer.py | 140 ++++
.../plugins/stardict_merge_syns/__init__.py | 133 +---
.../plugins/stardict_merge_syns/writer.py | 137 ++++
.../plugins/stardict_textual/__init__.py | 359 +---------
pyglossary/plugins/stardict_textual/reader.py | 212 ++++++
pyglossary/plugins/stardict_textual/writer.py | 162 +++++
pyglossary/plugins/tabfile/__init__.py | 119 +---
pyglossary/plugins/tabfile/reader.py | 49 ++
pyglossary/plugins/tabfile/writer.py | 59 ++
pyglossary/plugins/testformat/__init__.py | 94 +--
pyglossary/plugins/testformat/reader.py | 57 ++
pyglossary/plugins/testformat/writer.py | 43 ++
pyglossary/plugins/wiktextract/__init__.py | 655 +----------------
pyglossary/plugins/wiktextract/reader.py | 656 ++++++++++++++++++
pyglossary/plugins/wordnet/__init__.py | 324 +--------
pyglossary/plugins/wordnet/reader.py | 330 +++++++++
pyglossary/plugins/wordset/__init__.py | 94 +--
pyglossary/plugins/wordset/reader.py | 97 +++
pyglossary/plugins/xdxf/__init__.py | 253 +------
pyglossary/plugins/xdxf/reader.py | 252 +++++++
pyglossary/plugins/xdxf_css/__init__.py | 282 +-------
pyglossary/plugins/xdxf_css/reader.py | 284 ++++++++
pyglossary/plugins/xdxf_lax/__init__.py | 246 +------
pyglossary/plugins/xdxf_lax/reader.py | 246 +++++++
pyglossary/plugins/yomichan/__init__.py | 247 +------
pyglossary/plugins/yomichan/writer.py | 249 +++++++
pyglossary/plugins/zimfile/__init__.py | 184 +----
pyglossary/plugins/zimfile/reader.py | 184 +++++
tests/deprecated/glossary_security_test.py | 1 +
99 files changed, 9068 insertions(+), 8559 deletions(-)
create mode 100644 pyglossary/plugins/aard2_slob/reader.py
create mode 100644 pyglossary/plugins/aard2_slob/tags.py
create mode 100644 pyglossary/plugins/aard2_slob/writer.py
create mode 100644 pyglossary/plugins/almaany/reader.py
create mode 100644 pyglossary/plugins/ayandict_sqlite/reader.py
create mode 100644 pyglossary/plugins/ayandict_sqlite/writer.py
create mode 100644 pyglossary/plugins/cc_kedict/reader.py
create mode 100644 pyglossary/plugins/crawler_dir/reader.py
create mode 100644 pyglossary/plugins/crawler_dir/writer.py
create mode 100644 pyglossary/plugins/csv_plugin/reader.py
create mode 100644 pyglossary/plugins/csv_plugin/writer.py
create mode 100644 pyglossary/plugins/dicformids/reader.py
create mode 100644 pyglossary/plugins/dicformids/writer.py
create mode 100644 pyglossary/plugins/dict_cc/reader.py
create mode 100644 pyglossary/plugins/dict_cc_split/reader.py
create mode 100644 pyglossary/plugins/dict_org/reader.py
create mode 100644 pyglossary/plugins/dict_org/writer.py
create mode 100644 pyglossary/plugins/dict_org_source/writer.py
create mode 100644 pyglossary/plugins/dictunformat/reader.py
create mode 100644 pyglossary/plugins/digitalnk/reader.py
create mode 100644 pyglossary/plugins/dikt_json/writer.py
create mode 100644 pyglossary/plugins/ebook_epub2/writer.py
create mode 100644 pyglossary/plugins/ebook_kobo/writer.py
create mode 100644 pyglossary/plugins/ebook_kobo_dictfile/reader.py
create mode 100644 pyglossary/plugins/ebook_kobo_dictfile/writer.py
create mode 100644 pyglossary/plugins/ebook_mobi/writer.py
create mode 100644 pyglossary/plugins/edict2/reader.py
create mode 100644 pyglossary/plugins/edlin/reader.py
create mode 100644 pyglossary/plugins/edlin/writer.py
create mode 100644 pyglossary/plugins/gettext_po/reader.py
create mode 100644 pyglossary/plugins/gettext_po/writer.py
create mode 100644 pyglossary/plugins/html_dir/writer.py
create mode 100644 pyglossary/plugins/info_plugin/reader.py
create mode 100644 pyglossary/plugins/jmdict/reader.py
create mode 100644 pyglossary/plugins/jmnedict/reader.py
create mode 100644 pyglossary/plugins/json_plugin/writer.py
create mode 100644 pyglossary/plugins/lingoes_ldf/reader.py
create mode 100644 pyglossary/plugins/lingoes_ldf/writer.py
create mode 100644 pyglossary/plugins/makindo_medical/reader.py
create mode 100644 pyglossary/plugins/octopus_mdict_new/reader.py
create mode 100644 pyglossary/plugins/sql/writer.py
create mode 100644 pyglossary/plugins/stardict_merge_syns/writer.py
create mode 100644 pyglossary/plugins/stardict_textual/reader.py
create mode 100644 pyglossary/plugins/stardict_textual/writer.py
create mode 100644 pyglossary/plugins/tabfile/reader.py
create mode 100644 pyglossary/plugins/tabfile/writer.py
create mode 100644 pyglossary/plugins/testformat/reader.py
create mode 100644 pyglossary/plugins/testformat/writer.py
create mode 100644 pyglossary/plugins/wiktextract/reader.py
create mode 100644 pyglossary/plugins/wordnet/reader.py
create mode 100644 pyglossary/plugins/wordset/reader.py
create mode 100644 pyglossary/plugins/xdxf/reader.py
create mode 100644 pyglossary/plugins/xdxf_css/reader.py
create mode 100644 pyglossary/plugins/xdxf_lax/reader.py
create mode 100644 pyglossary/plugins/yomichan/writer.py
create mode 100644 pyglossary/plugins/zimfile/reader.py
diff --git a/pyglossary/plugins/aard2_slob/__init__.py b/pyglossary/plugins/aard2_slob/__init__.py
index 8d75434ff..6e63ead7a 100644
--- a/pyglossary/plugins/aard2_slob/__init__.py
+++ b/pyglossary/plugins/aard2_slob/__init__.py
@@ -1,19 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
-import os
-import re
-import shutil
-from os.path import isfile, splitext
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- from collections.abc import Generator, Iterator
-
- from pyglossary import slob
- from pyglossary.glossary_types import EntryType, GlossaryType
-
-from pyglossary.core import cacheDir, exc_note, log, pip
from pyglossary.option import (
BoolOption,
FileSizeOption,
@@ -22,6 +9,9 @@
StrOption,
)
+from .reader import Reader
+from .writer import Writer
+
__all__ = [
"Reader",
"Writer",
@@ -92,380 +82,3 @@
" instructions on how to install PyICU.",
),
]
-
-t_created_at = "created.at"
-t_label = "label"
-t_created_by = "created.by"
-t_copyright = "copyright"
-t_license_name = "license.name"
-t_license_url = "license.url"
-t_uri = "uri"
-t_edition = "edition"
-
-supported_tags = {
- t_label,
- t_created_at,
- t_created_by,
- t_copyright,
- t_uri,
- t_edition,
-}
-
-
-class Reader:
- depends = {
- "icu": "PyICU", # >=1.5
- }
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
- self._re_bword = re.compile(
- "(]+?>)",
- re.IGNORECASE,
- )
-
- def close(self) -> None:
- if self._slobObj is not None:
- self._slobObj.close()
- self._clear()
-
- def _clear(self) -> None:
- self._filename = ""
- self._slobObj: slob.Slob | None = None
-
- # TODO: PLR0912 Too many branches (13 > 12)
- def open(self, filename: str) -> None: # noqa: PLR0912
- try:
- import icu # type: ignore # noqa: F401
- except ModuleNotFoundError as e:
- exc_note(e, f"Run `{pip} install PyICU` to install")
- raise
- from pyglossary import slob
-
- self._filename = filename
- self._slobObj = slob.open(filename)
- tags = dict(self._slobObj.tags.items())
-
- if t_label in tags:
- self._glos.setInfo("name", tags[t_label])
-
- if t_created_at in tags:
- self._glos.setInfo("creationTime", tags[t_created_at])
-
- if t_created_by in tags:
- self._glos.setInfo("author", tags[t_created_by])
-
- copyrightLines: list[str] = []
- for key in (t_copyright, t_license_name, t_license_url):
- try:
- value = tags.pop(key)
- except KeyError:
- continue
- copyrightLines.append(value)
- if copyrightLines:
- self._glos.setInfo("copyright", "\n".join(copyrightLines))
-
- if t_uri in tags:
- self._glos.setInfo("website", tags[t_uri])
-
- if t_edition in tags:
- self._glos.setInfo("edition", tags[t_edition])
-
- for key, value in tags.items():
- if key in supported_tags:
- continue
- self._glos.setInfo(f"slob.{key}", value)
-
- def __len__(self) -> int:
- if self._slobObj is None:
- log.error("called len() on a reader which is not open")
- return 0
- return len(self._slobObj)
-
- @staticmethod
- def _href_sub(m: re.Match) -> str:
- st = m.group(0)
- if "//" in st:
- return st
- return st.replace('href="', 'href="bword://').replace(
- "href='",
- "href='bword://",
- )
-
- def __iter__(self) -> Iterator[EntryType | None]:
- from pyglossary.slob import MIME_HTML, MIME_TEXT
-
- if self._slobObj is None:
- raise RuntimeError("iterating over a reader while it's not open")
-
- slobObj = self._slobObj
- blobSet = set()
-
- # slob library gives duplicate blobs when iterating over slobObj
- # even keeping the last id is not enough, since duplicate blobs
- # are not all consecutive. so we have to keep a set of blob IDs
-
- for blob in slobObj:
- id_ = blob.identity
- if id_ in blobSet:
- yield None # update progressbar
- continue
- blobSet.add(id_)
-
- # blob.key is str, blob.content is bytes
- word = blob.key
-
- ctype = blob.content_type.split(";")[0]
- if ctype not in {MIME_HTML, MIME_TEXT}:
- log.debug(f"unknown {blob.content_type=} in {word=}")
- word = word.removeprefix("~/")
- yield self._glos.newDataEntry(word, blob.content)
- continue
- defiFormat = ""
- if ctype == MIME_HTML:
- defiFormat = "h"
- elif ctype == MIME_TEXT:
- defiFormat = "m"
-
- defi = blob.content.decode("utf-8")
- defi = self._re_bword.sub(self._href_sub, defi)
- yield self._glos.newEntry(word, defi, defiFormat=defiFormat)
-
-
-class Writer:
- depends = {
- "icu": "PyICU",
- }
-
- _compression: str = "zlib"
- _content_type: str = ""
- _file_size_approx: int = 0
- _file_size_approx_check_num_entries = 100
- _separate_alternates: bool = False
- _word_title: bool = False
- _version_info: bool = False
-
- _audio_goldendict: bool = False
-
- resourceMimeTypes = {
- "png": "image/png",
- "jpeg": "image/jpeg",
- "jpg": "image/jpeg",
- "gif": "image/gif",
- "svg": "image/svg+xml",
- "webp": "image/webp",
- "tiff": "image/tiff",
- "tif": "image/tiff",
- "bmp": "image/bmp",
- "css": "text/css",
- "js": "application/javascript",
- "json": "application/json",
- "woff": "application/font-woff",
- "woff2": "application/font-woff2",
- "ttf": "application/x-font-ttf",
- "otf": "application/x-font-opentype",
- "mp3": "audio/mpeg",
- "ogg": "audio/ogg",
- "spx": "audio/x-speex",
- "wav": "audio/wav",
- "ini": "text/plain",
- # "application/octet-stream+xapian",
- "eot": "application/vnd.ms-fontobject",
- "pdf": "application/pdf",
- "mp4": "video/mp4",
- }
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = ""
- self._resPrefix = ""
- self._slobWriter: slob.Writer | None = None
-
- @staticmethod
- def _slobObserver(
- event: slob.WriterEvent, # noqa: F401, F821
- ) -> None:
- log.debug(f"slob: {event.name}{': ' + event.data if event.data else ''}")
-
- def _open(self, filepath: str, namePostfix: str) -> slob.Writer:
- from pyglossary import slob
-
- if isfile(filepath):
- shutil.move(filepath, f"{filepath}.bak")
- log.warning(f"renamed existing {filepath!r} to {filepath + '.bak'!r}")
- self._slobWriter = slobWriter = slob.Writer(
- filepath,
- observer=self._slobObserver,
- workdir=cacheDir,
- compression=self._compression,
- version_info=self._version_info,
- )
-
- # "label" tag is a dictionary name shown in UI
- slobWriter.tag(t_label, self._glos.getInfo("name") + namePostfix)
-
- createdAt = self._glos.getInfo("creationTime")
- if createdAt is not None:
- slobWriter.tag(t_created_at, createdAt)
- createdBy = self._glos.getInfo("author")
- if createdBy is not None:
- slobWriter.tag(t_created_by, createdBy)
-
- filename = os.path.basename(filepath)
- dic_uri = re.sub(r"[^A-Za-z0-9_-]+", "_", filename)
- # "uri" tag is not web url, it's a part of gloss addressing ID: uri + article ID
- # setting the tag allows bookmark & history migration, if dict file is updated
- # we use source filename as "uri", since it is stable (most likely)
- slobWriter.tag(t_uri, dic_uri)
-
- return slobWriter
-
- def open(self, filename: str) -> None:
- try:
- import icu # noqa: F401
- except ModuleNotFoundError as e:
- exc_note(e, f"Run `{pip} install PyICU` to install")
- raise
- if isfile(filename):
- raise OSError(f"File '{filename}' already exists")
- namePostfix = ""
- if self._file_size_approx > 0:
- namePostfix = " (part 1)"
- self._open(filename, namePostfix)
- self._filename = filename
-
- def finish(self) -> None:
- from time import perf_counter
-
- self._filename = ""
- if self._slobWriter is None:
- return
- log.info("Finalizing slob file...")
- t0 = perf_counter()
- self._slobWriter.finalize()
- log.info(f"Finalizing slob file took {perf_counter() - t0:.1f} seconds")
- self._slobWriter = None
-
- def addDataEntry(self, entry: EntryType) -> None:
- slobWriter = self._slobWriter
- if slobWriter is None:
- raise ValueError("slobWriter is None")
- rel_path = entry.s_word
- _, ext = splitext(rel_path)
- ext = ext.lstrip(os.path.extsep).lower()
- content_type = self.resourceMimeTypes.get(ext)
- if not content_type:
- log.error(f"Aard2 slob: unknown content type for {rel_path!r}")
- return
- content = entry.data
- key = self._resPrefix + rel_path
- try:
- key.encode(slobWriter.encoding)
- except UnicodeEncodeError:
- log.error(f"Failed to add, broken unicode in key: {key!a}")
- return
- slobWriter.add(content, key, content_type=content_type)
-
- def addEntry(self, entry: EntryType) -> None:
- words = entry.l_word
- b_defi = entry.defi.encode("utf-8")
- ctype = self._content_type
- writer = self._slobWriter
- if writer is None:
- raise ValueError("slobWriter is None")
-
- entry.detectDefiFormat()
- defiFormat = entry.defiFormat
-
- if self._word_title and defiFormat in {"h", "m"}:
- if defiFormat == "m":
- defiFormat = "h"
- title = self._glos.wordTitleStr(
- words[0],
- )
- b_defi = title.encode("utf-8") + b_defi
-
- if defiFormat == "h":
- b_defi = b_defi.replace(b'"bword://', b'"')
- b_defi = b_defi.replace(b"'bword://", b"'")
-
- if not self._audio_goldendict:
- b_defi = b_defi.replace(
- b"""href="sound://""",
- b'''onclick="new Audio(this.href).play(); return false;" href="''',
- )
- b_defi = b_defi.replace(
- b"""href='sound://""",
- b"""onclick="new Audio(this.href).play(); return false;" href='""",
- )
- b_defi = b_defi.replace(b""" Generator[None, EntryType, None]:
- slobWriter = self._slobWriter
- if slobWriter is None:
- raise ValueError("slobWriter is None")
- file_size_approx = int(self._file_size_approx * 0.95)
- entryCount = 0
- sumBlobSize = 0
- fileIndex = 0
- filenameNoExt, _ = splitext(self._filename)
- while True:
- entry = yield
- if entry is None:
- break
-
- if entry.isData():
- self.addDataEntry(entry)
- else:
- self.addEntry(entry)
-
- if file_size_approx <= 0:
- continue
-
- # handle file_size_approx
- check_every = self._file_size_approx_check_num_entries
- entryCount += 1
- if entryCount % check_every == 0:
- sumBlobSize = slobWriter.size_data()
- if sumBlobSize >= file_size_approx:
- slobWriter.finalize()
- fileIndex += 1
- slobWriter = self._open(
- f"{filenameNoExt}.{fileIndex}.slob",
- f" (part {fileIndex + 1})",
- )
- sumBlobSize = 0
- entryCount = 0
diff --git a/pyglossary/plugins/aard2_slob/reader.py b/pyglossary/plugins/aard2_slob/reader.py
new file mode 100644
index 000000000..c80fdffb8
--- /dev/null
+++ b/pyglossary/plugins/aard2_slob/reader.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from collections.abc import Iterator
+
+ from pyglossary import slob
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+from pyglossary.core import exc_note, log, pip
+from pyglossary.plugins.aard2_slob.tags import (
+ supported_tags,
+ t_copyright,
+ t_created_at,
+ t_created_by,
+ t_edition,
+ t_label,
+ t_license_name,
+ t_license_url,
+ t_uri,
+)
+
+
+class Reader:
+ depends = {
+ "icu": "PyICU", # >=1.5
+ }
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+ self._re_bword = re.compile(
+ "(]+?>)",
+ re.IGNORECASE,
+ )
+
+ def close(self) -> None:
+ if self._slobObj is not None:
+ self._slobObj.close()
+ self._clear()
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._slobObj: slob.Slob | None = None
+
+ # TODO: PLR0912 Too many branches (13 > 12)
+ def open(self, filename: str) -> None: # noqa: PLR0912
+ try:
+ import icu # type: ignore # noqa: F401
+ except ModuleNotFoundError as e:
+ exc_note(e, f"Run `{pip} install PyICU` to install")
+ raise
+ from pyglossary import slob
+
+ self._filename = filename
+ self._slobObj = slob.open(filename)
+ tags = dict(self._slobObj.tags.items())
+
+ if t_label in tags:
+ self._glos.setInfo("name", tags[t_label])
+
+ if t_created_at in tags:
+ self._glos.setInfo("creationTime", tags[t_created_at])
+
+ if t_created_by in tags:
+ self._glos.setInfo("author", tags[t_created_by])
+
+ copyrightLines: list[str] = []
+ for key in (t_copyright, t_license_name, t_license_url):
+ try:
+ value = tags.pop(key)
+ except KeyError:
+ continue
+ copyrightLines.append(value)
+ if copyrightLines:
+ self._glos.setInfo("copyright", "\n".join(copyrightLines))
+
+ if t_uri in tags:
+ self._glos.setInfo("website", tags[t_uri])
+
+ if t_edition in tags:
+ self._glos.setInfo("edition", tags[t_edition])
+
+ for key, value in tags.items():
+ if key in supported_tags:
+ continue
+ self._glos.setInfo(f"slob.{key}", value)
+
+ def __len__(self) -> int:
+ if self._slobObj is None:
+ log.error("called len() on a reader which is not open")
+ return 0
+ return len(self._slobObj)
+
+ @staticmethod
+ def _href_sub(m: re.Match) -> str:
+ st = m.group(0)
+ if "//" in st:
+ return st
+ return st.replace('href="', 'href="bword://').replace(
+ "href='",
+ "href='bword://",
+ )
+
+ def __iter__(self) -> Iterator[EntryType | None]:
+ from pyglossary.slob import MIME_HTML, MIME_TEXT
+
+ if self._slobObj is None:
+ raise RuntimeError("iterating over a reader while it's not open")
+
+ slobObj = self._slobObj
+ blobSet = set()
+
+ # slob library gives duplicate blobs when iterating over slobObj
+ # even keeping the last id is not enough, since duplicate blobs
+ # are not all consecutive. so we have to keep a set of blob IDs
+
+ for blob in slobObj:
+ id_ = blob.identity
+ if id_ in blobSet:
+ yield None # update progressbar
+ continue
+ blobSet.add(id_)
+
+ # blob.key is str, blob.content is bytes
+ word = blob.key
+
+ ctype = blob.content_type.split(";")[0]
+ if ctype not in {MIME_HTML, MIME_TEXT}:
+ log.debug(f"unknown {blob.content_type=} in {word=}")
+ word = word.removeprefix("~/")
+ yield self._glos.newDataEntry(word, blob.content)
+ continue
+ defiFormat = ""
+ if ctype == MIME_HTML:
+ defiFormat = "h"
+ elif ctype == MIME_TEXT:
+ defiFormat = "m"
+
+ defi = blob.content.decode("utf-8")
+ defi = self._re_bword.sub(self._href_sub, defi)
+ yield self._glos.newEntry(word, defi, defiFormat=defiFormat)
diff --git a/pyglossary/plugins/aard2_slob/tags.py b/pyglossary/plugins/aard2_slob/tags.py
new file mode 100644
index 000000000..e4336a02e
--- /dev/null
+++ b/pyglossary/plugins/aard2_slob/tags.py
@@ -0,0 +1,29 @@
+t_created_at = "created.at"
+t_label = "label"
+t_created_by = "created.by"
+t_copyright = "copyright"
+t_license_name = "license.name"
+t_license_url = "license.url"
+t_uri = "uri"
+t_edition = "edition"
+
+supported_tags = {
+ t_label,
+ t_created_at,
+ t_created_by,
+ t_copyright,
+ t_uri,
+ t_edition,
+}
+
+__all__ = [
+ "supported_tags",
+ "t_copyright",
+ "t_created_at",
+ "t_created_by",
+ "t_edition",
+ "t_label",
+ "t_license_name",
+ "t_license_url",
+ "t_uri",
+]
diff --git a/pyglossary/plugins/aard2_slob/writer.py b/pyglossary/plugins/aard2_slob/writer.py
new file mode 100644
index 000000000..c8519f987
--- /dev/null
+++ b/pyglossary/plugins/aard2_slob/writer.py
@@ -0,0 +1,260 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import os
+import re
+import shutil
+from os.path import isfile, splitext
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from collections.abc import Generator
+
+ from pyglossary import slob
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+from pyglossary.core import cacheDir, exc_note, log, pip
+from pyglossary.plugins.aard2_slob.tags import (
+ t_created_at,
+ t_created_by,
+ t_label,
+ t_uri,
+)
+
+
+class Writer:
+ depends = {
+ "icu": "PyICU",
+ }
+
+ _compression: str = "zlib"
+ _content_type: str = ""
+ _file_size_approx: int = 0
+ _file_size_approx_check_num_entries = 100
+ _separate_alternates: bool = False
+ _word_title: bool = False
+ _version_info: bool = False
+
+ _audio_goldendict: bool = False
+
+ resourceMimeTypes = {
+ "png": "image/png",
+ "jpeg": "image/jpeg",
+ "jpg": "image/jpeg",
+ "gif": "image/gif",
+ "svg": "image/svg+xml",
+ "webp": "image/webp",
+ "tiff": "image/tiff",
+ "tif": "image/tiff",
+ "bmp": "image/bmp",
+ "css": "text/css",
+ "js": "application/javascript",
+ "json": "application/json",
+ "woff": "application/font-woff",
+ "woff2": "application/font-woff2",
+ "ttf": "application/x-font-ttf",
+ "otf": "application/x-font-opentype",
+ "mp3": "audio/mpeg",
+ "ogg": "audio/ogg",
+ "spx": "audio/x-speex",
+ "wav": "audio/wav",
+ "ini": "text/plain",
+ # "application/octet-stream+xapian",
+ "eot": "application/vnd.ms-fontobject",
+ "pdf": "application/pdf",
+ "mp4": "video/mp4",
+ }
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = ""
+ self._resPrefix = ""
+ self._slobWriter: slob.Writer | None = None
+
+ @staticmethod
+ def _slobObserver(
+ event: slob.WriterEvent, # noqa: F401, F821
+ ) -> None:
+ log.debug(f"slob: {event.name}{': ' + event.data if event.data else ''}")
+
+ def _open(self, filepath: str, namePostfix: str) -> slob.Writer:
+ from pyglossary import slob
+
+ if isfile(filepath):
+ shutil.move(filepath, f"{filepath}.bak")
+ log.warning(f"renamed existing {filepath!r} to {filepath + '.bak'!r}")
+ self._slobWriter = slobWriter = slob.Writer(
+ filepath,
+ observer=self._slobObserver,
+ workdir=cacheDir,
+ compression=self._compression,
+ version_info=self._version_info,
+ )
+
+ # "label" tag is a dictionary name shown in UI
+ slobWriter.tag(t_label, self._glos.getInfo("name") + namePostfix)
+
+ createdAt = self._glos.getInfo("creationTime")
+ if createdAt is not None:
+ slobWriter.tag(t_created_at, createdAt)
+ createdBy = self._glos.getInfo("author")
+ if createdBy is not None:
+ slobWriter.tag(t_created_by, createdBy)
+
+ filename = os.path.basename(filepath)
+ dic_uri = re.sub(r"[^A-Za-z0-9_-]+", "_", filename)
+ # "uri" tag is not web url, it's a part of gloss addressing ID: uri + article ID
+ # setting the tag allows bookmark & history migration, if dict file is updated
+ # we use source filename as "uri", since it is stable (most likely)
+ slobWriter.tag(t_uri, dic_uri)
+
+ return slobWriter
+
+ def open(self, filename: str) -> None:
+ try:
+ import icu # noqa: F401
+ except ModuleNotFoundError as e:
+ exc_note(e, f"Run `{pip} install PyICU` to install")
+ raise
+ if isfile(filename):
+ raise OSError(f"File '{filename}' already exists")
+ namePostfix = ""
+ if self._file_size_approx > 0:
+ namePostfix = " (part 1)"
+ self._open(filename, namePostfix)
+ self._filename = filename
+
+ def finish(self) -> None:
+ from time import perf_counter
+
+ self._filename = ""
+ if self._slobWriter is None:
+ return
+ log.info("Finalizing slob file...")
+ t0 = perf_counter()
+ self._slobWriter.finalize()
+ log.info(f"Finalizing slob file took {perf_counter() - t0:.1f} seconds")
+ self._slobWriter = None
+
+ def addDataEntry(self, entry: EntryType) -> None:
+ slobWriter = self._slobWriter
+ if slobWriter is None:
+ raise ValueError("slobWriter is None")
+ rel_path = entry.s_word
+ _, ext = splitext(rel_path)
+ ext = ext.lstrip(os.path.extsep).lower()
+ content_type = self.resourceMimeTypes.get(ext)
+ if not content_type:
+ log.error(f"Aard2 slob: unknown content type for {rel_path!r}")
+ return
+ content = entry.data
+ key = self._resPrefix + rel_path
+ try:
+ key.encode(slobWriter.encoding)
+ except UnicodeEncodeError:
+ log.error(f"Failed to add, broken unicode in key: {key!a}")
+ return
+ slobWriter.add(content, key, content_type=content_type)
+
+ def addEntry(self, entry: EntryType) -> None:
+ words = entry.l_word
+ b_defi = entry.defi.encode("utf-8")
+ ctype = self._content_type
+ writer = self._slobWriter
+ if writer is None:
+ raise ValueError("slobWriter is None")
+
+ entry.detectDefiFormat()
+ defiFormat = entry.defiFormat
+
+ if self._word_title and defiFormat in {"h", "m"}:
+ if defiFormat == "m":
+ defiFormat = "h"
+ title = self._glos.wordTitleStr(
+ words[0],
+ )
+ b_defi = title.encode("utf-8") + b_defi
+
+ if defiFormat == "h":
+ b_defi = b_defi.replace(b'"bword://', b'"')
+ b_defi = b_defi.replace(b"'bword://", b"'")
+
+ if not self._audio_goldendict:
+ b_defi = b_defi.replace(
+ b"""href="sound://""",
+ b'''onclick="new Audio(this.href).play(); return false;" href="''',
+ )
+ b_defi = b_defi.replace(
+ b"""href='sound://""",
+ b"""onclick="new Audio(this.href).play(); return false;" href='""",
+ )
+ b_defi = b_defi.replace(b""" Generator[None, EntryType, None]:
+ slobWriter = self._slobWriter
+ if slobWriter is None:
+ raise ValueError("slobWriter is None")
+ file_size_approx = int(self._file_size_approx * 0.95)
+ entryCount = 0
+ sumBlobSize = 0
+ fileIndex = 0
+ filenameNoExt, _ = splitext(self._filename)
+ while True:
+ entry = yield
+ if entry is None:
+ break
+
+ if entry.isData():
+ self.addDataEntry(entry)
+ else:
+ self.addEntry(entry)
+
+ if file_size_approx <= 0:
+ continue
+
+ # handle file_size_approx
+ check_every = self._file_size_approx_check_num_entries
+ entryCount += 1
+ if entryCount % check_every == 0:
+ sumBlobSize = slobWriter.size_data()
+ if sumBlobSize >= file_size_approx:
+ slobWriter.finalize()
+ fileIndex += 1
+ slobWriter = self._open(
+ f"{filenameNoExt}.{fileIndex}.slob",
+ f" (part {fileIndex + 1})",
+ )
+ sumBlobSize = 0
+ entryCount = 0
diff --git a/pyglossary/plugins/almaany/__init__.py b/pyglossary/plugins/almaany/__init__.py
index 9a49bb167..8838cfd62 100644
--- a/pyglossary/plugins/almaany/__init__.py
+++ b/pyglossary/plugins/almaany/__init__.py
@@ -1,16 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
-import html
from typing import TYPE_CHECKING
if TYPE_CHECKING:
- import sqlite3
- from collections.abc import Iterator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
from pyglossary.option import Option
+from .reader import Reader
+
__all__ = [
"Reader",
"description",
@@ -40,80 +37,3 @@
"Almaany.com Arabic Dictionary - Google Play",
)
optionsProp: dict[str, Option] = {}
-
-
-class Reader:
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
-
- def _clear(self) -> None:
- self._filename = ""
- self._con: sqlite3.Connection | None = None
- self._cur: sqlite3.Cursor | None = None
-
- def open(self, filename: str) -> None:
- from sqlite3 import connect
-
- self._filename = filename
- self._con = connect(filename)
- self._cur = self._con.cursor()
- self._glos.setDefaultDefiFormat("h")
-
- def __len__(self) -> int:
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute("select count(*) from WordsTable")
- return self._cur.fetchone()[0]
-
- def __iter__(self) -> Iterator[EntryType]:
- if self._cur is None:
- raise ValueError("cur is None")
- from pyglossary.langs.writing_system import getWritingSystemFromText
-
- alternateDict: dict[str, list[str]] = {}
- self._cur.execute("select wordkey, searchwordkey from Keys")
- for row in self._cur.fetchall():
- if row[0] in alternateDict:
- alternateDict[row[0]].append(row[1])
- else:
- alternateDict[row[0]] = [row[1]]
-
- self._cur.execute(
- "select word, searchword, root, meaning from WordsTable order by id",
- )
- # FIXME: iteration over self._cur stops after one entry
- # and self._cur.fetchone() returns None
- # for row in self._cur:
- for row in self._cur.fetchall():
- word = row[0]
- searchword = row[1]
- root = row[2]
- meaning = row[3]
- definition = meaning
- definition = definition.replace("|", "
")
-
- if root:
- definition += (
- f'
Root: {root}'
- )
-
- ws = getWritingSystemFromText(meaning)
- if ws and ws.direction == "rtl":
- definition = f'{definition}
'
-
- words = [word, searchword]
- if word in alternateDict:
- words += alternateDict[word]
- yield self._glos.newEntry(
- words,
- definition,
- defiFormat="h",
- )
-
- def close(self) -> None:
- if self._cur:
- self._cur.close()
- if self._con:
- self._con.close()
- self._clear()
diff --git a/pyglossary/plugins/almaany/reader.py b/pyglossary/plugins/almaany/reader.py
new file mode 100644
index 000000000..3447c1010
--- /dev/null
+++ b/pyglossary/plugins/almaany/reader.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import html
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ import sqlite3
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._con: sqlite3.Connection | None = None
+ self._cur: sqlite3.Cursor | None = None
+
+ def open(self, filename: str) -> None:
+ from sqlite3 import connect
+
+ self._filename = filename
+ self._con = connect(filename)
+ self._cur = self._con.cursor()
+ self._glos.setDefaultDefiFormat("h")
+
+ def __len__(self) -> int:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute("select count(*) from WordsTable")
+ return self._cur.fetchone()[0]
+
+ def __iter__(self) -> Iterator[EntryType]:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ from pyglossary.langs.writing_system import getWritingSystemFromText
+
+ alternateDict: dict[str, list[str]] = {}
+ self._cur.execute("select wordkey, searchwordkey from Keys")
+ for row in self._cur.fetchall():
+ if row[0] in alternateDict:
+ alternateDict[row[0]].append(row[1])
+ else:
+ alternateDict[row[0]] = [row[1]]
+
+ self._cur.execute(
+ "select word, searchword, root, meaning from WordsTable order by id",
+ )
+ # FIXME: iteration over self._cur stops after one entry
+ # and self._cur.fetchone() returns None
+ # for row in self._cur:
+ for row in self._cur.fetchall():
+ word = row[0]
+ searchword = row[1]
+ root = row[2]
+ meaning = row[3]
+ definition = meaning
+ definition = definition.replace("|", "
")
+
+ if root:
+ definition += (
+ f'
Root: {root}'
+ )
+
+ ws = getWritingSystemFromText(meaning)
+ if ws and ws.direction == "rtl":
+ definition = f'{definition}
'
+
+ words = [word, searchword]
+ if word in alternateDict:
+ words += alternateDict[word]
+ yield self._glos.newEntry(
+ words,
+ definition,
+ defiFormat="h",
+ )
+
+ def close(self) -> None:
+ if self._cur:
+ self._cur.close()
+ if self._con:
+ self._con.close()
+ self._clear()
diff --git a/pyglossary/plugins/ayandict_sqlite/__init__.py b/pyglossary/plugins/ayandict_sqlite/__init__.py
index 5ac40b37b..a86e83029 100644
--- a/pyglossary/plugins/ayandict_sqlite/__init__.py
+++ b/pyglossary/plugins/ayandict_sqlite/__init__.py
@@ -1,20 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
-from typing import (
- TYPE_CHECKING,
-)
-
-if TYPE_CHECKING:
- import sqlite3
- from collections.abc import Generator, Iterator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
- from pyglossary.xdxf.transform import XdxfTransformer
-
-from pyglossary.core import log
from pyglossary.option import BoolOption, Option
+from .reader import Reader
+from .writer import Writer
+
__all__ = [
"Reader",
"Writer",
@@ -49,194 +40,3 @@
comment="Create fuzzy search data",
),
}
-
-
-class Reader:
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
-
- def _clear(self) -> None:
- self._filename = ""
- self._con: sqlite3.Connection | None = None
- self._cur: sqlite3.Cursor | None = None
-
- def open(self, filename: str) -> None:
- from sqlite3 import connect
-
- self._filename = filename
- self._con = connect(filename)
- self._cur = self._con.cursor()
- self._glos.setDefaultDefiFormat("h")
-
- self._cur.execute("SELECT key, value FROM meta;")
- for row in self._cur.fetchall():
- if row[0] == "hash":
- continue
- self._glos.setInfo(row[0], row[1])
-
- def __len__(self) -> int:
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute("select count(id) from entry")
- return self._cur.fetchone()[0]
-
- def __iter__(self) -> Iterator[EntryType]:
- from json import loads
-
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute(
- "SELECT entry.term, entry.article, "
- "json_group_array(alt.term)"
- "FROM entry LEFT JOIN alt ON entry.id=alt.id "
- "GROUP BY entry.id;",
- )
- for row in self._cur.fetchall():
- terms = [row[0]] + [alt for alt in loads(row[2]) if alt]
- article = row[1]
- yield self._glos.newEntry(terms, article, defiFormat="h")
-
- def close(self) -> None:
- if self._cur:
- self._cur.close()
- if self._con:
- self._con.close()
- self._clear()
-
-
-class Writer:
- _fuzzy: int = True
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
-
- def _clear(self) -> None:
- self._filename = ""
- self._con: sqlite3.Connection | None = None
- self._cur: sqlite3.Cursor | None = None
- self._xdxfTr: XdxfTransformer | None = None
-
- def open(self, filename: str) -> None:
- from sqlite3 import connect
-
- self._filename = filename
- con = self._con = connect(filename)
- self._cur = self._con.cursor()
-
- for query in (
- "CREATE TABLE meta ('key' TEXT PRIMARY KEY NOT NULL, 'value' TEXT);",
- (
- "CREATE TABLE entry ('id' INTEGER PRIMARY KEY NOT NULL, "
- "'term' TEXT, 'article' TEXT);"
- ),
- "CREATE TABLE alt ('id' INTEGER NOT NULL, 'term' TEXT);",
- "CREATE INDEX idx_meta ON meta(key);",
- "CREATE INDEX idx_entry_term ON entry(term COLLATE NOCASE);",
- "CREATE INDEX idx_alt_id ON alt(id);",
- "CREATE INDEX idx_alt_term ON alt(term COLLATE NOCASE);",
- ):
- try:
- con.execute(query)
- except Exception as e: # noqa: PERF203
- log.error(f"query: {query}")
- raise e
-
- for key, value in self._glos.iterInfo():
- con.execute(
- "INSERT INTO meta (key, value) VALUES (?, ?);",
- (key, value),
- )
-
- if self._fuzzy:
- con.execute(
- "CREATE TABLE fuzzy3 ('sub' TEXT NOT NULL, "
- "'term' TEXT NOT NULL, "
- "id INTEGER NOT NULL);",
- )
- con.execute(
- "CREATE INDEX idx_fuzzy3_sub ON fuzzy3(sub COLLATE NOCASE);",
- )
-
- con.commit()
-
- def finish(self) -> None:
- if self._con is None or self._cur is None:
- return
-
- self._con.commit()
- self._con.close()
- self._con = None
- self._cur = None
-
- def xdxf_setup(self) -> None:
- from pyglossary.xdxf.transform import XdxfTransformer
-
- # if self._xsl:
- # self._xdxfTr = XslXdxfTransformer(encoding="utf-8")
- # return
- self._xdxfTr = XdxfTransformer(encoding="utf-8")
-
- def xdxf_transform(self, text: str) -> str:
- if self._xdxfTr is None:
- self.xdxf_setup()
- return self._xdxfTr.transformByInnerString(text) # type: ignore
-
- def write(self) -> Generator[None, EntryType, None]:
- import hashlib
-
- cur = self._cur
- if cur is None:
- raise ValueError("cur is None")
- hash_ = hashlib.md5()
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- # can save it with entry.save(directory)
- continue
- defi = entry.defi
- entry.detectDefiFormat()
- if entry.defiFormat == "m":
- if "\n" in defi:
- defi = f"{defi}
"
- elif entry.defiFormat == "x":
- defi = self.xdxf_transform(defi)
-
- cur.execute(
- "INSERT INTO entry(term, article) VALUES (?, ?);",
- (entry.l_word[0], defi),
- )
- id_ = cur.lastrowid
- if id_ is None:
- raise ValueError("lastrowid is None")
- for alt in entry.l_word[1:]:
- cur.execute(
- "INSERT INTO alt(id, term) VALUES (?, ?);",
- (id_, alt),
- )
- hash_.update(entry.s_word.encode("utf-8"))
- if self._fuzzy:
- self.addFuzzy(id_, entry.l_word)
-
- cur.execute(
- "INSERT INTO meta (key, value) VALUES (?, ?);",
- ("hash", hash_.hexdigest()),
- )
-
- def addFuzzy(self, id_: int, terms: list[str]) -> None:
- cur = self._cur
- if cur is None:
- raise ValueError("cur is None")
- for term in terms:
- subs: set[str] = set()
- for word in term.split(" "):
- eword = "\n" + word
- subs.update(eword[i : i + 3] for i in range(len(eword) - 2))
- for sub in subs:
- cur.execute(
- "INSERT INTO fuzzy3(sub, term, id) VALUES (?, ?, ?);",
- (sub, term, id_),
- )
diff --git a/pyglossary/plugins/ayandict_sqlite/reader.py b/pyglossary/plugins/ayandict_sqlite/reader.py
new file mode 100644
index 000000000..b1ed0b6eb
--- /dev/null
+++ b/pyglossary/plugins/ayandict_sqlite/reader.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+from typing import (
+ TYPE_CHECKING,
+)
+
+if TYPE_CHECKING:
+ import sqlite3
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._con: sqlite3.Connection | None = None
+ self._cur: sqlite3.Cursor | None = None
+
+ def open(self, filename: str) -> None:
+ from sqlite3 import connect
+
+ self._filename = filename
+ self._con = connect(filename)
+ self._cur = self._con.cursor()
+ self._glos.setDefaultDefiFormat("h")
+
+ self._cur.execute("SELECT key, value FROM meta;")
+ for row in self._cur.fetchall():
+ if row[0] == "hash":
+ continue
+ self._glos.setInfo(row[0], row[1])
+
+ def __len__(self) -> int:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute("select count(id) from entry")
+ return self._cur.fetchone()[0]
+
+ def __iter__(self) -> Iterator[EntryType]:
+ from json import loads
+
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute(
+ "SELECT entry.term, entry.article, "
+ "json_group_array(alt.term)"
+ "FROM entry LEFT JOIN alt ON entry.id=alt.id "
+ "GROUP BY entry.id;",
+ )
+ for row in self._cur.fetchall():
+ terms = [row[0]] + [alt for alt in loads(row[2]) if alt]
+ article = row[1]
+ yield self._glos.newEntry(terms, article, defiFormat="h")
+
+ def close(self) -> None:
+ if self._cur:
+ self._cur.close()
+ if self._con:
+ self._con.close()
+ self._clear()
diff --git a/pyglossary/plugins/ayandict_sqlite/writer.py b/pyglossary/plugins/ayandict_sqlite/writer.py
new file mode 100644
index 000000000..810631c71
--- /dev/null
+++ b/pyglossary/plugins/ayandict_sqlite/writer.py
@@ -0,0 +1,152 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+from typing import (
+ TYPE_CHECKING,
+)
+
+if TYPE_CHECKING:
+ import sqlite3
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+ from pyglossary.xdxf.transform import XdxfTransformer
+
+from pyglossary.core import log
+
+
+class Writer:
+ _fuzzy: int = True
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._con: sqlite3.Connection | None = None
+ self._cur: sqlite3.Cursor | None = None
+ self._xdxfTr: XdxfTransformer | None = None
+
+ def open(self, filename: str) -> None:
+ from sqlite3 import connect
+
+ self._filename = filename
+ con = self._con = connect(filename)
+ self._cur = self._con.cursor()
+
+ for query in (
+ "CREATE TABLE meta ('key' TEXT PRIMARY KEY NOT NULL, 'value' TEXT);",
+ (
+ "CREATE TABLE entry ('id' INTEGER PRIMARY KEY NOT NULL, "
+ "'term' TEXT, 'article' TEXT);"
+ ),
+ "CREATE TABLE alt ('id' INTEGER NOT NULL, 'term' TEXT);",
+ "CREATE INDEX idx_meta ON meta(key);",
+ "CREATE INDEX idx_entry_term ON entry(term COLLATE NOCASE);",
+ "CREATE INDEX idx_alt_id ON alt(id);",
+ "CREATE INDEX idx_alt_term ON alt(term COLLATE NOCASE);",
+ ):
+ try:
+ con.execute(query)
+ except Exception as e: # noqa: PERF203
+ log.error(f"query: {query}")
+ raise e
+
+ for key, value in self._glos.iterInfo():
+ con.execute(
+ "INSERT INTO meta (key, value) VALUES (?, ?);",
+ (key, value),
+ )
+
+ if self._fuzzy:
+ con.execute(
+ "CREATE TABLE fuzzy3 ('sub' TEXT NOT NULL, "
+ "'term' TEXT NOT NULL, "
+ "id INTEGER NOT NULL);",
+ )
+ con.execute(
+ "CREATE INDEX idx_fuzzy3_sub ON fuzzy3(sub COLLATE NOCASE);",
+ )
+
+ con.commit()
+
+ def finish(self) -> None:
+ if self._con is None or self._cur is None:
+ return
+
+ self._con.commit()
+ self._con.close()
+ self._con = None
+ self._cur = None
+
+ def xdxf_setup(self) -> None:
+ from pyglossary.xdxf.transform import XdxfTransformer
+
+ # if self._xsl:
+ # self._xdxfTr = XslXdxfTransformer(encoding="utf-8")
+ # return
+ self._xdxfTr = XdxfTransformer(encoding="utf-8")
+
+ def xdxf_transform(self, text: str) -> str:
+ if self._xdxfTr is None:
+ self.xdxf_setup()
+ return self._xdxfTr.transformByInnerString(text) # type: ignore
+
+ def write(self) -> Generator[None, EntryType, None]:
+ import hashlib
+
+ cur = self._cur
+ if cur is None:
+ raise ValueError("cur is None")
+ hash_ = hashlib.md5()
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ # can save it with entry.save(directory)
+ continue
+ defi = entry.defi
+ entry.detectDefiFormat()
+ if entry.defiFormat == "m":
+ if "\n" in defi:
+ defi = f"{defi}
"
+ elif entry.defiFormat == "x":
+ defi = self.xdxf_transform(defi)
+
+ cur.execute(
+ "INSERT INTO entry(term, article) VALUES (?, ?);",
+ (entry.l_word[0], defi),
+ )
+ id_ = cur.lastrowid
+ if id_ is None:
+ raise ValueError("lastrowid is None")
+ for alt in entry.l_word[1:]:
+ cur.execute(
+ "INSERT INTO alt(id, term) VALUES (?, ?);",
+ (id_, alt),
+ )
+ hash_.update(entry.s_word.encode("utf-8"))
+ if self._fuzzy:
+ self.addFuzzy(id_, entry.l_word)
+
+ cur.execute(
+ "INSERT INTO meta (key, value) VALUES (?, ?);",
+ ("hash", hash_.hexdigest()),
+ )
+
+ def addFuzzy(self, id_: int, terms: list[str]) -> None:
+ cur = self._cur
+ if cur is None:
+ raise ValueError("cur is None")
+ for term in terms:
+ subs: set[str] = set()
+ for word in term.split(" "):
+ eword = "\n" + word
+ subs.update(eword[i : i + 3] for i in range(len(eword) - 2))
+ for sub in subs:
+ cur.execute(
+ "INSERT INTO fuzzy3(sub, term, id) VALUES (?, ?, ?);",
+ (sub, term, id_),
+ )
diff --git a/pyglossary/plugins/cc_kedict/__init__.py b/pyglossary/plugins/cc_kedict/__init__.py
index 772c2ff6b..5289633ef 100644
--- a/pyglossary/plugins/cc_kedict/__init__.py
+++ b/pyglossary/plugins/cc_kedict/__init__.py
@@ -2,20 +2,12 @@
# mypy: ignore-errors
from __future__ import annotations
-from io import BytesIO
-from os.path import isdir, join
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
if TYPE_CHECKING:
- from collections.abc import Callable, Iterator
-
- import lxml
-
- from pyglossary.glossary_types import EntryType, GlossaryType
from pyglossary.option import Option
-from pyglossary.core import exc_note, log, pip
-from pyglossary.text_reader import TextGlossaryReader
+from .reader import Reader
__all__ = [
"Reader",
@@ -46,295 +38,3 @@
"@mhagiwara/cc-kedict",
)
optionsProp: dict[str, Option] = {}
-
-
-class YamlReader(TextGlossaryReader):
- tagStyle = (
- "color:white;"
- "background:green;"
- "padding-left:3px;"
- "padding-right:3px;"
- "border-radius:0.5ex;"
- # 0.5ex ~= 0.3em, but "ex" is recommended
- )
-
- def __init__( # noqa: PLR0913
- self,
- glos: GlossaryType,
- spellKey: str = "",
- posKey: str = "",
- synsKey: str = "",
- tagsKey: str = "",
- ) -> None:
- TextGlossaryReader.__init__(self, glos)
- self._spellKey = spellKey
- self._posKey = posKey
- self._synsKey = synsKey
- self._tagsKey = tagsKey
-
- self._posMapping = {
- "n": "noun",
- "v": "verb",
- "a": "adjective",
- "pron": "pronoun",
- "propn": "proper noun",
- "intj": "interjection",
- "det": "determiner",
- "part": "particle",
- "adv": "adverb",
- "num": "number",
- "abbrev": "abbreviation",
- "suf": "suffix",
- "pref": "prefix",
- }
-
- @classmethod
- def isInfoWord(cls, _word: str) -> bool:
- return False
-
- @classmethod
- def fixInfoWord(cls, _word: str) -> str:
- return ""
-
- @staticmethod
- def _makeList(
- hf: lxml.etree.htmlfile,
- input_objects: list[Any],
- processor: Callable,
- single_prefix: str | None = None,
- skip_single: bool = True,
- ) -> None:
- """Wrap elements into if more than one element."""
- if not input_objects:
- return
-
- if skip_single and len(input_objects) == 1:
- # if single_prefix is None:
- # single_prefix = ET.Element("br")
- if single_prefix:
- hf.write(single_prefix)
- processor(hf, input_objects[0], 1)
- return
-
- with hf.element("ol"):
- for el in input_objects:
- with hf.element("li"):
- processor(hf, el, len(input_objects))
-
- def _processExample( # noqa: PLR6301
- self,
- hf: lxml.etree.htmlfile,
- exampleDict: dict,
- _count: int,
- ) -> None:
- from lxml import etree as ET
-
- if not exampleDict.get("example"):
- log.error(f"invalid example: {exampleDict}")
- return
-
- hf.write(exampleDict["example"])
-
- transliteration = exampleDict.get("transliteration")
- if transliteration:
- hf.write(ET.Element("br"))
- with hf.element("font", color="green"):
- hf.write(f"{transliteration}")
-
- translation = exampleDict.get("translation")
- if translation:
- hf.write(ET.Element("br"))
- with hf.element("i"):
- hf.write(f"{translation}")
-
- def _processDef(
- self,
- hf: lxml.etree.htmlfile,
- defDict: dict,
- count: int,
- ) -> None:
- from lxml import etree as ET
-
- text = defDict.get("def", "")
- if text:
- hf.write(text)
-
- examples = defDict.get("examples")
- if examples:
- if text:
- if count == 1:
- hf.write(ET.Element("br"))
- hf.write(ET.Element("br"))
- with hf.element("i"):
- hf.write("Examples:")
- self._makeList(
- hf,
- examples,
- self._processExample,
- skip_single=False,
- )
-
- def _processNote( # noqa: PLR6301
- self,
- hf: lxml.etree.htmlfile,
- note: str,
- _count: int,
- ) -> None:
- hf.write(note)
-
- def _processEntry(
- self,
- hf: lxml.etree.htmlfile,
- edict: dict,
- ) -> None:
- from lxml import etree as ET
-
- if self._spellKey and self._spellKey in edict:
- spelling = edict[self._spellKey]
- if not isinstance(spelling, str):
- log.error(f"{spelling=}, {type(spelling)=}, {edict=}")
- # https://github.com/mhagiwara/cc-kedict/pull/1
- spelling = "on" if spelling is True else ""
- if spelling:
- with hf.element("font", color="green"):
- hf.write(spelling)
- hf.write(ET.Element("br"))
-
- if self._posKey and self._posKey in edict:
- pos = edict[self._posKey]
- pos = self._posMapping.get(pos, pos)
- with hf.element("i"):
- hf.write(pos.capitalize())
- hf.write(ET.Element("br"))
-
- if self._tagsKey and self._tagsKey in edict:
- tags = edict[self._tagsKey]
- for i, tag in enumerate(tags):
- if i > 0:
- hf.write(" ")
- with hf.element("span", style=self.tagStyle):
- hf.write(tag)
- hf.write(ET.Element("br"))
-
- defs = edict.get("defs")
- if defs:
- self._makeList(
- hf,
- defs,
- self._processDef,
- )
-
- if self._synsKey and self._synsKey in edict:
- hf.write("Synonyms: ")
- for i, word in enumerate(edict[self._synsKey]):
- if i > 0:
- with hf.element("big"):
- hf.write(" | ") # NESTED: 5
- with hf.element("a", href=f"bword://{word}"):
- hf.write(word)
- hf.write(ET.Element("br"))
-
- notes = edict.get("notes")
- if notes:
- hf.write(ET.Element("br"))
- hf.write("Notes:")
- self._makeList(
- hf,
- notes,
- self._processNote,
- skip_single=False,
- )
-
- def _createEntry(
- self,
- yamlBlock: str,
- ) -> tuple[str, str, None] | None:
- from lxml import etree as ET
- from yaml import load
-
- try:
- from yaml import CLoader as Loader
- except ImportError:
- from yaml import Loader
-
- edict = load(yamlBlock, Loader=Loader)
- word = edict.get("word")
- if not word:
- log.error(f"no word in {edict}")
- return None
-
- f = BytesIO()
-
- with ET.htmlfile(f, encoding="utf-8") as hf:
- with hf.element("div"):
- self._processEntry(hf, edict)
-
- defi = f.getvalue().decode("utf-8")
- return word, defi, None
-
- def nextBlock(self) -> EntryType:
- if not self._file:
- raise StopIteration
- lines: list[str] = []
- while True:
- line = self.readline()
- if not line:
- break
- line = line.rstrip("\n\r")
- if not line:
- continue
- if line.startswith("- "):
- line = " " + line[1:]
- if lines:
- self._bufferLine = line
- return self._createEntry("\n".join(lines))
-
- lines.append(line)
-
- if lines:
- return self._createEntry("\n".join(lines))
-
- raise StopIteration
-
-
-class Reader:
- depends = {
- "yaml": "PyYAML",
- "lxml": "lxml",
- }
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._yaml = YamlReader(
- glos,
- spellKey="romaja",
- posKey="pos",
- synsKey="syns",
- tagsKey="tags",
- )
-
- def __len__(self) -> int:
- return 0
-
- def open(self, filename: str) -> None:
- try:
- from lxml import etree as ET # noqa: F401
- except ModuleNotFoundError as e:
- exc_note(e, f"Run `{pip} install lxml` to install")
- raise
-
- if isdir(filename):
- filename = join(filename, "kedict.yml")
- self._filename = filename
-
- self._glos.sourceLangName = "Korean"
- self._glos.targetLangName = "English"
-
- self._glos.setDefaultDefiFormat("h")
- self._yaml.open(filename)
-
- def close(self) -> None:
- self._yaml.close()
-
- def __iter__(self) -> Iterator[EntryType]:
- yield from self._yaml
diff --git a/pyglossary/plugins/cc_kedict/reader.py b/pyglossary/plugins/cc_kedict/reader.py
new file mode 100644
index 000000000..1a9efcb4f
--- /dev/null
+++ b/pyglossary/plugins/cc_kedict/reader.py
@@ -0,0 +1,309 @@
+# -*- coding: utf-8 -*-
+# mypy: ignore-errors
+from __future__ import annotations
+
+from io import BytesIO
+from os.path import isdir, join
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+ from collections.abc import Callable, Iterator
+
+ import lxml
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+from pyglossary.core import exc_note, log, pip
+from pyglossary.text_reader import TextGlossaryReader
+
+
+class YamlReader(TextGlossaryReader):
+ tagStyle = (
+ "color:white;"
+ "background:green;"
+ "padding-left:3px;"
+ "padding-right:3px;"
+ "border-radius:0.5ex;"
+ # 0.5ex ~= 0.3em, but "ex" is recommended
+ )
+
+ def __init__( # noqa: PLR0913
+ self,
+ glos: GlossaryType,
+ spellKey: str = "",
+ posKey: str = "",
+ synsKey: str = "",
+ tagsKey: str = "",
+ ) -> None:
+ TextGlossaryReader.__init__(self, glos)
+ self._spellKey = spellKey
+ self._posKey = posKey
+ self._synsKey = synsKey
+ self._tagsKey = tagsKey
+
+ self._posMapping = {
+ "n": "noun",
+ "v": "verb",
+ "a": "adjective",
+ "pron": "pronoun",
+ "propn": "proper noun",
+ "intj": "interjection",
+ "det": "determiner",
+ "part": "particle",
+ "adv": "adverb",
+ "num": "number",
+ "abbrev": "abbreviation",
+ "suf": "suffix",
+ "pref": "prefix",
+ }
+
+ @classmethod
+ def isInfoWord(cls, _word: str) -> bool:
+ return False
+
+ @classmethod
+ def fixInfoWord(cls, _word: str) -> str:
+ return ""
+
+ @staticmethod
+ def _makeList(
+ hf: lxml.etree.htmlfile,
+ input_objects: list[Any],
+ processor: Callable,
+ single_prefix: str | None = None,
+ skip_single: bool = True,
+ ) -> None:
+ """Wrap elements into if more than one element."""
+ if not input_objects:
+ return
+
+ if skip_single and len(input_objects) == 1:
+ # if single_prefix is None:
+ # single_prefix = ET.Element("br")
+ if single_prefix:
+ hf.write(single_prefix)
+ processor(hf, input_objects[0], 1)
+ return
+
+ with hf.element("ol"):
+ for el in input_objects:
+ with hf.element("li"):
+ processor(hf, el, len(input_objects))
+
+ def _processExample( # noqa: PLR6301
+ self,
+ hf: lxml.etree.htmlfile,
+ exampleDict: dict,
+ _count: int,
+ ) -> None:
+ from lxml import etree as ET
+
+ if not exampleDict.get("example"):
+ log.error(f"invalid example: {exampleDict}")
+ return
+
+ hf.write(exampleDict["example"])
+
+ transliteration = exampleDict.get("transliteration")
+ if transliteration:
+ hf.write(ET.Element("br"))
+ with hf.element("font", color="green"):
+ hf.write(f"{transliteration}")
+
+ translation = exampleDict.get("translation")
+ if translation:
+ hf.write(ET.Element("br"))
+ with hf.element("i"):
+ hf.write(f"{translation}")
+
+ def _processDef(
+ self,
+ hf: lxml.etree.htmlfile,
+ defDict: dict,
+ count: int,
+ ) -> None:
+ from lxml import etree as ET
+
+ text = defDict.get("def", "")
+ if text:
+ hf.write(text)
+
+ examples = defDict.get("examples")
+ if examples:
+ if text:
+ if count == 1:
+ hf.write(ET.Element("br"))
+ hf.write(ET.Element("br"))
+ with hf.element("i"):
+ hf.write("Examples:")
+ self._makeList(
+ hf,
+ examples,
+ self._processExample,
+ skip_single=False,
+ )
+
+ def _processNote( # noqa: PLR6301
+ self,
+ hf: lxml.etree.htmlfile,
+ note: str,
+ _count: int,
+ ) -> None:
+ hf.write(note)
+
+ def _processEntry(
+ self,
+ hf: lxml.etree.htmlfile,
+ edict: dict,
+ ) -> None:
+ from lxml import etree as ET
+
+ if self._spellKey and self._spellKey in edict:
+ spelling = edict[self._spellKey]
+ if not isinstance(spelling, str):
+ log.error(f"{spelling=}, {type(spelling)=}, {edict=}")
+ # https://github.com/mhagiwara/cc-kedict/pull/1
+ spelling = "on" if spelling is True else ""
+ if spelling:
+ with hf.element("font", color="green"):
+ hf.write(spelling)
+ hf.write(ET.Element("br"))
+
+ if self._posKey and self._posKey in edict:
+ pos = edict[self._posKey]
+ pos = self._posMapping.get(pos, pos)
+ with hf.element("i"):
+ hf.write(pos.capitalize())
+ hf.write(ET.Element("br"))
+
+ if self._tagsKey and self._tagsKey in edict:
+ tags = edict[self._tagsKey]
+ for i, tag in enumerate(tags):
+ if i > 0:
+ hf.write(" ")
+ with hf.element("span", style=self.tagStyle):
+ hf.write(tag)
+ hf.write(ET.Element("br"))
+
+ defs = edict.get("defs")
+ if defs:
+ self._makeList(
+ hf,
+ defs,
+ self._processDef,
+ )
+
+ if self._synsKey and self._synsKey in edict:
+ hf.write("Synonyms: ")
+ for i, word in enumerate(edict[self._synsKey]):
+ if i > 0:
+ with hf.element("big"):
+ hf.write(" | ") # NESTED: 5
+ with hf.element("a", href=f"bword://{word}"):
+ hf.write(word)
+ hf.write(ET.Element("br"))
+
+ notes = edict.get("notes")
+ if notes:
+ hf.write(ET.Element("br"))
+ hf.write("Notes:")
+ self._makeList(
+ hf,
+ notes,
+ self._processNote,
+ skip_single=False,
+ )
+
+ def _createEntry(
+ self,
+ yamlBlock: str,
+ ) -> tuple[str, str, None] | None:
+ from lxml import etree as ET
+ from yaml import load
+
+ try:
+ from yaml import CLoader as Loader
+ except ImportError:
+ from yaml import Loader
+
+ edict = load(yamlBlock, Loader=Loader)
+ word = edict.get("word")
+ if not word:
+ log.error(f"no word in {edict}")
+ return None
+
+ f = BytesIO()
+
+ with ET.htmlfile(f, encoding="utf-8") as hf:
+ with hf.element("div"):
+ self._processEntry(hf, edict)
+
+ defi = f.getvalue().decode("utf-8")
+ return word, defi, None
+
+ def nextBlock(self) -> EntryType:
+ if not self._file:
+ raise StopIteration
+ lines: list[str] = []
+ while True:
+ line = self.readline()
+ if not line:
+ break
+ line = line.rstrip("\n\r")
+ if not line:
+ continue
+ if line.startswith("- "):
+ line = " " + line[1:]
+ if lines:
+ self._bufferLine = line
+ return self._createEntry("\n".join(lines))
+
+ lines.append(line)
+
+ if lines:
+ return self._createEntry("\n".join(lines))
+
+ raise StopIteration
+
+
+class Reader:
+ depends = {
+ "yaml": "PyYAML",
+ "lxml": "lxml",
+ }
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._yaml = YamlReader(
+ glos,
+ spellKey="romaja",
+ posKey="pos",
+ synsKey="syns",
+ tagsKey="tags",
+ )
+
+ def __len__(self) -> int:
+ return 0
+
+ def open(self, filename: str) -> None:
+ try:
+ from lxml import etree as ET # noqa: F401
+ except ModuleNotFoundError as e:
+ exc_note(e, f"Run `{pip} install lxml` to install")
+ raise
+
+ if isdir(filename):
+ filename = join(filename, "kedict.yml")
+ self._filename = filename
+
+ self._glos.sourceLangName = "Korean"
+ self._glos.targetLangName = "English"
+
+ self._glos.setDefaultDefiFormat("h")
+ self._yaml.open(filename)
+
+ def close(self) -> None:
+ self._yaml.close()
+
+ def __iter__(self) -> Iterator[EntryType]:
+ yield from self._yaml
diff --git a/pyglossary/plugins/crawler_dir/__init__.py b/pyglossary/plugins/crawler_dir/__init__.py
index 9c0ec0557..ae64f6e5c 100644
--- a/pyglossary/plugins/crawler_dir/__init__.py
+++ b/pyglossary/plugins/crawler_dir/__init__.py
@@ -1,28 +1,13 @@
# mypy: ignore-errors
from __future__ import annotations
-from hashlib import sha1
-from os import listdir, makedirs
-from os.path import dirname, isdir, isfile, join, splitext
-from typing import TYPE_CHECKING
-
-from pyglossary.compression import (
- compressionOpenFunc,
-)
-from pyglossary.core import log
from pyglossary.option import (
Option,
StrOption,
)
-from pyglossary.text_utils import (
- escapeNTB,
- splitByBarUnescapeNTB,
-)
-
-if TYPE_CHECKING:
- from collections.abc import Generator, Iterator
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .reader import Reader
+from .writer import Writer
__all__ = [
"Reader",
@@ -56,147 +41,3 @@
comment="Compression Algorithm",
),
}
-
-
-class Writer:
- _compression: str = ""
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = None
-
- def finish(self) -> None:
- pass
-
- def open(self, filename: str) -> None:
- self._filename = filename
- if not isdir(filename):
- makedirs(filename)
-
- @staticmethod
- def filePathFromWord(b_word: bytes) -> str:
- bw = b_word.lower()
- if len(bw) <= 2:
- return bw.hex()
- if len(bw) <= 4:
- return join(
- bw[:2].hex() + ".d",
- bw[2:].hex(),
- )
- return join(
- bw[:2].hex() + ".d",
- bw[2:4].hex() + ".d",
- bw[4:8].hex() + "-" + sha1(b_word).hexdigest()[:8], # noqa: S324
- )
-
- def write(self) -> None:
- from pyglossary.json_utils import dataToPrettyJson
-
- filename = self._filename
-
- wordCount = 0
- compression = self._compression
- c_open = compressionOpenFunc(compression)
- if not c_open:
- raise ValueError(f"invalid compression {compression!r}")
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- continue
- fpath = join(filename, self.filePathFromWord(entry.b_word))
- if compression:
- fpath = f"{fpath}.{compression}"
- parentDir = dirname(fpath)
- if not isdir(parentDir):
- makedirs(parentDir)
- if isfile(fpath):
- log.warning(f"file exists: {fpath}")
- fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}" # noqa: S324
- with c_open(fpath, "wt", encoding="utf-8") as _file:
- _file.write(
- f"{escapeNTB(entry.s_word)}\n{entry.defi}",
- )
- wordCount += 1
-
- with open(
- join(filename, "info.json"),
- mode="w",
- encoding="utf-8",
- ) as infoFile:
- info = {}
- info["name"] = self._glos.getInfo("name")
- info["wordCount"] = wordCount
- info |= self._glos.getExtraInfos(["name", "wordCount"])
-
- infoFile.write(dataToPrettyJson(info))
-
-
-class Reader:
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = None
- self._wordCount = 0
-
- def open(self, filename: str) -> None:
- from pyglossary.json_utils import jsonToData
-
- self._filename = filename
-
- with open(join(filename, "info.json"), encoding="utf-8") as infoFp:
- info = jsonToData(infoFp.read())
- self._wordCount = info.pop("wordCount")
- for key, value in info.items():
- self._glos.setInfo(key, value)
-
- def close(self) -> None:
- pass
-
- def __len__(self) -> int:
- return self._wordCount
-
- def _fromFile(self, fpath: str) -> EntryType:
- _, ext = splitext(fpath)
- c_open = compressionOpenFunc(ext.lstrip("."))
- if not c_open:
- log.error(f"invalid extension {ext}")
- c_open = open
- with c_open(fpath, "rt", encoding="utf-8") as _file:
- words = splitByBarUnescapeNTB(_file.readline().rstrip("\n"))
- defi = _file.read()
- return self._glos.newEntry(words, defi)
-
- @staticmethod
- def _listdirSortKey(name: str) -> str:
- name_nox, ext = splitext(name)
- if ext == ".d":
- return name
- return name_nox
-
- def _readDir(
- self,
- dpath: str,
- exclude: set[str] | None,
- ) -> Generator[EntryType, None, None]:
- children = listdir(dpath)
- if exclude:
- children = [name for name in children if name not in exclude]
- children.sort(key=self._listdirSortKey)
- for name in children:
- cpath = join(dpath, name)
- if isfile(cpath):
- yield self._fromFile(cpath)
- continue
- if isdir(cpath):
- yield from self._readDir(cpath, None)
- continue
- log.error(f"Not a file nor a directory: {cpath}")
-
- def __iter__(self) -> Iterator[EntryType]:
- yield from self._readDir(
- self._filename,
- {
- "info.json",
- },
- )
diff --git a/pyglossary/plugins/crawler_dir/reader.py b/pyglossary/plugins/crawler_dir/reader.py
new file mode 100644
index 000000000..9bb6b0369
--- /dev/null
+++ b/pyglossary/plugins/crawler_dir/reader.py
@@ -0,0 +1,88 @@
+# mypy: ignore-errors
+from __future__ import annotations
+
+from os import listdir
+from os.path import isdir, isfile, join, splitext
+from typing import TYPE_CHECKING
+
+from pyglossary.compression import (
+ compressionOpenFunc,
+)
+from pyglossary.core import log
+from pyglossary.text_utils import (
+ splitByBarUnescapeNTB,
+)
+
+if TYPE_CHECKING:
+ from collections.abc import Generator, Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = None
+ self._wordCount = 0
+
+ def open(self, filename: str) -> None:
+ from pyglossary.json_utils import jsonToData
+
+ self._filename = filename
+
+ with open(join(filename, "info.json"), encoding="utf-8") as infoFp:
+ info = jsonToData(infoFp.read())
+ self._wordCount = info.pop("wordCount")
+ for key, value in info.items():
+ self._glos.setInfo(key, value)
+
+ def close(self) -> None:
+ pass
+
+ def __len__(self) -> int:
+ return self._wordCount
+
+ def _fromFile(self, fpath: str) -> EntryType:
+ _, ext = splitext(fpath)
+ c_open = compressionOpenFunc(ext.lstrip("."))
+ if not c_open:
+ log.error(f"invalid extension {ext}")
+ c_open = open
+ with c_open(fpath, "rt", encoding="utf-8") as _file:
+ words = splitByBarUnescapeNTB(_file.readline().rstrip("\n"))
+ defi = _file.read()
+ return self._glos.newEntry(words, defi)
+
+ @staticmethod
+ def _listdirSortKey(name: str) -> str:
+ name_nox, ext = splitext(name)
+ if ext == ".d":
+ return name
+ return name_nox
+
+ def _readDir(
+ self,
+ dpath: str,
+ exclude: set[str] | None,
+ ) -> Generator[EntryType, None, None]:
+ children = listdir(dpath)
+ if exclude:
+ children = [name for name in children if name not in exclude]
+ children.sort(key=self._listdirSortKey)
+ for name in children:
+ cpath = join(dpath, name)
+ if isfile(cpath):
+ yield self._fromFile(cpath)
+ continue
+ if isdir(cpath):
+ yield from self._readDir(cpath, None)
+ continue
+ log.error(f"Not a file nor a directory: {cpath}")
+
+ def __iter__(self) -> Iterator[EntryType]:
+ yield from self._readDir(
+ self._filename,
+ {
+ "info.json",
+ },
+ )
diff --git a/pyglossary/plugins/crawler_dir/writer.py b/pyglossary/plugins/crawler_dir/writer.py
new file mode 100644
index 000000000..6171a341e
--- /dev/null
+++ b/pyglossary/plugins/crawler_dir/writer.py
@@ -0,0 +1,93 @@
+# mypy: ignore-errors
+from __future__ import annotations
+
+from hashlib import sha1
+from os import makedirs
+from os.path import dirname, isdir, isfile, join
+from typing import TYPE_CHECKING
+
+from pyglossary.compression import (
+ compressionOpenFunc,
+)
+from pyglossary.core import log
+from pyglossary.text_utils import (
+ escapeNTB,
+)
+
+if TYPE_CHECKING:
+ from pyglossary.glossary_types import GlossaryType
+
+
+class Writer:
+ _compression: str = ""
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = None
+
+ def finish(self) -> None:
+ pass
+
+ def open(self, filename: str) -> None:
+ self._filename = filename
+ if not isdir(filename):
+ makedirs(filename)
+
+ @staticmethod
+ def filePathFromWord(b_word: bytes) -> str:
+ bw = b_word.lower()
+ if len(bw) <= 2:
+ return bw.hex()
+ if len(bw) <= 4:
+ return join(
+ bw[:2].hex() + ".d",
+ bw[2:].hex(),
+ )
+ return join(
+ bw[:2].hex() + ".d",
+ bw[2:4].hex() + ".d",
+ bw[4:8].hex() + "-" + sha1(b_word).hexdigest()[:8], # noqa: S324
+ )
+
+ def write(self) -> None:
+ from pyglossary.json_utils import dataToPrettyJson
+
+ filename = self._filename
+
+ wordCount = 0
+ compression = self._compression
+ c_open = compressionOpenFunc(compression)
+ if not c_open:
+ raise ValueError(f"invalid compression {compression!r}")
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ continue
+ fpath = join(filename, self.filePathFromWord(entry.b_word))
+ if compression:
+ fpath = f"{fpath}.{compression}"
+ parentDir = dirname(fpath)
+ if not isdir(parentDir):
+ makedirs(parentDir)
+ if isfile(fpath):
+ log.warning(f"file exists: {fpath}")
+ fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}" # noqa: S324
+ with c_open(fpath, "wt", encoding="utf-8") as _file:
+ _file.write(
+ f"{escapeNTB(entry.s_word)}\n{entry.defi}",
+ )
+ wordCount += 1
+
+ with open(
+ join(filename, "info.json"),
+ mode="w",
+ encoding="utf-8",
+ ) as infoFile:
+ info = {}
+ info["name"] = self._glos.getInfo("name")
+ info["wordCount"] = wordCount
+ info |= self._glos.getExtraInfos(["name", "wordCount"])
+
+ infoFile.write(dataToPrettyJson(info))
diff --git a/pyglossary/plugins/csv_plugin/__init__.py b/pyglossary/plugins/csv_plugin/__init__.py
index 1f9aebb29..36916b243 100644
--- a/pyglossary/plugins/csv_plugin/__init__.py
+++ b/pyglossary/plugins/csv_plugin/__init__.py
@@ -20,16 +20,7 @@
from __future__ import annotations
import csv
-import os
-from os.path import isdir, join
-from typing import TYPE_CHECKING, cast
-from pyglossary.compression import (
- compressionOpen,
- stdCompressions,
-)
-from pyglossary.core import log
-from pyglossary.io_utils import nullTextIO
from pyglossary.option import (
BoolOption,
EncodingOption,
@@ -37,11 +28,8 @@
Option,
)
-if TYPE_CHECKING:
- import io
- from collections.abc import Generator, Iterable, Iterator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .reader import Reader
+from .writer import Writer
__all__ = [
"Reader",
@@ -94,231 +82,3 @@
}
csv.field_size_limit(0x7FFFFFFF)
-
-
-class Reader:
- compressions = stdCompressions
-
- _encoding: str = "utf-8"
- _newline: str = "\n"
- _delimiter: str = ","
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self.clear()
-
- def clear(self) -> None:
- self._filename = ""
- self._file: io.TextIOBase = nullTextIO
- self._fileSize = 0
- self._leadingLinesCount = 0
- self._wordCount: int | None = None
- self._pos = -1
- self._csvReader: Iterable[list[str]] | None = None
- self._resDir = ""
- self._resFileNames: list[str] = []
- self._bufferRow: list[str] | None = None
-
- def open(
- self,
- filename: str,
- ) -> None:
- from pyglossary.text_reader import TextFilePosWrapper
-
- self._filename = filename
- cfile = cast(
- "io.TextIOBase",
- compressionOpen(
- filename,
- mode="rt",
- encoding=self._encoding,
- newline=self._newline,
- ),
- )
-
- if self._glos.progressbar:
- if cfile.seekable():
- cfile.seek(0, 2)
- self._fileSize = cfile.tell()
- cfile.seek(0)
- # self._glos.setInfo("input_file_size", f"{self._fileSize}")
- else:
- log.warning("CSV Reader: file is not seekable")
-
- self._file = TextFilePosWrapper(cfile, self._encoding)
- self._csvReader = csv.reader(
- self._file,
- dialect="excel",
- delimiter=self._delimiter,
- )
- self._resDir = filename + "_res"
- if isdir(self._resDir):
- self._resFileNames = os.listdir(self._resDir)
- else:
- self._resDir = ""
- self._resFileNames = []
- for row in self._csvReader:
- if not row:
- continue
- if not row[0].startswith("#"):
- self._bufferRow = row
- break
- if len(row) < 2:
- log.error(f"invalid row: {row}")
- continue
- self._glos.setInfo(row[0].lstrip("#"), row[1])
-
- def close(self) -> None:
- if self._file:
- try:
- self._file.close()
- except Exception:
- log.exception("error while closing csv file")
- self.clear()
-
- def __len__(self) -> int:
- from pyglossary.file_utils import fileCountLines
-
- if self._wordCount is None:
- if hasattr(self._file, "compression"):
- return 0
- log.debug("Try not to use len(reader) as it takes extra time")
- self._wordCount = fileCountLines(self._filename) - self._leadingLinesCount
- return self._wordCount + len(self._resFileNames)
-
- def _iterRows(self) -> Iterator[list[str]]:
- if self._csvReader is None:
- raise RuntimeError("self._csvReader is None")
- if self._bufferRow:
- yield self._bufferRow
- yield from self._csvReader
-
- def _processRow(self, row: list[str]) -> EntryType | None:
- if not row:
- return None
-
- word: str | list[str]
- try:
- word = row[0]
- defi = row[1]
- except IndexError:
- log.error(f"invalid row: {row!r}")
- return None
-
- try:
- alts = row[2].split(",")
- except IndexError:
- pass
- else:
- word = [word] + alts
-
- return self._glos.newEntry(
- word,
- defi,
- byteProgress=(
- (self._file.tell(), self._fileSize) if self._fileSize else None
- ),
- )
-
- def __iter__(self) -> Iterator[EntryType | None]:
- if not self._csvReader:
- raise RuntimeError("iterating over a reader while it's not open")
-
- wordCount = 0
- for row in self._iterRows():
- wordCount += 1
- yield self._processRow(row)
-
- self._wordCount = wordCount
-
- resDir = self._resDir
- for fname in self._resFileNames:
- with open(join(resDir, fname), "rb") as _file:
- yield self._glos.newDataEntry(
- fname,
- _file.read(),
- )
-
-
-class Writer:
- compressions = stdCompressions
-
- _encoding: str = "utf-8"
- _newline: str = "\n"
- _resources: bool = True
- _delimiter: str = ","
- _add_defi_format: bool = False
- _enable_info: bool = True
- _word_title: bool = False
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._file: io.TextIOBase = nullTextIO
-
- def open(self, filename: str) -> None:
- self._filename = filename
- self._file = cast(
- "io.TextIOBase",
- compressionOpen(
- filename,
- mode="wt",
- encoding=self._encoding,
- newline=self._newline,
- ),
- )
- self._resDir = resDir = filename + "_res"
- self._csvWriter = csv.writer(
- self._file,
- dialect="excel",
- quoting=csv.QUOTE_ALL, # FIXME
- delimiter=self._delimiter,
- )
- if not isdir(resDir):
- os.mkdir(resDir)
- if self._enable_info:
- for key, value in self._glos.iterInfo():
- self._csvWriter.writerow([f"#{key}", value])
-
- def finish(self) -> None:
- self._filename = ""
- self._file.close()
- self._file = nullTextIO
- if not os.listdir(self._resDir):
- os.rmdir(self._resDir)
-
- def write(self) -> Generator[None, EntryType, None]:
- resources = self._resources
- add_defi_format = self._add_defi_format
- glos = self._glos
- resDir = self._resDir
- writer = self._csvWriter
- word_title = self._word_title
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- if resources:
- entry.save(resDir)
- continue
-
- words = entry.l_word
- if not words:
- continue
- word, alts = words[0], words[1:]
- defi = entry.defi
-
- if word_title:
- defi = glos.wordTitleStr(words[0]) + defi
-
- row = [
- word,
- defi,
- ]
- if add_defi_format:
- entry.detectDefiFormat()
- row.append(entry.defiFormat)
- if alts:
- row.append(",".join(alts))
-
- writer.writerow(row)
diff --git a/pyglossary/plugins/csv_plugin/reader.py b/pyglossary/plugins/csv_plugin/reader.py
new file mode 100644
index 000000000..8087e9e92
--- /dev/null
+++ b/pyglossary/plugins/csv_plugin/reader.py
@@ -0,0 +1,182 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2013-2019 Saeed Rasooli (ilius)
+# This file is part of PyGlossary project, https://github.com/ilius/pyglossary
+#
+# This program is a free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL
+# If not, see .
+
+from __future__ import annotations
+
+import csv
+import os
+from os.path import isdir, join
+from typing import TYPE_CHECKING, cast
+
+from pyglossary.compression import (
+ compressionOpen,
+ stdCompressions,
+)
+from pyglossary.core import log
+from pyglossary.io_utils import nullTextIO
+
+if TYPE_CHECKING:
+ import io
+ from collections.abc import Iterable, Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ compressions = stdCompressions
+
+ _encoding: str = "utf-8"
+ _newline: str = "\n"
+ _delimiter: str = ","
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self.clear()
+
+ def clear(self) -> None:
+ self._filename = ""
+ self._file: io.TextIOBase = nullTextIO
+ self._fileSize = 0
+ self._leadingLinesCount = 0
+ self._wordCount: int | None = None
+ self._pos = -1
+ self._csvReader: Iterable[list[str]] | None = None
+ self._resDir = ""
+ self._resFileNames: list[str] = []
+ self._bufferRow: list[str] | None = None
+
+ def open(
+ self,
+ filename: str,
+ ) -> None:
+ from pyglossary.text_reader import TextFilePosWrapper
+
+ self._filename = filename
+ cfile = cast(
+ "io.TextIOBase",
+ compressionOpen(
+ filename,
+ mode="rt",
+ encoding=self._encoding,
+ newline=self._newline,
+ ),
+ )
+
+ if self._glos.progressbar:
+ if cfile.seekable():
+ cfile.seek(0, 2)
+ self._fileSize = cfile.tell()
+ cfile.seek(0)
+ # self._glos.setInfo("input_file_size", f"{self._fileSize}")
+ else:
+ log.warning("CSV Reader: file is not seekable")
+
+ self._file = TextFilePosWrapper(cfile, self._encoding)
+ self._csvReader = csv.reader(
+ self._file,
+ dialect="excel",
+ delimiter=self._delimiter,
+ )
+ self._resDir = filename + "_res"
+ if isdir(self._resDir):
+ self._resFileNames = os.listdir(self._resDir)
+ else:
+ self._resDir = ""
+ self._resFileNames = []
+ for row in self._csvReader:
+ if not row:
+ continue
+ if not row[0].startswith("#"):
+ self._bufferRow = row
+ break
+ if len(row) < 2:
+ log.error(f"invalid row: {row}")
+ continue
+ self._glos.setInfo(row[0].lstrip("#"), row[1])
+
+ def close(self) -> None:
+ if self._file:
+ try:
+ self._file.close()
+ except Exception:
+ log.exception("error while closing csv file")
+ self.clear()
+
+ def __len__(self) -> int:
+ from pyglossary.file_utils import fileCountLines
+
+ if self._wordCount is None:
+ if hasattr(self._file, "compression"):
+ return 0
+ log.debug("Try not to use len(reader) as it takes extra time")
+ self._wordCount = fileCountLines(self._filename) - self._leadingLinesCount
+ return self._wordCount + len(self._resFileNames)
+
+ def _iterRows(self) -> Iterator[list[str]]:
+ if self._csvReader is None:
+ raise RuntimeError("self._csvReader is None")
+ if self._bufferRow:
+ yield self._bufferRow
+ yield from self._csvReader
+
+ def _processRow(self, row: list[str]) -> EntryType | None:
+ if not row:
+ return None
+
+ word: str | list[str]
+ try:
+ word = row[0]
+ defi = row[1]
+ except IndexError:
+ log.error(f"invalid row: {row!r}")
+ return None
+
+ try:
+ alts = row[2].split(",")
+ except IndexError:
+ pass
+ else:
+ word = [word] + alts
+
+ return self._glos.newEntry(
+ word,
+ defi,
+ byteProgress=(
+ (self._file.tell(), self._fileSize) if self._fileSize else None
+ ),
+ )
+
+ def __iter__(self) -> Iterator[EntryType | None]:
+ if not self._csvReader:
+ raise RuntimeError("iterating over a reader while it's not open")
+
+ wordCount = 0
+ for row in self._iterRows():
+ wordCount += 1
+ yield self._processRow(row)
+
+ self._wordCount = wordCount
+
+ resDir = self._resDir
+ for fname in self._resFileNames:
+ with open(join(resDir, fname), "rb") as _file:
+ yield self._glos.newDataEntry(
+ fname,
+ _file.read(),
+ )
diff --git a/pyglossary/plugins/csv_plugin/writer.py b/pyglossary/plugins/csv_plugin/writer.py
new file mode 100644
index 000000000..ff1c42920
--- /dev/null
+++ b/pyglossary/plugins/csv_plugin/writer.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2013-2019 Saeed Rasooli (ilius)
+# This file is part of PyGlossary project, https://github.com/ilius/pyglossary
+#
+# This program is a free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL
+# If not, see .
+
+from __future__ import annotations
+
+import csv
+import os
+from os.path import isdir
+from typing import TYPE_CHECKING, cast
+
+from pyglossary.compression import (
+ compressionOpen,
+ stdCompressions,
+)
+from pyglossary.io_utils import nullTextIO
+
+if TYPE_CHECKING:
+ import io
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Writer:
+ compressions = stdCompressions
+
+ _encoding: str = "utf-8"
+ _newline: str = "\n"
+ _resources: bool = True
+ _delimiter: str = ","
+ _add_defi_format: bool = False
+ _enable_info: bool = True
+ _word_title: bool = False
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._file: io.TextIOBase = nullTextIO
+
+ def open(self, filename: str) -> None:
+ self._filename = filename
+ self._file = cast(
+ "io.TextIOBase",
+ compressionOpen(
+ filename,
+ mode="wt",
+ encoding=self._encoding,
+ newline=self._newline,
+ ),
+ )
+ self._resDir = resDir = filename + "_res"
+ self._csvWriter = csv.writer(
+ self._file,
+ dialect="excel",
+ quoting=csv.QUOTE_ALL, # FIXME
+ delimiter=self._delimiter,
+ )
+ if not isdir(resDir):
+ os.mkdir(resDir)
+ if self._enable_info:
+ for key, value in self._glos.iterInfo():
+ self._csvWriter.writerow([f"#{key}", value])
+
+ def finish(self) -> None:
+ self._filename = ""
+ self._file.close()
+ self._file = nullTextIO
+ if not os.listdir(self._resDir):
+ os.rmdir(self._resDir)
+
+ def write(self) -> Generator[None, EntryType, None]:
+ resources = self._resources
+ add_defi_format = self._add_defi_format
+ glos = self._glos
+ resDir = self._resDir
+ writer = self._csvWriter
+ word_title = self._word_title
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ if resources:
+ entry.save(resDir)
+ continue
+
+ words = entry.l_word
+ if not words:
+ continue
+ word, alts = words[0], words[1:]
+ defi = entry.defi
+
+ if word_title:
+ defi = glos.wordTitleStr(words[0]) + defi
+
+ row = [
+ word,
+ defi,
+ ]
+ if add_defi_format:
+ entry.detectDefiFormat()
+ row.append(entry.defiFormat)
+ if alts:
+ row.append(",".join(alts))
+
+ writer.writerow(row)
diff --git a/pyglossary/plugins/dicformids/__init__.py b/pyglossary/plugins/dicformids/__init__.py
index 625b9b7f3..8e1f4ca76 100644
--- a/pyglossary/plugins/dicformids/__init__.py
+++ b/pyglossary/plugins/dicformids/__init__.py
@@ -2,22 +2,16 @@
# mypy: ignore-errors
from __future__ import annotations
-import operator
-import os
-import re
-from os.path import join
from typing import TYPE_CHECKING
-from pyglossary.core import log
-from pyglossary.flags import ALWAYS
-from pyglossary.plugins.tabfile import Reader as TabfileReader
-
if TYPE_CHECKING:
- from collections.abc import Generator, Iterator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
from pyglossary.option import Option
+from pyglossary.flags import ALWAYS
+
+from .reader import Reader
+from .writer import Writer
+
__all__ = [
"Reader",
"Writer",
@@ -52,243 +46,3 @@
)
optionsProp: dict[str, Option] = {}
-
-
-PROP_TEMPLATE = """#DictionaryForMIDs property file
-infoText={name}, author: {author}
-indexFileMaxSize={indexFileMaxSize}\n
-language1IndexNumberOfSourceEntries={wordCount}
-language1DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate
-indexCharEncoding=ISO-8859-1
-dictionaryFileSeparationCharacter='\\t'
-language2NormationClassName=de.kugihan.dictionaryformids.translation.Normation
-language2DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate
-logLevel=0
-language1FilePostfix={directoryPostfix}
-dictionaryCharEncoding=UTF-8
-numberOfAvailableLanguages=2
-language1IsSearchable=true
-language2GenerateIndex=false
-dictionaryFileMaxSize={dicMaxSize}
-language2FilePostfix={language2FilePostfix}
-searchListFileMaxSize=20000
-language2IsSearchable=false
-fileEncodingFormat=plain_format1
-language1HasSeparateDictionaryFile=true
-searchListCharEncoding=ISO-8859-1
-searchListFileSeparationCharacter='\t'
-indexFileSeparationCharacter='\t'
-language1DisplayText={sourceLang}
-language2HasSeparateDictionaryFile=false
-dictionaryGenerationInputCharEncoding=UTF-8
-language1GenerateIndex=true
-language2DisplayText={targetLang}
-language1NormationClassName=de.kugihan.dictionaryformids.translation.NormationEng
-"""
-
-
-class Reader:
- re_number = re.compile(r"\d+")
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._tabFileNames: list[str] = []
- self._tabFileReader = None
-
- def open(self, dirname: str) -> None:
- self._dirname = dirname
- orderFileNames: list[tuple[int, str]] = []
- for fname in os.listdir(dirname):
- if not fname.startswith("directory"):
- continue
- try:
- num = self.re_number.findall(fname)[-1]
- except IndexError:
- pass
- else:
- orderFileNames.append((num, fname))
- orderFileNames.sort(
- key=operator.itemgetter(0),
- reverse=True,
- )
- self._tabFileNames = [x[1] for x in orderFileNames]
- self.nextTabFile()
-
- def __len__(self) -> int:
- raise NotImplementedError # FIXME
-
- def __iter__(self) -> Iterator[EntryType]:
- return self
-
- def __next__(self) -> EntryType:
- for _ in range(10):
- try:
- return next(self._tabFileReader)
- except StopIteration: # noqa: PERF203
- self._tabFileReader.close()
- self.nextTabFile()
- return None
-
- def nextTabFile(self) -> None:
- try:
- tabFileName = self._tabFileNames.pop()
- except IndexError:
- raise StopIteration from None
- self._tabFileReader = TabfileReader(self._glos, hasInfo=False)
- self._tabFileReader.open(join(self._dirname, tabFileName), newline="\n")
-
- def close(self) -> None:
- if self._tabFileReader:
- try:
- self._tabFileReader.close()
- except Exception:
- pass # noqa: S110
- self._tabFileReader = None
- self._tabFileNames = []
-
-
-class Writer:
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self.linesPerDirectoryFile = 500 # 200
- self.indexFileMaxSize = 32722 # 30000
- self.directoryPostfix = ""
- self.indexPostfix = ""
- self._dirname = ""
- # looks like we need to remove tabs, because app gives error
- # but based on the java code, all punctuations should be removed
- # as well, including '|'
- self.re_punc = re.compile(
- r"""[!"$§%&/()=?´`\\{}\[\]^°+*~#'\-_.:,;<>@|]*""", # noqa: RUF001
- )
- self.re_spaces = re.compile(" +")
- self.re_tabs = re.compile("\t+")
-
- def normateWord(self, word: str) -> str:
- word = word.strip()
- word = self.re_punc.sub("", word)
- word = self.re_spaces.sub(" ", word)
- word = self.re_tabs.sub(" ", word)
- word = word.lower()
- return word # noqa: RET504
-
- def writeProbs(self) -> None:
- glos = self._glos
- probsPath = join(
- self._dirname,
- "DictionaryForMIDs.properties",
- )
- with open(probsPath, mode="w", newline="\n", encoding="utf-8") as fileObj:
- fileObj.write(
- PROP_TEMPLATE.format(
- name=glos.getInfo("name"),
- author=glos.author,
- indexFileMaxSize=self.indexFileMaxSize,
- wordCount=self.wordCount,
- directoryPostfix=self.directoryPostfix,
- dicMaxSize=self.dicMaxSize + 1,
- language2FilePostfix="fa", # FIXME
- sourceLang=glos.sourceLangName,
- targetLang=glos.targetLangName,
- ),
- )
-
- def nextIndex(self) -> None:
- try:
- self.indexFp.close()
- except AttributeError:
- self.indexIndex = 0
-
- self.indexIndex += 1
- fname = f"index{self.indexPostfix}{self.indexIndex}.csv"
- fpath = join(self._dirname, fname)
- self.indexFp = open(fpath, mode="w", encoding="utf-8", newline="\n")
-
- def finish(self) -> None:
- pass
-
- def open(self, dirname: str) -> None:
- self._dirname = dirname
- if not os.path.isdir(dirname):
- os.mkdir(dirname)
-
- def write(self) -> Generator[None, EntryType, None]:
- self.nextIndex()
-
- dicMaxSize = 0
- indexData: list[tuple[str, int, int]] = []
-
- def writeBucket(dicIndex: int, entryList: list[EntryType]) -> None:
- nonlocal dicMaxSize
- log.debug(
- f"{dicIndex=}, {len(entryList)=}, {dicMaxSize=}",
- )
- dicFp = open(
- join(
- self._dirname,
- f"directory{self.directoryPostfix}{dicIndex + 1}.csv",
- ),
- mode="w",
- encoding="utf-8",
- newline="\n",
- )
- for entry in entryList:
- word = entry.s_word
- n_word = self.normateWord(word)
- defi = entry.defi
- dicLine = word + "\t" + defi + "\n"
- dicPos = dicFp.tell()
- dicFp.write(dicLine)
- indexData.append((n_word, dicIndex + 1, dicPos))
-
- dicMaxSize = max(dicMaxSize, dicFp.tell())
- dicFp.close()
-
- bucketSize = self.linesPerDirectoryFile
- wordCount = 0
- dicIndex = 0
- entryList: list[EntryType] = [] # aka bucket
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- # FIXME
- continue
- wordCount += 1
- entryList.append(entry)
- if len(entryList) >= bucketSize:
- writeBucket(dicIndex, entryList)
- dicIndex += 1
- entryList = []
-
- if entryList:
- writeBucket(dicIndex, entryList)
- entryList = []
-
- self.dicMaxSize = dicMaxSize
- self.wordCount = wordCount
-
- langSearchListFp = open(
- join(
- self._dirname,
- f"searchlist{self.directoryPostfix}.csv",
- ),
- mode="w",
- newline="\n",
- encoding="utf-8",
- )
-
- langSearchListFp.write(f"{indexData[0][0]}\t{self.indexIndex}\n")
-
- for word, dicIndex, dicPos in indexData:
- indexLine = f"{word}\t{dicIndex}-{dicPos}-B\n"
- if (self.indexFp.tell() + len(indexLine)) > self.indexFileMaxSize - 10:
- self.nextIndex()
- langSearchListFp.write(f"{word}\t{self.indexIndex}\n")
- self.indexFp.write(indexLine)
-
- self.indexFp.close()
- langSearchListFp.close()
-
- self.writeProbs()
diff --git a/pyglossary/plugins/dicformids/reader.py b/pyglossary/plugins/dicformids/reader.py
new file mode 100644
index 000000000..9ae2bd1a8
--- /dev/null
+++ b/pyglossary/plugins/dicformids/reader.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+# mypy: ignore-errors
+from __future__ import annotations
+
+import operator
+import os
+import re
+from os.path import join
+from typing import TYPE_CHECKING
+
+from pyglossary.plugins.tabfile import Reader as TabfileReader
+
+if TYPE_CHECKING:
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ re_number = re.compile(r"\d+")
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._tabFileNames: list[str] = []
+ self._tabFileReader = None
+
+ def open(self, dirname: str) -> None:
+ self._dirname = dirname
+ orderFileNames: list[tuple[int, str]] = []
+ for fname in os.listdir(dirname):
+ if not fname.startswith("directory"):
+ continue
+ try:
+ num = self.re_number.findall(fname)[-1]
+ except IndexError:
+ pass
+ else:
+ orderFileNames.append((num, fname))
+ orderFileNames.sort(
+ key=operator.itemgetter(0),
+ reverse=True,
+ )
+ self._tabFileNames = [x[1] for x in orderFileNames]
+ self.nextTabFile()
+
+ def __len__(self) -> int:
+ raise NotImplementedError # FIXME
+
+ def __iter__(self) -> Iterator[EntryType]:
+ return self
+
+ def __next__(self) -> EntryType:
+ for _ in range(10):
+ try:
+ return next(self._tabFileReader)
+ except StopIteration: # noqa: PERF203
+ self._tabFileReader.close()
+ self.nextTabFile()
+ return None
+
+ def nextTabFile(self) -> None:
+ try:
+ tabFileName = self._tabFileNames.pop()
+ except IndexError:
+ raise StopIteration from None
+ self._tabFileReader = TabfileReader(self._glos, hasInfo=False)
+ self._tabFileReader.open(join(self._dirname, tabFileName), newline="\n")
+
+ def close(self) -> None:
+ if self._tabFileReader:
+ try:
+ self._tabFileReader.close()
+ except Exception:
+ pass # noqa: S110
+ self._tabFileReader = None
+ self._tabFileNames = []
diff --git a/pyglossary/plugins/dicformids/writer.py b/pyglossary/plugins/dicformids/writer.py
new file mode 100644
index 000000000..44dc07ebd
--- /dev/null
+++ b/pyglossary/plugins/dicformids/writer.py
@@ -0,0 +1,195 @@
+# -*- coding: utf-8 -*-
+# mypy: ignore-errors
+from __future__ import annotations
+
+import os
+import re
+from os.path import join
+from typing import TYPE_CHECKING
+
+from pyglossary.core import log
+
+if TYPE_CHECKING:
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+PROP_TEMPLATE = """#DictionaryForMIDs property file
+infoText={name}, author: {author}
+indexFileMaxSize={indexFileMaxSize}\n
+language1IndexNumberOfSourceEntries={wordCount}
+language1DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate
+indexCharEncoding=ISO-8859-1
+dictionaryFileSeparationCharacter='\\t'
+language2NormationClassName=de.kugihan.dictionaryformids.translation.Normation
+language2DictionaryUpdateClassName=de.kugihan.dictionaryformids.dictgen.DictionaryUpdate
+logLevel=0
+language1FilePostfix={directoryPostfix}
+dictionaryCharEncoding=UTF-8
+numberOfAvailableLanguages=2
+language1IsSearchable=true
+language2GenerateIndex=false
+dictionaryFileMaxSize={dicMaxSize}
+language2FilePostfix={language2FilePostfix}
+searchListFileMaxSize=20000
+language2IsSearchable=false
+fileEncodingFormat=plain_format1
+language1HasSeparateDictionaryFile=true
+searchListCharEncoding=ISO-8859-1
+searchListFileSeparationCharacter='\t'
+indexFileSeparationCharacter='\t'
+language1DisplayText={sourceLang}
+language2HasSeparateDictionaryFile=false
+dictionaryGenerationInputCharEncoding=UTF-8
+language1GenerateIndex=true
+language2DisplayText={targetLang}
+language1NormationClassName=de.kugihan.dictionaryformids.translation.NormationEng
+"""
+
+
+class Writer:
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self.linesPerDirectoryFile = 500 # 200
+ self.indexFileMaxSize = 32722 # 30000
+ self.directoryPostfix = ""
+ self.indexPostfix = ""
+ self._dirname = ""
+ # looks like we need to remove tabs, because app gives error
+ # but based on the java code, all punctuations should be removed
+ # as well, including '|'
+ self.re_punc = re.compile(
+ r"""[!"$§%&/()=?´`\\{}\[\]^°+*~#'\-_.:,;<>@|]*""", # noqa: RUF001
+ )
+ self.re_spaces = re.compile(" +")
+ self.re_tabs = re.compile("\t+")
+
+ def normateWord(self, word: str) -> str:
+ word = word.strip()
+ word = self.re_punc.sub("", word)
+ word = self.re_spaces.sub(" ", word)
+ word = self.re_tabs.sub(" ", word)
+ word = word.lower()
+ return word # noqa: RET504
+
+ def writeProbs(self) -> None:
+ glos = self._glos
+ probsPath = join(
+ self._dirname,
+ "DictionaryForMIDs.properties",
+ )
+ with open(probsPath, mode="w", newline="\n", encoding="utf-8") as fileObj:
+ fileObj.write(
+ PROP_TEMPLATE.format(
+ name=glos.getInfo("name"),
+ author=glos.author,
+ indexFileMaxSize=self.indexFileMaxSize,
+ wordCount=self.wordCount,
+ directoryPostfix=self.directoryPostfix,
+ dicMaxSize=self.dicMaxSize + 1,
+ language2FilePostfix="fa", # FIXME
+ sourceLang=glos.sourceLangName,
+ targetLang=glos.targetLangName,
+ ),
+ )
+
+ def nextIndex(self) -> None:
+ try:
+ self.indexFp.close()
+ except AttributeError:
+ self.indexIndex = 0
+
+ self.indexIndex += 1
+ fname = f"index{self.indexPostfix}{self.indexIndex}.csv"
+ fpath = join(self._dirname, fname)
+ self.indexFp = open(fpath, mode="w", encoding="utf-8", newline="\n")
+
+ def finish(self) -> None:
+ pass
+
+ def open(self, dirname: str) -> None:
+ self._dirname = dirname
+ if not os.path.isdir(dirname):
+ os.mkdir(dirname)
+
+ def write(self) -> Generator[None, EntryType, None]:
+ self.nextIndex()
+
+ dicMaxSize = 0
+ indexData: list[tuple[str, int, int]] = []
+
+ def writeBucket(dicIndex: int, entryList: list[EntryType]) -> None:
+ nonlocal dicMaxSize
+ log.debug(
+ f"{dicIndex=}, {len(entryList)=}, {dicMaxSize=}",
+ )
+ dicFp = open(
+ join(
+ self._dirname,
+ f"directory{self.directoryPostfix}{dicIndex + 1}.csv",
+ ),
+ mode="w",
+ encoding="utf-8",
+ newline="\n",
+ )
+ for entry in entryList:
+ word = entry.s_word
+ n_word = self.normateWord(word)
+ defi = entry.defi
+ dicLine = word + "\t" + defi + "\n"
+ dicPos = dicFp.tell()
+ dicFp.write(dicLine)
+ indexData.append((n_word, dicIndex + 1, dicPos))
+
+ dicMaxSize = max(dicMaxSize, dicFp.tell())
+ dicFp.close()
+
+ bucketSize = self.linesPerDirectoryFile
+ wordCount = 0
+ dicIndex = 0
+ entryList: list[EntryType] = [] # aka bucket
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ # FIXME
+ continue
+ wordCount += 1
+ entryList.append(entry)
+ if len(entryList) >= bucketSize:
+ writeBucket(dicIndex, entryList)
+ dicIndex += 1
+ entryList = []
+
+ if entryList:
+ writeBucket(dicIndex, entryList)
+ entryList = []
+
+ self.dicMaxSize = dicMaxSize
+ self.wordCount = wordCount
+
+ langSearchListFp = open(
+ join(
+ self._dirname,
+ f"searchlist{self.directoryPostfix}.csv",
+ ),
+ mode="w",
+ newline="\n",
+ encoding="utf-8",
+ )
+
+ langSearchListFp.write(f"{indexData[0][0]}\t{self.indexIndex}\n")
+
+ for word, dicIndex, dicPos in indexData:
+ indexLine = f"{word}\t{dicIndex}-{dicPos}-B\n"
+ if (self.indexFp.tell() + len(indexLine)) > self.indexFileMaxSize - 10:
+ self.nextIndex()
+ langSearchListFp.write(f"{word}\t{self.indexIndex}\n")
+ self.indexFp.write(indexLine)
+
+ self.indexFp.close()
+ langSearchListFp.close()
+
+ self.writeProbs()
diff --git a/pyglossary/plugins/dict_cc/__init__.py b/pyglossary/plugins/dict_cc/__init__.py
index 9105a963e..c75ec3d64 100644
--- a/pyglossary/plugins/dict_cc/__init__.py
+++ b/pyglossary/plugins/dict_cc/__init__.py
@@ -1,20 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
-import html
-from operator import itemgetter
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING
if TYPE_CHECKING:
- import sqlite3
- from collections.abc import Callable, Iterator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
- from pyglossary.lxml_types import Element, T_htmlfile
from pyglossary.option import Option
-from pyglossary.core import log
+from .reader import Reader
__all__ = [
"Reader",
@@ -45,192 +38,3 @@
"dict.cc dictionary - Google Play",
)
optionsProp: dict[str, Option] = {}
-
-
-class Reader:
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
-
- def _clear(self) -> None:
- self._filename = ""
- self._con: sqlite3.Connection | None = None
- self._cur: sqlite3.Cursor | None = None
-
- def open(self, filename: str) -> None:
- from sqlite3 import connect
-
- self._filename = filename
- self._con = connect(filename)
- self._cur = self._con.cursor()
- self._glos.setDefaultDefiFormat("h")
-
- def __len__(self) -> int:
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute(
- "select count(distinct term1)+count(distinct term2) from main_ft",
- )
- return self._cur.fetchone()[0]
-
- @staticmethod
- def makeList(
- hf: T_htmlfile,
- input_elements: list[Element],
- processor: Callable,
- single_prefix: str = "",
- skip_single: bool = True,
- ) -> None:
- """Wrap elements into if more than one element."""
- if not input_elements:
- return
-
- if skip_single and len(input_elements) == 1:
- hf.write(single_prefix)
- processor(hf, input_elements[0])
- return
-
- with hf.element("ol"):
- for el in input_elements:
- with hf.element("li"):
- processor(hf, el)
-
- @staticmethod
- def makeGroupsList(
- hf: T_htmlfile,
- groups: list[tuple[str, str]],
- processor: Callable[[T_htmlfile, tuple[str, str]], None],
- single_prefix: str = "",
- skip_single: bool = True,
- ) -> None:
- """Wrap elements into if more than one element."""
- if not groups:
- return
-
- if skip_single and len(groups) == 1:
- hf.write(single_prefix)
- processor(hf, groups[0])
- return
-
- with hf.element("ol"):
- for el in groups:
- with hf.element("li"):
- processor(hf, el)
-
- def writeSense( # noqa: PLR6301
- self,
- hf: T_htmlfile,
- row: tuple[str, str],
- ) -> None:
- from lxml import etree as ET
-
- trans, entry_type = row
- if entry_type:
- with hf.element("i"):
- hf.write(f"{entry_type}") # noqa: FURB183
- hf.write(ET.Element("br"))
- try:
- hf.write(trans + " ")
- except Exception as e:
- log.error(f"error in writing {trans!r}, {e}")
- hf.write(repr(trans) + " ")
- else:
- with hf.element("big"):
- with hf.element("a", href=f"bword://{trans}"):
- hf.write("⏎")
-
- def iterRows(
- self,
- column1: str,
- column2: str,
- ) -> Iterator[tuple[str, str, str]]:
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute(
- f"select {column1}, {column2}, entry_type from main_ft"
- f" order by {column1}",
- )
- for row in self._cur.fetchall():
- term1 = row[0]
- term2 = row[1]
- try:
- term1 = html.unescape(term1)
- except Exception as e:
- log.error(f"html.unescape({term1!r}) -> {e}")
- try:
- term2 = html.unescape(term2)
- except Exception as e:
- log.error(f"html.unescape({term2!r}) -> {e}")
- yield term1, term2, row[2]
-
- def parseGender(self, headword: str) -> tuple[str | None, str]: # noqa: PLR6301
- # {m} masc masculine German: maskulin
- # {f} fem feminine German: feminin
- # {n} neut neutral German: neutral
- # { } ????
- i = headword.find(" {")
- if i <= 0:
- return None, headword
- if len(headword) < i + 4:
- return None, headword
- if headword[i + 3] != "}":
- return None, headword
- g = headword[i + 2]
- gender = None
- if g == "m":
- gender = "masculine"
- elif g == "f":
- gender = "feminine"
- elif g == "n":
- gender = "neutral"
- else:
- log.warning(f"invalid gender {g!r}")
- return None, headword
- headword = headword[:i] + headword[i + 4 :]
- return gender, headword
-
- def _iterOneDirection(
- self,
- column1: str,
- column2: str,
- ) -> Iterator[EntryType]:
- from io import BytesIO
- from itertools import groupby
-
- from lxml import etree as ET
-
- glos = self._glos
- for headwordEscaped, groupsOrig in groupby(
- self.iterRows(column1, column2),
- key=itemgetter(0),
- ):
- headword = html.unescape(headwordEscaped)
- groups: list[tuple[str, str]] = [
- (term2, entry_type) for _, term2, entry_type in groupsOrig
- ]
- f = BytesIO()
- gender, headword = self.parseGender(headword)
- with ET.htmlfile(f, encoding="utf-8") as hf:
- with hf.element("div"):
- if gender:
- with hf.element("i"):
- hf.write(gender)
- hf.write(ET.Element("br"))
- self.makeGroupsList(
- cast("T_htmlfile", hf),
- groups,
- self.writeSense,
- )
- defi = f.getvalue().decode("utf-8")
- yield glos.newEntry(headword, defi, defiFormat="h")
-
- def __iter__(self) -> Iterator[EntryType]:
- yield from self._iterOneDirection("term1", "term2")
- yield from self._iterOneDirection("term2", "term1")
-
- def close(self) -> None:
- if self._cur:
- self._cur.close()
- if self._con:
- self._con.close()
- self._clear()
diff --git a/pyglossary/plugins/dict_cc/reader.py b/pyglossary/plugins/dict_cc/reader.py
new file mode 100644
index 000000000..e6615604a
--- /dev/null
+++ b/pyglossary/plugins/dict_cc/reader.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import html
+from operator import itemgetter
+from typing import TYPE_CHECKING, cast
+
+if TYPE_CHECKING:
+ import sqlite3
+ from collections.abc import Callable, Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+ from pyglossary.lxml_types import Element, T_htmlfile
+
+
+from pyglossary.core import log
+
+
+class Reader:
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._con: sqlite3.Connection | None = None
+ self._cur: sqlite3.Cursor | None = None
+
+ def open(self, filename: str) -> None:
+ from sqlite3 import connect
+
+ self._filename = filename
+ self._con = connect(filename)
+ self._cur = self._con.cursor()
+ self._glos.setDefaultDefiFormat("h")
+
+ def __len__(self) -> int:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute(
+ "select count(distinct term1)+count(distinct term2) from main_ft",
+ )
+ return self._cur.fetchone()[0]
+
+ @staticmethod
+ def makeList(
+ hf: T_htmlfile,
+ input_elements: list[Element],
+ processor: Callable,
+ single_prefix: str = "",
+ skip_single: bool = True,
+ ) -> None:
+ """Wrap elements into if more than one element."""
+ if not input_elements:
+ return
+
+ if skip_single and len(input_elements) == 1:
+ hf.write(single_prefix)
+ processor(hf, input_elements[0])
+ return
+
+ with hf.element("ol"):
+ for el in input_elements:
+ with hf.element("li"):
+ processor(hf, el)
+
+ @staticmethod
+ def makeGroupsList(
+ hf: T_htmlfile,
+ groups: list[tuple[str, str]],
+ processor: Callable[[T_htmlfile, tuple[str, str]], None],
+ single_prefix: str = "",
+ skip_single: bool = True,
+ ) -> None:
+ """Wrap elements into if more than one element."""
+ if not groups:
+ return
+
+ if skip_single and len(groups) == 1:
+ hf.write(single_prefix)
+ processor(hf, groups[0])
+ return
+
+ with hf.element("ol"):
+ for el in groups:
+ with hf.element("li"):
+ processor(hf, el)
+
+ def writeSense( # noqa: PLR6301
+ self,
+ hf: T_htmlfile,
+ row: tuple[str, str],
+ ) -> None:
+ from lxml import etree as ET
+
+ trans, entry_type = row
+ if entry_type:
+ with hf.element("i"):
+ hf.write(f"{entry_type}") # noqa: FURB183
+ hf.write(ET.Element("br"))
+ try:
+ hf.write(trans + " ")
+ except Exception as e:
+ log.error(f"error in writing {trans!r}, {e}")
+ hf.write(repr(trans) + " ")
+ else:
+ with hf.element("big"):
+ with hf.element("a", href=f"bword://{trans}"):
+ hf.write("⏎")
+
+ def iterRows(
+ self,
+ column1: str,
+ column2: str,
+ ) -> Iterator[tuple[str, str, str]]:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute(
+ f"select {column1}, {column2}, entry_type from main_ft"
+ f" order by {column1}",
+ )
+ for row in self._cur.fetchall():
+ term1 = row[0]
+ term2 = row[1]
+ try:
+ term1 = html.unescape(term1)
+ except Exception as e:
+ log.error(f"html.unescape({term1!r}) -> {e}")
+ try:
+ term2 = html.unescape(term2)
+ except Exception as e:
+ log.error(f"html.unescape({term2!r}) -> {e}")
+ yield term1, term2, row[2]
+
+ def parseGender(self, headword: str) -> tuple[str | None, str]: # noqa: PLR6301
+ # {m} masc masculine German: maskulin
+ # {f} fem feminine German: feminin
+ # {n} neut neutral German: neutral
+ # { } ????
+ i = headword.find(" {")
+ if i <= 0:
+ return None, headword
+ if len(headword) < i + 4:
+ return None, headword
+ if headword[i + 3] != "}":
+ return None, headword
+ g = headword[i + 2]
+ gender = None
+ if g == "m":
+ gender = "masculine"
+ elif g == "f":
+ gender = "feminine"
+ elif g == "n":
+ gender = "neutral"
+ else:
+ log.warning(f"invalid gender {g!r}")
+ return None, headword
+ headword = headword[:i] + headword[i + 4 :]
+ return gender, headword
+
+ def _iterOneDirection(
+ self,
+ column1: str,
+ column2: str,
+ ) -> Iterator[EntryType]:
+ from io import BytesIO
+ from itertools import groupby
+
+ from lxml import etree as ET
+
+ glos = self._glos
+ for headwordEscaped, groupsOrig in groupby(
+ self.iterRows(column1, column2),
+ key=itemgetter(0),
+ ):
+ headword = html.unescape(headwordEscaped)
+ groups: list[tuple[str, str]] = [
+ (term2, entry_type) for _, term2, entry_type in groupsOrig
+ ]
+ f = BytesIO()
+ gender, headword = self.parseGender(headword)
+ with ET.htmlfile(f, encoding="utf-8") as hf:
+ with hf.element("div"):
+ if gender:
+ with hf.element("i"):
+ hf.write(gender)
+ hf.write(ET.Element("br"))
+ self.makeGroupsList(
+ cast("T_htmlfile", hf),
+ groups,
+ self.writeSense,
+ )
+ defi = f.getvalue().decode("utf-8")
+ yield glos.newEntry(headword, defi, defiFormat="h")
+
+ def __iter__(self) -> Iterator[EntryType]:
+ yield from self._iterOneDirection("term1", "term2")
+ yield from self._iterOneDirection("term2", "term1")
+
+ def close(self) -> None:
+ if self._cur:
+ self._cur.close()
+ if self._con:
+ self._con.close()
+ self._clear()
diff --git a/pyglossary/plugins/dict_cc_split/__init__.py b/pyglossary/plugins/dict_cc_split/__init__.py
index daa096949..69fbb799c 100644
--- a/pyglossary/plugins/dict_cc_split/__init__.py
+++ b/pyglossary/plugins/dict_cc_split/__init__.py
@@ -1,17 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
-import html
from typing import TYPE_CHECKING
if TYPE_CHECKING:
- import sqlite3
- from collections.abc import Iterator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
from pyglossary.option import Option
-from pyglossary.core import log
+from .reader import Reader
__all__ = [
"Reader",
@@ -42,73 +37,3 @@
"dict.cc dictionary - Google Play",
)
optionsProp: dict[str, Option] = {}
-
-
-class Reader:
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
-
- def _clear(self) -> None:
- self._filename = ""
- self._con: sqlite3.Connection | None = None
- self._cur: sqlite3.Cursor | None = None
-
- def open(self, filename: str) -> None:
- from sqlite3 import connect
-
- self._filename = filename
- self._con = connect(filename)
- self._cur = self._con.cursor()
- self._glos.setDefaultDefiFormat("m")
-
- def __len__(self) -> int:
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute("select count(*) * 2 from main_ft")
- return self._cur.fetchone()[0]
-
- def iterRows(
- self,
- column1: str,
- column2: str,
- ) -> Iterator[tuple[str, str, str]]:
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute(
- f"select {column1}, {column2}, entry_type from main_ft"
- f" order by {column1}",
- )
- for row in self._cur.fetchall():
- term1 = row[0]
- term2 = row[1]
- try:
- term1 = html.unescape(term1)
- except Exception as e:
- log.error(f"html.unescape({term1!r}) -> {e}")
- try:
- term2 = html.unescape(term2)
- except Exception as e:
- log.error(f"html.unescape({term2!r}) -> {e}")
- yield term1, term2, row[2]
-
- def _iterOneDirection(
- self,
- column1: str,
- column2: str,
- ) -> Iterator[EntryType]:
- for word, defi, entry_type in self.iterRows(column1, column2):
- if entry_type:
- word = f"{word} {{{entry_type}}}" # noqa: PLW2901
- yield self._glos.newEntry(word, defi, defiFormat="m")
-
- def __iter__(self) -> Iterator[EntryType]:
- yield from self._iterOneDirection("term1", "term2")
- yield from self._iterOneDirection("term2", "term1")
-
- def close(self) -> None:
- if self._cur:
- self._cur.close()
- if self._con:
- self._con.close()
- self._clear()
diff --git a/pyglossary/plugins/dict_cc_split/reader.py b/pyglossary/plugins/dict_cc_split/reader.py
new file mode 100644
index 000000000..1e5205f28
--- /dev/null
+++ b/pyglossary/plugins/dict_cc_split/reader.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import html
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ import sqlite3
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+from pyglossary.core import log
+
+
+class Reader:
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._con: sqlite3.Connection | None = None
+ self._cur: sqlite3.Cursor | None = None
+
+ def open(self, filename: str) -> None:
+ from sqlite3 import connect
+
+ self._filename = filename
+ self._con = connect(filename)
+ self._cur = self._con.cursor()
+ self._glos.setDefaultDefiFormat("m")
+
+ def __len__(self) -> int:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute("select count(*) * 2 from main_ft")
+ return self._cur.fetchone()[0]
+
+ def iterRows(
+ self,
+ column1: str,
+ column2: str,
+ ) -> Iterator[tuple[str, str, str]]:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute(
+ f"select {column1}, {column2}, entry_type from main_ft"
+ f" order by {column1}",
+ )
+ for row in self._cur.fetchall():
+ term1 = row[0]
+ term2 = row[1]
+ try:
+ term1 = html.unescape(term1)
+ except Exception as e:
+ log.error(f"html.unescape({term1!r}) -> {e}")
+ try:
+ term2 = html.unescape(term2)
+ except Exception as e:
+ log.error(f"html.unescape({term2!r}) -> {e}")
+ yield term1, term2, row[2]
+
+ def _iterOneDirection(
+ self,
+ column1: str,
+ column2: str,
+ ) -> Iterator[EntryType]:
+ for word, defi, entry_type in self.iterRows(column1, column2):
+ if entry_type:
+ word = f"{word} {{{entry_type}}}" # noqa: PLW2901
+ yield self._glos.newEntry(word, defi, defiFormat="m")
+
+ def __iter__(self) -> Iterator[EntryType]:
+ yield from self._iterOneDirection("term1", "term2")
+ yield from self._iterOneDirection("term2", "term1")
+
+ def close(self) -> None:
+ if self._cur:
+ self._cur.close()
+ if self._con:
+ self._con.close()
+ self._clear()
diff --git a/pyglossary/plugins/dict_org/__init__.py b/pyglossary/plugins/dict_org/__init__.py
index 8331d3adb..9af2bf0b3 100644
--- a/pyglossary/plugins/dict_org/__init__.py
+++ b/pyglossary/plugins/dict_org/__init__.py
@@ -2,20 +2,11 @@
from __future__ import annotations
-import os
-import re
-from os.path import isdir, splitext
-from typing import TYPE_CHECKING
-
-from pyglossary.core import log
from pyglossary.flags import DEFAULT_NO
from pyglossary.option import BoolOption, Option
-from pyglossary.plugin_lib.dictdlib import DictDB
-
-if TYPE_CHECKING:
- from collections.abc import Generator, Iterator
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .reader import Reader
+from .writer import Writer
__all__ = [
"Reader",
@@ -51,146 +42,3 @@
"http://dict.org/bin/Dict",
"The DICT Development Group",
)
-
-
-def installToDictd(filename: str, dictzip: bool) -> None:
- """Filename is without extension (neither .index or .dict or .dict.dz)."""
- import shutil
- import subprocess
-
- targetDir = "/usr/share/dictd/"
- if filename.startswith(targetDir):
- return
-
- if not isdir(targetDir):
- log.warning(f"Directory {targetDir!r} does not exist, skipping install")
- return
-
- log.info(f"Installing {filename!r} to DICTD server directory: {targetDir}")
-
- if dictzip and os.path.isfile(filename + ".dict.dz"):
- dictExt = ".dict.dz"
- elif os.path.isfile(filename + ".dict"):
- dictExt = ".dict"
- else:
- log.error(f"No .dict file, could not install dictd file {filename!r}")
- return
-
- if not filename.startswith(targetDir):
- shutil.copy(filename + ".index", targetDir)
- shutil.copy(filename + dictExt, targetDir)
-
- # update /var/lib/dictd/db.list
- if subprocess.call(["/usr/sbin/dictdconfig", "-w"]) != 0:
- log.error(
- "failed to update /var/lib/dictd/db.list file"
- ", try manually running: sudo /usr/sbin/dictdconfig -w",
- )
-
- log.info("don't forget to restart dictd server")
-
-
-class Reader:
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = ""
- self._dictdb: DictDB | None = None
-
- # regular expression patterns used to prettify definition text
- self._re_newline_in_braces = re.compile(
- r"\{(?P.*?)\n(?P.*?)?\}",
- )
- self._re_words_in_braces = re.compile(
- r"\{(?P.+?)\}",
- )
-
- def open(self, filename: str) -> None:
- filename = filename.removesuffix(".index")
- self._filename = filename
- self._dictdb = DictDB(filename, "read", 1)
-
- def close(self) -> None:
- if self._dictdb is not None:
- self._dictdb.close()
- # self._dictdb.finish()
- self._dictdb = None
-
- def prettifyDefinitionText(self, defi: str) -> str:
- # Handle words in {}
- # First, we remove any \n in {} pairs
- defi = self._re_newline_in_braces.sub(r"{\g\g}", defi)
-
- # Then, replace any {words} into words,
- # so it can be rendered as link correctly
- defi = self._re_words_in_braces.sub(
- r'\g',
- defi,
- )
-
- # Use
so it can be rendered as newline correctly
- return defi.replace("\n", "
")
-
- def __len__(self) -> int:
- if self._dictdb is None:
- return 0
- return len(self._dictdb)
-
- def __iter__(self) -> Iterator[EntryType]:
- if self._dictdb is None:
- raise RuntimeError("iterating over a reader while it's not open")
- dictdb = self._dictdb
- for word in dictdb.getDefList():
- b_defi = b"\n\n
\n\n".join(dictdb.getDef(word))
- try:
- defi = b_defi.decode("utf_8", "ignore")
- defi = self.prettifyDefinitionText(defi)
- except Exception as e:
- log.error(f"{b_defi = }")
- raise e
- yield self._glos.newEntry(word, defi)
-
-
-class Writer:
- _dictzip: bool = False
- _install: bool = True
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = ""
- self._dictdb: DictDB | None = None
-
- def finish(self) -> None:
- from pyglossary.os_utils import runDictzip
-
- if self._dictdb is None:
- raise RuntimeError("self._dictdb is None")
-
- self._dictdb.finish(dosort=True)
- if self._dictzip:
- runDictzip(f"{self._filename}.dict")
- if self._install:
- installToDictd(
- self._filename,
- self._dictzip,
- )
- self._filename = ""
-
- def open(self, filename: str) -> None:
- filename_nox, ext = splitext(filename)
- if ext.lower() == ".index":
- filename = filename_nox
- self._dictdb = DictDB(filename, "write", 1)
- self._filename = filename
-
- def write(self) -> Generator[None, EntryType, None]:
- dictdb = self._dictdb
- if dictdb is None:
- raise RuntimeError("self._dictdb is None")
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- # does dictd support resources? and how? FIXME
- continue
- dictdb.addEntry(entry.defi, entry.l_word)
diff --git a/pyglossary/plugins/dict_org/reader.py b/pyglossary/plugins/dict_org/reader.py
new file mode 100644
index 000000000..71a47fc13
--- /dev/null
+++ b/pyglossary/plugins/dict_org/reader.py
@@ -0,0 +1,74 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from pyglossary.core import log
+from pyglossary.plugin_lib.dictdlib import DictDB
+
+if TYPE_CHECKING:
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = ""
+ self._dictdb: DictDB | None = None
+
+ # regular expression patterns used to prettify definition text
+ self._re_newline_in_braces = re.compile(
+ r"\{(?P.*?)\n(?P.*?)?\}",
+ )
+ self._re_words_in_braces = re.compile(
+ r"\{(?P.+?)\}",
+ )
+
+ def open(self, filename: str) -> None:
+ filename = filename.removesuffix(".index")
+ self._filename = filename
+ self._dictdb = DictDB(filename, "read", 1)
+
+ def close(self) -> None:
+ if self._dictdb is not None:
+ self._dictdb.close()
+ # self._dictdb.finish()
+ self._dictdb = None
+
+ def prettifyDefinitionText(self, defi: str) -> str:
+ # Handle words in {}
+ # First, we remove any \n in {} pairs
+ defi = self._re_newline_in_braces.sub(r"{\g\g}", defi)
+
+ # Then, replace any {words} into words,
+ # so it can be rendered as link correctly
+ defi = self._re_words_in_braces.sub(
+ r'\g',
+ defi,
+ )
+
+ # Use
so it can be rendered as newline correctly
+ return defi.replace("\n", "
")
+
+ def __len__(self) -> int:
+ if self._dictdb is None:
+ return 0
+ return len(self._dictdb)
+
+ def __iter__(self) -> Iterator[EntryType]:
+ if self._dictdb is None:
+ raise RuntimeError("iterating over a reader while it's not open")
+ dictdb = self._dictdb
+ for word in dictdb.getDefList():
+ b_defi = b"\n\n
\n\n".join(dictdb.getDef(word))
+ try:
+ defi = b_defi.decode("utf_8", "ignore")
+ defi = self.prettifyDefinitionText(defi)
+ except Exception as e:
+ log.error(f"{b_defi = }")
+ raise e
+ yield self._glos.newEntry(word, defi)
diff --git a/pyglossary/plugins/dict_org/writer.py b/pyglossary/plugins/dict_org/writer.py
new file mode 100644
index 000000000..5cc2762e7
--- /dev/null
+++ b/pyglossary/plugins/dict_org/writer.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+from os.path import splitext
+from typing import TYPE_CHECKING
+
+from pyglossary.core import log
+from pyglossary.plugin_lib.dictdlib import DictDB
+
+if TYPE_CHECKING:
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+def installToDictd(filename: str, dictzip: bool) -> None:
+ """Filename is without extension (neither .index or .dict or .dict.dz)."""
+ import shutil
+ import subprocess
+ from os.path import isdir, isfile
+
+ targetDir = "/usr/share/dictd/"
+ if filename.startswith(targetDir):
+ return
+
+ if not isdir(targetDir):
+ log.warning(f"Directory {targetDir!r} does not exist, skipping install")
+ return
+
+ log.info(f"Installing {filename!r} to DICTD server directory: {targetDir}")
+
+ if dictzip and isfile(filename + ".dict.dz"):
+ dictExt = ".dict.dz"
+ elif isfile(filename + ".dict"):
+ dictExt = ".dict"
+ else:
+ log.error(f"No .dict file, could not install dictd file {filename!r}")
+ return
+
+ if not filename.startswith(targetDir):
+ shutil.copy(filename + ".index", targetDir)
+ shutil.copy(filename + dictExt, targetDir)
+
+ # update /var/lib/dictd/db.list
+ if subprocess.call(["/usr/sbin/dictdconfig", "-w"]) != 0:
+ log.error(
+ "failed to update /var/lib/dictd/db.list file"
+ ", try manually running: sudo /usr/sbin/dictdconfig -w",
+ )
+
+ log.info("don't forget to restart dictd server")
+
+
+class Writer:
+ _dictzip: bool = False
+ _install: bool = True
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = ""
+ self._dictdb: DictDB | None = None
+
+ def finish(self) -> None:
+ from pyglossary.os_utils import runDictzip
+
+ if self._dictdb is None:
+ raise RuntimeError("self._dictdb is None")
+
+ self._dictdb.finish(dosort=True)
+ if self._dictzip:
+ runDictzip(f"{self._filename}.dict")
+ if self._install:
+ installToDictd(
+ self._filename,
+ self._dictzip,
+ )
+ self._filename = ""
+
+ def open(self, filename: str) -> None:
+ filename_nox, ext = splitext(filename)
+ if ext.lower() == ".index":
+ filename = filename_nox
+ self._dictdb = DictDB(filename, "write", 1)
+ self._filename = filename
+
+ def write(self) -> Generator[None, EntryType, None]:
+ dictdb = self._dictdb
+ if dictdb is None:
+ raise RuntimeError("self._dictdb is None")
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ # does dictd support resources? and how? FIXME
+ continue
+ dictdb.addEntry(entry.defi, entry.l_word)
diff --git a/pyglossary/plugins/dict_org_source/__init__.py b/pyglossary/plugins/dict_org_source/__init__.py
index 5c899f1fe..9a9d63233 100644
--- a/pyglossary/plugins/dict_org_source/__init__.py
+++ b/pyglossary/plugins/dict_org_source/__init__.py
@@ -1,14 +1,9 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
-from typing import TYPE_CHECKING
-
from pyglossary.option import BoolOption, Option
-if TYPE_CHECKING:
- from collections.abc import Generator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .writer import Writer
__all__ = [
"Writer",
@@ -41,36 +36,3 @@
optionsProp: dict[str, Option] = {
"remove_html_all": BoolOption(comment="Remove all HTML tags"),
}
-
-
-class Writer:
- _remove_html_all: bool = True
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = ""
-
- def finish(self) -> None:
- self._filename = ""
-
- def open(self, filename: str) -> None:
- self._filename = filename
- if self._remove_html_all:
- self._glos.removeHtmlTagsAll()
- # TODO: add another bool flag to only remove html tags that are not
- # supported by GtkTextView
-
- @staticmethod
- def _defiEscapeFunc(defi: str) -> str:
- return defi.replace("\r", "")
-
- def write(self) -> Generator[None, EntryType, None]:
- from pyglossary.text_writer import writeTxt
-
- yield from writeTxt(
- self._glos,
- entryFmt=":{word}:{defi}\n",
- filename=self._filename,
- defiEscapeFunc=self._defiEscapeFunc,
- ext=".dtxt",
- )
diff --git a/pyglossary/plugins/dict_org_source/writer.py b/pyglossary/plugins/dict_org_source/writer.py
new file mode 100644
index 000000000..1548f5975
--- /dev/null
+++ b/pyglossary/plugins/dict_org_source/writer.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Writer:
+ _remove_html_all: bool = True
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = ""
+
+ def finish(self) -> None:
+ self._filename = ""
+
+ def open(self, filename: str) -> None:
+ self._filename = filename
+ if self._remove_html_all:
+ self._glos.removeHtmlTagsAll()
+ # TODO: add another bool flag to only remove html tags that are not
+ # supported by GtkTextView
+
+ @staticmethod
+ def _defiEscapeFunc(defi: str) -> str:
+ return defi.replace("\r", "")
+
+ def write(self) -> Generator[None, EntryType, None]:
+ from pyglossary.text_writer import writeTxt
+
+ yield from writeTxt(
+ self._glos,
+ entryFmt=":{word}:{defi}\n",
+ filename=self._filename,
+ defiEscapeFunc=self._defiEscapeFunc,
+ ext=".dtxt",
+ )
diff --git a/pyglossary/plugins/dictunformat/__init__.py b/pyglossary/plugins/dictunformat/__init__.py
index a05c55459..77e5f8233 100644
--- a/pyglossary/plugins/dictunformat/__init__.py
+++ b/pyglossary/plugins/dictunformat/__init__.py
@@ -1,8 +1,8 @@
from __future__ import annotations
-from pyglossary.core import log
from pyglossary.option import EncodingOption, Option, StrOption
-from pyglossary.text_reader import TextGlossaryReader
+
+from .reader import Reader
__all__ = [
"Reader",
@@ -38,89 +38,3 @@
comment="separator for headword and alternates",
),
}
-
-
-def unescapeDefi(defi: str) -> str:
- return defi
-
-
-class Reader(TextGlossaryReader):
- _headword_separator = "; "
- # https://github.com/cheusov/dictd/blob/master/dictfmt/dictunformat.in#L14
-
- @classmethod
- def isInfoWord(cls, word: str) -> bool:
- return word.startswith("00-database-")
-
- @classmethod
- def fixInfoWord(cls, word: str) -> str:
- return word
-
- def setInfo(self, word: str, defi: str) -> None:
- if word == "00-database-short":
- self._glos.setInfo("name", defi)
- return
-
- if word != "00-database-info":
- return
-
- glos = self._glos
-
- lastKey = ""
- for line in defi.split("\n"):
- if not line.startswith("##:"):
- if lastKey:
- glos.setInfo(word, f"{glos.getInfo(lastKey)}\n{line}")
- continue
-
- parts = line[3:].split(":")
- if len(parts) < 2:
- log.error(f"unexpected line: {line}")
- key = lastKey = parts[0]
- value = ":".join(parts[1:])
- glos.setInfo(key, value)
-
- def nextBlock(self) -> tuple[str | list[str], str, None] | None:
- if not self._file:
- raise StopIteration
- word = ""
- defiLines: list[str] = []
-
- while True:
- line = self.readline()
- if not line:
- break
- line = line.rstrip("\n\r")
- if not line:
- continue
-
- if not line.strip("_"):
- if not word:
- continue
- if not defiLines:
- log.warning(f"no definition/value for {word!r}")
- defi = unescapeDefi("\n".join(defiLines))
- words = word.split(self._headword_separator)
- return words, defi, None
-
- if not word:
- word = line
- continue
-
- if line == word:
- continue
- if line.lower() == word:
- word = line
- continue
-
- defiLines.append(line)
-
- if word:
- defi = unescapeDefi("\n".join(defiLines))
- if word.startswith("00-database-") and defi == "unknown":
- log.info(f"ignoring {word} -> {defi}")
- return None
- words = word.split(self._headword_separator)
- return words, defi, None
-
- raise StopIteration
diff --git a/pyglossary/plugins/dictunformat/reader.py b/pyglossary/plugins/dictunformat/reader.py
new file mode 100644
index 000000000..c66a0f937
--- /dev/null
+++ b/pyglossary/plugins/dictunformat/reader.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+from pyglossary.core import log
+from pyglossary.text_reader import TextGlossaryReader
+
+
+def unescapeDefi(defi: str) -> str:
+ return defi
+
+
+class Reader(TextGlossaryReader):
+ _headword_separator = "; "
+ # https://github.com/cheusov/dictd/blob/master/dictfmt/dictunformat.in#L14
+
+ @classmethod
+ def isInfoWord(cls, word: str) -> bool:
+ return word.startswith("00-database-")
+
+ @classmethod
+ def fixInfoWord(cls, word: str) -> str:
+ return word
+
+ def setInfo(self, word: str, defi: str) -> None:
+ if word == "00-database-short":
+ self._glos.setInfo("name", defi)
+ return
+
+ if word != "00-database-info":
+ return
+
+ glos = self._glos
+
+ lastKey = ""
+ for line in defi.split("\n"):
+ if not line.startswith("##:"):
+ if lastKey:
+ glos.setInfo(word, f"{glos.getInfo(lastKey)}\n{line}")
+ continue
+
+ parts = line[3:].split(":")
+ if len(parts) < 2:
+ log.error(f"unexpected line: {line}")
+ key = lastKey = parts[0]
+ value = ":".join(parts[1:])
+ glos.setInfo(key, value)
+
+ def nextBlock(self) -> tuple[str | list[str], str, None] | None:
+ if not self._file:
+ raise StopIteration
+ word = ""
+ defiLines: list[str] = []
+
+ while True:
+ line = self.readline()
+ if not line:
+ break
+ line = line.rstrip("\n\r")
+ if not line:
+ continue
+
+ if not line.strip("_"):
+ if not word:
+ continue
+ if not defiLines:
+ log.warning(f"no definition/value for {word!r}")
+ defi = unescapeDefi("\n".join(defiLines))
+ words = word.split(self._headword_separator)
+ return words, defi, None
+
+ if not word:
+ word = line
+ continue
+
+ if line == word:
+ continue
+ if line.lower() == word:
+ word = line
+ continue
+
+ defiLines.append(line)
+
+ if word:
+ defi = unescapeDefi("\n".join(defiLines))
+ if word.startswith("00-database-") and defi == "unknown":
+ log.info(f"ignoring {word} -> {defi}")
+ return None
+ words = word.split(self._headword_separator)
+ return words, defi, None
+
+ raise StopIteration
diff --git a/pyglossary/plugins/digitalnk/__init__.py b/pyglossary/plugins/digitalnk/__init__.py
index cf35cef73..08c23d4eb 100644
--- a/pyglossary/plugins/digitalnk/__init__.py
+++ b/pyglossary/plugins/digitalnk/__init__.py
@@ -1,16 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
-import html
from typing import TYPE_CHECKING
if TYPE_CHECKING:
- import sqlite3
- from collections.abc import Iterator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
from pyglossary.option import Option
+from .reader import Reader
+
__all__ = [
"Reader",
"description",
@@ -40,51 +37,3 @@
"@digitalprk/dicrs",
)
optionsProp: dict[str, Option] = {}
-
-
-class Reader:
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
-
- def _clear(self) -> None:
- self._filename = ""
- self._con: sqlite3.Connection | None = None
- self._cur: sqlite3.Cursor | None = None
-
- def open(self, filename: str) -> None:
- from sqlite3 import connect
-
- self._filename = filename
- self._con = connect(filename)
- self._cur = self._con.cursor()
- self._glos.setDefaultDefiFormat("m")
-
- def __len__(self) -> int:
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute("select count(*) from dictionary")
- return self._cur.fetchone()[0]
-
- def __iter__(self) -> Iterator[EntryType]:
- if self._cur is None:
- raise ValueError("cur is None")
- self._cur.execute(
- "select word, definition from dictionary order by word",
- )
- # iteration over self._cur stops after one entry
- # and self._cur.fetchone() returns None
- # no idea why!
- # https://github.com/ilius/pyglossary/issues/282
- # for row in self._cur:
- for row in self._cur.fetchall():
- word = html.unescape(row[0])
- definition = row[1]
- yield self._glos.newEntry(word, definition, defiFormat="m")
-
- def close(self) -> None:
- if self._cur:
- self._cur.close()
- if self._con:
- self._con.close()
- self._clear()
diff --git a/pyglossary/plugins/digitalnk/reader.py b/pyglossary/plugins/digitalnk/reader.py
new file mode 100644
index 000000000..5eb2ba373
--- /dev/null
+++ b/pyglossary/plugins/digitalnk/reader.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import html
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ import sqlite3
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._con: sqlite3.Connection | None = None
+ self._cur: sqlite3.Cursor | None = None
+
+ def open(self, filename: str) -> None:
+ from sqlite3 import connect
+
+ self._filename = filename
+ self._con = connect(filename)
+ self._cur = self._con.cursor()
+ self._glos.setDefaultDefiFormat("m")
+
+ def __len__(self) -> int:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute("select count(*) from dictionary")
+ return self._cur.fetchone()[0]
+
+ def __iter__(self) -> Iterator[EntryType]:
+ if self._cur is None:
+ raise ValueError("cur is None")
+ self._cur.execute(
+ "select word, definition from dictionary order by word",
+ )
+ # iteration over self._cur stops after one entry
+ # and self._cur.fetchone() returns None
+ # no idea why!
+ # https://github.com/ilius/pyglossary/issues/282
+ # for row in self._cur:
+ for row in self._cur.fetchall():
+ word = html.unescape(row[0])
+ definition = row[1]
+ yield self._glos.newEntry(word, definition, defiFormat="m")
+
+ def close(self) -> None:
+ if self._cur:
+ self._cur.close()
+ if self._con:
+ self._con.close()
+ self._clear()
diff --git a/pyglossary/plugins/dikt_json/__init__.py b/pyglossary/plugins/dikt_json/__init__.py
index e47315cd5..39eeecf74 100644
--- a/pyglossary/plugins/dikt_json/__init__.py
+++ b/pyglossary/plugins/dikt_json/__init__.py
@@ -4,23 +4,13 @@
from __future__ import annotations
-import re
-from typing import TYPE_CHECKING
-
-from pyglossary.compression import (
- # compressionOpen,
- stdCompressions,
-)
from pyglossary.option import (
BoolOption,
EncodingOption,
Option,
)
-if TYPE_CHECKING:
- from collections.abc import Generator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .writer import Writer
__all__ = [
"Writer",
@@ -55,65 +45,3 @@
comment="add headwords title to beginning of definition",
),
}
-
-
-class Writer:
- _encoding: str = "utf-8"
- _enable_info: bool = True
- _resources: bool = True
- _word_title: bool = False
-
- compressions = stdCompressions
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = None
- glos.preventDuplicateWords()
-
- def open(self, filename: str) -> None:
- self._filename = filename
-
- def finish(self) -> None:
- self._filename = None
-
- def write(self) -> Generator[None, EntryType, None]:
- from json import dumps
-
- from pyglossary.text_writer import writeTxt
-
- glos = self._glos
- encoding = self._encoding
- enable_info = self._enable_info
- resources = self._resources
-
- ensure_ascii = encoding == "ascii"
-
- def escape(st: str) -> str:
- # remove styling from HTML tags
- st2 = re.sub(r' style="[^"]*"', "", st)
- st2 = re.sub(r' class="[^"]*"', "", st2)
- st2 = re.sub(r"]*>", "", st2)
- st2 = st2.replace("", "")
- st2 = re.sub(r"\n", "", st2)
- st2 = st2.replace("", "")
- st2 = st2.replace("", "")
- # fix russian dictionary issues,
- # such as hyphenation in word (e.g. абб{[']}а{[/']}т)
- st2 = re.sub(r"\{\['\]\}", "", st2)
- st2 = re.sub(r"\{\[/'\]\}", "", st2)
- return dumps(st2, ensure_ascii=ensure_ascii)
-
- yield from writeTxt(
- glos,
- entryFmt="\t{word}: {defi},\n",
- filename=self._filename,
- encoding=encoding,
- writeInfo=enable_info,
- wordEscapeFunc=escape,
- defiEscapeFunc=escape,
- ext=".json",
- head="{\n",
- tail='\t"": ""\n}',
- resources=resources,
- word_title=self._word_title,
- )
diff --git a/pyglossary/plugins/dikt_json/writer.py b/pyglossary/plugins/dikt_json/writer.py
new file mode 100644
index 000000000..e7827ae4b
--- /dev/null
+++ b/pyglossary/plugins/dikt_json/writer.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+# mypy: ignore-errors
+# from https://github.com/maxim-saplin/pyglossary
+
+from __future__ import annotations
+
+import re
+from typing import TYPE_CHECKING
+
+from pyglossary.compression import (
+ # compressionOpen,
+ stdCompressions,
+)
+
+if TYPE_CHECKING:
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Writer:
+ _encoding: str = "utf-8"
+ _enable_info: bool = True
+ _resources: bool = True
+ _word_title: bool = False
+
+ compressions = stdCompressions
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = None
+ glos.preventDuplicateWords()
+
+ def open(self, filename: str) -> None:
+ self._filename = filename
+
+ def finish(self) -> None:
+ self._filename = None
+
+ def write(self) -> Generator[None, EntryType, None]:
+ from json import dumps
+
+ from pyglossary.text_writer import writeTxt
+
+ glos = self._glos
+ encoding = self._encoding
+ enable_info = self._enable_info
+ resources = self._resources
+
+ ensure_ascii = encoding == "ascii"
+
+ def escape(st: str) -> str:
+ # remove styling from HTML tags
+ st2 = re.sub(r' style="[^"]*"', "", st)
+ st2 = re.sub(r' class="[^"]*"', "", st2)
+ st2 = re.sub(r"]*>", "", st2)
+ st2 = st2.replace("", "")
+ st2 = re.sub(r"\n", "", st2)
+ st2 = st2.replace("", "")
+ st2 = st2.replace("", "")
+ # fix russian dictionary issues,
+ # such as hyphenation in word (e.g. абб{[']}а{[/']}т)
+ st2 = re.sub(r"\{\['\]\}", "", st2)
+ st2 = re.sub(r"\{\[/'\]\}", "", st2)
+ return dumps(st2, ensure_ascii=ensure_ascii)
+
+ yield from writeTxt(
+ glos,
+ entryFmt="\t{word}: {defi},\n",
+ filename=self._filename,
+ encoding=encoding,
+ writeInfo=enable_info,
+ wordEscapeFunc=escape,
+ defiEscapeFunc=escape,
+ ext=".json",
+ head="{\n",
+ tail='\t"": ""\n}',
+ resources=resources,
+ word_title=self._word_title,
+ )
diff --git a/pyglossary/plugins/ebook_epub2/__init__.py b/pyglossary/plugins/ebook_epub2/__init__.py
index 8bf34801b..baabf0036 100644
--- a/pyglossary/plugins/ebook_epub2/__init__.py
+++ b/pyglossary/plugins/ebook_epub2/__init__.py
@@ -1,27 +1,7 @@
# -*- coding: utf-8 -*-
-# The MIT License (MIT)
-# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it)
-# Copyright © 2016-2019 Saeed Rasooli
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-from __future__ import annotations
-from typing import TYPE_CHECKING, Any
+from __future__ import annotations
-from pyglossary.ebook_base import EbookWriter
from pyglossary.flags import ALWAYS
from pyglossary.option import (
BoolOption,
@@ -30,8 +10,7 @@
StrOption,
)
-if TYPE_CHECKING:
- from pyglossary.glossary_types import GlossaryType
+from .writer import Writer
__all__ = [
"Writer",
@@ -85,209 +64,3 @@
comment="Path to cover file",
),
}
-
-
-class Writer(EbookWriter):
- # these class attrs are only in Epub
- # MIMETYPE_CONTENTS, CONTAINER_XML_CONTENTS
- # NCX_TEMPLATE, NCX_NAVPOINT_TEMPLATE
-
- MIMETYPE_CONTENTS = "application/epub+zip"
- CONTAINER_XML_CONTENTS = """
-
-
-
-
-"""
-
- NCX_TEMPLATE = """
-
-
-
-
-
-
-
-
-
- {title}
-
-
-{ncx_items}
-
-"""
-
- NCX_NAVPOINT_TEMPLATE = """\t
-
- {text}
-
-
- """
-
- CSS_CONTENTS = b"""@charset "UTF-8";
-body {
- margin: 10px 25px 10px 25px;
-}
-h1 {
- font-size: 200%;
-}
-h2 {
- font-size: 150%;
-}
-p {
- margin-left: 0em;
- margin-right: 0em;
- margin-top: 0em;
- margin-bottom: 0em;
- line-height: 2em;
- text-align: justify;
-}
-a, a:focus, a:active, a:visited {
- color: black;
- text-decoration: none;
-}
-body.indexPage {}
-h1.indexTitle {}
-p.indexGroups {
- font-size: 150%;
-}
-span.indexGroup {}
-body.groupPage {}
-h1.groupTitle {}
-div.groupNavigation {}
-span.groupHeadword {}
-div.groupEntry {
- margin-top: 0;
- margin-bottom: 1em;
-}
-h2.groupHeadword {
- margin-left: 5%;
-}
-p.groupDefinition {
- margin-left: 10%;
- margin-right: 10%;
-}
-"""
-
- GROUP_XHTML_TEMPLATE = """
-
-
-
- {title}
-
-
-
- {group_title}
-
-{group_contents}
-
-"""
- GROUP_XHTML_INDEX_LINK = '\t\t[ Index ]'
-
- GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """\t
-
{headword}
-
{definition}
-
"""
-
- OPF_TEMPLATE = """
-
-
- {identifier}
- {sourceLang}
- {title}
- {creator}
- {copyright}
- {creationDate}
- {cover}
-
-
-{manifest}
-
-
-{spine}
-
-"""
-
- COVER_TEMPLATE = ''
-
- def __init__(self, glos: GlossaryType) -> None:
- import uuid
-
- EbookWriter.__init__(
- self,
- glos,
- )
- glos.setInfo("uuid", str(uuid.uuid4()).replace("-", ""))
-
- @classmethod
- def cls_get_prefix(
- cls: type[EbookWriter],
- options: dict[str, Any],
- word: str,
- ) -> str:
- if not word:
- return ""
- length = options.get("group_by_prefix_length", cls._group_by_prefix_length)
- prefix = word[:length].lower()
- if prefix[0] < "a":
- return "SPECIAL"
- return prefix
-
- def get_prefix(self, word: str) -> str:
- if not word:
- return ""
- length = self._group_by_prefix_length
- prefix = word[:length].lower()
- if prefix[0] < "a":
- return "SPECIAL"
- return prefix
-
- def write_ncx(self, group_labels: list[str]) -> None:
- """
- write_ncx
- only for epub.
- """
- ncx_items: list[str] = []
- index = 1
- if self._include_index_page:
- ncx_items.append(
- self.NCX_NAVPOINT_TEMPLATE.format(
- index=index,
- text="Index",
- src="index.xhtml",
- ),
- )
- index += 1
- for group_label in group_labels:
- ncx_items.append(
- self.NCX_NAVPOINT_TEMPLATE.format(
- index=index,
- text=group_label,
- src=self.get_group_xhtml_file_name_from_index(index),
- ),
- )
- index += 1
- ncx_items_unicode = "\n".join(ncx_items)
- ncx_contents = self.NCX_TEMPLATE.format(
- identifier=self._glos.getInfo("uuid"),
- title=self._glos.getInfo("name"),
- ncx_items=ncx_items_unicode,
- ).encode("utf-8")
- self.add_file_manifest(
- "OEBPS/toc.ncx",
- "toc.ncx",
- ncx_contents,
- "application/x-dtbncx+xml",
- )
-
- # inherits write from EbookWriter
diff --git a/pyglossary/plugins/ebook_epub2/writer.py b/pyglossary/plugins/ebook_epub2/writer.py
new file mode 100644
index 000000000..eba888c33
--- /dev/null
+++ b/pyglossary/plugins/ebook_epub2/writer.py
@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+# The MIT License (MIT)
+# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it)
+# Copyright © 2016-2019 Saeed Rasooli
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from pyglossary.ebook_base import EbookWriter
+
+if TYPE_CHECKING:
+ from pyglossary.glossary_types import GlossaryType
+
+
+class Writer(EbookWriter):
+ # these class attrs are only in Epub
+ # MIMETYPE_CONTENTS, CONTAINER_XML_CONTENTS
+ # NCX_TEMPLATE, NCX_NAVPOINT_TEMPLATE
+
+ MIMETYPE_CONTENTS = "application/epub+zip"
+ CONTAINER_XML_CONTENTS = """
+
+
+
+
+"""
+
+ NCX_TEMPLATE = """
+
+
+
+
+
+
+
+
+
+ {title}
+
+
+{ncx_items}
+
+"""
+
+ NCX_NAVPOINT_TEMPLATE = """\t
+
+ {text}
+
+
+ """
+
+ CSS_CONTENTS = b"""@charset "UTF-8";
+body {
+ margin: 10px 25px 10px 25px;
+}
+h1 {
+ font-size: 200%;
+}
+h2 {
+ font-size: 150%;
+}
+p {
+ margin-left: 0em;
+ margin-right: 0em;
+ margin-top: 0em;
+ margin-bottom: 0em;
+ line-height: 2em;
+ text-align: justify;
+}
+a, a:focus, a:active, a:visited {
+ color: black;
+ text-decoration: none;
+}
+body.indexPage {}
+h1.indexTitle {}
+p.indexGroups {
+ font-size: 150%;
+}
+span.indexGroup {}
+body.groupPage {}
+h1.groupTitle {}
+div.groupNavigation {}
+span.groupHeadword {}
+div.groupEntry {
+ margin-top: 0;
+ margin-bottom: 1em;
+}
+h2.groupHeadword {
+ margin-left: 5%;
+}
+p.groupDefinition {
+ margin-left: 10%;
+ margin-right: 10%;
+}
+"""
+
+ GROUP_XHTML_TEMPLATE = """
+
+
+
+ {title}
+
+
+
+ {group_title}
+
+{group_contents}
+
+"""
+ GROUP_XHTML_INDEX_LINK = '\t\t[ Index ]'
+
+ GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """\t
+
{headword}
+
{definition}
+
"""
+
+ OPF_TEMPLATE = """
+
+
+ {identifier}
+ {sourceLang}
+ {title}
+ {creator}
+ {copyright}
+ {creationDate}
+ {cover}
+
+
+{manifest}
+
+
+{spine}
+
+"""
+
+ COVER_TEMPLATE = ''
+
+ def __init__(self, glos: GlossaryType) -> None:
+ import uuid
+
+ EbookWriter.__init__(
+ self,
+ glos,
+ )
+ glos.setInfo("uuid", str(uuid.uuid4()).replace("-", ""))
+
+ @classmethod
+ def cls_get_prefix(
+ cls: type[EbookWriter],
+ options: dict[str, Any],
+ word: str,
+ ) -> str:
+ if not word:
+ return ""
+ length = options.get("group_by_prefix_length", cls._group_by_prefix_length)
+ prefix = word[:length].lower()
+ if prefix[0] < "a":
+ return "SPECIAL"
+ return prefix
+
+ def get_prefix(self, word: str) -> str:
+ if not word:
+ return ""
+ length = self._group_by_prefix_length
+ prefix = word[:length].lower()
+ if prefix[0] < "a":
+ return "SPECIAL"
+ return prefix
+
+ def write_ncx(self, group_labels: list[str]) -> None:
+ """
+ write_ncx
+ only for epub.
+ """
+ ncx_items: list[str] = []
+ index = 1
+ if self._include_index_page:
+ ncx_items.append(
+ self.NCX_NAVPOINT_TEMPLATE.format(
+ index=index,
+ text="Index",
+ src="index.xhtml",
+ ),
+ )
+ index += 1
+ for group_label in group_labels:
+ ncx_items.append(
+ self.NCX_NAVPOINT_TEMPLATE.format(
+ index=index,
+ text=group_label,
+ src=self.get_group_xhtml_file_name_from_index(index),
+ ),
+ )
+ index += 1
+ ncx_items_unicode = "\n".join(ncx_items)
+ ncx_contents = self.NCX_TEMPLATE.format(
+ identifier=self._glos.getInfo("uuid"),
+ title=self._glos.getInfo("name"),
+ ncx_items=ncx_items_unicode,
+ ).encode("utf-8")
+ self.add_file_manifest(
+ "OEBPS/toc.ncx",
+ "toc.ncx",
+ ncx_contents,
+ "application/x-dtbncx+xml",
+ )
+
+ # inherits write from EbookWriter
diff --git a/pyglossary/plugins/ebook_kobo/__init__.py b/pyglossary/plugins/ebook_kobo/__init__.py
index 02a108f88..cbd9b6f90 100644
--- a/pyglossary/plugins/ebook_kobo/__init__.py
+++ b/pyglossary/plugins/ebook_kobo/__init__.py
@@ -1,41 +1,14 @@
# -*- coding: utf-8 -*-
-# The MIT License (MIT)
-# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it)
-# Copyright © 2022 Saeed Rasooli
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+
from __future__ import annotations
-import re
-import unicodedata
-from gzip import compress, decompress
-from operator import itemgetter
-from pathlib import Path
-from pickle import dumps, loads
from typing import TYPE_CHECKING
-from pyglossary import core
-from pyglossary.core import exc_note, log, pip
from pyglossary.flags import NEVER
-from pyglossary.os_utils import indir
-if TYPE_CHECKING:
- from collections.abc import Generator
+from .writer import Writer
- from pyglossary.glossary_types import EntryType, GlossaryType
+if TYPE_CHECKING:
from pyglossary.option import Option
__all__ = [
@@ -75,199 +48,3 @@
# Penelope option: marisa_index_size=1000000
-
-
-def is_cyrillic_char(c: str) -> bool:
- # U+0400 - U+04FF: Cyrillic
- # U+0500 - U+052F: Cyrillic Supplement
- if "\u0400" <= c <= "\u052f":
- return True
-
- # U+2DE0 - U+2DFF: Cyrillic Extended-A
- if "\u2de0" <= c <= "\u2dff":
- return True
-
- # U+A640 - U+A69F: Cyrillic Extended-B
- if "\ua640" <= c <= "\ua69f":
- return True
-
- # U+1C80 - U+1C8F: Cyrillic Extended-C
- if "\u1c80" <= c <= "\u1c8f":
- return True
-
- # U+FE2E, U+FE2F: Combining Half Marks
- # U+1D2B, U+1D78: Phonetic Extensions
- return c in {"\ufe2e", "\ufe2f", "\u1d2b", "\u1d78"}
-
-
-def fixFilename(fname: str) -> str:
- return Path(fname.replace("/", "2F").replace("\\", "5C")).name
-
-
-class Writer:
- WORDS_FILE_NAME = "words"
-
- depends = {
- "marisa_trie": "marisa-trie",
- }
-
- @staticmethod
- def stripFullHtmlError(entry: EntryType, error: str) -> None:
- log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}")
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = ""
- self._words: list[str] = []
- self._img_pattern = re.compile(
- ']*?)?>',
- re.DOTALL,
- )
- # img tag has no closing
- glos.stripFullHtml(errorHandler=self.stripFullHtmlError)
-
- def get_prefix(self, word: str) -> str: # noqa: PLR6301
- if not word:
- return "11"
- wo = word[:2].strip().lower()
- if not wo:
- return "11"
- if wo[0] == "\x00":
- return "11"
- if len(wo) > 1 and wo[1] == "\x00":
- wo = wo[:1]
- if is_cyrillic_char(wo[0]):
- return wo
- # if either of the first 2 chars are not unicode letters, return "11"
- for c in wo:
- if not unicodedata.category(c).startswith("L"):
- return "11"
- return wo.ljust(2, "a")
-
- def fix_defi(self, defi: str) -> str:
- # @pgaskin on #219: Kobo supports images in dictionaries,
- # but these have a lot of gotchas
- # (see https://pgaskin.net/dictutil/dicthtml/format.html).
- # Basically, The best way to do it is to encode the images as a
- # base64 data URL after shrinking it and making it grayscale
- # (if it's JPG, this is as simple as only keeping the Y channel)
-
- # for now we just skip data entries and remove ' Generator[None, EntryType, None]:
- import gzip
-
- dataEntryCount = 0
-
- htmlHeader = '\n'
-
- groupCounter = 0
- htmlContents = htmlHeader
-
- def writeGroup(lastPrefix: str) -> None:
- nonlocal htmlContents
- group_fname = fixFilename(lastPrefix)
- htmlContents += ""
- core.trace(
- log,
- f"writeGroup: {lastPrefix!r}, "
- f"{group_fname!r}, count={groupCounter}",
- )
- with gzip.open(group_fname + ".html", mode="wb") as gzipFile:
- gzipFile.write(htmlContents.encode("utf-8"))
- htmlContents = htmlHeader
-
- allWords: list[str] = []
- # TODO: switch to SQLite, like StarDict writer
- data: list[tuple[str, bytes]] = []
-
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- dataEntryCount += 1
- continue
- l_word = entry.l_word
- allWords += l_word
- wordsByPrefix: dict[str, list[str]] = {}
- for word in l_word:
- prefix = self.get_prefix(word)
- if prefix in wordsByPrefix:
- wordsByPrefix[prefix].append(word)
- else:
- wordsByPrefix[prefix] = [word]
- defi = self.fix_defi(entry.defi)
- mainHeadword = l_word[0]
- for prefix, p_words in wordsByPrefix.items():
- headword, *variants = p_words
- if headword != mainHeadword:
- headword = f"{mainHeadword}, {headword}"
- data.append(
- (
- prefix,
- compress(
- dumps(
- (
- headword,
- variants,
- defi,
- ),
- ),
- ),
- ),
- )
- del entry
-
- log.info("Kobo: sorting entries...")
- data.sort(key=itemgetter(0))
-
- log.info("Kobo: writing entries...")
-
- lastPrefix = ""
- for prefix, row in data:
- headword, variants, defi = loads(decompress(row))
- if lastPrefix and prefix != lastPrefix:
- writeGroup(lastPrefix)
- groupCounter = 0
- lastPrefix = prefix
-
- htmlVariants = "".join(
- f'' for v in variants
- )
- body = f"{headword}{htmlVariants}
{defi}
"
- htmlContents += f'{body}\n'
- groupCounter += 1
- del data
-
- if groupCounter > 0:
- writeGroup(lastPrefix)
-
- if dataEntryCount > 0:
- log.warning(
- f"ignored {dataEntryCount} files (data entries)"
- " and replaced ' None:
- try:
- import marisa_trie # type: ignore # noqa: F401
- except ModuleNotFoundError as e:
- exc_note(e, f"Run `{pip} install marisa-trie` to install")
- raise
- self._filename = filename
-
- def write(self) -> Generator[None, EntryType, None]:
- with indir(self._filename, create=True):
- yield from self.write_groups()
-
- def finish(self) -> None:
- import marisa_trie
-
- with indir(self._filename, create=False):
- trie = marisa_trie.Trie(self._words)
- trie.save(self.WORDS_FILE_NAME)
- self._filename = ""
diff --git a/pyglossary/plugins/ebook_kobo/writer.py b/pyglossary/plugins/ebook_kobo/writer.py
new file mode 100644
index 000000000..5b26aff01
--- /dev/null
+++ b/pyglossary/plugins/ebook_kobo/writer.py
@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+# The MIT License (MIT)
+# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it)
+# Copyright © 2022 Saeed Rasooli
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from __future__ import annotations
+
+import re
+import unicodedata
+from gzip import compress, decompress
+from operator import itemgetter
+from pathlib import Path
+from pickle import dumps, loads
+from typing import TYPE_CHECKING
+
+from pyglossary import core
+from pyglossary.core import exc_note, log, pip
+from pyglossary.os_utils import indir
+
+if TYPE_CHECKING:
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+def is_cyrillic_char(c: str) -> bool:
+ # U+0400 - U+04FF: Cyrillic
+ # U+0500 - U+052F: Cyrillic Supplement
+ if "\u0400" <= c <= "\u052f":
+ return True
+
+ # U+2DE0 - U+2DFF: Cyrillic Extended-A
+ if "\u2de0" <= c <= "\u2dff":
+ return True
+
+ # U+A640 - U+A69F: Cyrillic Extended-B
+ if "\ua640" <= c <= "\ua69f":
+ return True
+
+ # U+1C80 - U+1C8F: Cyrillic Extended-C
+ if "\u1c80" <= c <= "\u1c8f":
+ return True
+
+ # U+FE2E, U+FE2F: Combining Half Marks
+ # U+1D2B, U+1D78: Phonetic Extensions
+ return c in {"\ufe2e", "\ufe2f", "\u1d2b", "\u1d78"}
+
+
+def fixFilename(fname: str) -> str:
+ return Path(fname.replace("/", "2F").replace("\\", "5C")).name
+
+
+class Writer:
+ WORDS_FILE_NAME = "words"
+
+ depends = {
+ "marisa_trie": "marisa-trie",
+ }
+
+ @staticmethod
+ def stripFullHtmlError(entry: EntryType, error: str) -> None:
+ log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}")
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = ""
+ self._words: list[str] = []
+ self._img_pattern = re.compile(
+ ']*?)?>',
+ re.DOTALL,
+ )
+ # img tag has no closing
+ glos.stripFullHtml(errorHandler=self.stripFullHtmlError)
+
+ def get_prefix(self, word: str) -> str: # noqa: PLR6301
+ if not word:
+ return "11"
+ wo = word[:2].strip().lower()
+ if not wo:
+ return "11"
+ if wo[0] == "\x00":
+ return "11"
+ if len(wo) > 1 and wo[1] == "\x00":
+ wo = wo[:1]
+ if is_cyrillic_char(wo[0]):
+ return wo
+ # if either of the first 2 chars are not unicode letters, return "11"
+ for c in wo:
+ if not unicodedata.category(c).startswith("L"):
+ return "11"
+ return wo.ljust(2, "a")
+
+ def fix_defi(self, defi: str) -> str:
+ # @pgaskin on #219: Kobo supports images in dictionaries,
+ # but these have a lot of gotchas
+ # (see https://pgaskin.net/dictutil/dicthtml/format.html).
+ # Basically, The best way to do it is to encode the images as a
+ # base64 data URL after shrinking it and making it grayscale
+ # (if it's JPG, this is as simple as only keeping the Y channel)
+
+ # for now we just skip data entries and remove ' Generator[None, EntryType, None]:
+ import gzip
+
+ dataEntryCount = 0
+
+ htmlHeader = '\n'
+
+ groupCounter = 0
+ htmlContents = htmlHeader
+
+ def writeGroup(lastPrefix: str) -> None:
+ nonlocal htmlContents
+ group_fname = fixFilename(lastPrefix)
+ htmlContents += ""
+ core.trace(
+ log,
+ f"writeGroup: {lastPrefix!r}, "
+ f"{group_fname!r}, count={groupCounter}",
+ )
+ with gzip.open(group_fname + ".html", mode="wb") as gzipFile:
+ gzipFile.write(htmlContents.encode("utf-8"))
+ htmlContents = htmlHeader
+
+ allWords: list[str] = []
+ # TODO: switch to SQLite, like StarDict writer
+ data: list[tuple[str, bytes]] = []
+
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ dataEntryCount += 1
+ continue
+ l_word = entry.l_word
+ allWords += l_word
+ wordsByPrefix: dict[str, list[str]] = {}
+ for word in l_word:
+ prefix = self.get_prefix(word)
+ if prefix in wordsByPrefix:
+ wordsByPrefix[prefix].append(word)
+ else:
+ wordsByPrefix[prefix] = [word]
+ defi = self.fix_defi(entry.defi)
+ mainHeadword = l_word[0]
+ for prefix, p_words in wordsByPrefix.items():
+ headword, *variants = p_words
+ if headword != mainHeadword:
+ headword = f"{mainHeadword}, {headword}"
+ data.append(
+ (
+ prefix,
+ compress(
+ dumps(
+ (
+ headword,
+ variants,
+ defi,
+ ),
+ ),
+ ),
+ ),
+ )
+ del entry
+
+ log.info("Kobo: sorting entries...")
+ data.sort(key=itemgetter(0))
+
+ log.info("Kobo: writing entries...")
+
+ lastPrefix = ""
+ for prefix, row in data:
+ headword, variants, defi = loads(decompress(row))
+ if lastPrefix and prefix != lastPrefix:
+ writeGroup(lastPrefix)
+ groupCounter = 0
+ lastPrefix = prefix
+
+ htmlVariants = "".join(
+ f'' for v in variants
+ )
+ body = f"{headword}{htmlVariants}
{defi}
"
+ htmlContents += f'{body}\n'
+ groupCounter += 1
+ del data
+
+ if groupCounter > 0:
+ writeGroup(lastPrefix)
+
+ if dataEntryCount > 0:
+ log.warning(
+ f"ignored {dataEntryCount} files (data entries)"
+ " and replaced ' None:
+ try:
+ import marisa_trie # type: ignore # noqa: F401
+ except ModuleNotFoundError as e:
+ exc_note(e, f"Run `{pip} install marisa-trie` to install")
+ raise
+ self._filename = filename
+
+ def write(self) -> Generator[None, EntryType, None]:
+ with indir(self._filename, create=True):
+ yield from self.write_groups()
+
+ def finish(self) -> None:
+ import marisa_trie
+
+ with indir(self._filename, create=False):
+ trie = marisa_trie.Trie(self._words)
+ trie.save(self.WORDS_FILE_NAME)
+ self._filename = ""
diff --git a/pyglossary/plugins/ebook_kobo_dictfile/__init__.py b/pyglossary/plugins/ebook_kobo_dictfile/__init__.py
index 7ec327ee3..946b18dfd 100644
--- a/pyglossary/plugins/ebook_kobo_dictfile/__init__.py
+++ b/pyglossary/plugins/ebook_kobo_dictfile/__init__.py
@@ -1,42 +1,15 @@
# -*- coding: utf-8 -*-
-# The MIT License (MIT)
-# Copyright © 2020-2021 Saeed Rasooli
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-from __future__ import annotations
-import os
-from os.path import isdir
-from typing import TYPE_CHECKING
+from __future__ import annotations
-from pyglossary.core import exc_note, log, pip
-from pyglossary.image_utils import extractInlineHtmlImages
-from pyglossary.io_utils import nullTextIO
from pyglossary.option import (
BoolOption,
EncodingOption,
Option,
)
-from pyglossary.text_reader import TextGlossaryReader
-if TYPE_CHECKING:
- import io
- from collections.abc import Generator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .reader import Reader
+from .writer import Writer
__all__ = [
"Reader",
@@ -73,155 +46,3 @@
"encoding": EncodingOption(),
"extract_inline_images": BoolOption(comment="Extract inline images"),
}
-
-
-def fixWord(word: str) -> str:
- return word.replace("\n", " ")
-
-
-def escapeDefi(defi: str) -> str:
- return defi.replace("\n@", "\n @").replace("\n:", "\n :").replace("\n&", "\n &")
-
-
-class Reader(TextGlossaryReader):
- depends = {
- "mistune": "mistune==3.0.1",
- }
-
- _extract_inline_images: bool = True
-
- def __init__(self, glos: GlossaryType) -> None:
- TextGlossaryReader.__init__(self, glos, hasInfo=False)
-
- def open(self, filename: str) -> None:
- try:
- import mistune # type: ignore # noqa: F401
- except ModuleNotFoundError as e:
- exc_note(e, f"Run `{pip} install mistune` to install")
- raise
- TextGlossaryReader.open(self, filename)
- self._glos.setDefaultDefiFormat("h")
-
- @classmethod
- def isInfoWord(cls, _word: str) -> bool:
- return False
-
- @classmethod
- def fixInfoWord(cls, _word: str) -> str:
- raise NotImplementedError
-
- def fixDefi(
- self,
- defi: str,
- html: bool,
- ) -> tuple[str, list[tuple[str, str]] | None]:
- import mistune
-
- defi = (
- defi.replace("\n @", "\n@")
- .replace("\n :", "\n:")
- .replace("\n &", "\n&")
- .replace("
", "")
- .replace("
", "")
- .replace("", "")
- )
- defi = defi.strip()
- if html:
- pass
- else:
- defi = mistune.html(defi)
- images: list[tuple[str, str]] | None = None
- if self._extract_inline_images:
- defi, images = extractInlineHtmlImages(
- defi,
- self._glos.tmpDataDir,
- fnamePrefix="", # maybe f"{self._pos:06d}-"
- )
- return defi, images
-
- def nextBlock(
- self,
- ) -> tuple[list[str], str, list[tuple[str, str]] | None]:
- words: list[str] = []
- defiLines: list[str] = []
- html = False
-
- while True:
- line = self.readline()
- if not line:
- break
- line = line.rstrip("\n\r")
- if line.startswith("@"):
- if words:
- self._bufferLine = line
- defi, images = self.fixDefi("\n".join(defiLines), html=html)
- return words, defi, images
- words = [line[1:].strip()]
- continue
- if line.startswith(": "):
- defiLines.append(line[2:])
- continue
- if line.startswith("::"):
- continue
- if line.startswith("&"):
- words.append(line[1:].strip())
- continue
- if line.startswith(""):
- line = line[6:]
- html = True
- defiLines.append(line)
-
- if words:
- defi, images = self.fixDefi("\n".join(defiLines), html=html)
- return words, defi, images
-
- raise StopIteration
-
-
-class Writer:
- _encoding: str = "utf-8"
-
- @staticmethod
- def stripFullHtmlError(entry: EntryType, error: str) -> None:
- log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}")
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._file: io.TextIOBase = nullTextIO
- glos.stripFullHtml(errorHandler=self.stripFullHtmlError)
-
- def finish(self) -> None:
- self._file.close()
- if not os.listdir(self._resDir):
- os.rmdir(self._resDir)
-
- def open(self, filename: str) -> None:
- self._file = open(filename, "w", encoding=self._encoding)
- # dictgen's ParseDictFile does not seem to support glossary info / metedata
- self._resDir = filename + "_res"
- if not isdir(self._resDir):
- os.mkdir(self._resDir)
-
- def write(
- self,
- ) -> Generator[None, EntryType, None]:
- fileObj = self._file
- resDir = self._resDir
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- entry.save(resDir)
- continue
- words = entry.l_word
- defi = entry.defi
-
- entry.detectDefiFormat()
- if entry.defiFormat == "h":
- defi = f"{entry.defi}"
-
- fileObj.write(f"@ {fixWord(words[0])}\n")
- for alt in words[1:]:
- fileObj.write(f"& {fixWord(alt)}\n")
- fileObj.write(f"{escapeDefi(defi)}\n\n")
diff --git a/pyglossary/plugins/ebook_kobo_dictfile/reader.py b/pyglossary/plugins/ebook_kobo_dictfile/reader.py
new file mode 100644
index 000000000..131ab6190
--- /dev/null
+++ b/pyglossary/plugins/ebook_kobo_dictfile/reader.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+# The MIT License (MIT)
+# Copyright © 2020-2021 Saeed Rasooli
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pyglossary.core import exc_note, pip
+from pyglossary.image_utils import extractInlineHtmlImages
+from pyglossary.text_reader import TextGlossaryReader
+
+if TYPE_CHECKING:
+ from pyglossary.glossary_types import GlossaryType
+
+
+class Reader(TextGlossaryReader):
+ depends = {
+ "mistune": "mistune==3.0.1",
+ }
+
+ _extract_inline_images: bool = True
+
+ def __init__(self, glos: GlossaryType) -> None:
+ TextGlossaryReader.__init__(self, glos, hasInfo=False)
+
+ def open(self, filename: str) -> None:
+ try:
+ import mistune # type: ignore # noqa: F401
+ except ModuleNotFoundError as e:
+ exc_note(e, f"Run `{pip} install mistune` to install")
+ raise
+ TextGlossaryReader.open(self, filename)
+ self._glos.setDefaultDefiFormat("h")
+
+ @classmethod
+ def isInfoWord(cls, _word: str) -> bool:
+ return False
+
+ @classmethod
+ def fixInfoWord(cls, _word: str) -> str:
+ raise NotImplementedError
+
+ def fixDefi(
+ self,
+ defi: str,
+ html: bool,
+ ) -> tuple[str, list[tuple[str, str]] | None]:
+ import mistune
+
+ defi = (
+ defi.replace("\n @", "\n@")
+ .replace("\n :", "\n:")
+ .replace("\n &", "\n&")
+ .replace("
", "")
+ .replace("
", "")
+ .replace("", "")
+ )
+ defi = defi.strip()
+ if html:
+ pass
+ else:
+ defi = mistune.html(defi)
+ images: list[tuple[str, str]] | None = None
+ if self._extract_inline_images:
+ defi, images = extractInlineHtmlImages(
+ defi,
+ self._glos.tmpDataDir,
+ fnamePrefix="", # maybe f"{self._pos:06d}-"
+ )
+ return defi, images
+
+ def nextBlock(
+ self,
+ ) -> tuple[list[str], str, list[tuple[str, str]] | None]:
+ words: list[str] = []
+ defiLines: list[str] = []
+ html = False
+
+ while True:
+ line = self.readline()
+ if not line:
+ break
+ line = line.rstrip("\n\r")
+ if line.startswith("@"):
+ if words:
+ self._bufferLine = line
+ defi, images = self.fixDefi("\n".join(defiLines), html=html)
+ return words, defi, images
+ words = [line[1:].strip()]
+ continue
+ if line.startswith(": "):
+ defiLines.append(line[2:])
+ continue
+ if line.startswith("::"):
+ continue
+ if line.startswith("&"):
+ words.append(line[1:].strip())
+ continue
+ if line.startswith(""):
+ line = line[6:]
+ html = True
+ defiLines.append(line)
+
+ if words:
+ defi, images = self.fixDefi("\n".join(defiLines), html=html)
+ return words, defi, images
+
+ raise StopIteration
diff --git a/pyglossary/plugins/ebook_kobo_dictfile/writer.py b/pyglossary/plugins/ebook_kobo_dictfile/writer.py
new file mode 100644
index 000000000..60c9c9651
--- /dev/null
+++ b/pyglossary/plugins/ebook_kobo_dictfile/writer.py
@@ -0,0 +1,89 @@
+# -*- coding: utf-8 -*-
+# The MIT License (MIT)
+# Copyright © 2020-2021 Saeed Rasooli
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from __future__ import annotations
+
+import os
+from os.path import isdir
+from typing import TYPE_CHECKING
+
+from pyglossary.core import log
+from pyglossary.io_utils import nullTextIO
+
+if TYPE_CHECKING:
+ import io
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+def fixWord(word: str) -> str:
+ return word.replace("\n", " ")
+
+
+def escapeDefi(defi: str) -> str:
+ return defi.replace("\n@", "\n @").replace("\n:", "\n :").replace("\n&", "\n &")
+
+
+class Writer:
+ _encoding: str = "utf-8"
+
+ @staticmethod
+ def stripFullHtmlError(entry: EntryType, error: str) -> None:
+ log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}")
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._file: io.TextIOBase = nullTextIO
+ glos.stripFullHtml(errorHandler=self.stripFullHtmlError)
+
+ def finish(self) -> None:
+ self._file.close()
+ if not os.listdir(self._resDir):
+ os.rmdir(self._resDir)
+
+ def open(self, filename: str) -> None:
+ self._file = open(filename, "w", encoding=self._encoding)
+ # dictgen's ParseDictFile does not seem to support glossary info / metedata
+ self._resDir = filename + "_res"
+ if not isdir(self._resDir):
+ os.mkdir(self._resDir)
+
+ def write(
+ self,
+ ) -> Generator[None, EntryType, None]:
+ fileObj = self._file
+ resDir = self._resDir
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ entry.save(resDir)
+ continue
+ words = entry.l_word
+ defi = entry.defi
+
+ entry.detectDefiFormat()
+ if entry.defiFormat == "h":
+ defi = f"{entry.defi}"
+
+ fileObj.write(f"@ {fixWord(words[0])}\n")
+ for alt in words[1:]:
+ fileObj.write(f"& {fixWord(alt)}\n")
+ fileObj.write(f"{escapeDefi(defi)}\n\n")
diff --git a/pyglossary/plugins/ebook_mobi/__init__.py b/pyglossary/plugins/ebook_mobi/__init__.py
index 9ac4e18ec..00da1e1ad 100644
--- a/pyglossary/plugins/ebook_mobi/__init__.py
+++ b/pyglossary/plugins/ebook_mobi/__init__.py
@@ -1,33 +1,8 @@
# -*- coding: utf-8 -*-
-# The MIT License (MIT)
-# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it)
-# Copyright © 2016-2022 Saeed Rasooli
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-from __future__ import annotations
-import os
-from datetime import datetime
-from os.path import join, split
-from typing import TYPE_CHECKING
+from __future__ import annotations
-from pyglossary.core import log
-from pyglossary.ebook_base import EbookWriter
from pyglossary.flags import DEFAULT_YES
-from pyglossary.langs import Lang
from pyglossary.option import (
BoolOption,
FileSizeOption,
@@ -36,10 +11,7 @@
StrOption,
)
-if TYPE_CHECKING:
- from collections.abc import Generator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .writer import Writer
__all__ = [
"Writer",
@@ -121,277 +93,3 @@
" for creating Mobipocket e-books.",
),
]
-
-
-class GroupStateBySize:
- def __init__(self, writer: Writer) -> None:
- self.writer = writer
- self.group_index = -1
- self.reset()
-
- def reset(self) -> None:
- self.group_contents: list[str] = []
- self.group_size = 0
-
- def add(self, entry: EntryType) -> None:
- defi = entry.defi
- content = self.writer.format_group_content(
- entry.l_word[0],
- defi,
- variants=entry.l_word[1:],
- )
- self.group_contents.append(content)
- self.group_size += len(content.encode("utf-8"))
-
-
-class Writer(EbookWriter):
- _compress: bool = False
- _keep: bool = False
- _kindlegen_path: str = ""
- _file_size_approx: int = 271360
- _hide_word_index: bool = False
- _spellcheck: bool = True
- _exact: bool = False
- CSS_CONTENTS = b""""@charset "UTF-8";"""
- GROUP_XHTML_TEMPLATE = """
-
-
-
-
-
-
-
-
-{group_contents}
-
-
-"""
-
- GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """
-{headword_visible}{infl}
-
-
{definition}
-
-
"""
-
- GROUP_XHTML_WORD_INFL_TEMPLATE = """
-{iforms_str}
-"""
-
- GROUP_XHTML_WORD_IFORM_TEMPLATE = """"""
-
- OPF_TEMPLATE = """
-
-
-
-{title}
-{sourceLang}
-{identifier}
-{creator}
-{copyright}
-{description}
-Dictionaries
-
-
-
-{sourceLang}
-{targetLang}
-{cover}
-
-
-
-{manifest}
-
-
-{spine}
-
-
-
-"""
-
- def __init__(self, glos: GlossaryType) -> None:
- import uuid
-
- EbookWriter.__init__(
- self,
- glos,
- )
- glos.setInfo("uuid", str(uuid.uuid4()).replace("-", ""))
- # FIXME: check if full html pages/documents as entry do work
- # glos.stripFullHtml(errorHandler=None)
-
- def get_prefix(self, word: str) -> str:
- if not word:
- return ""
- length = self._group_by_prefix_length
- prefix = word[:length].lower()
- if prefix[0] < "a":
- return "SPECIAL"
- return prefix
-
- def format_group_content(
- self,
- word: str,
- defi: str,
- variants: list[str] | None = None,
- ) -> str:
- hide_word_index = self._hide_word_index
- infl = ""
- if variants:
- iforms_list = [
- self.GROUP_XHTML_WORD_IFORM_TEMPLATE.format(
- inflword=variant,
- exact_str=' exact="yes"' if self._exact else "",
- )
- for variant in variants
- ]
- infl = "\n" + self.GROUP_XHTML_WORD_INFL_TEMPLATE.format(
- iforms_str="\n".join(iforms_list),
- )
-
- headword = self.escape_if_needed(word)
-
- defi = self.escape_if_needed(defi)
-
- if hide_word_index:
- headword_visible = ""
- value_headword = f' value="{headword}"'
- else:
- headword_visible = "\n" + self._glos.wordTitleStr(headword)
- value_headword = ""
-
- return self.GROUP_XHTML_WORD_DEFINITION_TEMPLATE.format(
- spellcheck_str=' spell="yes"' if self._spellcheck else "",
- headword_visible=headword_visible,
- value_headword=value_headword,
- definition=defi,
- infl=infl,
- )
-
- @staticmethod
- def getLangCode(lang: Lang | None) -> str:
- return lang.code if isinstance(lang, Lang) else ""
-
- def get_opf_contents(
- self,
- manifest_contents: str,
- spine_contents: str,
- ) -> bytes:
- cover = ""
- if self.cover:
- cover = self.COVER_TEMPLATE.format(cover=self.cover)
- creationDate = datetime.now().strftime("%Y-%m-%d")
-
- return self.OPF_TEMPLATE.format(
- identifier=self._glos.getInfo("uuid"),
- # use Language code instead name for kindlegen
- sourceLang=self.getLangCode(self._glos.sourceLang),
- targetLang=self.getLangCode(self._glos.targetLang),
- title=self._glos.getInfo("name"),
- creator=self._glos.author,
- copyright=self._glos.getInfo("copyright"),
- description=self._glos.getInfo("description"),
- creationDate=creationDate,
- cover=cover,
- manifest=manifest_contents,
- spine=spine_contents,
- ).encode("utf-8")
-
- def write_groups(self) -> Generator[None, EntryType, None]:
- def add_group(state: GroupStateBySize) -> None:
- if state.group_size <= 0:
- return
- state.group_index += 1
- index = state.group_index + self.GROUP_START_INDEX
- group_xhtml_path = self.get_group_xhtml_file_name_from_index(index)
- self.add_file_manifest(
- "OEBPS/" + group_xhtml_path,
- group_xhtml_path,
- self.GROUP_XHTML_TEMPLATE.format(
- group_contents=self.GROUP_XHTML_WORD_DEFINITION_JOINER.join(
- state.group_contents,
- ),
- ).encode("utf-8"),
- "application/xhtml+xml",
- )
-
- state = GroupStateBySize(self)
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- continue
-
- if state.group_size >= self._file_size_approx:
- add_group(state)
- state.reset()
-
- state.add(entry)
-
- add_group(state)
-
- def write(self) -> Generator[None, EntryType, None]:
- import shutil
- import subprocess
-
- filename = self._filename
- kindlegen_path = self._kindlegen_path
-
- yield from EbookWriter.write(self)
-
- # download kindlegen from this page:
- # https://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000765211
-
- # run kindlegen
- if not kindlegen_path:
- kindlegen_path = shutil.which("kindlegen") or ""
- if not kindlegen_path:
- log.warning(
- f"Not running kindlegen, the raw files are located in {filename}",
- )
- log.warning(
- "Provide KindleGen path with: --write-options 'kindlegen_path=...'",
- )
- return
-
- # name = self._glos.getInfo("name")
- log.info(f"Creating .mobi file with kindlegen, using {kindlegen_path!r}")
- direc, filename = split(filename)
- cmd = [
- kindlegen_path,
- join(filename, "OEBPS", "content.opf"),
- "-gen_ff_mobi7",
- "-o",
- "content.mobi",
- ]
- proc = subprocess.Popen(
- cmd,
- cwd=direc,
- stdout=subprocess.PIPE,
- stdin=subprocess.PIPE,
- stderr=subprocess.PIPE,
- )
- output = proc.communicate()
- log.info(output[0].decode("utf-8"))
- mobi_path_abs = os.path.join(filename, "OEBPS", "content.mobi")
- log.info(f"Created .mobi file with kindlegen: {mobi_path_abs}")
diff --git a/pyglossary/plugins/ebook_mobi/writer.py b/pyglossary/plugins/ebook_mobi/writer.py
new file mode 100644
index 000000000..36484ff8e
--- /dev/null
+++ b/pyglossary/plugins/ebook_mobi/writer.py
@@ -0,0 +1,308 @@
+# -*- coding: utf-8 -*-
+# The MIT License (MIT)
+# Copyright © 2012-2016 Alberto Pettarin (alberto@albertopettarin.it)
+# Copyright © 2016-2022 Saeed Rasooli
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+from __future__ import annotations
+
+import os
+from datetime import datetime
+from os.path import join, split
+from typing import TYPE_CHECKING
+
+from pyglossary.core import log
+from pyglossary.ebook_base import EbookWriter
+from pyglossary.langs import Lang
+
+if TYPE_CHECKING:
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class GroupStateBySize:
+ def __init__(self, writer: Writer) -> None:
+ self.writer = writer
+ self.group_index = -1
+ self.reset()
+
+ def reset(self) -> None:
+ self.group_contents: list[str] = []
+ self.group_size = 0
+
+ def add(self, entry: EntryType) -> None:
+ defi = entry.defi
+ content = self.writer.format_group_content(
+ entry.l_word[0],
+ defi,
+ variants=entry.l_word[1:],
+ )
+ self.group_contents.append(content)
+ self.group_size += len(content.encode("utf-8"))
+
+
+class Writer(EbookWriter):
+ _compress: bool = False
+ _keep: bool = False
+ _kindlegen_path: str = ""
+ _file_size_approx: int = 271360
+ _hide_word_index: bool = False
+ _spellcheck: bool = True
+ _exact: bool = False
+ CSS_CONTENTS = b""""@charset "UTF-8";"""
+ GROUP_XHTML_TEMPLATE = """
+
+
+
+
+
+
+
+
+{group_contents}
+
+
+"""
+
+ GROUP_XHTML_WORD_DEFINITION_TEMPLATE = """
+{headword_visible}{infl}
+
+
{definition}
+
+
"""
+
+ GROUP_XHTML_WORD_INFL_TEMPLATE = """
+{iforms_str}
+"""
+
+ GROUP_XHTML_WORD_IFORM_TEMPLATE = """"""
+
+ OPF_TEMPLATE = """
+
+
+
+{title}
+{sourceLang}
+{identifier}
+{creator}
+{copyright}
+{description}
+Dictionaries
+
+
+
+{sourceLang}
+{targetLang}
+{cover}
+
+
+
+{manifest}
+
+
+{spine}
+
+
+
+"""
+
+ def __init__(self, glos: GlossaryType) -> None:
+ import uuid
+
+ EbookWriter.__init__(
+ self,
+ glos,
+ )
+ glos.setInfo("uuid", str(uuid.uuid4()).replace("-", ""))
+ # FIXME: check if full html pages/documents as entry do work
+ # glos.stripFullHtml(errorHandler=None)
+
+ def get_prefix(self, word: str) -> str:
+ if not word:
+ return ""
+ length = self._group_by_prefix_length
+ prefix = word[:length].lower()
+ if prefix[0] < "a":
+ return "SPECIAL"
+ return prefix
+
+ def format_group_content(
+ self,
+ word: str,
+ defi: str,
+ variants: list[str] | None = None,
+ ) -> str:
+ hide_word_index = self._hide_word_index
+ infl = ""
+ if variants:
+ iforms_list = [
+ self.GROUP_XHTML_WORD_IFORM_TEMPLATE.format(
+ inflword=variant,
+ exact_str=' exact="yes"' if self._exact else "",
+ )
+ for variant in variants
+ ]
+ infl = "\n" + self.GROUP_XHTML_WORD_INFL_TEMPLATE.format(
+ iforms_str="\n".join(iforms_list),
+ )
+
+ headword = self.escape_if_needed(word)
+
+ defi = self.escape_if_needed(defi)
+
+ if hide_word_index:
+ headword_visible = ""
+ value_headword = f' value="{headword}"'
+ else:
+ headword_visible = "\n" + self._glos.wordTitleStr(headword)
+ value_headword = ""
+
+ return self.GROUP_XHTML_WORD_DEFINITION_TEMPLATE.format(
+ spellcheck_str=' spell="yes"' if self._spellcheck else "",
+ headword_visible=headword_visible,
+ value_headword=value_headword,
+ definition=defi,
+ infl=infl,
+ )
+
+ @staticmethod
+ def getLangCode(lang: Lang | None) -> str:
+ return lang.code if isinstance(lang, Lang) else ""
+
+ def get_opf_contents(
+ self,
+ manifest_contents: str,
+ spine_contents: str,
+ ) -> bytes:
+ cover = ""
+ if self.cover:
+ cover = self.COVER_TEMPLATE.format(cover=self.cover)
+ creationDate = datetime.now().strftime("%Y-%m-%d")
+
+ return self.OPF_TEMPLATE.format(
+ identifier=self._glos.getInfo("uuid"),
+ # use Language code instead name for kindlegen
+ sourceLang=self.getLangCode(self._glos.sourceLang),
+ targetLang=self.getLangCode(self._glos.targetLang),
+ title=self._glos.getInfo("name"),
+ creator=self._glos.author,
+ copyright=self._glos.getInfo("copyright"),
+ description=self._glos.getInfo("description"),
+ creationDate=creationDate,
+ cover=cover,
+ manifest=manifest_contents,
+ spine=spine_contents,
+ ).encode("utf-8")
+
+ def write_groups(self) -> Generator[None, EntryType, None]:
+ def add_group(state: GroupStateBySize) -> None:
+ if state.group_size <= 0:
+ return
+ state.group_index += 1
+ index = state.group_index + self.GROUP_START_INDEX
+ group_xhtml_path = self.get_group_xhtml_file_name_from_index(index)
+ self.add_file_manifest(
+ "OEBPS/" + group_xhtml_path,
+ group_xhtml_path,
+ self.GROUP_XHTML_TEMPLATE.format(
+ group_contents=self.GROUP_XHTML_WORD_DEFINITION_JOINER.join(
+ state.group_contents,
+ ),
+ ).encode("utf-8"),
+ "application/xhtml+xml",
+ )
+
+ state = GroupStateBySize(self)
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ continue
+
+ if state.group_size >= self._file_size_approx:
+ add_group(state)
+ state.reset()
+
+ state.add(entry)
+
+ add_group(state)
+
+ def write(self) -> Generator[None, EntryType, None]:
+ import shutil
+ import subprocess
+
+ filename = self._filename
+ kindlegen_path = self._kindlegen_path
+
+ yield from EbookWriter.write(self)
+
+ # download kindlegen from this page:
+ # https://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000765211
+
+ # run kindlegen
+ if not kindlegen_path:
+ kindlegen_path = shutil.which("kindlegen") or ""
+ if not kindlegen_path:
+ log.warning(
+ f"Not running kindlegen, the raw files are located in {filename}",
+ )
+ log.warning(
+ "Provide KindleGen path with: --write-options 'kindlegen_path=...'",
+ )
+ return
+
+ # name = self._glos.getInfo("name")
+ log.info(f"Creating .mobi file with kindlegen, using {kindlegen_path!r}")
+ direc, filename = split(filename)
+ cmd = [
+ kindlegen_path,
+ join(filename, "OEBPS", "content.opf"),
+ "-gen_ff_mobi7",
+ "-o",
+ "content.mobi",
+ ]
+ proc = subprocess.Popen(
+ cmd,
+ cwd=direc,
+ stdout=subprocess.PIPE,
+ stdin=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ output = proc.communicate()
+ log.info(output[0].decode("utf-8"))
+ mobi_path_abs = os.path.join(filename, "OEBPS", "content.mobi")
+ log.info(f"Created .mobi file with kindlegen: {mobi_path_abs}")
diff --git a/pyglossary/plugins/edict2/__init__.py b/pyglossary/plugins/edict2/__init__.py
index f0cb45408..50b9a2466 100644
--- a/pyglossary/plugins/edict2/__init__.py
+++ b/pyglossary/plugins/edict2/__init__.py
@@ -1,23 +1,12 @@
from __future__ import annotations
-from typing import TYPE_CHECKING
-
-from pyglossary.core import log
-from pyglossary.io_utils import nullTextIO
from pyglossary.option import (
BoolOption,
EncodingOption,
Option,
)
-from . import conv
-
-if TYPE_CHECKING:
- import io
- from collections.abc import Iterator
-
- from pyglossary.glossary_types import EntryType, GlossaryType
-
+from .reader import Reader
__all__ = [
"Reader",
@@ -71,78 +60,3 @@
comment="Set to false to disable tones coloring",
),
}
-
-
-class Reader:
- depends = {
- "lxml": "lxml",
- }
-
- _encoding: str = "utf-8"
- _traditional_title: bool = False
- _colorize_tones: bool = True
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self.file: io.TextIOBase = nullTextIO
- self._fileSize = 0
-
- def open(self, filename: str) -> None:
- # self._glos.sourceLangName = "Chinese"
- # self._glos.targetLangName = "English"
-
- cfile = self.file = open(filename, encoding=self._encoding)
-
- if cfile.seekable():
- cfile.seek(0, 2)
- self._fileSize = cfile.tell()
- cfile.seek(0)
- # self._glos.setInfo("input_file_size", f"{self._fileSize}")
- else:
- log.warning("EDICT2 Reader: file is not seekable")
-
- def close(self) -> None:
- self.file.close()
- self.file = nullTextIO
-
- def __len__(self) -> int:
- return 0
-
- def __iter__(self) -> Iterator[EntryType]:
- file = self.file
- fileSize = self._fileSize
- glos = self._glos
-
- render_syllables = (
- conv.render_syllables_color
- if self._colorize_tones
- else conv.render_syllables_no_color
- )
- parse_line = (
- conv.parse_line_trad if self._traditional_title else conv.parse_line_simp
- )
-
- while True:
- line = file.readline()
- if not line:
- break
- line = line.rstrip("\n")
- if not line:
- continue
- if line.startswith("#"):
- continue
- parts = parse_line(line)
- if parts is None:
- log.warning(f"bad line: {line!r}")
- continue
- names, article_text = conv.render_article(
- render_syllables,
- conv.Article(*parts),
- )
- entry = glos.newEntry(
- names,
- article_text,
- defiFormat="h",
- byteProgress=(file.tell(), fileSize) if fileSize else None,
- )
- yield entry
diff --git a/pyglossary/plugins/edict2/reader.py b/pyglossary/plugins/edict2/reader.py
new file mode 100644
index 000000000..378cc0251
--- /dev/null
+++ b/pyglossary/plugins/edict2/reader.py
@@ -0,0 +1,89 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pyglossary.core import log
+from pyglossary.io_utils import nullTextIO
+
+from . import conv
+
+if TYPE_CHECKING:
+ import io
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ depends = {
+ "lxml": "lxml",
+ }
+
+ _encoding: str = "utf-8"
+ _traditional_title: bool = False
+ _colorize_tones: bool = True
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self.file: io.TextIOBase = nullTextIO
+ self._fileSize = 0
+
+ def open(self, filename: str) -> None:
+ # self._glos.sourceLangName = "Chinese"
+ # self._glos.targetLangName = "English"
+
+ cfile = self.file = open(filename, encoding=self._encoding)
+
+ if cfile.seekable():
+ cfile.seek(0, 2)
+ self._fileSize = cfile.tell()
+ cfile.seek(0)
+ # self._glos.setInfo("input_file_size", f"{self._fileSize}")
+ else:
+ log.warning("EDICT2 Reader: file is not seekable")
+
+ def close(self) -> None:
+ self.file.close()
+ self.file = nullTextIO
+
+ def __len__(self) -> int:
+ return 0
+
+ def __iter__(self) -> Iterator[EntryType]:
+ file = self.file
+ fileSize = self._fileSize
+ glos = self._glos
+
+ render_syllables = (
+ conv.render_syllables_color
+ if self._colorize_tones
+ else conv.render_syllables_no_color
+ )
+ parse_line = (
+ conv.parse_line_trad if self._traditional_title else conv.parse_line_simp
+ )
+
+ while True:
+ line = file.readline()
+ if not line:
+ break
+ line = line.rstrip("\n")
+ if not line:
+ continue
+ if line.startswith("#"):
+ continue
+ parts = parse_line(line)
+ if parts is None:
+ log.warning(f"bad line: {line!r}")
+ continue
+ names, article_text = conv.render_article(
+ render_syllables,
+ conv.Article(*parts),
+ )
+ entry = glos.newEntry(
+ names,
+ article_text,
+ defiFormat="h",
+ byteProgress=(file.tell(), fileSize) if fileSize else None,
+ )
+ yield entry
diff --git a/pyglossary/plugins/edlin/__init__.py b/pyglossary/plugins/edlin/__init__.py
index fc5e428f8..6f6664762 100644
--- a/pyglossary/plugins/edlin/__init__.py
+++ b/pyglossary/plugins/edlin/__init__.py
@@ -1,45 +1,15 @@
# -*- coding: utf-8 -*-
-# edlin.py
-#
-# Copyright © 2016-2019 Saeed Rasooli (ilius)
-# This file is part of PyGlossary project, https://github.com/ilius/pyglossary
-#
-# This program is a free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3, or (at your option)
-# any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program. Or on Debian systems, from /usr/share/common-licenses/GPL
-# If not, see .
from __future__ import annotations
-import os
-from os.path import dirname, isdir, isfile, join
-from typing import TYPE_CHECKING
-
-from pyglossary.core import log
from pyglossary.option import (
BoolOption,
EncodingOption,
Option,
)
-from pyglossary.text_utils import (
- escapeNTB,
- splitByBarUnescapeNTB,
- unescapeNTB,
-)
-
-if TYPE_CHECKING:
- from collections.abc import Generator, Iterator
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .reader import Reader
+from .writer import Writer
__all__ = [
"Reader",
@@ -72,241 +42,3 @@
"encoding": EncodingOption(),
"prev_link": BoolOption(comment="Enable link to previous entry"),
}
-
-
-def makeDir(direc: str) -> None:
- if not isdir(direc):
- os.makedirs(direc)
-
-
-class Reader:
- _encoding: str = "utf-8"
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
-
- def close(self) -> None:
- self._clear()
-
- def _clear(self) -> None:
- self._filename = ""
- self._prev_link = True
- self._wordCount = None
- self._rootPath = None
- self._resDir = ""
- self._resFileNames: list[str] = []
-
- def open(self, filename: str) -> None:
- from pyglossary.json_utils import jsonToData
-
- if isdir(filename):
- infoFname = join(filename, "info.json")
- elif isfile(filename):
- infoFname = filename
- filename = dirname(filename)
- else:
- raise ValueError(
- f"error while opening {filename!r}: no such file or directory",
- )
- self._filename = filename
-
- with open(infoFname, encoding=self._encoding) as infoFp:
- info = jsonToData(infoFp.read())
- self._wordCount = info.pop("wordCount")
- self._prev_link = info.pop("prev_link")
- self._rootPath = info.pop("root")
- for key, value in info.items():
- self._glos.setInfo(key, value)
-
- self._resDir = join(filename, "res")
- if isdir(self._resDir):
- self._resFileNames = os.listdir(self._resDir)
- else:
- self._resDir = ""
- self._resFileNames = []
-
- def __len__(self) -> int:
- if self._wordCount is None:
- log.error("called len() on a reader which is not open")
- return 0
- return self._wordCount + len(self._resFileNames)
-
- def __iter__(self) -> Iterator[EntryType]:
- if not self._rootPath:
- raise RuntimeError("iterating over a reader while it's not open")
-
- wordCount = 0
- nextPath = self._rootPath
- while nextPath != "END":
- wordCount += 1
- # before or after reading word and defi
- # (and skipping empty entry)? FIXME
-
- with open(
- join(self._filename, nextPath),
- encoding=self._encoding,
- ) as _file:
- header = _file.readline().rstrip()
- if self._prev_link:
- _prevPath, nextPath = header.split(" ")
- else:
- nextPath = header
- word = _file.readline()
- if not word:
- yield None # update progressbar
- continue
- defi = _file.read()
- if not defi:
- log.warning(
- f"Edlin Reader: no definition for word {word!r}, skipping",
- )
- yield None # update progressbar
- continue
- word = word.rstrip()
- defi = defi.rstrip()
-
- if self._glos.alts:
- word = splitByBarUnescapeNTB(word)
- if len(word) == 1:
- word = word[0]
- else:
- word = unescapeNTB(word, bar=False)
-
- # defi = unescapeNTB(defi)
- yield self._glos.newEntry(word, defi)
-
- if wordCount != self._wordCount:
- log.warning(
- f"{wordCount} words found, "
- f"wordCount in info.json was {self._wordCount}",
- )
- self._wordCount = wordCount
-
- resDir = self._resDir
- for fname in self._resFileNames:
- with open(join(resDir, fname), "rb") as _file:
- yield self._glos.newDataEntry(
- fname,
- _file.read(),
- )
-
-
-class Writer:
- _encoding: str = "utf-8"
- _prev_link: bool = True
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._clear()
-
- def finish(self) -> None:
- self._clear()
-
- def open(self, filename: str) -> None:
- self._filename = filename
- self._resDir = join(filename, "res")
- os.makedirs(filename)
- os.mkdir(self._resDir)
-
- def _clear(self) -> None:
- self._filename = ""
- self._resDir = ""
- self._encoding = "utf-8"
- self._hashSet: set[str] = set()
- # self._wordCount = None
-
- @staticmethod
- def hashToPath(h: str) -> str:
- return h[:2] + "/" + h[2:]
-
- def getEntryHash(self, entry: EntryType) -> str:
- """
- Return hash string for given entry
- don't call it twice for one entry, if you do you will get a
- different hash string.
- """
- from hashlib import sha1
-
- hash_ = sha1(entry.s_word.encode("utf-8")).hexdigest()[:8] # noqa: S324
- if hash_ not in self._hashSet:
- self._hashSet.add(hash_)
- return hash_
- index = 0
- while True:
- tmp_hash = hash_ + f"{index:x}"
- if tmp_hash not in self._hashSet:
- self._hashSet.add(tmp_hash)
- return tmp_hash
- index += 1
-
- def saveEntry(
- self,
- thisEntry: EntryType,
- thisHash: str,
- prevHash: str | None,
- nextHash: str | None,
- ) -> None:
- dpath = join(self._filename, thisHash[:2])
- makeDir(dpath)
- with open(
- join(dpath, thisHash[2:]),
- "w",
- encoding=self._encoding,
- ) as toFile:
- nextPath = self.hashToPath(nextHash) if nextHash else "END"
- if self._prev_link:
- prevPath = self.hashToPath(prevHash) if prevHash else "START"
- header = prevPath + " " + nextPath
- else:
- header = nextPath
- toFile.write(
- "\n".join(
- [
- header,
- escapeNTB(thisEntry.s_word, bar=False),
- thisEntry.defi,
- ],
- ),
- )
-
- def write(self) -> Generator[None, EntryType, None]:
- from pyglossary.json_utils import dataToPrettyJson
-
- thisEntry = yield
- if thisEntry is None:
- raise ValueError("glossary is empty")
-
- count = 1
- rootHash = thisHash = self.getEntryHash(thisEntry)
- prevHash = None
-
- while True:
- nextEntry = yield
- if nextEntry is None:
- break
- if nextEntry.isData():
- nextEntry.save(self._resDir)
- continue
- nextHash = self.getEntryHash(nextEntry)
- self.saveEntry(thisEntry, thisHash, prevHash, nextHash)
- thisEntry = nextEntry
- prevHash, thisHash = thisHash, nextHash
- count += 1
- self.saveEntry(thisEntry, thisHash, prevHash, None)
-
- with open(
- join(self._filename, "info.json"),
- "w",
- encoding=self._encoding,
- ) as toFile:
- info = {}
- info["name"] = self._glos.getInfo("name")
- info["root"] = self.hashToPath(rootHash)
- info["prev_link"] = self._prev_link
- info["wordCount"] = count
- # info["modified"] =
-
- info |= self._glos.getExtraInfos(["name", "root", "prev_link", "wordCount"])
-
- toFile.write(dataToPrettyJson(info))
diff --git a/pyglossary/plugins/edlin/reader.py b/pyglossary/plugins/edlin/reader.py
new file mode 100644
index 000000000..8fcdf4007
--- /dev/null
+++ b/pyglossary/plugins/edlin/reader.py
@@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+import os
+from os.path import dirname, isdir, isfile, join
+from typing import TYPE_CHECKING
+
+from pyglossary.core import log
+from pyglossary.text_utils import (
+ splitByBarUnescapeNTB,
+ unescapeNTB,
+)
+
+if TYPE_CHECKING:
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ _encoding: str = "utf-8"
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+
+ def close(self) -> None:
+ self._clear()
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._prev_link = True
+ self._wordCount = None
+ self._rootPath = None
+ self._resDir = ""
+ self._resFileNames: list[str] = []
+
+ def open(self, filename: str) -> None:
+ from pyglossary.json_utils import jsonToData
+
+ if isdir(filename):
+ infoFname = join(filename, "info.json")
+ elif isfile(filename):
+ infoFname = filename
+ filename = dirname(filename)
+ else:
+ raise ValueError(
+ f"error while opening {filename!r}: no such file or directory",
+ )
+ self._filename = filename
+
+ with open(infoFname, encoding=self._encoding) as infoFp:
+ info = jsonToData(infoFp.read())
+ self._wordCount = info.pop("wordCount")
+ self._prev_link = info.pop("prev_link")
+ self._rootPath = info.pop("root")
+ for key, value in info.items():
+ self._glos.setInfo(key, value)
+
+ self._resDir = join(filename, "res")
+ if isdir(self._resDir):
+ self._resFileNames = os.listdir(self._resDir)
+ else:
+ self._resDir = ""
+ self._resFileNames = []
+
+ def __len__(self) -> int:
+ if self._wordCount is None:
+ log.error("called len() on a reader which is not open")
+ return 0
+ return self._wordCount + len(self._resFileNames)
+
+ def __iter__(self) -> Iterator[EntryType]:
+ if not self._rootPath:
+ raise RuntimeError("iterating over a reader while it's not open")
+
+ wordCount = 0
+ nextPath = self._rootPath
+ while nextPath != "END":
+ wordCount += 1
+ # before or after reading word and defi
+ # (and skipping empty entry)? FIXME
+
+ with open(
+ join(self._filename, nextPath),
+ encoding=self._encoding,
+ ) as _file:
+ header = _file.readline().rstrip()
+ if self._prev_link:
+ _prevPath, nextPath = header.split(" ")
+ else:
+ nextPath = header
+ word = _file.readline()
+ if not word:
+ yield None # update progressbar
+ continue
+ defi = _file.read()
+ if not defi:
+ log.warning(
+ f"Edlin Reader: no definition for word {word!r}, skipping",
+ )
+ yield None # update progressbar
+ continue
+ word = word.rstrip()
+ defi = defi.rstrip()
+
+ if self._glos.alts:
+ word = splitByBarUnescapeNTB(word)
+ if len(word) == 1:
+ word = word[0]
+ else:
+ word = unescapeNTB(word, bar=False)
+
+ # defi = unescapeNTB(defi)
+ yield self._glos.newEntry(word, defi)
+
+ if wordCount != self._wordCount:
+ log.warning(
+ f"{wordCount} words found, "
+ f"wordCount in info.json was {self._wordCount}",
+ )
+ self._wordCount = wordCount
+
+ resDir = self._resDir
+ for fname in self._resFileNames:
+ with open(join(resDir, fname), "rb") as _file:
+ yield self._glos.newDataEntry(
+ fname,
+ _file.read(),
+ )
diff --git a/pyglossary/plugins/edlin/writer.py b/pyglossary/plugins/edlin/writer.py
new file mode 100644
index 000000000..10b77b85a
--- /dev/null
+++ b/pyglossary/plugins/edlin/writer.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+import os
+from os.path import isdir, join
+from typing import TYPE_CHECKING
+
+from pyglossary.text_utils import (
+ escapeNTB,
+)
+
+if TYPE_CHECKING:
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+def makeDir(direc: str) -> None:
+ if not isdir(direc):
+ os.makedirs(direc)
+
+
+class Writer:
+ _encoding: str = "utf-8"
+ _prev_link: bool = True
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._clear()
+
+ def finish(self) -> None:
+ self._clear()
+
+ def open(self, filename: str) -> None:
+ self._filename = filename
+ self._resDir = join(filename, "res")
+ os.makedirs(filename)
+ os.mkdir(self._resDir)
+
+ def _clear(self) -> None:
+ self._filename = ""
+ self._resDir = ""
+ self._encoding = "utf-8"
+ self._hashSet: set[str] = set()
+ # self._wordCount = None
+
+ @staticmethod
+ def hashToPath(h: str) -> str:
+ return h[:2] + "/" + h[2:]
+
+ def getEntryHash(self, entry: EntryType) -> str:
+ """
+ Return hash string for given entry
+ don't call it twice for one entry, if you do you will get a
+ different hash string.
+ """
+ from hashlib import sha1
+
+ hash_ = sha1(entry.s_word.encode("utf-8")).hexdigest()[:8] # noqa: S324
+ if hash_ not in self._hashSet:
+ self._hashSet.add(hash_)
+ return hash_
+ index = 0
+ while True:
+ tmp_hash = hash_ + f"{index:x}"
+ if tmp_hash not in self._hashSet:
+ self._hashSet.add(tmp_hash)
+ return tmp_hash
+ index += 1
+
+ def saveEntry(
+ self,
+ thisEntry: EntryType,
+ thisHash: str,
+ prevHash: str | None,
+ nextHash: str | None,
+ ) -> None:
+ dpath = join(self._filename, thisHash[:2])
+ makeDir(dpath)
+ with open(
+ join(dpath, thisHash[2:]),
+ "w",
+ encoding=self._encoding,
+ ) as toFile:
+ nextPath = self.hashToPath(nextHash) if nextHash else "END"
+ if self._prev_link:
+ prevPath = self.hashToPath(prevHash) if prevHash else "START"
+ header = prevPath + " " + nextPath
+ else:
+ header = nextPath
+ toFile.write(
+ "\n".join(
+ [
+ header,
+ escapeNTB(thisEntry.s_word, bar=False),
+ thisEntry.defi,
+ ],
+ ),
+ )
+
+ def write(self) -> Generator[None, EntryType, None]:
+ from pyglossary.json_utils import dataToPrettyJson
+
+ thisEntry = yield
+ if thisEntry is None:
+ raise ValueError("glossary is empty")
+
+ count = 1
+ rootHash = thisHash = self.getEntryHash(thisEntry)
+ prevHash = None
+
+ while True:
+ nextEntry = yield
+ if nextEntry is None:
+ break
+ if nextEntry.isData():
+ nextEntry.save(self._resDir)
+ continue
+ nextHash = self.getEntryHash(nextEntry)
+ self.saveEntry(thisEntry, thisHash, prevHash, nextHash)
+ thisEntry = nextEntry
+ prevHash, thisHash = thisHash, nextHash
+ count += 1
+ self.saveEntry(thisEntry, thisHash, prevHash, None)
+
+ with open(
+ join(self._filename, "info.json"),
+ "w",
+ encoding=self._encoding,
+ ) as toFile:
+ info = {}
+ info["name"] = self._glos.getInfo("name")
+ info["root"] = self.hashToPath(rootHash)
+ info["prev_link"] = self._prev_link
+ info["wordCount"] = count
+ # info["modified"] =
+
+ info |= self._glos.getExtraInfos(["name", "root", "prev_link", "wordCount"])
+
+ toFile.write(dataToPrettyJson(info))
diff --git a/pyglossary/plugins/gettext_po/__init__.py b/pyglossary/plugins/gettext_po/__init__.py
index 978b7c455..cd6dd9887 100644
--- a/pyglossary/plugins/gettext_po/__init__.py
+++ b/pyglossary/plugins/gettext_po/__init__.py
@@ -2,23 +2,13 @@
from __future__ import annotations
-import os
-from os.path import isdir
-from typing import TYPE_CHECKING
-
-from pyglossary.core import exc_note, log, pip
-from pyglossary.io_utils import nullTextIO
from pyglossary.option import (
BoolOption,
Option,
)
-from pyglossary.text_utils import splitByBar
-
-if TYPE_CHECKING:
- import io
- from collections.abc import Generator, Iterator
- from pyglossary.glossary_types import EntryType, GlossaryType
+from .reader import Reader
+from .writer import Writer
__all__ = [
"Reader",
@@ -52,166 +42,3 @@
optionsProp: dict[str, Option] = {
"resources": BoolOption(comment="Enable resources / data files"),
}
-
-
-class Reader:
- depends = {
- "polib": "polib",
- }
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._alts = glos.alts
- self.clear()
-
- def clear(self) -> None:
- self._filename = ""
- self._file: io.TextIOBase = nullTextIO
- self._wordCount: int | None = None
- self._resDir = ""
- self._resFileNames: list[str] = []
-
- def open(self, filename: str) -> None:
- self._filename = filename
- self._file = open(filename, encoding="utf-8")
- self._resDir = filename + "_res"
- if isdir(self._resDir):
- self._resFileNames = os.listdir(self._resDir)
- else:
- self._resDir = ""
- self._resFileNames = []
-
- def close(self) -> None:
- self._file.close()
- self._file = nullTextIO
- self.clear()
-
- def __len__(self) -> int:
- from pyglossary.file_utils import fileCountLines
-
- if self._wordCount is None:
- log.debug("Try not to use len(reader) as it takes extra time")
- self._wordCount = fileCountLines(
- self._filename,
- newline=b"\nmsgid",
- )
- return self._wordCount
-
- def makeEntry(self, word: str, defi: str) -> EntryType:
- if self._alts:
- return self._glos.newEntry(splitByBar(word), defi)
- return self._glos.newEntry(word, defi)
-
- def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912
- try:
- from polib import unescape as po_unescape
- except ModuleNotFoundError as e:
- exc_note(e, f"Run `{pip} install polib` to install")
- raise
-
- file = self._file
-
- word = ""
- defi = ""
- msgstr = False
- wordCount = 0
- for line_ in file:
- line = line_.strip() # noqa: PLW2901
- if not line:
- continue
- if line.startswith("#"):
- continue
- if line.startswith("msgid "):
- if word:
- yield self.makeEntry(word, defi)
- wordCount += 1
- word = ""
- defi = ""
- else:
- pass
- # TODO: parse defi and set glos info?
- # but this should be done in self.open
- word = po_unescape(line[6:])
- if word.startswith('"'):
- if len(word) < 2 or word[-1] != '"':
- raise ValueError("invalid po line: line")
- word = word[1:-1]
- msgstr = False
- continue
- if line.startswith("msgstr "):
- if msgstr:
- log.error("msgid omitted!")
- defi = po_unescape(line[7:])
- if defi.startswith('"'):
- if len(defi) < 2 or defi[-1] != '"':
- raise ValueError("invalid po line: line")
- defi = defi[1:-1]
- msgstr = True
- continue
-
- line = po_unescape(line)
- if line.startswith('"'):
- if len(line) < 2 or line[-1] != '"':
- raise ValueError("invalid po line: line")
- line = line[1:-1]
-
- if msgstr:
- defi += line
- else:
- word += line
- if word:
- yield self.makeEntry(word, defi)
- wordCount += 1
- self._wordCount = wordCount
-
-
-class Writer:
- depends = {
- "polib": "polib",
- }
-
- _resources: bool = True
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = ""
- self._file: io.TextIOBase = nullTextIO
- glos.preventDuplicateWords()
-
- def open(self, filename: str) -> None:
- try:
- from polib import escape as po_escape
- except ModuleNotFoundError as e:
- exc_note(e, f"Run `{pip} install polib` to install")
- raise
-
- self._filename = filename
- self._file = file = open(filename, mode="w", encoding="utf-8")
- file.write('#\nmsgid ""\nmsgstr ""\n')
- for key, value in self._glos.iterInfo():
- file.write(f'"{po_escape(key)}: {po_escape(value)}\\n"\n')
-
- def finish(self) -> None:
- self._filename = ""
- self._file.close()
- self._file = nullTextIO
-
- def write(self) -> Generator[None, EntryType, None]:
- from polib import escape as po_escape
-
- file = self._file
-
- resources = self._resources
- filename = self._filename
- while True:
- entry = yield
- if entry is None:
- break
- if entry.isData():
- if resources:
- entry.save(filename + "_res")
- continue
- file.write(
- f'msgid "{po_escape(entry.s_word)}"\n'
- f'msgstr "{po_escape(entry.defi)}"\n\n',
- )
diff --git a/pyglossary/plugins/gettext_po/reader.py b/pyglossary/plugins/gettext_po/reader.py
new file mode 100644
index 000000000..126288488
--- /dev/null
+++ b/pyglossary/plugins/gettext_po/reader.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+import os
+from os.path import isdir
+from typing import TYPE_CHECKING
+
+from pyglossary.core import exc_note, log, pip
+from pyglossary.io_utils import nullTextIO
+from pyglossary.text_utils import splitByBar
+
+if TYPE_CHECKING:
+ import io
+ from collections.abc import Iterator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Reader:
+ depends = {
+ "polib": "polib",
+ }
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._alts = glos.alts
+ self.clear()
+
+ def clear(self) -> None:
+ self._filename = ""
+ self._file: io.TextIOBase = nullTextIO
+ self._wordCount: int | None = None
+ self._resDir = ""
+ self._resFileNames: list[str] = []
+
+ def open(self, filename: str) -> None:
+ self._filename = filename
+ self._file = open(filename, encoding="utf-8")
+ self._resDir = filename + "_res"
+ if isdir(self._resDir):
+ self._resFileNames = os.listdir(self._resDir)
+ else:
+ self._resDir = ""
+ self._resFileNames = []
+
+ def close(self) -> None:
+ self._file.close()
+ self._file = nullTextIO
+ self.clear()
+
+ def __len__(self) -> int:
+ from pyglossary.file_utils import fileCountLines
+
+ if self._wordCount is None:
+ log.debug("Try not to use len(reader) as it takes extra time")
+ self._wordCount = fileCountLines(
+ self._filename,
+ newline=b"\nmsgid",
+ )
+ return self._wordCount
+
+ def makeEntry(self, word: str, defi: str) -> EntryType:
+ if self._alts:
+ return self._glos.newEntry(splitByBar(word), defi)
+ return self._glos.newEntry(word, defi)
+
+ def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912
+ try:
+ from polib import unescape as po_unescape
+ except ModuleNotFoundError as e:
+ exc_note(e, f"Run `{pip} install polib` to install")
+ raise
+
+ file = self._file
+
+ word = ""
+ defi = ""
+ msgstr = False
+ wordCount = 0
+ for line_ in file:
+ line = line_.strip() # noqa: PLW2901
+ if not line:
+ continue
+ if line.startswith("#"):
+ continue
+ if line.startswith("msgid "):
+ if word:
+ yield self.makeEntry(word, defi)
+ wordCount += 1
+ word = ""
+ defi = ""
+ else:
+ pass
+ # TODO: parse defi and set glos info?
+ # but this should be done in self.open
+ word = po_unescape(line[6:])
+ if word.startswith('"'):
+ if len(word) < 2 or word[-1] != '"':
+ raise ValueError("invalid po line: line")
+ word = word[1:-1]
+ msgstr = False
+ continue
+ if line.startswith("msgstr "):
+ if msgstr:
+ log.error("msgid omitted!")
+ defi = po_unescape(line[7:])
+ if defi.startswith('"'):
+ if len(defi) < 2 or defi[-1] != '"':
+ raise ValueError("invalid po line: line")
+ defi = defi[1:-1]
+ msgstr = True
+ continue
+
+ line = po_unescape(line)
+ if line.startswith('"'):
+ if len(line) < 2 or line[-1] != '"':
+ raise ValueError("invalid po line: line")
+ line = line[1:-1]
+
+ if msgstr:
+ defi += line
+ else:
+ word += line
+ if word:
+ yield self.makeEntry(word, defi)
+ wordCount += 1
+ self._wordCount = wordCount
diff --git a/pyglossary/plugins/gettext_po/writer.py b/pyglossary/plugins/gettext_po/writer.py
new file mode 100644
index 000000000..685a447ee
--- /dev/null
+++ b/pyglossary/plugins/gettext_po/writer.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pyglossary.core import exc_note, pip
+from pyglossary.io_utils import nullTextIO
+
+if TYPE_CHECKING:
+ import io
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import EntryType, GlossaryType
+
+
+class Writer:
+ depends = {
+ "polib": "polib",
+ }
+
+ _resources: bool = True
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = ""
+ self._file: io.TextIOBase = nullTextIO
+ glos.preventDuplicateWords()
+
+ def open(self, filename: str) -> None:
+ try:
+ from polib import escape as po_escape
+ except ModuleNotFoundError as e:
+ exc_note(e, f"Run `{pip} install polib` to install")
+ raise
+
+ self._filename = filename
+ self._file = file = open(filename, mode="w", encoding="utf-8")
+ file.write('#\nmsgid ""\nmsgstr ""\n')
+ for key, value in self._glos.iterInfo():
+ file.write(f'"{po_escape(key)}: {po_escape(value)}\\n"\n')
+
+ def finish(self) -> None:
+ self._filename = ""
+ self._file.close()
+ self._file = nullTextIO
+
+ def write(self) -> Generator[None, EntryType, None]:
+ from polib import escape as po_escape
+
+ file = self._file
+
+ resources = self._resources
+ filename = self._filename
+ while True:
+ entry = yield
+ if entry is None:
+ break
+ if entry.isData():
+ if resources:
+ entry.save(filename + "_res")
+ continue
+ file.write(
+ f'msgid "{po_escape(entry.s_word)}"\n'
+ f'msgstr "{po_escape(entry.defi)}"\n\n',
+ )
diff --git a/pyglossary/plugins/html_dir/__init__.py b/pyglossary/plugins/html_dir/__init__.py
index 8931a0697..d47850759 100644
--- a/pyglossary/plugins/html_dir/__init__.py
+++ b/pyglossary/plugins/html_dir/__init__.py
@@ -1,24 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import annotations
-import html
-import os
-import re
-import time
-from functools import lru_cache
-from os.path import isdir, isfile, join
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
- import io
- from collections.abc import Generator
-
- from pyglossary.glossary_types import (
- EntryType,
- GlossaryType,
- )
-
-from pyglossary.core import log
from pyglossary.option import (
BoolOption,
EncodingOption,
@@ -26,10 +8,8 @@
Option,
StrOption,
)
-from pyglossary.text_utils import (
- escapeNTB,
- unescapeNTB,
-)
+
+from .writer import Writer
__all__ = [
"Writer",
@@ -80,469 +60,3 @@
comment="Add headwords title to beginning of definition",
),
}
-
-nbsp = "\xa0"
-# nbsp = " "
-
-darkStyle = """
-body {{
- background-color: #373737;
- color: #eee;
-}}
-a {{ color: #aaaaff; }}
-a.broken {{ color: #e0c0c0; }}
-a.no_ul {{ text-decoration: none; }}
-b.headword {{ font-size: 1.5em; color: #c7ffb9; }}
-h1 {{ font-size: 1.5em; color: #c7ffb9;}}
-h2 {{ font-size: 1.3em;}}
-h3 {{ font-size: 1.0em;}}
-h4 {{ font-size: 1.0em;}}
-h5 {{ font-size: 1.0em;}}
-h6 {{ font-size: 1.0em;}}
-"""
-
-
-class Writer:
- _encoding: str = "utf-8"
- _resources: bool = True
- _max_file_size: int = 102400
- _filename_format: str = "{n:05d}.html"
- _escape_defi: bool = False
- _dark: bool = True
- _css: str = ""
- _word_title: bool = True
-
- @staticmethod
- def stripFullHtmlError(entry: EntryType, error: str) -> None:
- log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}")
-
- def __init__(self, glos: GlossaryType) -> None:
- self._glos = glos
- self._filename = ""
- self._fileObj: io.IOBase | None = None
- self._encoding = "utf-8"
- self._filename_format = "{n:05d}.html"
- self._tail = ""
- self._filenameList: list[str] = []
- glos.stripFullHtml(errorHandler=self.stripFullHtmlError)
-
- self._resSrcPattern = re.compile(' src="([^"]*)"')
-
- def open(self, filename: str) -> None:
- self._filename = filename
- self._resDir = resDir = join(filename, "res")
- if not isdir(filename):
- os.mkdir(filename)
- if not isdir(resDir):
- os.mkdir(resDir)
- if self._css:
- self.copyCSS(self._css)
-
- def copyCSS(self, cssPath: str) -> None:
- import shutil
-
- shutil.copy(cssPath, join(self._filename, "style.css"))
-
- def finish(self) -> None:
- pass
-
- def getNextFilename(self) -> str:
- return self._filename_format.format(
- n=len(self._filenameList),
- )
-
- def nextFile(self) -> io.TextIOBase:
- if self._fileObj:
- self._fileObj.write(self._tail)
- self._fileObj.close()
- filename = self.getNextFilename()
- self._filenameList.append(filename)
- self._fileObj = open(
- join(
- self._filename,
- filename,
- ),
- mode="w",
- encoding=self._encoding,
- )
- return self._fileObj
-
- def fixLinks(self, linkTargetSet: set[str]) -> None: # noqa: PLR0912
- import gc
-
- gc.collect()
- dirn = self._filename
-
- filenameList = self._filenameList
-
- fileByWord: dict[str, list[tuple[str, int]]] = {}
- for line in open(join(dirn, "index.txt"), encoding="utf-8"):
- line = line.rstrip("\n") # noqa: PLW2901
- if not line:
- continue
- entryIndexStr, wordEsc, filename, _ = line.split("\t")
- entryIndex = int(entryIndexStr)
- # entryId = f"entry{entryIndex}"
- word = unescapeNTB(wordEsc)
- if word not in linkTargetSet:
- continue
- if word in fileByWord:
- fileByWord[word].append((filename, entryIndex))
- else:
- fileByWord[word] = [(filename, entryIndex)]
-
- # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile:
- # json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t")
-
- @lru_cache(maxsize=10)
- def getLinksByFile(fileIndex: int) -> io.TextIOBase:
- return open(
- join(dirn, f"links{fileIndex}"),
- mode="a",
- encoding="utf-8",
- )
-
- log.info("")
- for line in open(join(dirn, "links.txt"), encoding="utf-8"):
- line = line.rstrip("\n") # noqa: PLW2901
- if not line:
- continue
- target, fileIndexStr, x_start, x_size = line.split("\t")
- target = unescapeNTB(target)
- if target not in fileByWord:
- targetNew = ""
- else:
- targetFilename, targetEntryIndex = fileByWord[target][0]
- if targetFilename == filename:
- continue
- targetNew = f"{targetFilename}#entry{targetEntryIndex}"
- file = getLinksByFile(int(fileIndexStr))
- file.write(
- f"{x_start}\t{x_size}\t{targetNew}\n",
- )
- file.flush()
-
- linkTargetSet.clear()
- del fileByWord, linkTargetSet
- gc.collect()
-
- if os.sep == "\\":
- time.sleep(0.1)
-
- entry_url_fmt = self._glos.getInfo("entry_url")
-
- re_href = re.compile(
- b' href="[^<>"]*?"',
- re.IGNORECASE,
- )
-
- for fileIndex, filename in enumerate(filenameList):
- if not isfile(join(dirn, f"links{fileIndex}")):
- continue
- with open(join(dirn, filename), mode="rb") as inFile:
- with open(join(dirn, f"{filename}.new"), mode="wb") as outFile:
- for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"):
- outFile.flush()
- (
- b_x_start,
- b_x_size,
- b_target,
- ) = linkLine.rstrip(b"\n").split(b"\t")
- outFile.write(
- inFile.read(
- int(b_x_start, 16) - inFile.tell(),
- ),
- )
- curLink = inFile.read(int(b_x_size, 16))
-
- if b_target:
- outFile.write(
- re_href.sub(
- b' href="./' + b_target + b'"',
- curLink,
- ),
- )
- continue
-
- if not entry_url_fmt:
- outFile.write(
- curLink.replace(
- b' href="#',
- b' class="broken" href="#',
- ),
- )
- continue
-
- st = curLink.decode("utf-8")
- i = st.find('href="#')
- j = st.find('"', i + 7)
- word = st[i + 7 : j]
- url = entry_url_fmt.format(word=word)
- outFile.write(
- (
- st[:i] + f'class="broken" href="{url}"' + st[j + 1 :]
- ).encode("utf-8"),
- )
-
- outFile.write(inFile.read())
-
- os.remove(join(dirn, filename))
- os.rename(join(dirn, f"{filename}.new"), join(dirn, filename))
- os.remove(join(dirn, f"links{fileIndex}"))
-
- def writeInfo(self, filename: str, header: str) -> None:
- glos = self._glos
- title = glos.getInfo("name")
- customStyle = (
- "table, th, td {border: 1px solid black; "
- "border-collapse: collapse; padding: 5px;}"
- )
- infoHeader = header.format(
- pageTitle=f"Info: {title}",
- customStyle=customStyle,
- )
- with open(
- join(filename, "info.html"),
- mode="w",
- encoding=self._encoding,
- ) as _file:
- _file.write(
- infoHeader + ""
- ""
- 'Key | '
- 'Value | '
- "
\n",
- )
- for key, value in glos.iterInfo():
- _file.write(
- f"{key} | {value} |
\n",
- )
- _file.write("
")
-
- @staticmethod
- def _subResSrc(m: re.Match) -> str:
- url = m.group(1)
- if "://" in url:
- return m.group(0)
- url = "res/" + url
- return f' src="{url}"'
-
- def write(self) -> Generator[None, EntryType, None]: # noqa: PLR0912
- encoding = self._encoding
- resources = self._resources
- max_file_size = self._max_file_size
- filename_format = self._filename_format
- escape_defi = self._escape_defi
-
- wordSep = ' | '
-
- initFileSizeMax = 100
-
- glos = self._glos
-
- filename = self._filename
- self._encoding = encoding
- self._filename_format = filename_format
-
- entry_url_fmt = glos.getInfo("entry_url")
-
- def getEntryWebLink(entry: EntryType) -> str:
- if not entry_url_fmt:
- return ""
- url = entry_url_fmt.format(word=html.escape(entry.l_word[0]))
- return f'{nbsp}🌏'
-
- # from math import log2, ceil
- # maxPosHexLen = int(ceil(log2(max_file_size) / 4))
-
- indexTxtFileObj = open(
- join(filename, "index.txt"),
- mode="w",
- encoding="utf-8",
- )
- linksTxtFileObj = open(
- join(filename, "links.txt"),
- mode="w",
- encoding="utf-8",
- )
-
- title = glos.getInfo("name")
- style = ""
- if self._dark:
- style = darkStyle
-
- cssLink = '' if self._css else ""
-
- header = (
- "\n"
- ""
- "{pageTitle}"
- f''
- f'{cssLink}'
- "\n"
- )
-
- def pageHeader(n: int) -> str:
- return header.format(
- pageTitle=f"Page {n} of {title}",
- customStyle="",
- )
-
- def navBar() -> str:
- links: list[str] = []
- if len(self._filenameList) > 1:
- links.append(f'◀')
- links.extend(
- [
- f'▶',
- 'ℹ️', # noqa: RUF001
- ],
- )
- return (
- '"
- )
-
- tailSize = len(self._tail.encode(encoding))
-
- if max_file_size < len(header) + tailSize:
- raise ValueError(f"{max_file_size=} is too small")
-
- max_file_size -= tailSize
-
- if not isdir(self._filename):
- os.mkdir(self._filename)
-
- fileObj = self.nextFile()
- fileObj.write(pageHeader(0))
- fileObj.write(navBar())
-
- re_fixed_link = re.compile(
- r']*? )?href="#([^<>"]+?)">[^<>]+?',
- re.IGNORECASE,
- )
-
- linkTargetSet = set()
-
- def replaceBword(text: str) -> str:
- return text.replace(
- ' href="bword://',
- ' href="#',
- )
-
- def addLinks(text: str, pos: int) -> None:
- for m in re_fixed_link.finditer(text):
- if ' class="entry_link"' in m.group(0):
- continue
- if m.group(0).count("href=") != 1:
- log.error(f"unexpected match: {m.group(0)}")
- target = html.unescape(m.group(1))
- linkTargetSet.add(target)
- start = m.start()
- b_start = len(text[:start].encode(encoding))
- b_size = len(text[start : m.end()].encode(encoding))
- linksTxtFileObj.write(
- f"{escapeNTB(target)}\t"
- f"{len(self._filenameList) - 1}\t"
- f"{pos + b_start:x}\t"
- f"{b_size:x}\n",
- )
- linksTxtFileObj.flush()
-
- self.writeInfo(filename, header)
-
- word_title = self._word_title
-
- resDir = self._resDir
- entryIndex = -1
- while True:
- entryIndex += 1
- entry = yield
- if entry is None:
- break
- if entry.isData():
- if resources:
- entry.save(resDir)
- continue
-
- entry.detectDefiFormat()
- defi = entry.defi
- defiFormat = entry.defiFormat
-
- if defi.startswith("") and defiFormat != "h":
- log.error(f"bad {defiFormat=}")
- defiFormat = "h"
-
- if defiFormat == "m":
- defi = html.escape(defi)
- if "\n" in defi:
- # could be markdown or unformatted plaintext
- # FIXME: this changes the font to a monospace
- defi = f"{defi}
"
- elif defiFormat == "h":
- defi = self._resSrcPattern.sub(self._subResSrc, defi)
- if escape_defi:
- defi = html.escape(defi)
-
- entryId = f"entry{entryIndex}"
-
- if word_title:
- words = [html.escape(word) for word in entry.l_word]
- title = glos.wordTitleStr(
- wordSep.join(words),
- sample=entry.l_word[0],
- class_="headword",
- )
-
- if not title:
- title = f"Entry {entryIndex}"
-
- # entry_link_sym = "¶"
- entry_link_sym = "🔗"
- text = (
- f'{title}{nbsp}{nbsp}'
- f'
'
- f"{entry_link_sym}"
- f"{getEntryWebLink(entry)}"
- f"
\n{defi}"
- "
\n"
- "
\n"
- )
- pos = fileObj.tell()
- if pos > initFileSizeMax and pos > max_file_size - len(
- text.encode(encoding),
- ):
- fileObj = self.nextFile()
- fileObj.write(
- pageHeader(
- len(self._filenameList) - 1,
- ),
- )
- fileObj.write(navBar())
- pos = fileObj.tell()
- tmpFilename = escapeNTB(self._filenameList[-1])
- for word in entry.l_word:
- indexTxtFileObj.write(
- f"{entryIndex}\t"
- f"{escapeNTB(word)}\t"
- f"{tmpFilename}\t"
- f"{pos}\n",
- )
- del tmpFilename
- text = replaceBword(text)
- addLinks(text, pos)
- fileObj.write(text)
-
- fileObj.close()
- self._fileObj = None
- indexTxtFileObj.close()
-
- linksTxtFileObj.close()
-
- if linkTargetSet:
- log.info(f"{len(linkTargetSet)} link targets found")
- log.info("Fixing links, please wait...")
- self.fixLinks(linkTargetSet)
-
- os.remove(join(filename, "links.txt"))
diff --git a/pyglossary/plugins/html_dir/writer.py b/pyglossary/plugins/html_dir/writer.py
new file mode 100644
index 000000000..6451f09ce
--- /dev/null
+++ b/pyglossary/plugins/html_dir/writer.py
@@ -0,0 +1,491 @@
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import html
+import os
+import re
+import time
+from functools import lru_cache
+from os.path import isdir, isfile, join
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ import io
+ from collections.abc import Generator
+
+ from pyglossary.glossary_types import (
+ EntryType,
+ GlossaryType,
+ )
+
+from pyglossary.core import log
+from pyglossary.text_utils import (
+ escapeNTB,
+ unescapeNTB,
+)
+
+nbsp = "\xa0"
+# nbsp = " "
+
+darkStyle = """
+body {{
+ background-color: #373737;
+ color: #eee;
+}}
+a {{ color: #aaaaff; }}
+a.broken {{ color: #e0c0c0; }}
+a.no_ul {{ text-decoration: none; }}
+b.headword {{ font-size: 1.5em; color: #c7ffb9; }}
+h1 {{ font-size: 1.5em; color: #c7ffb9;}}
+h2 {{ font-size: 1.3em;}}
+h3 {{ font-size: 1.0em;}}
+h4 {{ font-size: 1.0em;}}
+h5 {{ font-size: 1.0em;}}
+h6 {{ font-size: 1.0em;}}
+"""
+
+
+class Writer:
+ _encoding: str = "utf-8"
+ _resources: bool = True
+ _max_file_size: int = 102400
+ _filename_format: str = "{n:05d}.html"
+ _escape_defi: bool = False
+ _dark: bool = True
+ _css: str = ""
+ _word_title: bool = True
+
+ @staticmethod
+ def stripFullHtmlError(entry: EntryType, error: str) -> None:
+ log.error(f"error in stripFullHtml: {error}, words={entry.l_word!r}")
+
+ def __init__(self, glos: GlossaryType) -> None:
+ self._glos = glos
+ self._filename = ""
+ self._fileObj: io.IOBase | None = None
+ self._encoding = "utf-8"
+ self._filename_format = "{n:05d}.html"
+ self._tail = ""
+ self._filenameList: list[str] = []
+ glos.stripFullHtml(errorHandler=self.stripFullHtmlError)
+
+ self._resSrcPattern = re.compile(' src="([^"]*)"')
+
+ def open(self, filename: str) -> None:
+ self._filename = filename
+ self._resDir = resDir = join(filename, "res")
+ if not isdir(filename):
+ os.mkdir(filename)
+ if not isdir(resDir):
+ os.mkdir(resDir)
+ if self._css:
+ self.copyCSS(self._css)
+
+ def copyCSS(self, cssPath: str) -> None:
+ import shutil
+
+ shutil.copy(cssPath, join(self._filename, "style.css"))
+
+ def finish(self) -> None:
+ pass
+
+ def getNextFilename(self) -> str:
+ return self._filename_format.format(
+ n=len(self._filenameList),
+ )
+
+ def nextFile(self) -> io.TextIOBase:
+ if self._fileObj:
+ self._fileObj.write(self._tail)
+ self._fileObj.close()
+ filename = self.getNextFilename()
+ self._filenameList.append(filename)
+ self._fileObj = open(
+ join(
+ self._filename,
+ filename,
+ ),
+ mode="w",
+ encoding=self._encoding,
+ )
+ return self._fileObj
+
+ def fixLinks(self, linkTargetSet: set[str]) -> None: # noqa: PLR0912
+ import gc
+
+ gc.collect()
+ dirn = self._filename
+
+ filenameList = self._filenameList
+
+ fileByWord: dict[str, list[tuple[str, int]]] = {}
+ for line in open(join(dirn, "index.txt"), encoding="utf-8"):
+ line = line.rstrip("\n") # noqa: PLW2901
+ if not line:
+ continue
+ entryIndexStr, wordEsc, filename, _ = line.split("\t")
+ entryIndex = int(entryIndexStr)
+ # entryId = f"entry{entryIndex}"
+ word = unescapeNTB(wordEsc)
+ if word not in linkTargetSet:
+ continue
+ if word in fileByWord:
+ fileByWord[word].append((filename, entryIndex))
+ else:
+ fileByWord[word] = [(filename, entryIndex)]
+
+ # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile:
+ # json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t")
+
+ @lru_cache(maxsize=10)
+ def getLinksByFile(fileIndex: int) -> io.TextIOBase:
+ return open(
+ join(dirn, f"links{fileIndex}"),
+ mode="a",
+ encoding="utf-8",
+ )
+
+ log.info("")
+ for line in open(join(dirn, "links.txt"), encoding="utf-8"):
+ line = line.rstrip("\n") # noqa: PLW2901
+ if not line:
+ continue
+ target, fileIndexStr, x_start, x_size = line.split("\t")
+ target = unescapeNTB(target)
+ if target not in fileByWord:
+ targetNew = ""
+ else:
+ targetFilename, targetEntryIndex = fileByWord[target][0]
+ if targetFilename == filename:
+ continue
+ targetNew = f"{targetFilename}#entry{targetEntryIndex}"
+ file = getLinksByFile(int(fileIndexStr))
+ file.write(
+ f"{x_start}\t{x_size}\t{targetNew}\n",
+ )
+ file.flush()
+
+ linkTargetSet.clear()
+ del fileByWord, linkTargetSet
+ gc.collect()
+
+ if os.sep == "\\":
+ time.sleep(0.1)
+
+ entry_url_fmt = self._glos.getInfo("entry_url")
+
+ re_href = re.compile(
+ b' href="[^<>"]*?"',
+ re.IGNORECASE,
+ )
+
+ for fileIndex, filename in enumerate(filenameList):
+ if not isfile(join(dirn, f"links{fileIndex}")):
+ continue
+ with open(join(dirn, filename), mode="rb") as inFile:
+ with open(join(dirn, f"{filename}.new"), mode="wb") as outFile:
+ for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"):
+ outFile.flush()
+ (
+ b_x_start,
+ b_x_size,
+ b_target,
+ ) = linkLine.rstrip(b"\n").split(b"\t")
+ outFile.write(
+ inFile.read(
+ int(b_x_start, 16) - inFile.tell(),
+ ),
+ )
+ curLink = inFile.read(int(b_x_size, 16))
+
+ if b_target:
+ outFile.write(
+ re_href.sub(
+ b' href="./' + b_target + b'"',
+ curLink,
+ ),
+ )
+ continue
+
+ if not entry_url_fmt:
+ outFile.write(
+ curLink.replace(
+ b' href="#',
+ b' class="broken" href="#',
+ ),
+ )
+ continue
+
+ st = curLink.decode("utf-8")
+ i = st.find('href="#')
+ j = st.find('"', i + 7)
+ word = st[i + 7 : j]
+ url = entry_url_fmt.format(word=word)
+ outFile.write(
+ (
+ st[:i] + f'class="broken" href="{url}"' + st[j + 1 :]
+ ).encode("utf-8"),
+ )
+
+ outFile.write(inFile.read())
+
+ os.remove(join(dirn, filename))
+ os.rename(join(dirn, f"{filename}.new"), join(dirn, filename))
+ os.remove(join(dirn, f"links{fileIndex}"))
+
+ def writeInfo(self, filename: str, header: str) -> None:
+ glos = self._glos
+ title = glos.getInfo("name")
+ customStyle = (
+ "table, th, td {border: 1px solid black; "
+ "border-collapse: collapse; padding: 5px;}"
+ )
+ infoHeader = header.format(
+ pageTitle=f"Info: {title}",
+ customStyle=customStyle,
+ )
+ with open(
+ join(filename, "info.html"),
+ mode="w",
+ encoding=self._encoding,
+ ) as _file:
+ _file.write(
+ infoHeader + ""
+ ""
+ 'Key | '
+ 'Value | '
+ "
\n",
+ )
+ for key, value in glos.iterInfo():
+ _file.write(
+ f"{key} | {value} |
\n",
+ )
+ _file.write("