From 273ec55679297e489947e3e373180c68042f366f Mon Sep 17 00:00:00 2001 From: John Parton Date: Wed, 4 Oct 2023 22:23:15 -0500 Subject: [PATCH] Move mapping logic of non-standard encoding names to rust code. --- python/chardetng_py/shortcuts.py | 24 ++---------------------- src/lib.rs | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/python/chardetng_py/shortcuts.py b/python/chardetng_py/shortcuts.py index dfcb4a1..775342a 100644 --- a/python/chardetng_py/shortcuts.py +++ b/python/chardetng_py/shortcuts.py @@ -1,21 +1,9 @@ """Functions for dealing with byte strings of unknown encoding.""" -from typing import Dict, Final, Union +from typing import Union from chardetng_py.detector import EncodingDetector -ALIASES: Final[Dict[str, str]] = { - "windows-874": "cp874", -} -"""Python prefers to use "cpXXX" for legacy encodings, while :code:`encoding_rs` -and :code:`chardetng` use whatwg names. - -References ----------- -https://docs.python.org/3/library/codecs.html#standard-encodings -https://encoding.spec.whatwg.org/#legacy-single-byte-encodings -""" - def detect( byte_str: Union[bytes, bytearray], @@ -49,12 +37,4 @@ def detect( encoding_detector = EncodingDetector() encoding_detector.feed(byte_str, last=True) - encoding: str = encoding_detector.guess(tld=tld, allow_utf8=allow_utf8) - - # chardetng uses 'windows-874' as an encoding, which Python does not understand - # I believe that windows-874 and cp874 are basically the same encoding - if encoding in ALIASES: - # TODO Log/warn? - return ALIASES[encoding] - - return encoding + return encoding_detector.guess(tld=tld, allow_utf8=allow_utf8) diff --git a/src/lib.rs b/src/lib.rs index 1f35dd2..2d20442 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,21 @@ use chardetng::EncodingDetector; use pyo3::prelude::*; +// See https://github.com/john-parton/chardetng-py/issues/11 +// Python prefers to use "cpXXX" for legacy encodings, while :code:`encoding_rs` +// and :code:`chardetng` use whatwg names. + +// References +// ---------- +// https://docs.python.org/3/library/codecs.html#standard-encodings +// https://encoding.spec.whatwg.org/#legacy-single-byte-encodings +fn _fix_encoding_name(encoding: &str) -> &str { + if encoding == "windows-874" { + "cp874" + } else { + encoding + } +} #[doc = include_str!("../chardetng_docs/EncodingDetector.md")] #[pyclass(name="EncodingDetector")] @@ -27,7 +42,9 @@ impl EncodingDetectorWrapper { #[pyo3(signature=(*, tld, allow_utf8))] fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static str { - self.encoding_detector.guess(tld, allow_utf8).name() + _fix_encoding_name( + self.encoding_detector.guess(tld, allow_utf8).name() + ) } #[doc = include_str!("../chardetng_docs/guess_assess.md")] @@ -37,7 +54,7 @@ impl EncodingDetectorWrapper { let (encoding, higher_score) = self.encoding_detector.guess_assess(tld, allow_utf8); ( - encoding.name(), + _fix_encoding_name(encoding.name()), higher_score ) }