Skip to content

Commit

Permalink
Move mapping logic of non-standard encoding names to rust code.
Browse files Browse the repository at this point in the history
  • Loading branch information
john-parton committed Oct 5, 2023
1 parent 65e0a5a commit 273ec55
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 24 deletions.
24 changes: 2 additions & 22 deletions python/chardetng_py/shortcuts.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,9 @@
"""Functions for dealing with byte strings of unknown encoding."""

from typing import Dict, Final, Union
from typing import Union

from chardetng_py.detector import EncodingDetector

ALIASES: Final[Dict[str, str]] = {
"windows-874": "cp874",
}
"""Python prefers to use "cpXXX" for legacy encodings, while :code:`encoding_rs`
and :code:`chardetng` use whatwg names.
References
----------
https://docs.python.org/3/library/codecs.html#standard-encodings
https://encoding.spec.whatwg.org/#legacy-single-byte-encodings
"""


def detect(
byte_str: Union[bytes, bytearray],
Expand Down Expand Up @@ -49,12 +37,4 @@ def detect(
encoding_detector = EncodingDetector()
encoding_detector.feed(byte_str, last=True)

encoding: str = encoding_detector.guess(tld=tld, allow_utf8=allow_utf8)

# chardetng uses 'windows-874' as an encoding, which Python does not understand
# I believe that windows-874 and cp874 are basically the same encoding
if encoding in ALIASES:
# TODO Log/warn?
return ALIASES[encoding]

return encoding
return encoding_detector.guess(tld=tld, allow_utf8=allow_utf8)
21 changes: 19 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
use chardetng::EncodingDetector;
use pyo3::prelude::*;

// See https://github.com/john-parton/chardetng-py/issues/11
// Python prefers to use "cpXXX" for legacy encodings, while :code:`encoding_rs`
// and :code:`chardetng` use whatwg names.

// References
// ----------
// https://docs.python.org/3/library/codecs.html#standard-encodings
// https://encoding.spec.whatwg.org/#legacy-single-byte-encodings
fn _fix_encoding_name(encoding: &str) -> &str {
if encoding == "windows-874" {
"cp874"
} else {
encoding
}
}

#[doc = include_str!("../chardetng_docs/EncodingDetector.md")]
#[pyclass(name="EncodingDetector")]
Expand All @@ -27,7 +42,9 @@ impl EncodingDetectorWrapper {
#[pyo3(signature=(*, tld, allow_utf8))]
fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static str {

self.encoding_detector.guess(tld, allow_utf8).name()
_fix_encoding_name(
self.encoding_detector.guess(tld, allow_utf8).name()
)
}

#[doc = include_str!("../chardetng_docs/guess_assess.md")]
Expand All @@ -37,7 +54,7 @@ impl EncodingDetectorWrapper {
let (encoding, higher_score) = self.encoding_detector.guess_assess(tld, allow_utf8);

(
encoding.name(),
_fix_encoding_name(encoding.name()),
higher_score
)
}
Expand Down

0 comments on commit 273ec55

Please sign in to comment.