Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move mapping logic of non-standard encoding names to rust code. #112

Merged
merged 1 commit into from
Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 2 additions & 22 deletions python/chardetng_py/shortcuts.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,9 @@
"""Functions for dealing with byte strings of unknown encoding."""

from typing import Dict, Final, Union
from typing import Union

from chardetng_py.detector import EncodingDetector

ALIASES: Final[Dict[str, str]] = {
"windows-874": "cp874",
}
"""Python prefers to use "cpXXX" for legacy encodings, while :code:`encoding_rs`
and :code:`chardetng` use whatwg names.

References
----------
https://docs.python.org/3/library/codecs.html#standard-encodings
https://encoding.spec.whatwg.org/#legacy-single-byte-encodings
"""


def detect(
byte_str: Union[bytes, bytearray],
Expand Down Expand Up @@ -49,12 +37,4 @@ def detect(
encoding_detector = EncodingDetector()
encoding_detector.feed(byte_str, last=True)

encoding: str = encoding_detector.guess(tld=tld, allow_utf8=allow_utf8)

# chardetng uses 'windows-874' as an encoding, which Python does not understand
# I believe that windows-874 and cp874 are basically the same encoding
if encoding in ALIASES:
# TODO Log/warn?
return ALIASES[encoding]

return encoding
return encoding_detector.guess(tld=tld, allow_utf8=allow_utf8)
21 changes: 19 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
use chardetng::EncodingDetector;
use pyo3::prelude::*;

// See https://github.com/john-parton/chardetng-py/issues/11
// Python prefers to use "cpXXX" for legacy encodings, while :code:`encoding_rs`
// and :code:`chardetng` use whatwg names.

// References
// ----------
// https://docs.python.org/3/library/codecs.html#standard-encodings
// https://encoding.spec.whatwg.org/#legacy-single-byte-encodings
fn _fix_encoding_name(encoding: &str) -> &str {
if encoding == "windows-874" {
"cp874"
} else {
encoding
}
}

#[doc = include_str!("../chardetng_docs/EncodingDetector.md")]
#[pyclass(name="EncodingDetector")]
Expand All @@ -27,7 +42,9 @@ impl EncodingDetectorWrapper {
#[pyo3(signature=(*, tld, allow_utf8))]
fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static str {

self.encoding_detector.guess(tld, allow_utf8).name()
_fix_encoding_name(
self.encoding_detector.guess(tld, allow_utf8).name()
)
}

#[doc = include_str!("../chardetng_docs/guess_assess.md")]
Expand All @@ -37,7 +54,7 @@ impl EncodingDetectorWrapper {
let (encoding, higher_score) = self.encoding_detector.guess_assess(tld, allow_utf8);

(
encoding.name(),
_fix_encoding_name(encoding.name()),
higher_score
)
}
Expand Down