Skip to content

Commit

Permalink
🎨 Refactor the encoding detection feature
Browse files Browse the repository at this point in the history
🔥 Remove chardet legacy support
  • Loading branch information
Ousret committed Sep 13, 2023
1 parent aa8c18e commit c286f8f
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 137 deletions.
10 changes: 10 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ Release History
3.0.0 (2023-??-??)
-------------------

**Removed**
- Property `apparent_encoding` in favor of a discrete internal inference.
- Support for the legacy `chardet` detector in case it was present in environment.
Extra `chardet_on_py3` is now unavailable.
- **requests.compat** no longer hold reference to _chardet_.

**Changed**
- Calling the method `json` from `Response` when no encoding was provided no longer relies on internal encoding inference.
We fall back on `charset-normalizer` with a limited set of charsets allowed (UTF-8/16/32 or ASCII).


2.32.1 (2023-09-12)
-------------------
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ def run_tests(self):
extras_require={
"security": [],
"socks": ["PySocks>=1.5.6, !=1.5.7"],
"use_chardet_on_py3": ["chardet>=3.0.2,<6"],
},
project_urls={
"Documentation": "https://niquests.readthedocs.io",
Expand Down
39 changes: 9 additions & 30 deletions src/niquests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,12 @@
import warnings

import urllib3
from charset_normalizer import __version__ as charset_normalizer_version

from .exceptions import RequestsDependencyWarning

try:
from charset_normalizer import __version__ as charset_normalizer_version
except ImportError:
charset_normalizer_version = None

try:
from chardet import __version__ as chardet_version
except ImportError:
chardet_version = None


def check_compatibility(urllib3_version, chardet_version, charset_normalizer_version):
def check_compatibility(urllib3_version, charset_normalizer_version):
urllib3_version = urllib3_version.split(".")
assert urllib3_version != ["dev"] # Verify urllib3 isn't installed from git.

Expand All @@ -72,18 +63,10 @@ def check_compatibility(urllib3_version, chardet_version, charset_normalizer_ver
assert minor >= 21

# Check charset_normalizer for compatibility.
if chardet_version:
major, minor, patch = chardet_version.split(".")[:3]
major, minor, patch = int(major), int(minor), int(patch)
# chardet_version >= 3.0.2, < 6.0.0
assert (3, 0, 2) <= (major, minor, patch) < (6, 0, 0)
elif charset_normalizer_version:
major, minor, patch = charset_normalizer_version.split(".")[:3]
major, minor, patch = int(major), int(minor), int(patch)
# charset_normalizer >= 2.0.0 < 4.0.0
assert (2, 0, 0) <= (major, minor, patch) < (4, 0, 0)
else:
raise Exception("You need either charset_normalizer or chardet installed")
major, minor, patch = charset_normalizer_version.split(".")[:3]
major, minor, patch = int(major), int(minor), int(patch)
# charset_normalizer >= 2.0.0 < 4.0.0
assert (2, 0, 0) <= (major, minor, patch) < (4, 0, 0)


def _check_cryptography(cryptography_version):
Expand All @@ -102,15 +85,11 @@ def _check_cryptography(cryptography_version):

# Check imported dependencies for compatibility.
try:
check_compatibility(
urllib3.__version__, chardet_version, charset_normalizer_version
)
check_compatibility(urllib3.__version__, charset_normalizer_version)
except (AssertionError, ValueError):
warnings.warn(
"urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
"version!".format(
urllib3.__version__, chardet_version, charset_normalizer_version
),
"urllib3 ({}) or charset_normalizer ({}) doesn't match a supported "
"version!".format(urllib3.__version__, charset_normalizer_version),
RequestsDependencyWarning,
)

Expand Down
5 changes: 0 additions & 5 deletions src/niquests/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,6 @@
compatibility until the next major version.
"""

try:
import chardet
except ImportError:
import charset_normalizer as chardet

import sys

# -------
Expand Down
21 changes: 3 additions & 18 deletions src/niquests/help.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,12 @@
import ssl
import sys

import charset_normalizer
import idna
import urllib3

from . import __version__ as requests_version

try:
import charset_normalizer
except ImportError:
charset_normalizer = None

try:
import chardet
except ImportError:
chardet = None

try:
from urllib3.contrib import pyopenssl
except (ImportError, AttributeError):
Expand Down Expand Up @@ -81,12 +72,8 @@ def info():

implementation_info = _implementation()
urllib3_info = {"version": urllib3.__version__}
charset_normalizer_info = {"version": None}
chardet_info = {"version": None}
if charset_normalizer:
charset_normalizer_info = {"version": charset_normalizer.__version__}
if chardet:
chardet_info = {"version": chardet.__version__}

charset_normalizer_info = {"version": charset_normalizer.__version__}

pyopenssl_info = {
"version": None,
Expand All @@ -112,10 +99,8 @@ def info():
"implementation": implementation_info,
"system_ssl": system_ssl_info,
"using_pyopenssl": pyopenssl is not None,
"using_charset_normalizer": chardet is None,
"pyOpenSSL": pyopenssl_info,
"urllib3": urllib3_info,
"chardet": chardet_info,
"charset_normalizer": charset_normalizer_info,
"cryptography": cryptography_info,
"idna": idna_info,
Expand Down
32 changes: 21 additions & 11 deletions src/niquests/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import encodings.idna # noqa: F401
from io import UnsupportedOperation

from charset_normalizer import from_bytes
from urllib3.exceptions import (
DecodeError,
LocationParseError,
Expand All @@ -32,7 +33,6 @@
Mapping,
basestring,
builtin_str,
chardet,
cookielib,
)
from .compat import json as complexjson
Expand All @@ -57,7 +57,6 @@
check_header_validity,
get_auth_from_url,
guess_filename,
guess_json_utf,
iter_slices,
parse_header_links,
requote_uri,
Expand Down Expand Up @@ -792,11 +791,6 @@ def next(self):
"""Returns a PreparedRequest for the next request in a redirect chain, if there is one."""
return self._next

@property
def apparent_encoding(self):
"""The apparent encoding, provided by the charset_normalizer or chardet libraries."""
return chardet.detect(self.content)["encoding"]

def iter_content(self, chunk_size=1, decode_unicode=False):
"""Iterates over the response data. When stream=True is set on the
request, this avoids reading the content at once into memory for
Expand Down Expand Up @@ -912,7 +906,7 @@ def text(self):
"""Content of the response, in unicode.
If Response.encoding is None, encoding will be guessed using
``charset_normalizer`` or ``chardet``.
``charset_normalizer``.
The encoding of the response content is determined based solely on HTTP
headers, following RFC 2616 to the letter. If you can take advantage of
Expand All @@ -929,7 +923,8 @@ def text(self):

# Fallback to auto-detected encoding.
if self.encoding is None:
encoding = self.apparent_encoding
guesses = from_bytes(self.content)
encoding = guesses.best().encoding if guesses else "utf-8"

# Decode unicode from given encoding.
try:
Expand All @@ -953,12 +948,27 @@ def json(self, **kwargs):
contain valid json.
"""

if not self.encoding and self.content and len(self.content) > 3:
if not self.encoding and self.content:
# No encoding set. JSON RFC 4627 section 3 states we should expect
# UTF-8, -16 or -32. Detect which one to use; If the detection or
# decoding fails, fall back to `self.text` (using charset_normalizer to make
# a best guess).
encoding = guess_json_utf(self.content)
guesses = from_bytes(
self.content,
cp_isolation=[
"ascii",
"utf-8",
"utf-16",
"utf-32",
"utf-16-le",
"utf-16-be",
"utf-32-le",
"utf-32-be",
],
)

encoding = guesses.best().encoding if guesses else "utf-8"

if encoding is not None:
try:
return complexjson.loads(self.content.decode(encoding), **kwargs)
Expand Down
5 changes: 1 addition & 4 deletions src/niquests/packages.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import sys

try:
import chardet
except ImportError:
import charset_normalizer as chardet
import charset_normalizer as chardet

# This code exists for backwards compatibility reasons.
# I don't like it either. Just look the other way. :)
Expand Down
32 changes: 0 additions & 32 deletions src/niquests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,38 +949,6 @@ def parse_header_links(value):
_null3 = _null * 3


def guess_json_utf(data):
"""
:rtype: str
"""
# JSON always starts with two ASCII characters, so detection is as
# easy as counting the nulls and from their location and count
# determine the encoding. Also detect a BOM, if present.
sample = data[:4]
if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
return "utf-32" # BOM included
if sample[:3] == codecs.BOM_UTF8:
return "utf-8-sig" # BOM included, MS style (discouraged)
if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE):
return "utf-16" # BOM included
nullcount = sample.count(_null)
if nullcount == 0:
return "utf-8"
if nullcount == 2:
if sample[::2] == _null2: # 1st and 3rd are null
return "utf-16-be"
if sample[1::2] == _null2: # 2nd and 4th are null
return "utf-16-le"
# Did not detect 2 valid UTF-16 ascii-range characters
if nullcount == 3:
if sample[:3] == _null3:
return "utf-32-be"
if sample[1:] == _null3:
return "utf-32-le"
# Did not detect a valid UTF-32 ascii-range character
return None


def prepend_scheme_if_needed(url, new_scheme):
"""Given a URL that may or may not have a scheme, prepend the given scheme.
Does not replace a present scheme with the one provided as an argument.
Expand Down
36 changes: 0 additions & 36 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
get_encodings_from_content,
get_environ_proxies,
guess_filename,
guess_json_utf,
is_ipv4_address,
is_valid_cidr,
iter_slices,
Expand Down Expand Up @@ -373,41 +372,6 @@ def test_precedence(self):
assert get_encodings_from_content(content) == ["HTML5", "HTML4", "XML"]


class TestGuessJSONUTF:
@pytest.mark.parametrize(
"encoding",
(
"utf-32",
"utf-8-sig",
"utf-16",
"utf-8",
"utf-16-be",
"utf-16-le",
"utf-32-be",
"utf-32-le",
),
)
def test_encoded(self, encoding):
data = "{}".encode(encoding)
assert guess_json_utf(data) == encoding

def test_bad_utf_like_encoding(self):
assert guess_json_utf(b"\x00\x00\x00\x00") is None

@pytest.mark.parametrize(
("encoding", "expected"),
(
("utf-16-be", "utf-16"),
("utf-16-le", "utf-16"),
("utf-32-be", "utf-32"),
("utf-32-le", "utf-32"),
),
)
def test_guess_by_bom(self, encoding, expected):
data = "\ufeff{}".encode(encoding)
assert guess_json_utf(data) == expected


USER = PASSWORD = "%!*'();:@&=+$,/?#[] "
ENCODED_USER = compat.quote(USER, "")
ENCODED_PASSWORD = compat.quote(PASSWORD, "")
Expand Down

0 comments on commit c286f8f

Please sign in to comment.