Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce default_encoding parameter to [set|autodetect] the encoding if the charset is missing from the headers #284

Merged
merged 2 commits into from
Apr 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions .github/workflows/build-and-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ jobs:
name: Lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- run: |
Expand All @@ -31,15 +31,15 @@ jobs:
name: Build sdist wheel
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- run: |
make preprocess
pipx run build --sdist
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
with:
path: ./dist/*.tar.gz

- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- run: |
Expand All @@ -52,51 +52,50 @@ jobs:
matrix:
os: [ubuntu-22.04, macos-12, macos-14, windows-2019]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: '3.10'

- if: runner.os == 'Linux'
uses: docker/setup-qemu-action@v2
uses: docker/setup-qemu-action@v3
with:
platforms: all

# macOS make is too old
- if: runner.os == 'macOS'
run: |
brew install make automake libtool
which pipx || brew install pipx && pipx ensurepath

- name: Build and test wheels
uses: pypa/cibuildwheel@v2.16.5
uses: pypa/cibuildwheel@v2.17.0

# - name: Setup tmate session
# uses: mxschmitt/action-tmate@v3

- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
with:
path: ./wheelhouse/*.whl

upload_all:
needs: [bdist, sdist]
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v3
- uses: actions/download-artifact@v3 # https://github.com/actions/upload-artifact/issues/478
if: startsWith(github.ref, 'refs/tags/')
with:
name: artifact
path: dist

- uses: pypa/gh-action-pypi-publish@v1.5.0
- uses: pypa/gh-action-pypi-publish@v1.8.14
if: startsWith(github.ref, 'refs/tags/')
with:
password: ${{ secrets.PYPI_TOKEN }}

- name: Upload release files
if: startsWith(github.ref, 'refs/tags/')
uses: softprops/action-gh-release@v1
uses: softprops/action-gh-release@v2
with:
files: |
./dist/*.whl
Expand Down
4 changes: 4 additions & 0 deletions curl_cffi/requests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def request(
impersonate: Optional[Union[str, BrowserType]] = None,
thread: Optional[ThreadType] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
curl_options: Optional[dict] = None,
http_version: Optional[CurlHttpVersion] = None,
debug: bool = False,
Expand Down Expand Up @@ -90,6 +91,8 @@ def request(
impersonate: which browser version to impersonate.
thread: work with other thread implementations. choices: eventlet, gevent.
default_headers: whether to set default browser headers.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.
curl_options: extra curl options to use.
http_version: limiting http version, http2 will be tries by default.
debug: print extra curl debug info.
Expand Down Expand Up @@ -122,6 +125,7 @@ def request(
content_callback=content_callback,
impersonate=impersonate,
default_headers=default_headers,
default_encoding=default_encoding,
http_version=http_version,
interface=interface,
multipart=multipart,
Expand Down
67 changes: 59 additions & 8 deletions curl_cffi/requests/models.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import queue
import re
import warnings
from concurrent.futures import Future
from functools import cached_property
from json import loads
from typing import Any, Awaitable, Dict, List, Optional
from typing import Any, Awaitable, Callable, Dict, List, Optional, Union

from .. import Curl
from .cookies import Cookies
from .errors import RequestsError
from .headers import Headers

CHARSET_RE = re.compile(r"charset=([\w-]+)")


def clear_queue(q: queue.Queue):
with q.mutex:
Expand Down Expand Up @@ -41,6 +45,8 @@ class Response:
elapsed: how many seconds the request cost.
encoding: http body encoding.
charset: alias for encoding.
charset_encoding: encoding specified by the Content-Type header.
default_encoding: user-defined encoding used for decoding content if charset is not found in headers.
redirect_count: how many redirects happened.
redirect_url: the final redirected url.
http_version: http version used.
Expand All @@ -58,8 +64,7 @@ def __init__(self, curl: Optional[Curl] = None, request: Optional[Request] = Non
self.headers = Headers()
self.cookies = Cookies()
self.elapsed = 0.0
self.encoding = "utf-8"
self.charset = self.encoding
self.default_encoding: Union[str, Callable[[bytes], str]] = "utf-8"
self.redirect_count = 0
self.redirect_url = ""
self.http_version = 0
Expand All @@ -70,16 +75,62 @@ def __init__(self, curl: Optional[Curl] = None, request: Optional[Request] = Non
self.astream_task: Optional[Awaitable] = None
self.quit_now = None

@property
def charset(self) -> str:
"""Alias for encoding."""
return self.encoding

@property
def encoding(self) -> str:
"""
Determines the encoding to decode byte content into text.

The method follows a specific priority to decide the encoding:
1. If `.encoding` has been explicitly set, it is used.
2. The encoding specified by the `charset` parameter in the `Content-Type` header.
3. The encoding specified by the `default_encoding` attribute. This can either be
a string (e.g., "utf-8") or a callable for charset autodetection.
"""
if not hasattr(self, "_encoding"):
encoding = self.charset_encoding
if encoding is None:
if isinstance(self.default_encoding, str):
encoding = self.default_encoding
elif callable(self.default_encoding):
encoding = self.default_encoding(self.content)
self._encoding = encoding or "utf-8"
return self._encoding

@encoding.setter
def encoding(self, value: str) -> None:
if hasattr(self, "_text"):
raise ValueError("Cannot set encoding after text has been accessed")
self._encoding = value

@property
def charset_encoding(self) -> Optional[str]:
"""Return the encoding, as specified by the Content-Type header."""
content_type = self.headers.get("Content-Type")
if content_type:
charset_match = CHARSET_RE.search(content_type)
return charset_match.group(1) if charset_match else None
return None

@property
def text(self) -> str:
if not hasattr(self, "_text"):
if not self.content:
self._text = ""
else:
self._text = self._decode(self.content)
return self._text

def _decode(self, content: bytes) -> str:
try:
return content.decode(self.charset, errors="replace")
return content.decode(self.encoding, errors="replace")
except (UnicodeDecodeError, LookupError):
return content.decode("utf-8-sig")

@property
def text(self) -> str:
return self._decode(self.content)

def raise_for_status(self):
"""Raise an error if status code is not in [200, 400)"""
if not self.ok:
Expand Down
36 changes: 18 additions & 18 deletions curl_cffi/requests/session.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import asyncio
import math
import queue
import re
import threading
import warnings
from concurrent.futures import ThreadPoolExecutor
Expand Down Expand Up @@ -55,7 +54,6 @@ class ProxySpec(TypedDict, total=False):
else:
ProxySpec = Dict[str, str]

CHARSET_RE = re.compile(r"charset=([\w-]+)")
ThreadType = Literal["eventlet", "gevent"]


Expand Down Expand Up @@ -205,6 +203,7 @@ def __init__(
max_redirects: int = -1,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: bool = True,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
curl_options: Optional[dict] = None,
curl_infos: Optional[list] = None,
http_version: Optional[CurlHttpVersion] = None,
Expand All @@ -224,6 +223,7 @@ def __init__(
self.max_redirects = max_redirects
self.impersonate = impersonate
self.default_headers = default_headers
self.default_encoding = default_encoding
self.curl_options = curl_options or {}
self.curl_infos = curl_infos or []
self.http_version = http_version
Expand Down Expand Up @@ -547,7 +547,7 @@ def qput(chunk):

return req, buffer, header_buffer, q, header_recved, quit_now

def _parse_response(self, curl, buffer, header_buffer):
def _parse_response(self, curl, buffer, header_buffer, default_encoding):
c = curl
rsp = Response(c)
rsp.url = cast(bytes, c.getinfo(CurlInfo.EFFECTIVE_URL)).decode()
Expand Down Expand Up @@ -583,13 +583,7 @@ def _parse_response(self, curl, buffer, header_buffer):
rsp.cookies = self.cookies
# print("Cookies after extraction", self.cookies)

content_type = rsp.headers.get("Content-Type", default="")
charset_match = CHARSET_RE.search(content_type)
charset = charset_match.group(1) if charset_match else "utf-8"

rsp.charset = charset
rsp.encoding = charset # TODO use chardet

rsp.default_encoding = default_encoding
rsp.elapsed = cast(float, c.getinfo(CurlInfo.TOTAL_TIME))
rsp.redirect_count = cast(int, c.getinfo(CurlInfo.REDIRECT_COUNT))
rsp.redirect_url = cast(bytes, c.getinfo(CurlInfo.REDIRECT_URL)).decode()
Expand Down Expand Up @@ -639,6 +633,8 @@ def __init__(
max_redirects: max redirect counts, default unlimited(-1).
impersonate: which browser version to impersonate in the session.
interface: which interface use in request to server.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.

Notes:
This class can be used as a context manager.
Expand Down Expand Up @@ -767,6 +763,7 @@ def request(
content_callback: Optional[Callable] = None,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
http_version: Optional[CurlHttpVersion] = None,
interface: Optional[str] = None,
cert: Optional[Union[str, Tuple[str, str]]] = None,
Expand Down Expand Up @@ -825,7 +822,7 @@ def perform():
try:
c.perform()
except CurlError as e:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
cast(queue.Queue, q).put_nowait(RequestsError(str(e), e.code, rsp))
finally:
Expand All @@ -843,7 +840,7 @@ def cleanup(fut):

# Wait for the first chunk
cast(threading.Event, header_recved).wait()
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
header_parsed.set()

# Raise the exception if something wrong happens when receiving the header.
Expand All @@ -868,11 +865,11 @@ def cleanup(fut):
else:
c.perform()
except CurlError as e:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
raise RequestsError(str(e), e.code, rsp) from e
else:
rsp = self._parse_response(c, buffer, header_buffer)
rsp = self._parse_response(c, buffer, header_buffer, default_encoding)
rsp.request = req
return rsp
finally:
Expand Down Expand Up @@ -919,6 +916,8 @@ def __init__(
allow_redirects: whether to allow redirection.
max_redirects: max redirect counts, default unlimited(-1).
impersonate: which browser version to impersonate in the session.
default_encoding: encoding for decoding response content if charset is not found in headers.
Defaults to "utf-8". Can be set to a callable for automatic detection.

Notes:
This class can be used as a context manager, and it's recommended to use via
Expand Down Expand Up @@ -1043,6 +1042,7 @@ async def request(
content_callback: Optional[Callable] = None,
impersonate: Optional[Union[str, BrowserType]] = None,
default_headers: Optional[bool] = None,
default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
http_version: Optional[CurlHttpVersion] = None,
interface: Optional[str] = None,
cert: Optional[Union[str, Tuple[str, str]]] = None,
Expand Down Expand Up @@ -1093,7 +1093,7 @@ async def perform():
try:
await task
except CurlError as e:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
cast(asyncio.Queue, q).put_nowait(RequestsError(str(e), e.code, rsp))
finally:
Expand All @@ -1113,7 +1113,7 @@ def cleanup(fut):
# Unlike threads, coroutines does not use preemptive scheduling.
# For asyncio, there is no need for a header_parsed event, the
# _parse_response will execute in the foreground, no background tasks running.
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)

first_element = _peek_aio_queue(cast(asyncio.Queue, q))
if isinstance(first_element, RequestsError):
Expand All @@ -1132,11 +1132,11 @@ def cleanup(fut):
await task
# print(curl.getinfo(CurlInfo.CAINFO))
except CurlError as e:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
raise RequestsError(str(e), e.code, rsp) from e
else:
rsp = self._parse_response(curl, buffer, header_buffer)
rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
rsp.request = req
return rsp
finally:
Expand Down
Loading
Loading