Skip to content

Commit

Permalink
Calculate status text manually if it's not returned by PlayWright API
Browse files Browse the repository at this point in the history
  • Loading branch information
D4Vinci committed Nov 20, 2024
1 parent 4c74d9b commit 19ad82c
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 3 deletions.
8 changes: 7 additions & 1 deletion scrapling/engines/camo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scrapling.engines.toolbelt import (
Response,
do_nothing,
StatusText,
get_os_name,
intercept_route,
check_type_validity,
Expand Down Expand Up @@ -111,12 +112,17 @@ def fetch(self, url: str) -> Response:
if 'charset=' in content_type.lower():
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()

status_text = res.status_text
# PlayWright API sometimes give empty status text for some reason!
if not status_text:
status_text = StatusText.get(res.status)

response = Response(
url=res.url,
text=page.content(),
body=page.content().encode('utf-8'),
status=res.status,
reason=res.status_text,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
Expand Down
8 changes: 7 additions & 1 deletion scrapling/engines/pw.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from scrapling.engines.toolbelt import (
Response,
do_nothing,
StatusText,
js_bypass_path,
intercept_route,
generate_headers,
Expand Down Expand Up @@ -221,12 +222,17 @@ def fetch(self, url: str) -> Response:
if 'charset=' in content_type.lower():
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()

status_text = res.status_text
# PlayWright API sometimes give empty status text for some reason!
if not status_text:
status_text = StatusText.get(res.status)

response = Response(
url=res.url,
text=page.content(),
body=page.content().encode('utf-8'),
status=res.status,
reason=res.status_text,
reason=status_text,
encoding=encoding,
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
headers=res.all_headers(),
Expand Down
1 change: 1 addition & 0 deletions scrapling/engines/toolbelt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .custom import (
Response,
do_nothing,
StatusText,
BaseFetcher,
get_variable_name,
check_type_validity,
Expand Down
80 changes: 79 additions & 1 deletion scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import inspect
import logging

from scrapling.core.utils import setup_basic_logging
from scrapling.core.custom_types import MappingProxyType
from scrapling.parser import Adaptor, SQLiteStorageSystem
from scrapling.core.utils import setup_basic_logging, cache
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable


Expand Down Expand Up @@ -67,6 +68,83 @@ def __init__(
self.adaptor_arguments.update({'automatch_domain': automatch_domain})


class StatusText:
"""A class that gets the status text of response status code.
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
"""
_phrases = MappingProxyType({
100: "Continue",
101: "Switching Protocols",
102: "Processing",
103: "Early Hints",
200: "OK",
201: "Created",
202: "Accepted",
203: "Non-Authoritative Information",
204: "No Content",
205: "Reset Content",
206: "Partial Content",
207: "Multi-Status",
208: "Already Reported",
226: "IM Used",
300: "Multiple Choices",
301: "Moved Permanently",
302: "Found",
303: "See Other",
304: "Not Modified",
305: "Use Proxy",
307: "Temporary Redirect",
308: "Permanent Redirect",
400: "Bad Request",
401: "Unauthorized",
402: "Payment Required",
403: "Forbidden",
404: "Not Found",
405: "Method Not Allowed",
406: "Not Acceptable",
407: "Proxy Authentication Required",
408: "Request Timeout",
409: "Conflict",
410: "Gone",
411: "Length Required",
412: "Precondition Failed",
413: "Payload Too Large",
414: "URI Too Long",
415: "Unsupported Media Type",
416: "Range Not Satisfiable",
417: "Expectation Failed",
418: "I'm a teapot",
421: "Misdirected Request",
422: "Unprocessable Entity",
423: "Locked",
424: "Failed Dependency",
425: "Too Early",
426: "Upgrade Required",
428: "Precondition Required",
429: "Too Many Requests",
431: "Request Header Fields Too Large",
451: "Unavailable For Legal Reasons",
500: "Internal Server Error",
501: "Not Implemented",
502: "Bad Gateway",
503: "Service Unavailable",
504: "Gateway Timeout",
505: "HTTP Version Not Supported",
506: "Variant Also Negotiates",
507: "Insufficient Storage",
508: "Loop Detected",
510: "Not Extended",
511: "Network Authentication Required"
})

@classmethod
@cache(maxsize=128)
def get(cls, status_code: int) -> str:
"""Get the phrase for a given HTTP status code."""
return cls._phrases.get(status_code, "Unknown Status Code")


def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
"""This function check if the passed engine can be used by a Fetcher-type class or not.
Expand Down

0 comments on commit 19ad82c

Please sign in to comment.