Calculate status text manually if it's not returned by PlayWright API

D4Vinci · Nov 20, 2024 · 19ad82c · 19ad82c
1 parent 4c74d9b
commit 19ad82c
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 3 deletions.
diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py
@@ -4,6 +4,7 @@
 from scrapling.engines.toolbelt import (
     Response,
     do_nothing,
+    StatusText,
     get_os_name,
     intercept_route,
     check_type_validity,
@@ -111,12 +112,17 @@ def fetch(self, url: str) -> Response:
             if 'charset=' in content_type.lower():
                 encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
 
+            status_text = res.status_text
+            # PlayWright API sometimes give empty status text for some reason!
+            if not status_text:
+                status_text = StatusText.get(res.status)
+
             response = Response(
                 url=res.url,
                 text=page.content(),
                 body=page.content().encode('utf-8'),
                 status=res.status,
-                reason=res.status_text,
+                reason=status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=res.all_headers(),

diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py
@@ -6,6 +6,7 @@
 from scrapling.engines.toolbelt import (
     Response,
     do_nothing,
+    StatusText,
     js_bypass_path,
     intercept_route,
     generate_headers,
@@ -221,12 +222,17 @@ def fetch(self, url: str) -> Response:
             if 'charset=' in content_type.lower():
                 encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
 
+            status_text = res.status_text
+            # PlayWright API sometimes give empty status text for some reason!
+            if not status_text:
+                status_text = StatusText.get(res.status)
+
             response = Response(
                 url=res.url,
                 text=page.content(),
                 body=page.content().encode('utf-8'),
                 status=res.status,
-                reason=res.status_text,
+                reason=status_text,
                 encoding=encoding,
                 cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
                 headers=res.all_headers(),

diff --git a/scrapling/engines/toolbelt/__init__.py b/scrapling/engines/toolbelt/__init__.py
@@ -6,6 +6,7 @@
 from .custom import (
     Response,
     do_nothing,
+    StatusText,
     BaseFetcher,
     get_variable_name,
     check_type_validity,

diff --git a/scrapling/engines/toolbelt/custom.py b/scrapling/engines/toolbelt/custom.py
@@ -4,8 +4,9 @@
 import inspect
 import logging
 
-from scrapling.core.utils import setup_basic_logging
+from scrapling.core.custom_types import MappingProxyType
 from scrapling.parser import Adaptor, SQLiteStorageSystem
+from scrapling.core.utils import setup_basic_logging, cache
 from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
 
 
@@ -67,6 +68,83 @@ def __init__(
                 self.adaptor_arguments.update({'automatch_domain': automatch_domain})
 
 
+class StatusText:
+    """A class that gets the status text of response status code.
+
+        Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
+    """
+    _phrases = MappingProxyType({
+        100: "Continue",
+        101: "Switching Protocols",
+        102: "Processing",
+        103: "Early Hints",
+        200: "OK",
+        201: "Created",
+        202: "Accepted",
+        203: "Non-Authoritative Information",
+        204: "No Content",
+        205: "Reset Content",
+        206: "Partial Content",
+        207: "Multi-Status",
+        208: "Already Reported",
+        226: "IM Used",
+        300: "Multiple Choices",
+        301: "Moved Permanently",
+        302: "Found",
+        303: "See Other",
+        304: "Not Modified",
+        305: "Use Proxy",
+        307: "Temporary Redirect",
+        308: "Permanent Redirect",
+        400: "Bad Request",
+        401: "Unauthorized",
+        402: "Payment Required",
+        403: "Forbidden",
+        404: "Not Found",
+        405: "Method Not Allowed",
+        406: "Not Acceptable",
+        407: "Proxy Authentication Required",
+        408: "Request Timeout",
+        409: "Conflict",
+        410: "Gone",
+        411: "Length Required",
+        412: "Precondition Failed",
+        413: "Payload Too Large",
+        414: "URI Too Long",
+        415: "Unsupported Media Type",
+        416: "Range Not Satisfiable",
+        417: "Expectation Failed",
+        418: "I'm a teapot",
+        421: "Misdirected Request",
+        422: "Unprocessable Entity",
+        423: "Locked",
+        424: "Failed Dependency",
+        425: "Too Early",
+        426: "Upgrade Required",
+        428: "Precondition Required",
+        429: "Too Many Requests",
+        431: "Request Header Fields Too Large",
+        451: "Unavailable For Legal Reasons",
+        500: "Internal Server Error",
+        501: "Not Implemented",
+        502: "Bad Gateway",
+        503: "Service Unavailable",
+        504: "Gateway Timeout",
+        505: "HTTP Version Not Supported",
+        506: "Variant Also Negotiates",
+        507: "Insufficient Storage",
+        508: "Loop Detected",
+        510: "Not Extended",
+        511: "Network Authentication Required"
+    })
+
+    @classmethod
+    @cache(maxsize=128)
+    def get(cls, status_code: int) -> str:
+        """Get the phrase for a given HTTP status code."""
+        return cls._phrases.get(status_code, "Unknown Status Code")
+
+
 def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
     """This function check if the passed engine can be used by a Fetcher-type class or not.