Merge remote-tracking branch 'origin/main' into close-inactive-contexts

scrapy-plugins · Nov 29, 2023 · baf4f57 · baf4f57
2 parents a1a040d + 5b254e4
commit baf4f57
Show file tree

Hide file tree

Showing 28 changed files with 522 additions and 177 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.31
+current_version = 0.0.33
 commit = True
 tag = True
 

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -26,10 +26,10 @@ jobs:
             TOXENV: pylint
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
 

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -8,10 +8,10 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Set up Python 3
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: 3
 

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -12,10 +12,10 @@ jobs:
         python-version: ["3.8", "3.9", "3.10", "3.11"]
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
 
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
 

diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ class AwesomeSpider(scrapy.Spider):
             meta={"playwright": True},
         )
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         # 'response' contains the page as seen by the browser
         return {"url": response.url}
 ```
@@ -138,6 +138,37 @@ PLAYWRIGHT_LAUNCH_OPTIONS = {
 }
 ```
 
+### `PLAYWRIGHT_CDP_URL`
+Type `Optional[str]`, default `None`
+
+The endpoint of a remote Chromium browser to connect using the
+[Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/),
+via [`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp).
+If this setting is used:
+* all non-persistent contexts will be created on the connected remote browser
+* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored
+* the `PLAYWRIGHT_BROWSER_TYPE` setting must not be set to a value different than "chromium"
+
+```python
+PLAYWRIGHT_CDP_URL = "http://localhost:9222"
+```
+
+### `PLAYWRIGHT_CDP_KWARGS`
+Type `dict[str, Any]`, default `{}`
+
+Additional keyword arguments to be passed to
+[`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp)
+when using `PLAYWRIGHT_CDP_URL`. The `endpoint_url` key is always ignored,
+`PLAYWRIGHT_CDP_URL` is used instead.
+
+```python
+PLAYWRIGHT_CDP_KWARGS = {
+    "slow_mo": 1000,
+    "timeout": 10 * 1000
+}
+```
+
+
 ### `PLAYWRIGHT_CONTEXTS`
 Type `dict[str, dict]`, default `{}`
 
@@ -412,7 +443,7 @@ def start_requests(self):
         meta={"playwright": True, "playwright_include_page": True},
     )
 
-def parse(self, response):
+def parse(self, response, **kwargs):
     page = response.meta["playwright_page"]
     yield scrapy.Request(
         url="https://httpbin.org/headers",
@@ -449,7 +480,7 @@ about the give response. Only available for HTTPS requests. Could be accessed
 in the callback via `response.meta['playwright_security_details']`
 
 ```python
-def parse(self, response):
+def parse(self, response, **kwargs):
     print(response.meta["playwright_security_details"])
     # {'issuer': 'DigiCert TLS RSA SHA256 2020 CA1', 'protocol': 'TLS 1.3', 'subjectName': 'www.example.org', 'validFrom': 1647216000, 'validTo': 1678838399}
 ```
@@ -597,7 +628,7 @@ you can access a context though the corresponding [`Page.context`](https://playw
 attribute, and await [`close`](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-close) on it.
 
 ```python
-def parse(self, response):
+def parse(self, response, **kwargs):
     yield scrapy.Request(
         url="https://example.org",
         callback=self.parse_in_new_context,
@@ -660,7 +691,7 @@ class ProxySpider(Spider):
     def start_requests(self):
         yield Request("http://httpbin.org/get", meta={"playwright": True})
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         print(response.text)
 ```
 
@@ -729,7 +760,7 @@ def start_requests(self):
         },
     )
 
-def parse(self, response):
+def parse(self, response, **kwargs):
     screenshot = response.meta["playwright_page_methods"][0]
     # screenshot.result contains the image's bytes
 ```
@@ -742,7 +773,7 @@ def start_requests(self):
         meta={"playwright": True, "playwright_include_page": True},
     )
 
-async def parse(self, response):
+async def parse(self, response, **kwargs):
     page = response.meta["playwright_page"]
     screenshot = await page.screenshot(path="example.png", full_page=True)
     # screenshot contains the image's bytes
@@ -834,7 +865,7 @@ class ClickAndSavePdfSpider(scrapy.Spider):
             ),
         )
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         pdf_bytes = response.meta["playwright_page_methods"]["pdf"].result
         with open("iana.pdf", "wb") as fp:
             fp.write(pdf_bytes)
@@ -861,7 +892,7 @@ class ScrollSpider(scrapy.Spider):
             ),
         )
 
-    async def parse(self, response):
+    async def parse(self, response, **kwargs):
         page = response.meta["playwright_page"]
         await page.screenshot(path="quotes.png", full_page=True)
         await page.close()

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,6 +1,16 @@
 # scrapy-playwright changelog
 
 
+### [v0.0.33](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.33) (2023-10-19)
+
+* Handle downloads as binary responses (#228)
+
+
+### [v0.0.32](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.32) (2023-09-04)
+
+* Connect to browser using CDP (#227)
+
+
 ### [v0.0.31](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.31) (2023-08-28)
 
 * Do not fail when getting referer header for debug log messages (#225)

diff --git a/examples/.gitignore b/examples/.gitignore
@@ -0,0 +1,2 @@
+*.png
+*.pdf
diff --git a/examples/contexts.py b/examples/contexts.py
@@ -95,7 +95,7 @@ def start_requests(self):
                 dont_filter=True,
             )
 
-    async def parse(self, response):
+    async def parse(self, response, **kwargs):
         page = response.meta["playwright_page"]
         context_name = response.meta["playwright_context"]
         storage_state = await page.context.storage_state()

diff --git a/examples/download.py b/examples/download.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+
+from scrapy import Spider, Request
+
+
+class DownloadSpider(Spider):
+    name = "download"
+    custom_settings = {
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "DOWNLOAD_HANDLERS": {
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            # "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+    }
+
+    def start_requests(self):
+        yield Request(url="https://example.org", meta={"playwright": True})
+        yield Request(
+            url="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
+            meta={"playwright": True},
+        )
+
+    def parse(self, response, **kwargs):
+        if filename := response.meta.get("playwright_suggested_filename"):
+            (Path(__file__).parent / filename).write_bytes(response.body)
+        yield {
+            "url": response.url,
+            "response_cls": response.__class__.__name__,
+            "first_bytes": response.body[:60],
+            "filename": filename,
+        }
diff --git a/examples/events.py b/examples/events.py
@@ -37,5 +37,5 @@ async def handle_dialog(self, dialog: Dialog) -> None:
     async def handle_response(self, response: PlaywrightResponse) -> None:
         self.logger.info(f"Received response with URL {response.url}")
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         return {"url": response.url}
diff --git a/examples/exception_middleware.py b/examples/exception_middleware.py
@@ -50,6 +50,6 @@ def start_requests(self):
             meta={"playwright": True},
         )
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         logging.info("Received response for %s", response.url)
         yield {"url": response.url}
diff --git a/examples/headers.py b/examples/headers.py
@@ -37,6 +37,6 @@ def start_requests(self):
             cookies={"foo": "bar"},
         )
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         headers = json.loads(response.css("pre::text").get())["headers"]
         yield {"url": response.url, "headers": headers}
diff --git a/examples/init_page.py b/examples/init_page.py
@@ -28,7 +28,7 @@ def start_requests(self):
             },
         )
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         json_str = response.css("pre::text").get()
         print(json_str)
         return {"data": json.loads(json_str)}
diff --git a/examples/max_pages.py b/examples/max_pages.py
@@ -41,7 +41,7 @@ def start_requests(self):
                 meta={"playwright": True, "playwright_context": "b"},
             )
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         return {"url": response.url}
 
     async def errback(self, failure):

diff --git a/examples/post.py b/examples/post.py
@@ -30,5 +30,5 @@ def start_requests(self):
             },
         )
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         yield {"url": response.url}
diff --git a/examples/scroll.py b/examples/scroll.py
@@ -34,5 +34,5 @@ def start_requests(self):
             },
         )
 
-    def parse(self, response):
+    def parse(self, response, **kwargs):
         return {"url": response.url, "count": len(response.css("div.quote"))}
diff --git a/examples/storage.py b/examples/storage.py
@@ -26,7 +26,7 @@ def start_requests(self):
             },
         )
 
-    async def parse(self, response):
+    async def parse(self, response, **kwargs):
         page = response.meta["playwright_page"]
         storage_state = await page.context.storage_state()
         await page.close()

diff --git a/pylintrc b/pylintrc
@@ -13,6 +13,7 @@ disable=
     duplicate-code,
     import-outside-toplevel,
     protected-access,
+    too-many-public-methods,
     unnecessary-dunder-call,
 
 

diff --git a/scrapy_playwright/__init__.py b/scrapy_playwright/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.31"
+__version__ = "0.0.33"