Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta committed Nov 29, 2023
2 parents a1a040d + 5b254e4 commit baf4f57
Show file tree
Hide file tree
Showing 28 changed files with 522 additions and 177 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.0.31
current_version = 0.0.33
commit = True
tag = True

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ jobs:
TOXENV: pylint

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- name: Set up Python 3
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: 3

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ jobs:
python-version: ["3.8", "3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

Expand Down
49 changes: 40 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class AwesomeSpider(scrapy.Spider):
meta={"playwright": True},
)

def parse(self, response):
def parse(self, response, **kwargs):
# 'response' contains the page as seen by the browser
return {"url": response.url}
```
Expand Down Expand Up @@ -138,6 +138,37 @@ PLAYWRIGHT_LAUNCH_OPTIONS = {
}
```

### `PLAYWRIGHT_CDP_URL`
Type `Optional[str]`, default `None`

The endpoint of a remote Chromium browser to connect using the
[Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/),
via [`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp).
If this setting is used:
* all non-persistent contexts will be created on the connected remote browser
* the `PLAYWRIGHT_LAUNCH_OPTIONS` setting is ignored
* the `PLAYWRIGHT_BROWSER_TYPE` setting must not be set to a value different than "chromium"

```python
PLAYWRIGHT_CDP_URL = "http://localhost:9222"
```

### `PLAYWRIGHT_CDP_KWARGS`
Type `dict[str, Any]`, default `{}`

Additional keyword arguments to be passed to
[`BrowserType.connect_over_cdp`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-connect-over-cdp)
when using `PLAYWRIGHT_CDP_URL`. The `endpoint_url` key is always ignored,
`PLAYWRIGHT_CDP_URL` is used instead.

```python
PLAYWRIGHT_CDP_KWARGS = {
"slow_mo": 1000,
"timeout": 10 * 1000
}
```


### `PLAYWRIGHT_CONTEXTS`
Type `dict[str, dict]`, default `{}`

Expand Down Expand Up @@ -412,7 +443,7 @@ def start_requests(self):
meta={"playwright": True, "playwright_include_page": True},
)

def parse(self, response):
def parse(self, response, **kwargs):
page = response.meta["playwright_page"]
yield scrapy.Request(
url="https://httpbin.org/headers",
Expand Down Expand Up @@ -449,7 +480,7 @@ about the give response. Only available for HTTPS requests. Could be accessed
in the callback via `response.meta['playwright_security_details']`

```python
def parse(self, response):
def parse(self, response, **kwargs):
print(response.meta["playwright_security_details"])
# {'issuer': 'DigiCert TLS RSA SHA256 2020 CA1', 'protocol': 'TLS 1.3', 'subjectName': 'www.example.org', 'validFrom': 1647216000, 'validTo': 1678838399}
```
Expand Down Expand Up @@ -597,7 +628,7 @@ you can access a context though the corresponding [`Page.context`](https://playw
attribute, and await [`close`](https://playwright.dev/python/docs/api/class-browsercontext#browser-context-close) on it.

```python
def parse(self, response):
def parse(self, response, **kwargs):
yield scrapy.Request(
url="https://example.org",
callback=self.parse_in_new_context,
Expand Down Expand Up @@ -660,7 +691,7 @@ class ProxySpider(Spider):
def start_requests(self):
yield Request("http://httpbin.org/get", meta={"playwright": True})

def parse(self, response):
def parse(self, response, **kwargs):
print(response.text)
```

Expand Down Expand Up @@ -729,7 +760,7 @@ def start_requests(self):
},
)

def parse(self, response):
def parse(self, response, **kwargs):
screenshot = response.meta["playwright_page_methods"][0]
# screenshot.result contains the image's bytes
```
Expand All @@ -742,7 +773,7 @@ def start_requests(self):
meta={"playwright": True, "playwright_include_page": True},
)

async def parse(self, response):
async def parse(self, response, **kwargs):
page = response.meta["playwright_page"]
screenshot = await page.screenshot(path="example.png", full_page=True)
# screenshot contains the image's bytes
Expand Down Expand Up @@ -834,7 +865,7 @@ class ClickAndSavePdfSpider(scrapy.Spider):
),
)

def parse(self, response):
def parse(self, response, **kwargs):
pdf_bytes = response.meta["playwright_page_methods"]["pdf"].result
with open("iana.pdf", "wb") as fp:
fp.write(pdf_bytes)
Expand All @@ -861,7 +892,7 @@ class ScrollSpider(scrapy.Spider):
),
)

async def parse(self, response):
async def parse(self, response, **kwargs):
page = response.meta["playwright_page"]
await page.screenshot(path="quotes.png", full_page=True)
await page.close()
Expand Down
10 changes: 10 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# scrapy-playwright changelog


### [v0.0.33](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.33) (2023-10-19)

* Handle downloads as binary responses (#228)


### [v0.0.32](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.32) (2023-09-04)

* Connect to browser using CDP (#227)


### [v0.0.31](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.31) (2023-08-28)

* Do not fail when getting referer header for debug log messages (#225)
Expand Down
2 changes: 2 additions & 0 deletions examples/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.png
*.pdf
2 changes: 1 addition & 1 deletion examples/contexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def start_requests(self):
dont_filter=True,
)

async def parse(self, response):
async def parse(self, response, **kwargs):
page = response.meta["playwright_page"]
context_name = response.meta["playwright_context"]
storage_state = await page.context.storage_state()
Expand Down
31 changes: 31 additions & 0 deletions examples/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pathlib import Path

from scrapy import Spider, Request


class DownloadSpider(Spider):
name = "download"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
# "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
}

def start_requests(self):
yield Request(url="https://example.org", meta={"playwright": True})
yield Request(
url="https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
meta={"playwright": True},
)

def parse(self, response, **kwargs):
if filename := response.meta.get("playwright_suggested_filename"):
(Path(__file__).parent / filename).write_bytes(response.body)
yield {
"url": response.url,
"response_cls": response.__class__.__name__,
"first_bytes": response.body[:60],
"filename": filename,
}
2 changes: 1 addition & 1 deletion examples/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,5 @@ async def handle_dialog(self, dialog: Dialog) -> None:
async def handle_response(self, response: PlaywrightResponse) -> None:
self.logger.info(f"Received response with URL {response.url}")

def parse(self, response):
def parse(self, response, **kwargs):
return {"url": response.url}
2 changes: 1 addition & 1 deletion examples/exception_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,6 @@ def start_requests(self):
meta={"playwright": True},
)

def parse(self, response):
def parse(self, response, **kwargs):
logging.info("Received response for %s", response.url)
yield {"url": response.url}
2 changes: 1 addition & 1 deletion examples/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@ def start_requests(self):
cookies={"foo": "bar"},
)

def parse(self, response):
def parse(self, response, **kwargs):
headers = json.loads(response.css("pre::text").get())["headers"]
yield {"url": response.url, "headers": headers}
2 changes: 1 addition & 1 deletion examples/init_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def start_requests(self):
},
)

def parse(self, response):
def parse(self, response, **kwargs):
json_str = response.css("pre::text").get()
print(json_str)
return {"data": json.loads(json_str)}
2 changes: 1 addition & 1 deletion examples/max_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def start_requests(self):
meta={"playwright": True, "playwright_context": "b"},
)

def parse(self, response):
def parse(self, response, **kwargs):
return {"url": response.url}

async def errback(self, failure):
Expand Down
2 changes: 1 addition & 1 deletion examples/post.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ def start_requests(self):
},
)

def parse(self, response):
def parse(self, response, **kwargs):
yield {"url": response.url}
2 changes: 1 addition & 1 deletion examples/scroll.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,5 @@ def start_requests(self):
},
)

def parse(self, response):
def parse(self, response, **kwargs):
return {"url": response.url, "count": len(response.css("div.quote"))}
2 changes: 1 addition & 1 deletion examples/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def start_requests(self):
},
)

async def parse(self, response):
async def parse(self, response, **kwargs):
page = response.meta["playwright_page"]
storage_state = await page.context.storage_state()
await page.close()
Expand Down
1 change: 1 addition & 0 deletions pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ disable=
duplicate-code,
import-outside-toplevel,
protected-access,
too-many-public-methods,
unnecessary-dunder-call,


Expand Down
2 changes: 1 addition & 1 deletion scrapy_playwright/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.31"
__version__ = "0.0.33"
Loading

0 comments on commit baf4f57

Please sign in to comment.