Skip to content

Commit

Permalink
Compose action (#40)
Browse files Browse the repository at this point in the history
* Added `Compose` action in actions.py. Some formatting.

* Fix closing context

* Update for pyppeteer and playwright.

* Doc and example

* Typing issue

* Fixed stand-alone `Compose`.

* Changes?

* Fixed example spider

* Playwright and Pyppeteer fixes

* Better example

* Standard execution method

* increment version + minor fix

---------

Co-authored-by: Max Varlamov <[email protected]>
  • Loading branch information
MatthewZMSU and mxsnq authored Oct 9, 2024
1 parent a35eaa0 commit 98704eb
Show file tree
Hide file tree
Showing 8 changed files with 155 additions and 13 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ Here is the list of available actions:
- `GoForward(options)` - navigate forward in history
- `GoBack(options)` - navigate back in history
- `Click(selector, click_options, wait_options)` - click on element on page
- `Compose(*actions)` - composition of several puppeteer action
- `Scroll(selector, wait_options)` - scroll page
- `Screenshot(options)` - take screenshot
- `Har()` - to get the HAR file, pass the `har_recording=True` argument to `PuppeteerRequest` at the start of execution.
Expand Down
57 changes: 57 additions & 0 deletions examples/spiders/compose.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from logging import ERROR

import scrapy
from scrapy.utils.log import failure_to_exc_info
from twisted.python.failure import Failure

from scrapypuppeteer import (
PuppeteerRequest,
PuppeteerResponse,
PuppeteerScreenshotResponse,
)
from scrapypuppeteer.actions import Click, Compose, GoTo, Screenshot, Scroll


class ComposeSpider(scrapy.Spider):
name = "compose"

custom_settings = {
"DOWNLOADER_MIDDLEWARES": {
"scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042,
},
}

def start_requests(self):
goto = GoTo("https://pptr.dev")
click_1 = Click(
"#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)"
)
click_2 = Click(
"#__docusaurus_skipToContent_fallback > div > div > aside > div > "
"div > nav > ul > li:nth-child(1) > ul > li:nth-child(3) > a"
)
click = Compose(click_1, click_2)
scroll = Scroll()
screenshot = Screenshot(options={"full_page": True, "type": "jpeg"})

compose_action = Compose(
goto,
click,
scroll,
screenshot,
)

yield PuppeteerRequest(
compose_action,
callback=self.parse,
errback=self.errback,
close_page=True,
)

def parse(self, response: PuppeteerResponse):
assert isinstance(response, PuppeteerScreenshotResponse)
self.log("Spider worked fine!")

def errback(self, failure: Failure):
print(failure)
self.log(failure_to_exc_info(failure), level=ERROR)
44 changes: 41 additions & 3 deletions scrapypuppeteer/actions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from abc import ABC, abstractmethod
from typing import List, Tuple


class PuppeteerServiceAction(ABC):
content_type = "application/json"

@property
@abstractmethod
def endpoint(self): ...

content_type = "application/json"

@abstractmethod
def payload(self): ...

Expand Down Expand Up @@ -291,7 +292,8 @@ class RecaptchaSolver(PuppeteerServiceAction):
Response for this action is PuppeteerJsonResponse. You can get the return values
via self.data['recaptcha_data'].
You can visit https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object
You can visit
https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object
to get information about return value.
"""

Expand Down Expand Up @@ -334,3 +336,39 @@ def __init__(self, js_action: str):

def payload(self):
return self.js_action


class Compose(PuppeteerServiceAction):
"""
Compose several scrapy-puppeteer actions into one action and send it to the service.
Response for this action is PuppeteerResponse to last action in a sequence.
"""

endpoint = "compose"

def __init__(self, *actions: PuppeteerServiceAction):
self.actions = self.__flatten(actions)

@staticmethod
def __flatten(
actions: Tuple[PuppeteerServiceAction, ...],
) -> List[PuppeteerServiceAction]:
flatten_actions = []
for action in actions:
if isinstance(action, Compose):
flatten_actions.extend(action.actions)
else:
flatten_actions.append(action)
if not flatten_actions:
raise ValueError("No actions provided in `Compose`.")
return flatten_actions

def payload(self):
return {
"actions": [
{"endpoint": action.endpoint, "body": action.payload()}
for action in self.actions
]
}
10 changes: 10 additions & 0 deletions scrapypuppeteer/browser_managers/playwright_browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __init__(self):
self.action_map = {
"goto": self.goto,
"click": self.click,
"compose": self.compose,
"back": self.go_back,
"forward": self.go_forward,
"scroll": self.scroll,
Expand Down Expand Up @@ -357,6 +358,15 @@ async def async_fill_form():

return syncer.sync(async_fill_form())

def compose(self, request: PuppeteerRequest):
_, context_id, page_id = self.get_page_from_request(request)
request.page_id = page_id
request.context_id = context_id

for action in request.action.actions:
response = self.action_map[action.endpoint](request.replace(action=action))
return response.replace(puppeteer_request=request)

def action(self, request: PuppeteerRequest):
raise ValueError("CustomJsAction is not available in local mode")

Expand Down
14 changes: 14 additions & 0 deletions scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(self):
self.action_map = {
"goto": self.goto,
"click": self.click,
"compose": self.compose,
"back": self.go_back,
"forward": self.go_forward,
"scroll": self.scroll,
Expand Down Expand Up @@ -316,6 +317,19 @@ async def async_fill_form():

return syncer.sync(async_fill_form())

def compose(self, request: PuppeteerRequest):
context_id, page_id = syncer.sync(
self.context_manager.check_context_and_page(
request.context_id, request.page_id
)
)
request.page_id = page_id
request.context_id = context_id

for action in request.action.actions:
response = self.action_map[action.endpoint](request.replace(action=action))
return response.replace(puppeteer_request=request)

def action(self, request: PuppeteerRequest):
raise ValueError("CustomJsAction is not available in local mode")

Expand Down
33 changes: 26 additions & 7 deletions scrapypuppeteer/browser_managers/service_browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from scrapypuppeteer.actions import (
Click,
Compose,
FillForm,
GoBack,
GoForward,
Expand Down Expand Up @@ -98,9 +99,7 @@ def _encode_service_params(request):
def _serialize_body(self, action, request):
payload = action.payload()
if action.content_type == "application/json":
if isinstance(payload, dict):
# disallow null values in top-level request parameters
payload = {k: v for k, v in payload.items() if v is not None}
payload = self.__clean_payload(payload)
proxy = request.meta.get("proxy")
if proxy:
payload["proxy"] = proxy
Expand All @@ -119,6 +118,18 @@ def _serialize_body(self, action, request):
return json.dumps(payload)
return str(payload)

def __clean_payload(self, payload):
"""
disallow null values in request parameters
"""
if isinstance(payload, dict):
payload = {
k: self.__clean_payload(v) for k, v in payload.items() if v is not None
}
elif isinstance(payload, list):
payload = [self.__clean_payload(v) for v in payload if v is not None]
return payload

def close_used_contexts(self, spider):
contexts = list(self.used_contexts.pop(id(spider), set()))
if contexts:
Expand Down Expand Up @@ -168,7 +179,7 @@ def process_response(self, middleware, request, response, spider):
)
context_id = response_data.get("contextId")
if context_id:
middleware.used_contexts[id(spider)].add(context_id)
self.used_contexts[id(spider)].add(context_id)
return response

response_cls = self._get_response_class(puppeteer_request.action)
Expand All @@ -183,7 +194,13 @@ def process_response(self, middleware, request, response, spider):
)

def _form_response(
self, response_cls, response_data, url, request, puppeteer_request, spider
self,
response_cls,
response_data,
url,
request,
puppeteer_request,
spider,
):
context_id = response_data.pop("contextId", puppeteer_request.context_id)
page_id = response_data.pop("pageId", puppeteer_request.page_id)
Expand All @@ -198,8 +215,7 @@ def _form_response(
**response_data,
)

@staticmethod
def _get_response_class(request_action):
def _get_response_class(self, request_action):
if isinstance(
request_action, (GoTo, GoForward, GoBack, Click, Scroll, FillForm)
):
Expand All @@ -210,4 +226,7 @@ def _get_response_class(request_action):
return PuppeteerHarResponse
if isinstance(request_action, RecaptchaSolver):
return PuppeteerRecaptchaSolverResponse
if isinstance(request_action, Compose):
# Response class is a last action's response class
return self._get_response_class(request_action.actions[-1])
return PuppeteerJsonResponse
7 changes: 5 additions & 2 deletions scrapypuppeteer/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from scrapy.http import Headers, Request

from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction
from scrapypuppeteer.actions import Compose, GoTo, PuppeteerServiceAction


class ActionRequest(Request):
Expand Down Expand Up @@ -88,11 +88,14 @@ def __init__(
)
elif isinstance(action, GoTo):
url = action.url
elif isinstance(action, Compose):
if isinstance(action.actions[0], GoTo):
url = action.actions[0].url
elif not isinstance(action, PuppeteerServiceAction):
raise ValueError("Undefined browser action")
if url is None:
raise ValueError(
"Request is not a goto-request and does not follow a response"
"Request is not a goto-containing request and does not follow a response"
)
super().__init__(url, action, **kwargs)
self.context_id = context_id
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def read_long_description(file_path):

setup(
name="scrapy-puppeteer-client",
version="0.3.7",
version="0.3.8",
description="A library to use Puppeteer-managed browser in Scrapy spiders",
long_description=read_long_description("README.md"),
long_description_content_type="text/markdown",
Expand Down

0 comments on commit 98704eb

Please sign in to comment.