From 98704eb54b90288716d84129cc9c8bc4c1f49c1f Mon Sep 17 00:00:00 2001 From: Matvey Date: Wed, 9 Oct 2024 17:46:38 +0300 Subject: [PATCH] Compose action (#40) * Added `Compose` action in actions.py. Some formatting. * Fix closing context * Update for pyppeteer and playwright. * Doc and example * Typing issue * Fixed stand-alone `Compose`. * Changes? * Fixed example spider * Playwright and Pyppeteer fixes * Better example * Standard execution method * increment version + minor fix --------- Co-authored-by: Max Varlamov --- README.md | 1 + examples/spiders/compose.py | 57 +++++++++++++++++++ scrapypuppeteer/actions.py | 44 +++++++++++++- .../playwright_browser_manager.py | 10 ++++ .../pyppeteer_browser_manager.py | 14 +++++ .../service_browser_manager.py | 33 ++++++++--- scrapypuppeteer/request.py | 7 ++- setup.py | 2 +- 8 files changed, 155 insertions(+), 13 deletions(-) create mode 100644 examples/spiders/compose.py diff --git a/README.md b/README.md index 3984e9c..53769d4 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,7 @@ Here is the list of available actions: - `GoForward(options)` - navigate forward in history - `GoBack(options)` - navigate back in history - `Click(selector, click_options, wait_options)` - click on element on page +- `Compose(*actions)` - composition of several puppeteer action - `Scroll(selector, wait_options)` - scroll page - `Screenshot(options)` - take screenshot - `Har()` - to get the HAR file, pass the `har_recording=True` argument to `PuppeteerRequest` at the start of execution. diff --git a/examples/spiders/compose.py b/examples/spiders/compose.py new file mode 100644 index 0000000..b0af7ad --- /dev/null +++ b/examples/spiders/compose.py @@ -0,0 +1,57 @@ +from logging import ERROR + +import scrapy +from scrapy.utils.log import failure_to_exc_info +from twisted.python.failure import Failure + +from scrapypuppeteer import ( + PuppeteerRequest, + PuppeteerResponse, + PuppeteerScreenshotResponse, +) +from scrapypuppeteer.actions import Click, Compose, GoTo, Screenshot, Scroll + + +class ComposeSpider(scrapy.Spider): + name = "compose" + + custom_settings = { + "DOWNLOADER_MIDDLEWARES": { + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, + }, + } + + def start_requests(self): + goto = GoTo("https://pptr.dev") + click_1 = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)" + ) + click_2 = Click( + "#__docusaurus_skipToContent_fallback > div > div > aside > div > " + "div > nav > ul > li:nth-child(1) > ul > li:nth-child(3) > a" + ) + click = Compose(click_1, click_2) + scroll = Scroll() + screenshot = Screenshot(options={"full_page": True, "type": "jpeg"}) + + compose_action = Compose( + goto, + click, + scroll, + screenshot, + ) + + yield PuppeteerRequest( + compose_action, + callback=self.parse, + errback=self.errback, + close_page=True, + ) + + def parse(self, response: PuppeteerResponse): + assert isinstance(response, PuppeteerScreenshotResponse) + self.log("Spider worked fine!") + + def errback(self, failure: Failure): + print(failure) + self.log(failure_to_exc_info(failure), level=ERROR) diff --git a/scrapypuppeteer/actions.py b/scrapypuppeteer/actions.py index 4e56588..b871b1d 100644 --- a/scrapypuppeteer/actions.py +++ b/scrapypuppeteer/actions.py @@ -1,13 +1,14 @@ from abc import ABC, abstractmethod +from typing import List, Tuple class PuppeteerServiceAction(ABC): + content_type = "application/json" + @property @abstractmethod def endpoint(self): ... - content_type = "application/json" - @abstractmethod def payload(self): ... @@ -291,7 +292,8 @@ class RecaptchaSolver(PuppeteerServiceAction): Response for this action is PuppeteerJsonResponse. You can get the return values via self.data['recaptcha_data']. - You can visit https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object + You can visit + https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra-plugin-recaptcha#result-object to get information about return value. """ @@ -334,3 +336,39 @@ def __init__(self, js_action: str): def payload(self): return self.js_action + + +class Compose(PuppeteerServiceAction): + """ + Compose several scrapy-puppeteer actions into one action and send it to the service. + + Response for this action is PuppeteerResponse to last action in a sequence. + + """ + + endpoint = "compose" + + def __init__(self, *actions: PuppeteerServiceAction): + self.actions = self.__flatten(actions) + + @staticmethod + def __flatten( + actions: Tuple[PuppeteerServiceAction, ...], + ) -> List[PuppeteerServiceAction]: + flatten_actions = [] + for action in actions: + if isinstance(action, Compose): + flatten_actions.extend(action.actions) + else: + flatten_actions.append(action) + if not flatten_actions: + raise ValueError("No actions provided in `Compose`.") + return flatten_actions + + def payload(self): + return { + "actions": [ + {"endpoint": action.endpoint, "body": action.payload()} + for action in self.actions + ] + } diff --git a/scrapypuppeteer/browser_managers/playwright_browser_manager.py b/scrapypuppeteer/browser_managers/playwright_browser_manager.py index 1e1efc3..1228e29 100644 --- a/scrapypuppeteer/browser_managers/playwright_browser_manager.py +++ b/scrapypuppeteer/browser_managers/playwright_browser_manager.py @@ -63,6 +63,7 @@ def __init__(self): self.action_map = { "goto": self.goto, "click": self.click, + "compose": self.compose, "back": self.go_back, "forward": self.go_forward, "scroll": self.scroll, @@ -357,6 +358,15 @@ async def async_fill_form(): return syncer.sync(async_fill_form()) + def compose(self, request: PuppeteerRequest): + _, context_id, page_id = self.get_page_from_request(request) + request.page_id = page_id + request.context_id = context_id + + for action in request.action.actions: + response = self.action_map[action.endpoint](request.replace(action=action)) + return response.replace(puppeteer_request=request) + def action(self, request: PuppeteerRequest): raise ValueError("CustomJsAction is not available in local mode") diff --git a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py index bc465f5..6998e0c 100644 --- a/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py +++ b/scrapypuppeteer/browser_managers/pyppeteer_browser_manager.py @@ -59,6 +59,7 @@ def __init__(self): self.action_map = { "goto": self.goto, "click": self.click, + "compose": self.compose, "back": self.go_back, "forward": self.go_forward, "scroll": self.scroll, @@ -316,6 +317,19 @@ async def async_fill_form(): return syncer.sync(async_fill_form()) + def compose(self, request: PuppeteerRequest): + context_id, page_id = syncer.sync( + self.context_manager.check_context_and_page( + request.context_id, request.page_id + ) + ) + request.page_id = page_id + request.context_id = context_id + + for action in request.action.actions: + response = self.action_map[action.endpoint](request.replace(action=action)) + return response.replace(puppeteer_request=request) + def action(self, request: PuppeteerRequest): raise ValueError("CustomJsAction is not available in local mode") diff --git a/scrapypuppeteer/browser_managers/service_browser_manager.py b/scrapypuppeteer/browser_managers/service_browser_manager.py index 7829c72..f016f14 100644 --- a/scrapypuppeteer/browser_managers/service_browser_manager.py +++ b/scrapypuppeteer/browser_managers/service_browser_manager.py @@ -10,6 +10,7 @@ from scrapypuppeteer.actions import ( Click, + Compose, FillForm, GoBack, GoForward, @@ -98,9 +99,7 @@ def _encode_service_params(request): def _serialize_body(self, action, request): payload = action.payload() if action.content_type == "application/json": - if isinstance(payload, dict): - # disallow null values in top-level request parameters - payload = {k: v for k, v in payload.items() if v is not None} + payload = self.__clean_payload(payload) proxy = request.meta.get("proxy") if proxy: payload["proxy"] = proxy @@ -119,6 +118,18 @@ def _serialize_body(self, action, request): return json.dumps(payload) return str(payload) + def __clean_payload(self, payload): + """ + disallow null values in request parameters + """ + if isinstance(payload, dict): + payload = { + k: self.__clean_payload(v) for k, v in payload.items() if v is not None + } + elif isinstance(payload, list): + payload = [self.__clean_payload(v) for v in payload if v is not None] + return payload + def close_used_contexts(self, spider): contexts = list(self.used_contexts.pop(id(spider), set())) if contexts: @@ -168,7 +179,7 @@ def process_response(self, middleware, request, response, spider): ) context_id = response_data.get("contextId") if context_id: - middleware.used_contexts[id(spider)].add(context_id) + self.used_contexts[id(spider)].add(context_id) return response response_cls = self._get_response_class(puppeteer_request.action) @@ -183,7 +194,13 @@ def process_response(self, middleware, request, response, spider): ) def _form_response( - self, response_cls, response_data, url, request, puppeteer_request, spider + self, + response_cls, + response_data, + url, + request, + puppeteer_request, + spider, ): context_id = response_data.pop("contextId", puppeteer_request.context_id) page_id = response_data.pop("pageId", puppeteer_request.page_id) @@ -198,8 +215,7 @@ def _form_response( **response_data, ) - @staticmethod - def _get_response_class(request_action): + def _get_response_class(self, request_action): if isinstance( request_action, (GoTo, GoForward, GoBack, Click, Scroll, FillForm) ): @@ -210,4 +226,7 @@ def _get_response_class(request_action): return PuppeteerHarResponse if isinstance(request_action, RecaptchaSolver): return PuppeteerRecaptchaSolverResponse + if isinstance(request_action, Compose): + # Response class is a last action's response class + return self._get_response_class(request_action.actions[-1]) return PuppeteerJsonResponse diff --git a/scrapypuppeteer/request.py b/scrapypuppeteer/request.py index 1f89453..b64f69d 100644 --- a/scrapypuppeteer/request.py +++ b/scrapypuppeteer/request.py @@ -3,7 +3,7 @@ from scrapy.http import Headers, Request -from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction +from scrapypuppeteer.actions import Compose, GoTo, PuppeteerServiceAction class ActionRequest(Request): @@ -88,11 +88,14 @@ def __init__( ) elif isinstance(action, GoTo): url = action.url + elif isinstance(action, Compose): + if isinstance(action.actions[0], GoTo): + url = action.actions[0].url elif not isinstance(action, PuppeteerServiceAction): raise ValueError("Undefined browser action") if url is None: raise ValueError( - "Request is not a goto-request and does not follow a response" + "Request is not a goto-containing request and does not follow a response" ) super().__init__(url, action, **kwargs) self.context_id = context_id diff --git a/setup.py b/setup.py index 638ce36..b9b7750 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def read_long_description(file_path): setup( name="scrapy-puppeteer-client", - version="0.3.7", + version="0.3.8", description="A library to use Puppeteer-managed browser in Scrapy spiders", long_description=read_long_description("README.md"), long_description_content_type="text/markdown",