From f3cdaa9b2e8509f11394563ea5f76d98ae51d786 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 12 Sep 2024 11:05:47 +0300 Subject: [PATCH 01/14] Follow/follow_all --- scrapypuppeteer/request.py | 4 ++- scrapypuppeteer/response.py | 54 +++++++++++++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/scrapypuppeteer/request.py b/scrapypuppeteer/request.py index 1f89453..491882b 100644 --- a/scrapypuppeteer/request.py +++ b/scrapypuppeteer/request.py @@ -89,7 +89,9 @@ def __init__( elif isinstance(action, GoTo): url = action.url elif not isinstance(action, PuppeteerServiceAction): - raise ValueError("Undefined browser action") + raise TypeError( + f"Undefined browser action: `{type(action)}`. `Expected PuppeteerServiceAction`" + ) if url is None: raise ValueError( "Request is not a goto-request and does not follow a response" diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index 51dee13..a4cd8ec 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -1,8 +1,11 @@ +from typing import Tuple, Union, Generator import warnings -from typing import Tuple, Union from scrapy.exceptions import ScrapyDeprecationWarning -from scrapy.http import TextResponse +from scrapy.http import HtmlResponse, TextResponse +from scrapy.http.response.text import _url_from_selector +from scrapy.link import Link +import parsel from scrapypuppeteer import PuppeteerRequest from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction @@ -38,7 +41,7 @@ def __init__( def follow( self, - action: Union[str, PuppeteerServiceAction], + action: Union[str, parsel.Selector, Link, PuppeteerServiceAction], close_page=True, accumulate_meta: bool = False, **kwargs, @@ -55,6 +58,10 @@ def follow( page_id = None if self.puppeteer_request.close_page else self.page_id if isinstance(action, str): action = self.urljoin(action) + elif isinstance(action, parsel.Selector): + action = _url_from_selector(action) + elif isinstance(action, Link): + action = self.urljoin(action.url) elif isinstance(action, GoTo): action.url = self.urljoin(action.url) else: @@ -70,14 +77,51 @@ def follow( **kwargs, ) + def follow_all( + self, + actions=None, + close_page: bool = True, + accumulate_meta: bool = False, + **kwargs, + ) -> Generator[PuppeteerRequest, None, None]: + arguments = [ + x + for x in (actions, kwargs.get("css"), kwargs.get("xpath")) + if x is not None + ] + if len(arguments) != 1: + raise ValueError( + "Please supply exactly one of the following arguments: actions, css, xpath" + ) + if not actions: + if kwargs.get("css"): + actions = self.css(kwargs["css"]) + if kwargs.get("xpath"): + actions = self.xpath(kwargs["xpath"]) + + if isinstance(actions, parsel.SelectorList): + selectors = actions + actions = [] + for sel in selectors: + actions.append(_url_from_selector(sel)) + + return ( + self.follow( + action, close_page=close_page, accumulate_meta=accumulate_meta, **kwargs + ) + for action in actions + ) -class PuppeteerHtmlResponse(PuppeteerResponse): + +class PuppeteerHtmlResponse(PuppeteerResponse, HtmlResponse): """ scrapy.TextResponse capturing state of a page in browser. Additionally, exposes received html and cookies via corresponding attributes. """ - attributes: Tuple[str, ...] = PuppeteerResponse.attributes + ("html", "cookies") + attributes: Tuple[str, ...] = tuple( + set(PuppeteerResponse.attributes + HtmlResponse.attributes) + ) + ("html", "cookies") """ A tuple of :class:`str` objects containing the name of all public attributes of the class that are also keyword parameters of the From 64ee649f8f7624d29110a148c2149a2d73978030 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 12 Sep 2024 11:21:47 +0300 Subject: [PATCH 02/14] Documentation --- README.md | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 3984e9c..63f4451 100644 --- a/README.md +++ b/README.md @@ -23,29 +23,18 @@ DOWNLOADER_MIDDLEWARES = { 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 } -PUPPETEER_SERVICE_URL = 'http://localhost:3000' +PUPPETEER_SERVICE_URL = "http://localhost:3000" # Not necessary in other execution methods # To change the execution method, you must add the corresponding setting: EXECUTION_METHOD = "Puppeteer" ``` Available methods: `Puppeteer`, `Pyppeteer`, `Playwright` -The `Pyppeteer` and `Playwright` methods do not require a running service. They use the pyppeteer and playwright libraries for Python to interact with the browser. Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods. +`Pyppeteer` and `Playwright` methods do not require a running service. +They use the pyppeteer and playwright libraries for Python to interact with the browser. +Actions such as `CustomJsAction`, `RecaptchaSolver`, and `Har` are not available when using these methods. -To use the `Pyppeteer` or `Playwright` methods you need to install Chromium. - - -## Configuration - -You should have [scrapy-puppeteer-service](https://github.com/ispras/scrapy-puppeteer-service) started. -Then add its URL to `settings.py` and enable puppeteer downloader middleware: -```python -DOWNLOADER_MIDDLEWARES = { - 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042 -} - -PUPPETEER_SERVICE_URL = 'http://localhost:3000' -``` +To use `Pyppeteer` or `Playwright` methods you need to install Chromium. ## Basic usage @@ -129,6 +118,8 @@ class MySpider(scrapy.Spider): ) ``` +You may also use `follow_all` method to continue interacting. + On your first request service will create new incognito browser context and new page in it. Their ids will be in returned in response object as `context_id` and `page_id` attributes. Following such response means passing context and page ids to next request. From 6fe9ce2fea9c7561d8ae36bd69ca5c1cb025ef48 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 12 Sep 2024 11:22:11 +0300 Subject: [PATCH 03/14] Formatter and linter --- scrapypuppeteer/response.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index a4cd8ec..71d3edf 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -1,11 +1,11 @@ -from typing import Tuple, Union, Generator import warnings +from typing import Generator, Tuple, Union +import parsel from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http import HtmlResponse, TextResponse from scrapy.http.response.text import _url_from_selector from scrapy.link import Link -import parsel from scrapypuppeteer import PuppeteerRequest from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction From 199d0f4a2f037e5dca9a2bb8cebaa52650ed1115 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 12 Sep 2024 12:49:01 +0300 Subject: [PATCH 04/14] Priority fix --- scrapypuppeteer/response.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index 71d3edf..6026e49 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -59,7 +59,7 @@ def follow( if isinstance(action, str): action = self.urljoin(action) elif isinstance(action, parsel.Selector): - action = _url_from_selector(action) + action = self.urljoin(_url_from_selector(action)) elif isinstance(action, Link): action = self.urljoin(action.url) elif isinstance(action, GoTo): @@ -95,9 +95,9 @@ def follow_all( ) if not actions: if kwargs.get("css"): - actions = self.css(kwargs["css"]) + actions = self.css(kwargs.pop("css")) if kwargs.get("xpath"): - actions = self.xpath(kwargs["xpath"]) + actions = self.xpath(kwargs.pop("xpath")) if isinstance(actions, parsel.SelectorList): selectors = actions @@ -107,9 +107,13 @@ def follow_all( return ( self.follow( - action, close_page=close_page, accumulate_meta=accumulate_meta, **kwargs + action, + close_page=(close_page if ind == len(actions) - 1 else False), + accumulate_meta=accumulate_meta, + priority=(-1 if ind == len(actions) - 1 else kwargs.pop("priority", 0)), + **kwargs, ) - for action in actions + for ind, action in enumerate(actions) ) From cb2f0a8324ad235f3510b3348127400aae6270b9 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 12 Sep 2024 13:00:35 +0300 Subject: [PATCH 05/14] Comments --- scrapypuppeteer/response.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index 6026e49..de1933f 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -108,9 +108,13 @@ def follow_all( return ( self.follow( action, - close_page=(close_page if ind == len(actions) - 1 else False), + close_page=( + close_page if ind == len(actions) - 1 else False + ), # close_page on last request accumulate_meta=accumulate_meta, - priority=(-1 if ind == len(actions) - 1 else kwargs.pop("priority", 0)), + priority=( + -1 if ind == len(actions) - 1 else kwargs.pop("priority", 0) + ), # to execute close_page request "definitely" last **kwargs, ) for ind, action in enumerate(actions) From a5a70b9521224940820b118f3eb9c23163ee24f5 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 12 Sep 2024 15:11:55 +0300 Subject: [PATCH 06/14] Example and fix --- examples/spiders/follow.py | 49 ++++++++++++++++++++++++++++++++++ scrapypuppeteer/response.py | 52 ++++++++++++++++++++----------------- 2 files changed, 77 insertions(+), 24 deletions(-) create mode 100644 examples/spiders/follow.py diff --git a/examples/spiders/follow.py b/examples/spiders/follow.py new file mode 100644 index 0000000..3986e99 --- /dev/null +++ b/examples/spiders/follow.py @@ -0,0 +1,49 @@ +from scrapy import Spider +from scrapy.http import Response + +from scrapypuppeteer import GoTo, PuppeteerRequest, PuppeteerResponse + + +class FollowSpider(Spider): + name = "follow" + + start_urls = ["http://quotes.toscrape.com/page/1/"] + + def start_requests(self): + for url in self.start_urls: + yield PuppeteerRequest( + GoTo(url), + close_page=False, + callback=self.goto_about, + errback=self.errback, + ) + + def goto_about(self, response: PuppeteerResponse): + # yield response.follow( + # response.css("div.quote span a")[0], + # callback=self.parse, + # errback=self.errback, + # close_page=False, + # ) + + # Or: + yield from response.follow_all( + response.css("div.quote span a"), + callback=self.parse, + errback=self.errback, + close_page=True, + ) + + # Or: + # yield from response.follow_all( + # css="div.quote span a", + # callback=self.parse, + # errback=self.errback, + # close_page=False, + # ) + + def parse(self, response: Response, **kwargs): + self.log(response.url.split("/")[-1]) + + def errback(self, failure): + self.log(failure) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index de1933f..96f275d 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -82,44 +82,48 @@ def follow_all( actions=None, close_page: bool = True, accumulate_meta: bool = False, + css=None, + xpath=None, **kwargs, ) -> Generator[PuppeteerRequest, None, None]: - arguments = [ - x - for x in (actions, kwargs.get("css"), kwargs.get("xpath")) - if x is not None - ] + """ + Execute actions in the same context but in other browser pages. + Only one of `actions`, `css`, or `xpath` must be specified.` + Note that original page from which the method was called lasts unaffected. + + :param actions: iterable of PuppeteerActions or selectors + :param css: selector + :param xpath: selector + :return: Iterable[PuppeteerRequest] + """ + + # Probably, we should ban any PuppeteerAction in `actions` except GoTo + arguments = [x for x in (actions, css, xpath) if x is not None] if len(arguments) != 1: raise ValueError( "Please supply exactly one of the following arguments: actions, css, xpath" ) if not actions: - if kwargs.get("css"): - actions = self.css(kwargs.pop("css")) - if kwargs.get("xpath"): - actions = self.xpath(kwargs.pop("xpath")) - - if isinstance(actions, parsel.SelectorList): - selectors = actions - actions = [] - for sel in selectors: - actions.append(_url_from_selector(sel)) - - return ( + if css: + actions = self.css(css) + if xpath: + actions = self.xpath(xpath) + + page_id = self.page_id + self.page_id = None + + yield from ( self.follow( action, - close_page=( - close_page if ind == len(actions) - 1 else False - ), # close_page on last request + close_page=close_page, accumulate_meta=accumulate_meta, - priority=( - -1 if ind == len(actions) - 1 else kwargs.pop("priority", 0) - ), # to execute close_page request "definitely" last **kwargs, ) - for ind, action in enumerate(actions) + for action in actions ) + self.page_id = page_id + class PuppeteerHtmlResponse(PuppeteerResponse, HtmlResponse): """ From 56e8f0f547989b2d57eebc7934fdd215769f0305 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 12 Sep 2024 15:17:14 +0300 Subject: [PATCH 07/14] Docstring --- scrapypuppeteer/response.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index 96f275d..cc9dc19 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -92,6 +92,8 @@ def follow_all( Note that original page from which the method was called lasts unaffected. :param actions: iterable of PuppeteerActions or selectors + :param close_page: whether to close page after request completion + :param accumulate_meta: whether to accumulate meta from response :param css: selector :param xpath: selector :return: Iterable[PuppeteerRequest] From 5a820bca3109fa807771492cf8f5b73458b316f8 Mon Sep 17 00:00:00 2001 From: matthew Date: Thu, 12 Sep 2024 15:18:46 +0300 Subject: [PATCH 08/14] Docstring --- scrapypuppeteer/response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index cc9dc19..cd95006 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -111,7 +111,7 @@ def follow_all( if xpath: actions = self.xpath(xpath) - page_id = self.page_id + page_id = self.page_id # Substitution of page_id in order to create new page self.page_id = None yield from ( From dd6d02866129f35a649d2565ce055c978138ff4a Mon Sep 17 00:00:00 2001 From: matthew Date: Mon, 14 Oct 2024 16:31:07 +0300 Subject: [PATCH 09/14] ban any action except GoTo --- scrapypuppeteer/response.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index cd95006..e63786b 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -99,7 +99,6 @@ def follow_all( :return: Iterable[PuppeteerRequest] """ - # Probably, we should ban any PuppeteerAction in `actions` except GoTo arguments = [x for x in (actions, css, xpath) if x is not None] if len(arguments) != 1: raise ValueError( @@ -110,6 +109,11 @@ def follow_all( actions = self.css(css) if xpath: actions = self.xpath(xpath) + else: + # Ban any PuppeteerAction except GoTo + for action in actions: + if not isinstance(action, GoTo): + raise TypeError(f"Expected GoTo, got {type(action)}") page_id = self.page_id # Substitution of page_id in order to create new page self.page_id = None From abf780c1e44f56819be997c6fa29672a0ee902af Mon Sep 17 00:00:00 2001 From: matthew Date: Mon, 14 Oct 2024 16:34:31 +0300 Subject: [PATCH 10/14] fix page_id = None --- scrapypuppeteer/response.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index e63786b..57aee7c 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -115,21 +115,17 @@ def follow_all( if not isinstance(action, GoTo): raise TypeError(f"Expected GoTo, got {type(action)}") - page_id = self.page_id # Substitution of page_id in order to create new page - self.page_id = None - yield from ( self.follow( action, close_page=close_page, accumulate_meta=accumulate_meta, + page_id=None, # Substitution of page_id in order to create new page **kwargs, ) for action in actions ) - self.page_id = page_id - class PuppeteerHtmlResponse(PuppeteerResponse, HtmlResponse): """ From 3f9f4618bc1f0945b36544035549ae49f09018cc Mon Sep 17 00:00:00 2001 From: matthew Date: Mon, 14 Oct 2024 16:45:40 +0300 Subject: [PATCH 11/14] Add Compose to except --- scrapypuppeteer/response.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index 57aee7c..8b187d6 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -8,7 +8,7 @@ from scrapy.link import Link from scrapypuppeteer import PuppeteerRequest -from scrapypuppeteer.actions import GoTo, PuppeteerServiceAction +from scrapypuppeteer.actions import Compose, GoTo, PuppeteerServiceAction class PuppeteerResponse(TextResponse): @@ -110,8 +110,10 @@ def follow_all( if xpath: actions = self.xpath(xpath) else: - # Ban any PuppeteerAction except GoTo + # Ban any PuppeteerAction except GoTo and GoTo-like Compose for action in actions: + if isinstance(action, Compose): + action = action.actions[0] if not isinstance(action, GoTo): raise TypeError(f"Expected GoTo, got {type(action)}") From 07472d5994eec4e2d7a6ae36b1bdbfe617858a40 Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 15 Oct 2024 11:31:30 +0300 Subject: [PATCH 12/14] Fix action validation --- scrapypuppeteer/response.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index 8b187d6..ed26f20 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -112,10 +112,11 @@ def follow_all( else: # Ban any PuppeteerAction except GoTo and GoTo-like Compose for action in actions: - if isinstance(action, Compose): - action = action.actions[0] - if not isinstance(action, GoTo): - raise TypeError(f"Expected GoTo, got {type(action)}") + if isinstance(action, PuppeteerServiceAction): + if isinstance(action, Compose): + action = action.actions[0] + if not isinstance(action, GoTo): + raise TypeError(f"Expected GoTo, got {type(action)}") yield from ( self.follow( From 93161fd1503fcf8c7f9bddf2d8bb42e1462261f4 Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 15 Oct 2024 12:01:20 +0300 Subject: [PATCH 13/14] Fix action validation --- scrapypuppeteer/response.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index ed26f20..c88b4cd 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -118,16 +118,17 @@ def follow_all( if not isinstance(action, GoTo): raise TypeError(f"Expected GoTo, got {type(action)}") - yield from ( - self.follow( + page_id = self.page_id + for action in actions: + self.page_id = None # Substitution of page_id in order to create new page + next_request = self.follow( action, close_page=close_page, accumulate_meta=accumulate_meta, - page_id=None, # Substitution of page_id in order to create new page **kwargs, ) - for action in actions - ) + self.page_id = page_id + yield next_request class PuppeteerHtmlResponse(PuppeteerResponse, HtmlResponse): From 055216cca6d115028d7c32d3e52957c56b1ae912 Mon Sep 17 00:00:00 2001 From: matthew Date: Tue, 15 Oct 2024 12:22:03 +0300 Subject: [PATCH 14/14] Response's state is saved now --- scrapypuppeteer/response.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/scrapypuppeteer/response.py b/scrapypuppeteer/response.py index c88b4cd..20e3843 100644 --- a/scrapypuppeteer/response.py +++ b/scrapypuppeteer/response.py @@ -121,13 +121,15 @@ def follow_all( page_id = self.page_id for action in actions: self.page_id = None # Substitution of page_id in order to create new page - next_request = self.follow( - action, - close_page=close_page, - accumulate_meta=accumulate_meta, - **kwargs, - ) - self.page_id = page_id + try: + next_request = self.follow( + action, + close_page=close_page, + accumulate_meta=accumulate_meta, + **kwargs, + ) + finally: # To save the original state of response + self.page_id = page_id yield next_request