From c1ba9c0c44d41f571a007aff41a2f6ea6b3e24e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Ruiz?= Date: Mon, 9 Sep 2024 15:15:24 +0200 Subject: [PATCH] Add LocationSessionConfig (#215) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves Co-authored-by: Andrey Rakhmatullin --- docs/usage/session.rst | 9 ++ scrapy_zyte_api/__init__.py | 1 + scrapy_zyte_api/_session.py | 41 ++++++ tests/test_sessions.py | 255 ++++++++++++++++++++++++++++++++++++ 4 files changed, 306 insertions(+) diff --git a/docs/usage/session.rst b/docs/usage/session.rst index 09a7d792..8e3aad61 100644 --- a/docs/usage/session.rst +++ b/docs/usage/session.rst @@ -338,6 +338,15 @@ To define a different session config for a given URL pattern, install .. autofunction:: scrapy_zyte_api.session_config +If you only need to override the :meth:`SessionConfig.check +` or :meth:`SessionConfig.params +` methods for scenarios involving a +location, you may subclass :class:`~scrapy_zyte_api.LocationSessionConfig` +instead: + +.. autoclass:: scrapy_zyte_api.LocationSessionConfig + :members: location_check, location_params + If in a session config implementation or in any other Scrapy component you need to tell whether a request is a :ref:`session initialization request ` or not, use :func:`~scrapy_zyte_api.is_session_init_request`: diff --git a/scrapy_zyte_api/__init__.py b/scrapy_zyte_api/__init__.py index de1ff8e1..88bec2fb 100644 --- a/scrapy_zyte_api/__init__.py +++ b/scrapy_zyte_api/__init__.py @@ -17,6 +17,7 @@ ) from ._session import SESSION_DEFAULT_RETRY_POLICY as _SESSION_DEFAULT_RETRY_POLICY from ._session import ( + LocationSessionConfig, ScrapyZyteAPISessionDownloaderMiddleware, SessionConfig, is_session_init_request, diff --git a/scrapy_zyte_api/_session.py b/scrapy_zyte_api/_session.py index cac137c5..eb674492 100644 --- a/scrapy_zyte_api/_session.py +++ b/scrapy_zyte_api/_session.py @@ -348,6 +348,8 @@ def params(self, request: Request) -> Dict[str, Any]: The returned parameters do not need to include :http:`request:url`. If missing, it is picked from the request :ref:`triggering a session initialization request `. + + .. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig` """ if location := self.location(request): return { @@ -372,6 +374,8 @@ def check(self, response: Response, request: Request) -> bool: If you need to tell whether *request* is a :ref:`session initialization request ` or not, use :func:`~scrapy_zyte_api.is_session_init_request`. + + .. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig` """ if self._checker: return self._checker.check(response, request) @@ -966,3 +970,40 @@ async def process_exception( spider=spider, reason=reason, ) + + +class LocationSessionConfig(SessionConfig): + """:class:`~scrapy_zyte_api.SessionConfig` subclass to minimize boilerplate + when implementing location-specific session configs, i.e. session configs + where the default values should be used unless a location is set. + + Provides counterparts to some :class:`~scrapy_zyte_api.SessionConfig` + methods that are only called when a location is set, and get that location + as a parameter. + """ + + def params(self, request: Request) -> Dict[str, Any]: + if not (location := self.location(request)): + return super().params(request) + return self.location_params(request, location) + + def check(self, response: Response, request: Request) -> bool: + if not (location := self.location(request)): + return super().check(response, request) + return self.location_check(response, request, location) + + def location_params( + self, request: Request, location: Dict[str, Any] + ) -> Dict[str, Any]: + """Like :class:`SessionConfig.params + `, but it is only called when a + location is set, and gets that *location* as a parameter.""" + return super().params(request) + + def location_check( + self, response: Response, request: Request, location: Dict[str, Any] + ) -> bool: + """Like :class:`SessionConfig.check + `, but it is only called when a + location is set, and gets that *location* as a parameter.""" + return super().check(response, request) diff --git a/tests/test_sessions.py b/tests/test_sessions.py index 282034e6..cc249d3c 100644 --- a/tests/test_sessions.py +++ b/tests/test_sessions.py @@ -17,6 +17,7 @@ from scrapy_zyte_api import ( SESSION_AGGRESSIVE_RETRY_POLICY, SESSION_DEFAULT_RETRY_POLICY, + LocationSessionConfig, SessionConfig, is_session_init_request, session_config, @@ -2080,6 +2081,260 @@ class CustomSessionConfig(SessionConfig): pass +@ensureDeferred +async def test_location_session_config(mockserver): + pytest.importorskip("web_poet") + + @session_config( + [ + "postal-code-10001.example", + "postal-code-10001-fail.example", + "postal-code-10001-alternative.example", + ] + ) + class CustomSessionConfig(LocationSessionConfig): + + def location_params( + self, request: Request, location: Dict[str, Any] + ) -> Dict[str, Any]: + assert location == {"postalCode": "10002"} + return { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + } + + def location_check( + self, response: Response, request: Request, location: Dict[str, Any] + ) -> bool: + assert location == {"postalCode": "10002"} + domain = urlparse_cached(request).netloc + return "fail" not in domain + + def pool(self, request: Request) -> str: + domain = urlparse_cached(request).netloc + if domain == "postal-code-10001-alternative.example": + return "postal-code-10001.example" + return domain + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + # We set a location to force the location-specific methods of the + # session config class to be called, but we set the wrong location so + # that the test would not pass were it not for our custom + # implementation which ignores the input location and instead sets the + # right one. + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = [ + "https://postal-code-10001.example", + "https://postal-code-10001-alternative.example", + "https://postal-code-10001-fail.example", + ] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2, + "scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/check-failed": 1, + } + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-alternative.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/failed": 1, + } + + +@ensureDeferred +async def test_location_session_config_no_methods(mockserver): + """If no location_* methods are defined, LocationSessionConfig works the + same as SessionConfig.""" + pytest.importorskip("web_poet") + + @session_config( + [ + "postal-code-10001.example", + "postal-code-10001-alternative.example", + ] + ) + class CustomSessionConfig(LocationSessionConfig): + + def pool(self, request: Request) -> str: + domain = urlparse_cached(request).netloc + if domain == "postal-code-10001-alternative.example": + return "postal-code-10001.example" + return domain + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"}, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = [ + "https://postal-code-10001.example", + "https://postal-code-10001-alternative.example", + ] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2, + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2, + } + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + +@ensureDeferred +async def test_location_session_config_no_location(mockserver): + """If no location is configured, the methods are never called.""" + pytest.importorskip("web_poet") + + @session_config(["postal-code-10001.example", "a.example"]) + class CustomSessionConfig(LocationSessionConfig): + + def location_params( + self, request: Request, location: Dict[str, Any] + ) -> Dict[str, Any]: + assert False + + def location_check( + self, response: Response, request: Request, location: Dict[str, Any] + ) -> bool: + assert False + + settings = { + "RETRY_TIMES": 0, + "ZYTE_API_URL": mockserver.urljoin("/"), + "ZYTE_API_SESSION_ENABLED": True, + "ZYTE_API_SESSION_MAX_BAD_INITS": 1, + } + + class TestSpider(Spider): + name = "test" + start_urls = ["https://postal-code-10001.example", "https://a.example"] + + def start_requests(self): + for url in self.start_urls: + yield Request( + url, + meta={ + "zyte_api_automap": { + "actions": [ + { + "action": "setLocation", + "address": {"postalCode": "10001"}, + } + ] + }, + }, + ) + + def parse(self, response): + pass + + crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False) + await crawler.crawl() + + session_stats = { + k: v + for k, v in crawler.stats.get_stats().items() + if k.startswith("scrapy-zyte-api/sessions") + } + assert session_stats == { + "scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1, + "scrapy-zyte-api/sessions/pools/a.example/init/check-passed": 1, + "scrapy-zyte-api/sessions/pools/a.example/use/check-passed": 1, + } + + # Clean up the session config registry, and check it, otherwise we could + # affect other tests. + + session_config_registry.__init__() # type: ignore[misc] + + @ensureDeferred async def test_session_refresh(mockserver): """If a response does not pass a session validity check, the session is