Skip to content

Commit

Permalink
Add LocationSessionConfig (#215)
Browse files Browse the repository at this point in the history
Co-authored-by: Adrián Chaves <[email protected]>
Co-authored-by: Andrey Rakhmatullin <[email protected]>
  • Loading branch information
3 people authored Sep 9, 2024
1 parent cf55072 commit c1ba9c0
Show file tree
Hide file tree
Showing 4 changed files with 306 additions and 0 deletions.
9 changes: 9 additions & 0 deletions docs/usage/session.rst
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,15 @@ To define a different session config for a given URL pattern, install

.. autofunction:: scrapy_zyte_api.session_config

If you only need to override the :meth:`SessionConfig.check
<scrapy_zyte_api.SessionConfig.check>` or :meth:`SessionConfig.params
<scrapy_zyte_api.SessionConfig.params>` methods for scenarios involving a
location, you may subclass :class:`~scrapy_zyte_api.LocationSessionConfig`
instead:

.. autoclass:: scrapy_zyte_api.LocationSessionConfig
:members: location_check, location_params

If in a session config implementation or in any other Scrapy component you need
to tell whether a request is a :ref:`session initialization request
<session-init>` or not, use :func:`~scrapy_zyte_api.is_session_init_request`:
Expand Down
1 change: 1 addition & 0 deletions scrapy_zyte_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from ._session import SESSION_DEFAULT_RETRY_POLICY as _SESSION_DEFAULT_RETRY_POLICY
from ._session import (
LocationSessionConfig,
ScrapyZyteAPISessionDownloaderMiddleware,
SessionConfig,
is_session_init_request,
Expand Down
41 changes: 41 additions & 0 deletions scrapy_zyte_api/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,8 @@ def params(self, request: Request) -> Dict[str, Any]:
The returned parameters do not need to include :http:`request:url`. If
missing, it is picked from the request :ref:`triggering a session
initialization request <pool-size>`.
.. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig`
"""
if location := self.location(request):
return {
Expand All @@ -372,6 +374,8 @@ def check(self, response: Response, request: Request) -> bool:
If you need to tell whether *request* is a :ref:`session initialization
request <session-init>` or not, use
:func:`~scrapy_zyte_api.is_session_init_request`.
.. seealso:: :class:`~scrapy_zyte_api.LocationSessionConfig`
"""
if self._checker:
return self._checker.check(response, request)
Expand Down Expand Up @@ -966,3 +970,40 @@ async def process_exception(
spider=spider,
reason=reason,
)


class LocationSessionConfig(SessionConfig):
""":class:`~scrapy_zyte_api.SessionConfig` subclass to minimize boilerplate
when implementing location-specific session configs, i.e. session configs
where the default values should be used unless a location is set.
Provides counterparts to some :class:`~scrapy_zyte_api.SessionConfig`
methods that are only called when a location is set, and get that location
as a parameter.
"""

def params(self, request: Request) -> Dict[str, Any]:
if not (location := self.location(request)):
return super().params(request)
return self.location_params(request, location)

def check(self, response: Response, request: Request) -> bool:
if not (location := self.location(request)):
return super().check(response, request)
return self.location_check(response, request, location)

def location_params(
self, request: Request, location: Dict[str, Any]
) -> Dict[str, Any]:
"""Like :class:`SessionConfig.params
<scrapy_zyte_api.SessionConfig.params>`, but it is only called when a
location is set, and gets that *location* as a parameter."""
return super().params(request)

def location_check(
self, response: Response, request: Request, location: Dict[str, Any]
) -> bool:
"""Like :class:`SessionConfig.check
<scrapy_zyte_api.SessionConfig.check>`, but it is only called when a
location is set, and gets that *location* as a parameter."""
return super().check(response, request)
255 changes: 255 additions & 0 deletions tests/test_sessions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from scrapy_zyte_api import (
SESSION_AGGRESSIVE_RETRY_POLICY,
SESSION_DEFAULT_RETRY_POLICY,
LocationSessionConfig,
SessionConfig,
is_session_init_request,
session_config,
Expand Down Expand Up @@ -2080,6 +2081,260 @@ class CustomSessionConfig(SessionConfig):
pass


@ensureDeferred
async def test_location_session_config(mockserver):
pytest.importorskip("web_poet")

@session_config(
[
"postal-code-10001.example",
"postal-code-10001-fail.example",
"postal-code-10001-alternative.example",
]
)
class CustomSessionConfig(LocationSessionConfig):

def location_params(
self, request: Request, location: Dict[str, Any]
) -> Dict[str, Any]:
assert location == {"postalCode": "10002"}
return {
"actions": [
{
"action": "setLocation",
"address": {"postalCode": "10001"},
}
]
}

def location_check(
self, response: Response, request: Request, location: Dict[str, Any]
) -> bool:
assert location == {"postalCode": "10002"}
domain = urlparse_cached(request).netloc
return "fail" not in domain

def pool(self, request: Request) -> str:
domain = urlparse_cached(request).netloc
if domain == "postal-code-10001-alternative.example":
return "postal-code-10001.example"
return domain

settings = {
"RETRY_TIMES": 0,
"ZYTE_API_URL": mockserver.urljoin("/"),
"ZYTE_API_SESSION_ENABLED": True,
# We set a location to force the location-specific methods of the
# session config class to be called, but we set the wrong location so
# that the test would not pass were it not for our custom
# implementation which ignores the input location and instead sets the
# right one.
"ZYTE_API_SESSION_LOCATION": {"postalCode": "10002"},
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
}

class TestSpider(Spider):
name = "test"
start_urls = [
"https://postal-code-10001.example",
"https://postal-code-10001-alternative.example",
"https://postal-code-10001-fail.example",
]

def start_requests(self):
for url in self.start_urls:
yield Request(
url,
meta={
"zyte_api_automap": {
"actions": [
{
"action": "setLocation",
"address": {"postalCode": "10001"},
}
]
},
},
)

def parse(self, response):
pass

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2,
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2,
"scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/check-failed": 1,
}

# Clean up the session config registry, and check it, otherwise we could
# affect other tests.

session_config_registry.__init__() # type: ignore[misc]

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1,
"scrapy-zyte-api/sessions/pools/postal-code-10001-alternative.example/init/failed": 1,
"scrapy-zyte-api/sessions/pools/postal-code-10001-fail.example/init/failed": 1,
}


@ensureDeferred
async def test_location_session_config_no_methods(mockserver):
"""If no location_* methods are defined, LocationSessionConfig works the
same as SessionConfig."""
pytest.importorskip("web_poet")

@session_config(
[
"postal-code-10001.example",
"postal-code-10001-alternative.example",
]
)
class CustomSessionConfig(LocationSessionConfig):

def pool(self, request: Request) -> str:
domain = urlparse_cached(request).netloc
if domain == "postal-code-10001-alternative.example":
return "postal-code-10001.example"
return domain

settings = {
"RETRY_TIMES": 0,
"ZYTE_API_URL": mockserver.urljoin("/"),
"ZYTE_API_SESSION_ENABLED": True,
"ZYTE_API_SESSION_LOCATION": {"postalCode": "10001"},
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
}

class TestSpider(Spider):
name = "test"
start_urls = [
"https://postal-code-10001.example",
"https://postal-code-10001-alternative.example",
]

def start_requests(self):
for url in self.start_urls:
yield Request(
url,
meta={
"zyte_api_automap": {
"actions": [
{
"action": "setLocation",
"address": {"postalCode": "10001"},
}
]
},
},
)

def parse(self, response):
pass

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/check-passed": 2,
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/use/check-passed": 2,
}

# Clean up the session config registry, and check it, otherwise we could
# affect other tests.

session_config_registry.__init__() # type: ignore[misc]


@ensureDeferred
async def test_location_session_config_no_location(mockserver):
"""If no location is configured, the methods are never called."""
pytest.importorskip("web_poet")

@session_config(["postal-code-10001.example", "a.example"])
class CustomSessionConfig(LocationSessionConfig):

def location_params(
self, request: Request, location: Dict[str, Any]
) -> Dict[str, Any]:
assert False

def location_check(
self, response: Response, request: Request, location: Dict[str, Any]
) -> bool:
assert False

settings = {
"RETRY_TIMES": 0,
"ZYTE_API_URL": mockserver.urljoin("/"),
"ZYTE_API_SESSION_ENABLED": True,
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
}

class TestSpider(Spider):
name = "test"
start_urls = ["https://postal-code-10001.example", "https://a.example"]

def start_requests(self):
for url in self.start_urls:
yield Request(
url,
meta={
"zyte_api_automap": {
"actions": [
{
"action": "setLocation",
"address": {"postalCode": "10001"},
}
]
},
},
)

def parse(self, response):
pass

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/pools/postal-code-10001.example/init/failed": 1,
"scrapy-zyte-api/sessions/pools/a.example/init/check-passed": 1,
"scrapy-zyte-api/sessions/pools/a.example/use/check-passed": 1,
}

# Clean up the session config registry, and check it, otherwise we could
# affect other tests.

session_config_registry.__init__() # type: ignore[misc]


@ensureDeferred
async def test_session_refresh(mockserver):
"""If a response does not pass a session validity check, the session is
Expand Down

0 comments on commit c1ba9c0

Please sign in to comment.