Skip to content

Commit

Permalink
Implement SessionConfig.enabled (#206)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Jun 25, 2024
1 parent 748be07 commit beaf8ca
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 14 deletions.
7 changes: 6 additions & 1 deletion docs/usage/session.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ Enabling session management
To enable session management for all requests, set
:setting:`ZYTE_API_SESSION_ENABLED` to ``True``. You can also toggle session
management on or off for specific requests using the
:reqmeta:`zyte_api_session_enabled` request metadata key.
:reqmeta:`zyte_api_session_enabled` request metadata key, or override the
:meth:`~scrapy_zyte_api.SessionConfig.enabled` method of a :ref:`session config
override <session-configs>`.

By default, scrapy-zyte-api will maintain up to 8 sessions per domain, each
initialized with a :ref:`browser request <zyte-api-browser>` targeting the URL
Expand Down Expand Up @@ -360,3 +362,6 @@ The following stats exist for scrapy-zyte-api session management:
``scrapy-zyte-api/sessions/pools/{pool}/use/failed``
Number of times that a request that used a session from pool ``{pool}``
got an :ref:`unsuccessful response <zyte-api-unsuccessful-responses>`.

``scrapy-zyte-api/sessions/use/disabled``
Number of processed requests for which session management was disabled.
42 changes: 30 additions & 12 deletions scrapy_zyte_api/_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,17 @@ def __init__(self, crawler):
else:
self._checker = None

self._enabled = crawler.settings.getbool("ZYTE_API_SESSION_ENABLED", False)

def enabled(self, request: Request) -> bool:
"""Return ``True`` if the request should use sessions from
:ref:`session management <session>` or ``False`` otherwise.
The default implementation is based on settings and request metadata
keys as described in :ref:`enable-sessions`.
"""
return request.meta.get("zyte_api_session_enabled", self._enabled)

def pool(self, request: Request) -> str:
"""Return the ID of the session pool to use for *request*.
Expand All @@ -202,7 +213,9 @@ def pool(self, request: Request) -> str:
https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html.
scrapy-zyte-api can maintain multiple session pools, each pool with up
to :setting:`ZYTE_API_SESSION_POOL_SIZE` sessions.
to :setting:`ZYTE_API_SESSION_POOL_SIZE` sessions
(:setting:`ZYTE_API_SESSION_POOL_SIZES` allows configuring
pool-specific sizes).
"""
return urlparse_cached(request).netloc

Expand Down Expand Up @@ -655,7 +668,11 @@ async def check(self, response: Response, request: Request) -> bool:
"""Check the response for signs of session expiration, update the
internal session pool accordingly, and return ``False`` if the session
has expired or ``True`` if the session passed validation."""
if self.is_init_request(request):
return True
session_config = self._get_session_config(request)
if not session_config.enabled(request):
return True
pool = self._get_pool(request)
try:
passed = session_config.check(response, request)
Expand All @@ -681,6 +698,12 @@ async def check(self, response: Response, request: Request) -> bool:

async def assign(self, request: Request):
"""Assign a working session to *request*."""
if self.is_init_request(request):
return
session_config = self._get_session_config(request)
if not session_config.enabled(request):
self._crawler.stats.inc_value("scrapy-zyte-api/sessions/use/disabled")
return
session_id = await self._next(request)
# Note: If there is a session set already (e.g. a request being
# retried), it is overridden.
Expand All @@ -702,6 +725,10 @@ async def assign(self, request: Request):
request.meta[meta_key]["session"] = {"id": session_id}
request.meta.setdefault("dont_merge_cookies", True)

def is_enabled(self, request: Request) -> bool:
session_config = self._get_session_config(request)
return session_config.enabled(request)

def handle_error(self, request: Request):
pool = self._get_pool(request)
self._crawler.stats.inc_value(
Expand Down Expand Up @@ -755,27 +782,18 @@ def from_crawler(cls, crawler: Crawler):
return cls(crawler)

def __init__(self, crawler: Crawler):
self._enabled = crawler.settings.getbool("ZYTE_API_SESSION_ENABLED", False)
self._crawler = crawler
self._sessions = _SessionManager(crawler)
self._fatal_error_handler = FatalErrorHandler(crawler)

async def process_request(self, request: Request, spider: Spider) -> None:
if not request.meta.get(
"zyte_api_session_enabled", self._enabled
) or self._sessions.is_init_request(request):
return
async with self._fatal_error_handler:
await self._sessions.assign(request)

async def process_response(
self, request: Request, response: Response, spider: Spider
) -> Union[Request, Response, None]:
if (
isinstance(response, DummyResponse)
or not request.meta.get("zyte_api_session_enabled", self._enabled)
or self._sessions.is_init_request(request)
):
if isinstance(response, DummyResponse):
return response
async with self._fatal_error_handler:
passed = await self._sessions.check(response, request)
Expand All @@ -795,8 +813,8 @@ async def process_exception(
) -> Union[Request, None]:
if (
not isinstance(exception, RequestError)
or not request.meta.get("zyte_api_session_enabled", self._enabled)
or self._sessions.is_init_request(request)
or not self._sessions.is_enabled(request)
):
return None

Expand Down
46 changes: 45 additions & 1 deletion tests/test_sessions.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ def parse(self, response):
"scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 1,
}
else:
assert session_stats == {}
assert session_stats == {
"scrapy-zyte-api/sessions/use/disabled": 1,
}


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1205,6 +1207,47 @@ def parse(self, response):
}


@ensureDeferred
async def test_session_config_enabled(mockserver):
pytest.importorskip("web_poet")

@session_config(["enabled.example", "disabled.example"])
class CustomSessionConfig(SessionConfig):

def enabled(self, request: Request):
return "enabled" in urlparse_cached(request).netloc

settings = {
"RETRY_TIMES": 0,
"ZYTE_API_URL": mockserver.urljoin("/"),
"ZYTE_API_SESSION_MAX_BAD_INITS": 1,
}

class TestSpider(Spider):
name = "test"
start_urls = ["https://enabled.example", "https://disabled.example"]

def parse(self, response):
pass

crawler = await get_crawler(settings, spider_cls=TestSpider, setup_engine=False)
await crawler.crawl()

session_stats = {
k: v
for k, v in crawler.stats.get_stats().items()
if k.startswith("scrapy-zyte-api/sessions")
}
assert session_stats == {
"scrapy-zyte-api/sessions/use/disabled": 1,
"scrapy-zyte-api/sessions/pools/enabled.example/init/check-passed": 1,
"scrapy-zyte-api/sessions/pools/enabled.example/use/check-passed": 1,
}

# Clean up the session config registry.
session_config_registry.__init__() # type: ignore[misc]


@ensureDeferred
async def test_session_config_location(mockserver):
"""A custom session config can be used to customize the params for
Expand Down Expand Up @@ -1739,6 +1782,7 @@ def parse4(self, response):
assert session_stats == {
"scrapy-zyte-api/sessions/pools/example.com/init/check-passed": 2,
"scrapy-zyte-api/sessions/pools/example.com/use/check-passed": 2,
"scrapy-zyte-api/sessions/use/disabled": 2,
}

assert tracker.cookies == [
Expand Down

0 comments on commit beaf8ca

Please sign in to comment.