Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat!: Enable additional status codes arguments to PlaywrightCrawler #959

Merged
merged 7 commits into from
Feb 19, 2025
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
# Arguments relevant only for PlaywrightCrawler
playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'},
# Arguments relevant only for BeautifulSoupCrawler
static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]},
# Common arguments relevant to all crawlers
max_crawl_depth=5,
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
# Arguments relevant only for PlaywrightCrawler
playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'},
# Arguments relevant only for ParselCrawler
static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]},
# Common arguments relevant to all crawlers
max_crawl_depth=5,
)
4 changes: 4 additions & 0 deletions docs/upgrading/upgrading_to_v0x.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ This page summarizes the breaking changes between Crawlee for Python zero-based

This section summarizes the breaking changes between v0.5.x and v0.6.0.

### HttpCrawlerOptions

- Removed `HttpCrawlerOptions` - which contained options from `BasicCrawlerOptions` and unique options `additional_http_error_status_codes` and `ignore_http_error_status_codes`. Both of the unique options were added to `BasicCrawlerOptions` instead.

### Configuration

The `Configuration` fields `chrome_executable_path`, `xvfb`, and `verbose_log` have been removed. The `chrome_executable_path` and `xvfb` fields were unused, while `verbose_log` can be replaced by setting `log_level` to `DEBUG`.
Expand Down
3 changes: 1 addition & 2 deletions src/crawlee/crawlers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from crawlee._utils.try_import import install_import_hook as _install_import_hook
from crawlee._utils.try_import import try_import as _try_import

from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult

Expand Down Expand Up @@ -51,7 +51,6 @@
'BeautifulSoupParserType',
'ContextPipeline',
'HttpCrawler',
'HttpCrawlerOptions',
'HttpCrawlingContext',
'HttpCrawlingResult',
'ParsedHttpCrawlingContext',
Expand Down
3 changes: 1 addition & 2 deletions src/crawlee/crawlers/_abstract_http/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
from ._abstract_http_crawler import AbstractHttpCrawler
from ._abstract_http_parser import AbstractHttpParser
from ._http_crawling_context import ParsedHttpCrawlingContext

__all__ = [
'AbstractHttpCrawler',
'AbstractHttpParser',
'HttpCrawlerOptions',
'ParsedHttpCrawlingContext',
]
34 changes: 7 additions & 27 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import TYPE_CHECKING, Any, Callable, Generic

from pydantic import ValidationError
from typing_extensions import NotRequired, TypedDict, TypeVar
from typing_extensions import TypeVar

from crawlee import EnqueueStrategy, RequestTransformAction
from crawlee._request import Request, RequestOptions
Expand All @@ -19,7 +19,7 @@
from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult

if TYPE_CHECKING:
from collections.abc import AsyncGenerator, Awaitable, Iterable
from collections.abc import AsyncGenerator, Awaitable

from typing_extensions import Unpack

Expand All @@ -31,26 +31,6 @@
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)


class _HttpCrawlerAdditionalOptions(TypedDict):
additional_http_error_status_codes: NotRequired[Iterable[int]]
"""Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""

ignore_http_error_status_codes: NotRequired[Iterable[int]]
"""HTTP status codes that are typically considered errors but should be treated as successful responses."""


@docs_group('Data structures')
class HttpCrawlerOptions(
Generic[TCrawlingContext, TStatisticsState],
_HttpCrawlerAdditionalOptions,
BasicCrawlerOptions[TCrawlingContext, StatisticsState],
):
"""Arguments for the `AbstractHttpCrawler` constructor.

It is intended for typing forwarded `__init__` arguments in the subclasses.
"""


@docs_group('Abstract classes')
class AbstractHttpCrawler(
Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
Expand All @@ -73,18 +53,18 @@ def __init__(
self,
*,
parser: AbstractHttpParser[TParseResult, TSelectResult],
additional_http_error_status_codes: Iterable[int] = (),
ignore_http_error_status_codes: Iterable[int] = (),
**kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
) -> None:
self._parser = parser
self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
kwargs.setdefault('additional_http_error_status_codes', ())
kwargs.setdefault('ignore_http_error_status_codes', ())

kwargs.setdefault(
'http_client',
HttpxHttpClient(
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
additional_http_error_status_codes=kwargs['additional_http_error_status_codes'],
ignore_http_error_status_codes=kwargs['ignore_http_error_status_codes'],
),
)

Expand Down Expand Up @@ -115,7 +95,7 @@ class _ParsedHttpCrawler(
def __init__(
self,
parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
**kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
**kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
) -> None:
kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
super().__init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@

from typing_extensions import Unpack

from crawlee.crawlers._abstract_http._abstract_http_crawler import (
_HttpCrawlerAdditionalOptions,
)
from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions

Expand Down Expand Up @@ -130,7 +127,6 @@ def __init__(
rendering_type_predictor: RenderingTypePredictor | None = None,
result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None,
playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
statistics: Statistics[AdaptivePlaywrightCrawlerStatisticState] | None = None,
**kwargs: Unpack[_BasicCrawlerOptions],
Expand Down Expand Up @@ -164,7 +160,6 @@ def __init__(

# Sub crawlers related.
playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {}
static_crawler_specific_kwargs = static_crawler_specific_kwargs or {}

# Each sub crawler will use custom logger .
static_logger = getLogger('Subcrawler_static')
Expand All @@ -181,7 +176,6 @@ def __init__(
static_crawler = static_crawler_class(
parser=static_parser,
statistics=_NonPersistentStatistics(),
**static_crawler_specific_kwargs,
**basic_crawler_kwargs_for_static_crawler,
)
playwright_crawler = PlaywrightCrawler(
Expand Down Expand Up @@ -221,7 +215,6 @@ def with_beautifulsoup_static_parser(
result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
parser_type: BeautifulSoupParserType = 'lxml',
static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None,
playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
statistics: Statistics[StatisticsState] | None = None,
**kwargs: Unpack[_BasicCrawlerOptions],
Expand All @@ -236,7 +229,6 @@ def with_beautifulsoup_static_parser(
result_checker=result_checker,
result_comparator=result_comparator,
static_parser=BeautifulSoupParser(parser=parser_type),
static_crawler_specific_kwargs=static_crawler_specific_kwargs,
playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,
statistics=adaptive_statistics,
**kwargs,
Expand All @@ -248,7 +240,6 @@ def with_parsel_static_parser(
rendering_type_predictor: RenderingTypePredictor | None = None,
result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None,
playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
statistics: Statistics[StatisticsState] | None = None,
**kwargs: Unpack[_BasicCrawlerOptions],
Expand All @@ -263,7 +254,6 @@ def with_parsel_static_parser(
result_checker=result_checker,
result_comparator=result_comparator,
static_parser=ParselParser(),
static_crawler_specific_kwargs=static_crawler_specific_kwargs,
playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,
statistics=adaptive_statistics,
**kwargs,
Expand Down
42 changes: 38 additions & 4 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
import tempfile
from asyncio import CancelledError
from collections.abc import AsyncGenerator, Awaitable, Sequence
from collections.abc import AsyncGenerator, Awaitable, Iterable, Sequence
from contextlib import AsyncExitStack, suppress
from datetime import timedelta
from functools import partial
Expand Down Expand Up @@ -135,6 +135,12 @@ class _BasicCrawlerOptions(TypedDict):
keep_alive: NotRequired[bool]
"""Flag that can keep crawler running even when there are no requests in queue."""

additional_http_error_status_codes: NotRequired[Iterable[int]]
"""Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""

ignore_http_error_status_codes: NotRequired[Iterable[int]]
"""HTTP status codes that are typically considered errors but should be treated as successful responses."""

_additional_context_managers: NotRequired[Sequence[AbstractAsyncContextManager]]
"""Additional context managers used throughout the crawler lifecycle. Intended for use by
subclasses rather than direct instantiation of `BasicCrawler`."""
Expand Down Expand Up @@ -214,6 +220,8 @@ def __init__(
max_crawl_depth: int | None = None,
use_session_pool: bool = True,
retry_on_blocked: bool = True,
additional_http_error_status_codes: Iterable[int] | None = None,
ignore_http_error_status_codes: Iterable[int] | None = None,
concurrency_settings: ConcurrencySettings | None = None,
request_handler_timeout: timedelta = timedelta(minutes=1),
statistics: Statistics[TStatisticsState] | None = None,
Expand Down Expand Up @@ -249,6 +257,10 @@ def __init__(
from those requests. If not set, crawling continues without depth restrictions.
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
additional_http_error_status_codes: Additional HTTP status codes to treat as errors,
triggering automatic retries when encountered.
ignore_http_error_status_codes: HTTP status codes that are typically considered errors but should be treated
as successful responses.
concurrency_settings: Settings to fine-tune concurrency levels.
request_handler_timeout: Maximum duration allowed for a single request handler to run.
statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
Expand Down Expand Up @@ -276,7 +288,29 @@ def __init__(
self._request_manager = request_manager
self._session_pool = session_pool or SessionPool()
self._proxy_configuration = proxy_configuration
self._http_client = http_client or HttpxHttpClient()

self._additional_http_error_status_codes = (
set(additional_http_error_status_codes) if additional_http_error_status_codes else set()
)
self._ignore_http_error_status_codes = (
set(ignore_http_error_status_codes) if ignore_http_error_status_codes else set()
)

self._http_client = http_client or HttpxHttpClient(
additional_http_error_status_codes=self._additional_http_error_status_codes,
ignore_http_error_status_codes=self._ignore_http_error_status_codes,
)

if self._http_client.additional_blocked_status_codes != self._additional_http_error_status_codes:
raise ValueError(
'Used `additional_blocked_status_codes` argument does not match with '
f'{self._http_client.additional_blocked_status_codes=}. They have to be the same.'
)
if self._http_client.ignore_http_error_status_codes != self._ignore_http_error_status_codes:
raise ValueError(
'Used `ignore_http_error_status_codes` argument does not match with '
f'{self._http_client.ignore_http_error_status_codes=}. They have to be the same.'
)
Comment on lines +292 to +313
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't we just keep them only in the http_client instance? (PW Crawler has HTTP client as well)

Copy link
Contributor Author

@Pijukatel Pijukatel Feb 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was considering that option, but it felt like misuse to me, especially when it comes to PlaywrightCrawler. PlaywrightCrawler is not using HTTP client for page.navigate so it would be really strange if it would use some attribute of this unrelated component to decide whether response status code of page.navigate is ok or not.
(Mentioned : #953 (comment))

But I see it looks like unnecessary code duplication, so I am not 100% happy with this either.

Copy link
Collaborator

@vdusek vdusek Feb 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I got it... However, having it duplicated seems like a worse option to me.

@janbuchar Your opinion please?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can think about taking this logic out of the http client. And put it in the BasicCrawler. Then it will work uniformly for any crawler and we will avoid code duplication

Copy link
Collaborator

@janbuchar janbuchar Feb 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree - in the long run, we want to have this logic factored out of the http client. I believe there was an issue to track that, but I only found #830.

It's probably fine to duplicate now and make an issue for refactoring this later.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In that case, I don't see a problem if we keep the duplication of code at this point. It will be solved during refactoring.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, Maybe it will be solved at the same time as #830, but if not, here is the issue: #998


# Request router setup
self._router: Router[TCrawlingContext] | None = None
Expand Down Expand Up @@ -1155,6 +1189,6 @@ def _is_session_blocked_status_code(self, session: Session | None, status_code:
"""
return session is not None and session.is_blocked_status_code(
status_code=status_code,
additional_blocked_status_codes=self._http_client.additional_blocked_status_codes,
ignore_http_error_status_codes=self._http_client.ignore_http_error_status_codes,
additional_blocked_status_codes=self._additional_http_error_status_codes,
ignore_http_error_status_codes=self._ignore_http_error_status_codes,
)
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from bs4 import BeautifulSoup, Tag

from crawlee._utils.docs import docs_group
from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions
from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions

from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
Expand Down Expand Up @@ -58,7 +58,7 @@ def __init__(
self,
*,
parser: BeautifulSoupParserType = 'lxml',
**kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
**kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
) -> None:
"""A default constructor.

Expand Down
6 changes: 4 additions & 2 deletions src/crawlee/crawlers/_http/_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
from typing import TYPE_CHECKING

from crawlee._utils.docs import docs_group
from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions, ParsedHttpCrawlingContext
from crawlee.crawlers._abstract_http import AbstractHttpCrawler, ParsedHttpCrawlingContext

from ._http_parser import NoParser

if TYPE_CHECKING:
from typing_extensions import Unpack

from crawlee.crawlers import BasicCrawlerOptions


@docs_group('Classes')
class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]):
Expand Down Expand Up @@ -46,7 +48,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:

def __init__(
self,
**kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[bytes]]],
**kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[bytes]]],
) -> None:
"""A default constructor.

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/crawlers/_parsel/_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from parsel import Selector

from crawlee._utils.docs import docs_group
from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions
from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions

from ._parsel_crawling_context import ParselCrawlingContext
from ._parsel_parser import ParselParser
Expand Down Expand Up @@ -56,7 +56,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:

def __init__(
self,
**kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
**kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
) -> None:
"""A default constructor.

Expand Down
2 changes: 1 addition & 1 deletion tests/unit/crawlers/_http/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ async def test_handles_server_error(
],
)
async def test_stores_cookies(http_client_class: type[HttpClient], httpbin: URL) -> None:
http_client = http_client_class()
http_client = http_client_class(ignore_http_error_status_codes=[401])
visit = Mock()
track_session_usage = Mock()

Expand Down
Loading
Loading