apify · Pijukatel · Feb 19, 2025 · Feb 5, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/...guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py b/...guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_beautifulsoup.py
@@ -3,8 +3,6 @@
 crawler = AdaptivePlaywrightCrawler.with_beautifulsoup_static_parser(
     # Arguments relevant only for PlaywrightCrawler
     playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'},
-    # Arguments relevant only for BeautifulSoupCrawler
-    static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]},
     # Common arguments relevant to all crawlers
     max_crawl_depth=5,
 )
diff --git a/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py b/docs/guides/code/adaptive_playwright_crawler/adaptive_playwright_crawler_init_parsel.py
@@ -3,8 +3,6 @@
 crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser(
     # Arguments relevant only for PlaywrightCrawler
     playwright_crawler_specific_kwargs={'headless': False, 'browser_type': 'chromium'},
-    # Arguments relevant only for ParselCrawler
-    static_crawler_specific_kwargs={'additional_http_error_status_codes': [204]},
     # Common arguments relevant to all crawlers
     max_crawl_depth=5,
 )
diff --git a/docs/upgrading/upgrading_to_v0x.md b/docs/upgrading/upgrading_to_v0x.md
@@ -9,6 +9,10 @@ This page summarizes the breaking changes between Crawlee for Python zero-based
 
 This section summarizes the breaking changes between v0.5.x and v0.6.0.
 
+### HttpCrawlerOptions
+
+- Removed `HttpCrawlerOptions` - which contained options from `BasicCrawlerOptions` and unique options `additional_http_error_status_codes` and `ignore_http_error_status_codes`. Both of the unique options were added to `BasicCrawlerOptions` instead.
+
 ### Configuration
 
 The `Configuration` fields `chrome_executable_path`, `xvfb`, and `verbose_log` have been removed. The `chrome_executable_path` and `xvfb` fields were unused, while `verbose_log` can be replaced by setting `log_level` to `DEBUG`.

diff --git a/src/crawlee/crawlers/__init__.py b/src/crawlee/crawlers/__init__.py
@@ -1,7 +1,7 @@
 from crawlee._utils.try_import import install_import_hook as _install_import_hook
 from crawlee._utils.try_import import try_import as _try_import
 
-from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, HttpCrawlerOptions, ParsedHttpCrawlingContext
+from ._abstract_http import AbstractHttpCrawler, AbstractHttpParser, ParsedHttpCrawlingContext
 from ._basic import BasicCrawler, BasicCrawlerOptions, BasicCrawlingContext, ContextPipeline
 from ._http import HttpCrawler, HttpCrawlingContext, HttpCrawlingResult
 
@@ -51,7 +51,6 @@
     'BeautifulSoupParserType',
     'ContextPipeline',
     'HttpCrawler',
-    'HttpCrawlerOptions',
     'HttpCrawlingContext',
     'HttpCrawlingResult',
     'ParsedHttpCrawlingContext',

diff --git a/src/crawlee/crawlers/_abstract_http/__init__.py b/src/crawlee/crawlers/_abstract_http/__init__.py
@@ -1,10 +1,9 @@
-from ._abstract_http_crawler import AbstractHttpCrawler, HttpCrawlerOptions
+from ._abstract_http_crawler import AbstractHttpCrawler
 from ._abstract_http_parser import AbstractHttpParser
 from ._http_crawling_context import ParsedHttpCrawlingContext
 
 __all__ = [
     'AbstractHttpCrawler',
     'AbstractHttpParser',
-    'HttpCrawlerOptions',
     'ParsedHttpCrawlingContext',
 ]
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Generic
 
 from pydantic import ValidationError
-from typing_extensions import NotRequired, TypedDict, TypeVar
+from typing_extensions import TypeVar
 
 from crawlee import EnqueueStrategy, RequestTransformAction
 from crawlee._request import Request, RequestOptions
@@ -19,7 +19,7 @@
 from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult
 
 if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, Awaitable, Iterable
+    from collections.abc import AsyncGenerator, Awaitable
 
     from typing_extensions import Unpack
 
@@ -31,26 +31,6 @@
 TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
 
 
-class _HttpCrawlerAdditionalOptions(TypedDict):
-    additional_http_error_status_codes: NotRequired[Iterable[int]]
-    """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
-
-    ignore_http_error_status_codes: NotRequired[Iterable[int]]
-    """HTTP status codes that are typically considered errors but should be treated as successful responses."""
-
-
-@docs_group('Data structures')
-class HttpCrawlerOptions(
-    Generic[TCrawlingContext, TStatisticsState],
-    _HttpCrawlerAdditionalOptions,
-    BasicCrawlerOptions[TCrawlingContext, StatisticsState],
-):
-    """Arguments for the `AbstractHttpCrawler` constructor.
-
-    It is intended for typing forwarded `__init__` arguments in the subclasses.
-    """
-
-
 @docs_group('Abstract classes')
 class AbstractHttpCrawler(
     Generic[TCrawlingContext, TParseResult, TSelectResult], BasicCrawler[TCrawlingContext, StatisticsState], ABC
@@ -73,18 +53,18 @@ def __init__(
         self,
         *,
         parser: AbstractHttpParser[TParseResult, TSelectResult],
-        additional_http_error_status_codes: Iterable[int] = (),
-        ignore_http_error_status_codes: Iterable[int] = (),
         **kwargs: Unpack[BasicCrawlerOptions[TCrawlingContext, StatisticsState]],
     ) -> None:
         self._parser = parser
         self._pre_navigation_hooks: list[Callable[[BasicCrawlingContext], Awaitable[None]]] = []
+        kwargs.setdefault('additional_http_error_status_codes', ())
+        kwargs.setdefault('ignore_http_error_status_codes', ())
 
         kwargs.setdefault(
             'http_client',
             HttpxHttpClient(
-                additional_http_error_status_codes=additional_http_error_status_codes,
-                ignore_http_error_status_codes=ignore_http_error_status_codes,
+                additional_http_error_status_codes=kwargs['additional_http_error_status_codes'],
+                ignore_http_error_status_codes=kwargs['ignore_http_error_status_codes'],
             ),
         )
 
@@ -115,7 +95,7 @@ class _ParsedHttpCrawler(
             def __init__(
                 self,
                 parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser,
-                **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
+                **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[TParseResult]]],
             ) -> None:
                 kwargs['_context_pipeline'] = self._create_static_content_crawler_pipeline()
                 super().__init__(

diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py
@@ -50,9 +50,6 @@
 
     from typing_extensions import Unpack
 
-    from crawlee.crawlers._abstract_http._abstract_http_crawler import (
-        _HttpCrawlerAdditionalOptions,
-    )
     from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions
     from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions
 
@@ -130,7 +127,6 @@ def __init__(
         rendering_type_predictor: RenderingTypePredictor | None = None,
         result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
         result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
-        static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None,
         playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
         statistics: Statistics[AdaptivePlaywrightCrawlerStatisticState] | None = None,
         **kwargs: Unpack[_BasicCrawlerOptions],
@@ -164,7 +160,6 @@ def __init__(
 
         # Sub crawlers related.
         playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {}
-        static_crawler_specific_kwargs = static_crawler_specific_kwargs or {}
 
         # Each sub crawler will use custom logger .
         static_logger = getLogger('Subcrawler_static')
@@ -181,7 +176,6 @@ def __init__(
         static_crawler = static_crawler_class(
             parser=static_parser,
             statistics=_NonPersistentStatistics(),
-            **static_crawler_specific_kwargs,
             **basic_crawler_kwargs_for_static_crawler,
         )
         playwright_crawler = PlaywrightCrawler(
@@ -221,7 +215,6 @@ def with_beautifulsoup_static_parser(
         result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
         result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
         parser_type: BeautifulSoupParserType = 'lxml',
-        static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None,
         playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
         statistics: Statistics[StatisticsState] | None = None,
         **kwargs: Unpack[_BasicCrawlerOptions],
@@ -236,7 +229,6 @@ def with_beautifulsoup_static_parser(
             result_checker=result_checker,
             result_comparator=result_comparator,
             static_parser=BeautifulSoupParser(parser=parser_type),
-            static_crawler_specific_kwargs=static_crawler_specific_kwargs,
             playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,
             statistics=adaptive_statistics,
             **kwargs,
@@ -248,7 +240,6 @@ def with_parsel_static_parser(
         rendering_type_predictor: RenderingTypePredictor | None = None,
         result_checker: Callable[[RequestHandlerRunResult], bool] | None = None,
         result_comparator: Callable[[RequestHandlerRunResult, RequestHandlerRunResult], bool] | None = None,
-        static_crawler_specific_kwargs: _HttpCrawlerAdditionalOptions | None = None,
         playwright_crawler_specific_kwargs: _PlaywrightCrawlerAdditionalOptions | None = None,
         statistics: Statistics[StatisticsState] | None = None,
         **kwargs: Unpack[_BasicCrawlerOptions],
@@ -263,7 +254,6 @@ def with_parsel_static_parser(
             result_checker=result_checker,
             result_comparator=result_comparator,
             static_parser=ParselParser(),
-            static_crawler_specific_kwargs=static_crawler_specific_kwargs,
             playwright_crawler_specific_kwargs=playwright_crawler_specific_kwargs,
             statistics=adaptive_statistics,
             **kwargs,

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -7,7 +7,7 @@
 import sys
 import tempfile
 from asyncio import CancelledError
-from collections.abc import AsyncGenerator, Awaitable, Sequence
+from collections.abc import AsyncGenerator, Awaitable, Iterable, Sequence
 from contextlib import AsyncExitStack, suppress
 from datetime import timedelta
 from functools import partial
@@ -135,6 +135,12 @@ class _BasicCrawlerOptions(TypedDict):
     keep_alive: NotRequired[bool]
     """Flag that can keep crawler running even when there are no requests in queue."""
 
+    additional_http_error_status_codes: NotRequired[Iterable[int]]
+    """Additional HTTP status codes to treat as errors, triggering automatic retries when encountered."""
+
+    ignore_http_error_status_codes: NotRequired[Iterable[int]]
+    """HTTP status codes that are typically considered errors but should be treated as successful responses."""
+
     _additional_context_managers: NotRequired[Sequence[AbstractAsyncContextManager]]
     """Additional context managers used throughout the crawler lifecycle. Intended for use by
     subclasses rather than direct instantiation of `BasicCrawler`."""
@@ -214,6 +220,8 @@ def __init__(
         max_crawl_depth: int | None = None,
         use_session_pool: bool = True,
         retry_on_blocked: bool = True,
+        additional_http_error_status_codes: Iterable[int] | None = None,
+        ignore_http_error_status_codes: Iterable[int] | None = None,
         concurrency_settings: ConcurrencySettings | None = None,
         request_handler_timeout: timedelta = timedelta(minutes=1),
         statistics: Statistics[TStatisticsState] | None = None,
@@ -249,6 +257,10 @@ def __init__(
                 from those requests. If not set, crawling continues without depth restrictions.
             use_session_pool: Enable the use of a session pool for managing sessions during crawling.
             retry_on_blocked: If True, the crawler attempts to bypass bot protections automatically.
+            additional_http_error_status_codes: Additional HTTP status codes to treat as errors,
+                triggering automatic retries when encountered.
+            ignore_http_error_status_codes: HTTP status codes that are typically considered errors but should be treated
+                as successful responses.
             concurrency_settings: Settings to fine-tune concurrency levels.
             request_handler_timeout: Maximum duration allowed for a single request handler to run.
             statistics: A custom `Statistics` instance, allowing the use of non-default configuration.
@@ -276,7 +288,29 @@ def __init__(
         self._request_manager = request_manager
         self._session_pool = session_pool or SessionPool()
         self._proxy_configuration = proxy_configuration
-        self._http_client = http_client or HttpxHttpClient()
+
+        self._additional_http_error_status_codes = (
+            set(additional_http_error_status_codes) if additional_http_error_status_codes else set()
+        )
+        self._ignore_http_error_status_codes = (
+            set(ignore_http_error_status_codes) if ignore_http_error_status_codes else set()
+        )
+
+        self._http_client = http_client or HttpxHttpClient(
+            additional_http_error_status_codes=self._additional_http_error_status_codes,
+            ignore_http_error_status_codes=self._ignore_http_error_status_codes,
+        )
+
+        if self._http_client.additional_blocked_status_codes != self._additional_http_error_status_codes:
+            raise ValueError(
+                'Used `additional_blocked_status_codes` argument does not match with '
+                f'{self._http_client.additional_blocked_status_codes=}. They have to be the same.'
+            )
+        if self._http_client.ignore_http_error_status_codes != self._ignore_http_error_status_codes:
+            raise ValueError(
+                'Used `ignore_http_error_status_codes` argument does not match with '
+                f'{self._http_client.ignore_http_error_status_codes=}. They have to be the same.'
+            )
 
         # Request router setup
         self._router: Router[TCrawlingContext] | None = None
@@ -1155,6 +1189,6 @@ def _is_session_blocked_status_code(self, session: Session | None, status_code:
         """
         return session is not None and session.is_blocked_status_code(
             status_code=status_code,
-            additional_blocked_status_codes=self._http_client.additional_blocked_status_codes,
-            ignore_http_error_status_codes=self._http_client.ignore_http_error_status_codes,
+            additional_blocked_status_codes=self._additional_http_error_status_codes,
+            ignore_http_error_status_codes=self._ignore_http_error_status_codes,
         )
diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_crawler.py
@@ -5,7 +5,7 @@
 from bs4 import BeautifulSoup, Tag
 
 from crawlee._utils.docs import docs_group
-from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions
+from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
 
 from ._beautifulsoup_crawling_context import BeautifulSoupCrawlingContext
 from ._beautifulsoup_parser import BeautifulSoupParser, BeautifulSoupParserType
@@ -58,7 +58,7 @@ def __init__(
         self,
         *,
         parser: BeautifulSoupParserType = 'lxml',
-        **kwargs: Unpack[HttpCrawlerOptions[BeautifulSoupCrawlingContext]],
+        **kwargs: Unpack[BasicCrawlerOptions[BeautifulSoupCrawlingContext]],
     ) -> None:
         """A default constructor.
 

diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py
@@ -3,13 +3,15 @@
 from typing import TYPE_CHECKING
 
 from crawlee._utils.docs import docs_group
-from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions, ParsedHttpCrawlingContext
+from crawlee.crawlers._abstract_http import AbstractHttpCrawler, ParsedHttpCrawlingContext
 
 from ._http_parser import NoParser
 
 if TYPE_CHECKING:
     from typing_extensions import Unpack
 
+    from crawlee.crawlers import BasicCrawlerOptions
+
 
 @docs_group('Classes')
 class HttpCrawler(AbstractHttpCrawler[ParsedHttpCrawlingContext[bytes], bytes, bytes]):
@@ -46,7 +48,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
 
     def __init__(
         self,
-        **kwargs: Unpack[HttpCrawlerOptions[ParsedHttpCrawlingContext[bytes]]],
+        **kwargs: Unpack[BasicCrawlerOptions[ParsedHttpCrawlingContext[bytes]]],
     ) -> None:
         """A default constructor.
 

diff --git a/src/crawlee/crawlers/_parsel/_parsel_crawler.py b/src/crawlee/crawlers/_parsel/_parsel_crawler.py
@@ -5,7 +5,7 @@
 from parsel import Selector
 
 from crawlee._utils.docs import docs_group
-from crawlee.crawlers._abstract_http import AbstractHttpCrawler, HttpCrawlerOptions
+from crawlee.crawlers import AbstractHttpCrawler, BasicCrawlerOptions
 
 from ._parsel_crawling_context import ParselCrawlingContext
 from ._parsel_parser import ParselParser
@@ -56,7 +56,7 @@ async def request_handler(context: ParselCrawlingContext) -> None:
 
     def __init__(
         self,
-        **kwargs: Unpack[HttpCrawlerOptions[ParselCrawlingContext]],
+        **kwargs: Unpack[BasicCrawlerOptions[ParselCrawlingContext]],
     ) -> None:
         """A default constructor.
 

diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py
@@ -189,7 +189,7 @@ async def test_handles_server_error(
     ],
 )
 async def test_stores_cookies(http_client_class: type[HttpClient], httpbin: URL) -> None:
-    http_client = http_client_class()
+    http_client = http_client_class(ignore_http_error_status_codes=[401])
     visit = Mock()
     track_session_usage = Mock()