Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track in stats which fields from Zyte API automatic extraction are not overridden #202

Merged
merged 11 commits into from
Jul 25, 2024
7 changes: 7 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Changes
=======

N.N.N (YYYY-MM-DD)
------------------

* ``scrapy-zyte-api[provider]`` now requires zyte-common-items >= 0.20.0.
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

* Added the :setting:`ZYTE_API_AUTO_FIELD_STATS` setting.

0.18.4 (2024-06-10)
-------------------

Expand Down
31 changes: 31 additions & 0 deletions docs/reference/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,37 @@ Settings

:ref:`Settings <topics-settings>` for scrapy-zyte-api.

.. setting:: ZYTE_API_AUTO_FIELD_STATS

ZYTE_API_AUTO_FIELD_STATS
=========================

Default: ``False``

Enables stats that indicate which requested fields :ref:`obtained through
scrapy-poet integration <scrapy-poet>` come directly from
:ref:`zyte-api-extract`.

If for any request no page object class is used to override
:ref:`zyte-api-extract` fields for a given item type, the following stat is
set:

.. code-block:: python

"scrapy-zyte-api/auto_fields/<item class import path>": "<space-separated field list>"
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

If for any request a custom page object class is used to override some
:ref:`zyte-api-extract` fields, the following stat is set:

.. code-block:: python

"scrapy-zyte-api/auto_fields/<override class import path>": (
"<space-separated list of fields not overridden>"
)

.. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine
whether a field has been overridden or not.

.. setting:: ZYTE_API_AUTOMAP_PARAMS

ZYTE_API_AUTOMAP_PARAMS
Expand Down
97 changes: 83 additions & 14 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from typing import Any, Callable, Dict, List, Optional, Sequence, Set
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast

import attrs
from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.utils.defer import maybe_deferred_to_future
from scrapy_poet import PageObjectInputProvider
from scrapy_poet import InjectionMiddleware, PageObjectInputProvider
from web_poet import (
AnyResponse,
BrowserHtml,
Expand All @@ -13,16 +14,26 @@
HttpResponseHeaders,
)
from web_poet.annotated import AnnotatedInstance
from web_poet.fields import get_fields_dict
from web_poet.utils import get_fq_class_name
from zyte_common_items import (
Article,
ArticleList,
ArticleNavigation,
AutoArticleListPage,
AutoArticleNavigationPage,
AutoArticlePage,
AutoJobPostingPage,
AutoProductListPage,
AutoProductNavigationPage,
AutoProductPage,
Item,
JobPosting,
Product,
ProductList,
ProductNavigation,
)
from zyte_common_items.fields import is_auto_field

from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot
from scrapy_zyte_api._annotations import _ActionResult
Expand All @@ -35,6 +46,26 @@
NO_CALLBACK = None


_ITEM_KEYWORDS: Dict[type, str] = {
Product: "product",
ProductList: "productList",
ProductNavigation: "productNavigation",
Article: "article",
ArticleList: "articleList",
ArticleNavigation: "articleNavigation",
JobPosting: "jobPosting",
}
_AUTO_PAGES: Set[type] = {
AutoArticlePage,
AutoArticleListPage,
AutoArticleNavigationPage,
AutoJobPostingPage,
AutoProductPage,
AutoProductListPage,
AutoProductNavigationPage,
}


class ZyteApiProvider(PageObjectInputProvider):
name = "zyte_api"

Expand All @@ -54,9 +85,55 @@ class ZyteApiProvider(PageObjectInputProvider):
Screenshot,
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._injection_mw = None
self._should_track_auto_fields = None
self._tracked_auto_fields = set()

def is_provided(self, type_: Callable) -> bool:
return super().is_provided(strip_annotated(type_))

def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
if cls not in _ITEM_KEYWORDS:
return
if self._should_track_auto_fields is None:
self._should_track_auto_fields = crawler.settings.getbool(
"ZYTE_API_AUTO_FIELD_STATS", False
)
if self._should_track_auto_fields is False:
return
if self._injection_mw is None:
try:
self._injection_mw = crawler.get_downloader_middleware(
InjectionMiddleware
)
except AttributeError:
for component in crawler.engine.downloader.middleware.middlewares:
if isinstance(component, InjectionMiddleware):
self._injection_mw = component
break
if self._injection_mw is None:
raise RuntimeError(
"Could not find the InjectionMiddleware among enabled "
"downloader middlewares. Please, ensure you have properly "
"configured scrapy-poet."
)
cls = self._injection_mw.registry.page_cls_for_item(request.url, cls) or cls
if cls in self._tracked_auto_fields:
return
self._tracked_auto_fields.add(cls)
if cls in _ITEM_KEYWORDS:
auto_fields = set(attrs.fields_dict(cls))
else:
auto_fields = set()
for field_name in get_fields_dict(cls):
if is_auto_field(cls, field_name):
auto_fields.add(field_name)
cls_fqn = get_fq_class_name(cls)
field_list = " ".join(sorted(auto_fields))
crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)

async def __call__( # noqa: C901
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
Expand All @@ -66,6 +143,7 @@ async def __call__( # noqa: C901
http_response = None
screenshot_requested = Screenshot in to_provide
for cls in list(to_provide):
self._track_auto_fields(crawler, request, cast(type, cls))
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
item = self.injector.weak_cache.get(request, {}).get(cls)
if item:
results.append(item)
Expand All @@ -89,15 +167,6 @@ async def __call__( # noqa: C901
return results

html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
item_keywords: Dict[type, str] = {
Product: "product",
ProductList: "productList",
ProductNavigation: "productNavigation",
Article: "article",
ArticleList: "articleList",
ArticleNavigation: "articleNavigation",
JobPosting: "jobPosting",
}

zyte_api_meta = {
**crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS"),
Expand Down Expand Up @@ -135,7 +204,7 @@ async def __call__( # noqa: C901
}
)
continue
kw = item_keywords.get(cls_stripped)
kw = _ITEM_KEYWORDS.get(cls_stripped)
if not kw:
continue
item_requested = True
Expand Down Expand Up @@ -165,7 +234,7 @@ async def __call__( # noqa: C901
)

extract_from = None # type: ignore[assignment]
for item_type, kw in item_keywords.items():
for item_type, kw in _ITEM_KEYWORDS.items():
options_name = f"{kw}Options"
if item_type not in to_provide_stripped and options_name in zyte_api_meta:
del zyte_api_meta[options_name]
Expand Down Expand Up @@ -271,7 +340,7 @@ async def __call__( # noqa: C901
result = AnnotatedInstance(Actions(actions_result), cls.__metadata__) # type: ignore[attr-defined]
results.append(result)
continue
kw = item_keywords.get(cls_stripped)
kw = _ITEM_KEYWORDS.get(cls_stripped)
if not kw:
continue
assert issubclass(cls_stripped, Item)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_version():
"andi>=0.6.0",
"scrapy-poet>=0.22.3",
"web-poet>=0.17.0",
"zyte-common-items>=0.8.0",
"zyte-common-items>=0.20.0",
]
},
classifiers=[
Expand Down
Loading