scrapy-plugins · Gallaecio · Jul 25, 2024 · Jun 11, 2024 · Jun 11, 2024 · Jun 12, 2024
diff --git a/docs/reference/settings.rst b/docs/reference/settings.rst
@@ -6,6 +6,38 @@ Settings
 
 :ref:`Settings <topics-settings>` for scrapy-zyte-api.
 
+.. setting:: ZYTE_API_AUTO_FIELD_STATS
+
+ZYTE_API_AUTO_FIELD_STATS
+=========================
+
+Default: ``False``
+
+Enables stats that indicate which requested fields come directly from
+:ref:`zyte-api-extract`.
+
+If for any request no page object class is used to override
+:ref:`zyte-api-extract` fields for a given item type, the following stat is
+set:
+
+.. code-block:: python
+
+    "scrapy-zyte-api/auto_fields/<item class import path>": "<space-separated field list>"
+
+If for any request a custom page object class is used to override some
+:ref:`zyte-api-extract` fields, the following stat is set:
+
+.. code-block:: python
+
+    "scrapy-zyte-api/auto_fields/<override class import path>": (
+        "<space-separated list of fields not overridden>"
+    )
+
+.. note:: If that page object class is not a subclass of an ``Auto``-prefixed
+    class from :doc:`zyte-common-items <zyte-common-items:index>`, all fields
+    are assumed to have been overridden, i.e. the stat value is always an empty
+    string.
+
 .. setting:: ZYTE_API_AUTOMAP_PARAMS
 
 ZYTE_API_AUTOMAP_PARAMS

diff --git a/scrapy_zyte_api/providers.py b/scrapy_zyte_api/providers.py
@@ -1,10 +1,11 @@
-from typing import Any, Callable, Dict, List, Optional, Sequence, Set
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast
 
+import attrs
 from andi.typeutils import is_typing_annotated, strip_annotated
 from scrapy import Request
 from scrapy.crawler import Crawler
 from scrapy.utils.defer import maybe_deferred_to_future
-from scrapy_poet import PageObjectInputProvider
+from scrapy_poet import InjectionMiddleware, PageObjectInputProvider
 from web_poet import (
     AnyResponse,
     BrowserHtml,
@@ -13,10 +14,19 @@
     HttpResponseHeaders,
 )
 from web_poet.annotated import AnnotatedInstance
+from web_poet.fields import get_fields_dict
+from web_poet.utils import get_fq_class_name
 from zyte_common_items import (
     Article,
     ArticleList,
     ArticleNavigation,
+    AutoArticleListPage,
+    AutoArticleNavigationPage,
+    AutoArticlePage,
+    AutoJobPostingPage,
+    AutoProductListPage,
+    AutoProductNavigationPage,
+    AutoProductPage,
     Item,
     JobPosting,
     Product,
@@ -35,6 +45,35 @@
     NO_CALLBACK = None
 
 
+_ITEM_KEYWORDS: Dict[type, str] = {
+    Product: "product",
+    ProductList: "productList",
+    ProductNavigation: "productNavigation",
+    Article: "article",
+    ArticleList: "articleList",
+    ArticleNavigation: "articleNavigation",
+    JobPosting: "jobPosting",
+}
+_AUTO_PAGES: Set[type] = {
+    AutoArticlePage,
+    AutoArticleListPage,
+    AutoArticleNavigationPage,
+    AutoJobPostingPage,
+    AutoProductPage,
+    AutoProductListPage,
+    AutoProductNavigationPage,
+}
+
+
+# https://stackoverflow.com/a/25959545
+def _field_cls(page_cls, field_name):
+    for cls in page_cls.__mro__:
+        if field_name in cls.__dict__:
+            return cls
+    # Only used with fields known to exist
+    assert False  # noqa: B011
+
+
 class ZyteApiProvider(PageObjectInputProvider):
     name = "zyte_api"
 
@@ -54,9 +93,62 @@ class ZyteApiProvider(PageObjectInputProvider):
         Screenshot,
     }
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._injection_mw = None
+        self._should_track_auto_fields = None
+        self._tracked_auto_fields = set()
+
     def is_provided(self, type_: Callable) -> bool:
         return super().is_provided(strip_annotated(type_))
 
+    def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
+        if cls not in _ITEM_KEYWORDS:
+            return
+        if self._should_track_auto_fields is None:
+            self._should_track_auto_fields = crawler.settings.getbool(
+                "ZYTE_API_AUTO_FIELD_STATS", False
+            )
+        if self._should_track_auto_fields is False:
+            return
+        if self._injection_mw is None:
+            try:
+                self._injection_mw = crawler.get_downloader_middleware(
+                    InjectionMiddleware
+                )
+            except AttributeError:
+                for component in crawler.engine.downloader.middleware.middlewares:
+                    if isinstance(component, InjectionMiddleware):
+                        self._injection_mw = component
+                        break
+            if self._injection_mw is None:
+                raise RuntimeError(
+                    "Could not find the InjectionMiddleware among enabled "
+                    "downloader middlewares. Please, ensure you have properly "
+                    "configured scrapy-poet."
+                )
+        cls = self._injection_mw.registry.page_cls_for_item(request.url, cls) or cls
+        if cls in self._tracked_auto_fields:
+            return
+        self._tracked_auto_fields.add(cls)
+        if cls in _ITEM_KEYWORDS:
+            auto_fields = set(attrs.fields_dict(cls))
+        else:
+            auto_cls = None
+            for ancestor in cls.__mro__:
+                if ancestor in _AUTO_PAGES:
+                    auto_cls = ancestor
+                    break
+            auto_fields = set()
+            if auto_cls:
+                for field_name in get_fields_dict(cls):
+                    field_cls = _field_cls(cls, field_name)
+                    if field_cls is auto_cls:
+                        auto_fields.add(field_name)
+        cls_fqn = get_fq_class_name(cls)
+        field_list = " ".join(sorted(auto_fields))
+        crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)
+
     async def __call__(  # noqa: C901
         self, to_provide: Set[Callable], request: Request, crawler: Crawler
     ) -> Sequence[Any]:
@@ -66,6 +158,7 @@ async def __call__(  # noqa: C901
         http_response = None
         screenshot_requested = Screenshot in to_provide
         for cls in list(to_provide):
+            self._track_auto_fields(crawler, request, cast(type, cls))
             item = self.injector.weak_cache.get(request, {}).get(cls)
             if item:
                 results.append(item)
@@ -89,15 +182,6 @@ async def __call__(  # noqa: C901
             return results
 
         html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
-        item_keywords: Dict[type, str] = {
-            Product: "product",
-            ProductList: "productList",
-            ProductNavigation: "productNavigation",
-            Article: "article",
-            ArticleList: "articleList",
-            ArticleNavigation: "articleNavigation",
-            JobPosting: "jobPosting",
-        }
 
         zyte_api_meta = {
             **crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS"),
@@ -135,7 +219,7 @@ async def __call__(  # noqa: C901
                         }
                     )
                 continue
-            kw = item_keywords.get(cls_stripped)
+            kw = _ITEM_KEYWORDS.get(cls_stripped)
             if not kw:
                 continue
             item_requested = True
@@ -165,7 +249,7 @@ async def __call__(  # noqa: C901
         )
 
         extract_from = None  # type: ignore[assignment]
-        for item_type, kw in item_keywords.items():
+        for item_type, kw in _ITEM_KEYWORDS.items():
             options_name = f"{kw}Options"
             if item_type not in to_provide_stripped and options_name in zyte_api_meta:
                 del zyte_api_meta[options_name]
@@ -271,7 +355,7 @@ async def __call__(  # noqa: C901
                 result = AnnotatedInstance(Actions(actions_result), cls.__metadata__)  # type: ignore[attr-defined]
                 results.append(result)
                 continue
-            kw = item_keywords.get(cls_stripped)
+            kw = _ITEM_KEYWORDS.get(cls_stripped)
             if not kw:
                 continue
             assert issubclass(cls_stripped, Item)

diff --git a/setup.py b/setup.py
@@ -33,7 +33,7 @@ def get_version():
             "andi>=0.6.0",
             "scrapy-poet>=0.22.3",
             "web-poet>=0.17.0",
-            "zyte-common-items>=0.8.0",
+            "zyte-common-items>=0.19.0",
         ]
     },
     classifiers=[