Skip to content

Commit

Permalink
Track in stats which fields from Zyte API automatic extraction are no…
Browse files Browse the repository at this point in the history
…t overridden (#202)
  • Loading branch information
Gallaecio authored Jul 25, 2024
1 parent 42e81f6 commit 055a5a6
Show file tree
Hide file tree
Showing 6 changed files with 702 additions and 17 deletions.
7 changes: 7 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Changes
=======

N.N.N (YYYY-MM-DD)
------------------

* ``scrapy-zyte-api[provider]`` now requires zyte-common-items >= 0.20.0.

* Added the :setting:`ZYTE_API_AUTO_FIELD_STATS` setting.

0.21.0 (2024-07-02)
-------------------

Expand Down
34 changes: 34 additions & 0 deletions docs/reference/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,40 @@ Settings

:ref:`Settings <topics-settings>` for scrapy-zyte-api.

.. setting:: ZYTE_API_AUTO_FIELD_STATS

ZYTE_API_AUTO_FIELD_STATS
=========================

Default: ``False``

Enables stats that indicate which requested fields :ref:`obtained through
scrapy-poet integration <scrapy-poet>` come directly from
:ref:`zyte-api-extract`.

If for any request no page object class is used to override
:ref:`zyte-api-extract` fields for a given item type, the following stat is
set:

.. code-block:: python
"scrapy-zyte-api/auto_fields/<item class import path>": "(all fields)"
.. note:: A literal ``(all fields)`` string is used as value, not a list with
all fields.

If for any request a custom page object class is used to override some
:ref:`zyte-api-extract` fields, the following stat is set:

.. code-block:: python
"scrapy-zyte-api/auto_fields/<override class import path>": (
"<space-separated list of fields not overridden>"
)
.. note:: :func:`zyte_common_items.fields.is_auto_field` is used to determine
whether a field has been overridden or not.

.. setting:: ZYTE_API_AUTOMAP_PARAMS

ZYTE_API_AUTOMAP_PARAMS
Expand Down
77 changes: 64 additions & 13 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Callable, Dict, List, Optional, Sequence, Set
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast

from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
Expand All @@ -13,16 +13,26 @@
HttpResponseHeaders,
)
from web_poet.annotated import AnnotatedInstance
from web_poet.fields import get_fields_dict
from web_poet.utils import get_fq_class_name
from zyte_common_items import (
Article,
ArticleList,
ArticleNavigation,
AutoArticleListPage,
AutoArticleNavigationPage,
AutoArticlePage,
AutoJobPostingPage,
AutoProductListPage,
AutoProductNavigationPage,
AutoProductPage,
Item,
JobPosting,
Product,
ProductList,
ProductNavigation,
)
from zyte_common_items.fields import is_auto_field

from scrapy_zyte_api import Actions, ExtractFrom, Geolocation, Screenshot
from scrapy_zyte_api._annotations import _ActionResult
Expand All @@ -35,6 +45,26 @@
NO_CALLBACK = None


_ITEM_KEYWORDS: Dict[type, str] = {
Product: "product",
ProductList: "productList",
ProductNavigation: "productNavigation",
Article: "article",
ArticleList: "articleList",
ArticleNavigation: "articleNavigation",
JobPosting: "jobPosting",
}
_AUTO_PAGES: Set[type] = {
AutoArticlePage,
AutoArticleListPage,
AutoArticleNavigationPage,
AutoJobPostingPage,
AutoProductPage,
AutoProductListPage,
AutoProductNavigationPage,
}


class ZyteApiProvider(PageObjectInputProvider):
name = "zyte_api"

Expand All @@ -54,9 +84,38 @@ class ZyteApiProvider(PageObjectInputProvider):
Screenshot,
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._should_track_auto_fields = None
self._tracked_auto_fields = set()

def is_provided(self, type_: Callable) -> bool:
return super().is_provided(strip_annotated(type_))

def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
if cls not in _ITEM_KEYWORDS:
return
if self._should_track_auto_fields is None:
self._should_track_auto_fields = crawler.settings.getbool(
"ZYTE_API_AUTO_FIELD_STATS", False
)
if self._should_track_auto_fields is False:
return
cls = self.injector.registry.page_cls_for_item(request.url, cls) or cls
if cls in self._tracked_auto_fields:
return
self._tracked_auto_fields.add(cls)
if cls in _ITEM_KEYWORDS:
field_list = "(all fields)"
else:
auto_fields = set()
for field_name in get_fields_dict(cls):
if is_auto_field(cls, field_name): # type: ignore[arg-type]
auto_fields.add(field_name)
field_list = " ".join(sorted(auto_fields))
cls_fqn = get_fq_class_name(cls)
crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)

async def __call__( # noqa: C901
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
Expand All @@ -66,6 +125,7 @@ async def __call__( # noqa: C901
http_response = None
screenshot_requested = Screenshot in to_provide
for cls in list(to_provide):
self._track_auto_fields(crawler, request, cast(type, cls))
item = self.injector.weak_cache.get(request, {}).get(cls)
if item:
results.append(item)
Expand All @@ -89,15 +149,6 @@ async def __call__( # noqa: C901
return results

html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
item_keywords: Dict[type, str] = {
Product: "product",
ProductList: "productList",
ProductNavigation: "productNavigation",
Article: "article",
ArticleList: "articleList",
ArticleNavigation: "articleNavigation",
JobPosting: "jobPosting",
}

zyte_api_meta = {
**crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS"),
Expand Down Expand Up @@ -135,7 +186,7 @@ async def __call__( # noqa: C901
}
)
continue
kw = item_keywords.get(cls_stripped)
kw = _ITEM_KEYWORDS.get(cls_stripped)
if not kw:
continue
item_requested = True
Expand Down Expand Up @@ -165,7 +216,7 @@ async def __call__( # noqa: C901
)

extract_from = None # type: ignore[assignment]
for item_type, kw in item_keywords.items():
for item_type, kw in _ITEM_KEYWORDS.items():
options_name = f"{kw}Options"
if item_type not in to_provide_stripped and options_name in zyte_api_meta:
del zyte_api_meta[options_name]
Expand Down Expand Up @@ -271,7 +322,7 @@ async def __call__( # noqa: C901
result = AnnotatedInstance(Actions(actions_result), cls.__metadata__) # type: ignore[attr-defined]
results.append(result)
continue
kw = item_keywords.get(cls_stripped)
kw = _ITEM_KEYWORDS.get(cls_stripped)
if not kw:
continue
assert issubclass(cls_stripped, Item)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_version():
"andi>=0.6.0",
"scrapy-poet>=0.22.3",
"web-poet>=0.17.0",
"zyte-common-items>=0.8.0",
"zyte-common-items>=0.20.0",
]
},
classifiers=[
Expand Down
Loading

0 comments on commit 055a5a6

Please sign in to comment.