Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track in stats which fields from Zyte API automatic extraction are not overridden #202

Merged
merged 11 commits into from
Jul 25, 2024
105 changes: 91 additions & 14 deletions scrapy_zyte_api/providers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from typing import Any, Callable, Dict, List, Optional, Sequence, Set
from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Type, cast

import attrs
from andi.typeutils import is_typing_annotated, strip_annotated
from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.utils.defer import maybe_deferred_to_future
from scrapy_poet import PageObjectInputProvider
from scrapy_poet import InjectionMiddleware, PageObjectInputProvider
from web_poet import (
AnyResponse,
BrowserHtml,
Expand All @@ -13,10 +14,19 @@
HttpResponseHeaders,
)
from web_poet.annotated import AnnotatedInstance
from web_poet.fields import get_fields_dict
from web_poet.utils import get_fq_class_name
from zyte_common_items import (
Article,
ArticleList,
ArticleNavigation,
AutoArticleListPage,
AutoArticleNavigationPage,
AutoArticlePage,
AutoJobPostingPage,
AutoProductListPage,
AutoProductNavigationPage,
AutoProductPage,
Item,
JobPosting,
Product,
Expand All @@ -35,6 +45,35 @@
NO_CALLBACK = None


_ITEM_KEYWORDS: Dict[type, str] = {
Product: "product",
ProductList: "productList",
ProductNavigation: "productNavigation",
Article: "article",
ArticleList: "articleList",
ArticleNavigation: "articleNavigation",
JobPosting: "jobPosting",
}
_AUTO_PAGES: Set[type] = {
AutoArticlePage,
AutoArticleListPage,
AutoArticleNavigationPage,
AutoJobPostingPage,
AutoProductPage,
AutoProductListPage,
AutoProductNavigationPage,
}


# https://stackoverflow.com/a/25959545
def _field_cls(page_cls, field_name):
for cls in page_cls.__mro__:
if field_name in cls.__dict__:
return cls
# Only used with fields known to exist
assert False # noqa: B011


class ZyteApiProvider(PageObjectInputProvider):
name = "zyte_api"

Expand All @@ -54,9 +93,55 @@ class ZyteApiProvider(PageObjectInputProvider):
Screenshot,
}

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._injection_mw = None
self._tracked_auto_fields = set()

def is_provided(self, type_: Callable) -> bool:
return super().is_provided(strip_annotated(type_))

def _track_auto_fields(self, crawler: Crawler, request: Request, cls: Type):
if cls not in _ITEM_KEYWORDS:
return
if self._injection_mw is None:
try:
self._injection_mw = crawler.get_downloader_middleware(
InjectionMiddleware
)
except AttributeError:
for component in crawler.engine.downloader.middleware.middlewares:
if isinstance(component, InjectionMiddleware):
self._injection_mw = component
break
if self._injection_mw is None:
raise RuntimeError(
"Could not find the InjectionMiddleware among enabled "
"downloader middlewares. Please, ensure you have properly "
"configured scrapy-poet."
)
cls = self._injection_mw.registry.page_cls_for_item(request.url, cls) or cls
if cls in self._tracked_auto_fields:
return
self._tracked_auto_fields.add(cls)
if cls in _ITEM_KEYWORDS:
auto_fields = set(attrs.fields_dict(cls))
else:
auto_cls = None
for ancestor in cls.__mro__:
if ancestor in _AUTO_PAGES:
auto_cls = ancestor
break
auto_fields = set()
if auto_cls:
for field_name in get_fields_dict(cls):
field_cls = _field_cls(cls, field_name)
if field_cls is auto_cls:
auto_fields.add(field_name)
cls_fqn = get_fq_class_name(cls)
field_list = " ".join(sorted(auto_fields))
crawler.stats.set_value(f"scrapy-zyte-api/auto_fields/{cls_fqn}", field_list)

async def __call__( # noqa: C901
self, to_provide: Set[Callable], request: Request, crawler: Crawler
) -> Sequence[Any]:
Expand All @@ -66,6 +151,7 @@ async def __call__( # noqa: C901
http_response = None
screenshot_requested = Screenshot in to_provide
for cls in list(to_provide):
self._track_auto_fields(crawler, request, cast(type, cls))
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved
item = self.injector.weak_cache.get(request, {}).get(cls)
if item:
results.append(item)
Expand All @@ -89,15 +175,6 @@ async def __call__( # noqa: C901
return results

html_requested = BrowserResponse in to_provide or BrowserHtml in to_provide
item_keywords: Dict[type, str] = {
Product: "product",
ProductList: "productList",
ProductNavigation: "productNavigation",
Article: "article",
ArticleList: "articleList",
ArticleNavigation: "articleNavigation",
JobPosting: "jobPosting",
}

zyte_api_meta = {
**crawler.settings.getdict("ZYTE_API_PROVIDER_PARAMS"),
Expand Down Expand Up @@ -135,7 +212,7 @@ async def __call__( # noqa: C901
}
)
continue
kw = item_keywords.get(cls_stripped)
kw = _ITEM_KEYWORDS.get(cls_stripped)
if not kw:
continue
item_requested = True
Expand Down Expand Up @@ -165,7 +242,7 @@ async def __call__( # noqa: C901
)

extract_from = None # type: ignore[assignment]
for item_type, kw in item_keywords.items():
for item_type, kw in _ITEM_KEYWORDS.items():
options_name = f"{kw}Options"
if item_type not in to_provide_stripped and options_name in zyte_api_meta:
del zyte_api_meta[options_name]
Expand Down Expand Up @@ -271,7 +348,7 @@ async def __call__( # noqa: C901
result = AnnotatedInstance(Actions(actions_result), cls.__metadata__) # type: ignore[attr-defined]
results.append(result)
continue
kw = item_keywords.get(cls_stripped)
kw = _ITEM_KEYWORDS.get(cls_stripped)
if not kw:
continue
assert issubclass(cls_stripped, Item)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_version():
"andi>=0.6.0",
"scrapy-poet>=0.22.3",
"web-poet>=0.17.0",
"zyte-common-items>=0.8.0",
"zyte-common-items>=0.19.0",
]
},
classifiers=[
Expand Down
Loading