From 766ed5a1536f191c11635323976652af14c0e1a6 Mon Sep 17 00:00:00 2001 From: Felipe Tiago Date: Wed, 27 Mar 2024 19:53:48 -0300 Subject: [PATCH 1/2] feat: add article spider, docs and tests --- docs/index.rst | 1 + docs/templates/article.rst | 24 + docs/templates/index.rst | 3 + .../test_article_navigation_heuristics.py | 147 ++++ .../test_product_navigation_heuristics.py | 14 +- tests/test_article.py | 639 ++++++++++++++++++ tests/test_heuristics.py | 130 ++-- zyte_spider_templates/__init__.py | 1 + zyte_spider_templates/heuristics.py | 148 ++-- .../page_objects/__init__.py | 2 +- .../article_navigation_heuristics.py | 1 + zyte_spider_templates/pages/__init__.py | 1 + .../pages/article_navigation_heuristics.py | 75 ++ .../pages/product_navigation_heuristics.py | 37 +- zyte_spider_templates/spiders/article.py | 249 +++++++ 15 files changed, 1352 insertions(+), 120 deletions(-) create mode 100644 docs/templates/article.rst create mode 100644 tests/pages/test_article_navigation_heuristics.py create mode 100644 tests/test_article.py create mode 100644 zyte_spider_templates/page_objects/article_navigation_heuristics.py create mode 100644 zyte_spider_templates/pages/article_navigation_heuristics.py create mode 100644 zyte_spider_templates/spiders/article.py diff --git a/docs/index.rst b/docs/index.rst index d344faa..2829d03 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -18,6 +18,7 @@ zyte-spider-templates documentation templates/index E-commerce + Article .. toctree:: :caption: Customization diff --git a/docs/templates/article.rst b/docs/templates/article.rst new file mode 100644 index 0000000..d7d4f17 --- /dev/null +++ b/docs/templates/article.rst @@ -0,0 +1,24 @@ +.. _article: + +========================================== +Article spider template (``article``) +========================================== + +Basic use +========= + +.. code-block:: shell + + scrapy crawl article -a url="https://quotes.toscrape.com/" + +Parameters +========== + +.. autopydantic_model:: zyte_spider_templates.spiders.article.ArticleSpiderParams + :inherited-members: BaseModel + +.. autoenum:: zyte_spider_templates.spiders.article.ArticleCrawlStrategy + +.. autoenum:: zyte_spider_templates.spiders.base.ExtractFrom + +.. autoenum:: zyte_spider_templates.spiders.base.Geolocation diff --git a/docs/templates/index.rst b/docs/templates/index.rst index c70a7de..afaed04 100644 --- a/docs/templates/index.rst +++ b/docs/templates/index.rst @@ -29,3 +29,6 @@ Spider template list :ref:`E-commerce ` Get products from an e-commerce website. + +:ref:`Article
` + Get articles from an article website. diff --git a/tests/pages/test_article_navigation_heuristics.py b/tests/pages/test_article_navigation_heuristics.py new file mode 100644 index 0000000..9352829 --- /dev/null +++ b/tests/pages/test_article_navigation_heuristics.py @@ -0,0 +1,147 @@ +import pytest +from pytest_twisted import ensureDeferred +from web_poet import AnyResponse, HttpResponse, PageParams, RequestUrl +from zyte_common_items import ArticleNavigation, ProbabilityRequest + +from zyte_spider_templates.pages.article_navigation_heuristics import ( + HeuristicsArticleNavigationPage, +) + + +@ensureDeferred +async def test_unknown_article_page(): + body = b""" + + +
+

Categories

+
+ News + Sports +
+
+
+

Articles

+ + + Next Page + +
+ Probably Relevant? + + + + """ + response = AnyResponse(HttpResponse("https://example.com", body)) + navigation = ArticleNavigation.from_dict( + { + "url": "https://example.com", + "subCategories": [ + {"url": "https://example.com/category/news", "name": "News"}, + {"url": "https://example.com/category/sports", "name": "Sports"}, + ], + "items": [ + { + "url": "https://example.com/article?id=breaking-news", + "name": "Breaking News", + }, + { + "url": "https://example.com/article?id=latest-scores", + "name": "Latest Scores", + }, + ], + "nextPage": { + "url": "https://example.com/page-2", + "name": "Next Page", + }, + "metadata": {"dateDownloaded": "2024-01-09T14:37:58Z"}, + } + ) + all_valid_urls = [ + "https://example.com/category/news", + "https://example.com/category/sports", + "https://example.com/article?id=breaking-news", + "https://example.com/article?id=latest-scores", + "https://example.com/page-2", + ] + urls_subcategories = [ + ProbabilityRequest.from_dict( + {"url": "https://example.com/category/news", "name": "News"} + ), + ProbabilityRequest.from_dict( + {"url": "https://example.com/category/sports", "name": "Sports"} + ), + ] + + # Heuristics turned OFF + request_url = RequestUrl(response.url) + page_params = PageParams({"allow_domains": "example.com"}) + page = HeuristicsArticleNavigationPage( + request_url, navigation, response, page_params + ) + item = await page.to_item() + + assert item.subCategories == urls_subcategories + assert page._urls_for_navigation() == all_valid_urls + + # Heuristics turned ON + page_params = PageParams({"full_domain": "example.com"}) + page = HeuristicsArticleNavigationPage( + request_url, navigation, response, page_params + ) + item = await page.to_item() + + assert item.subCategories == urls_subcategories + [ + ProbabilityRequest.from_dict( + { + "url": "https://example.com/category/probably-relevant", + "name": "[heuristics] Probably Relevant?", + "metadata": {"probability": 0.1}, + } + ) + ] + assert page._urls_for_navigation() == all_valid_urls + + +@ensureDeferred +async def test_crawl_nofollow_links(): + page_params = PageParams({"full_domain": "example.com"}) + body = b""" + + + + + + """ + url = "https://example.com" + response = AnyResponse(HttpResponse(url, body)) + request_url = RequestUrl(response.url) + navigation = ArticleNavigation(url=url) + + page = HeuristicsArticleNavigationPage( + request_url, navigation, response, page_params + ) + assert [req.url for req in page.subCategories] == ["https://example.com/can-follow"] + + +def test_deprecated_page_objects(): + with pytest.warns(DeprecationWarning, match="page_objects"): + from zyte_spider_templates.page_objects import ( # noqa: F401 + HeuristicsArticleNavigationPage, + ) + + # We cannot test the warning again because duplicate warnings are ignored, + # but we still want to ensure that we can import the class. + from zyte_spider_templates.page_objects.article_navigation_heuristics import ( # noqa: F401, F811 + HeuristicsArticleNavigationPage, + ) diff --git a/tests/pages/test_product_navigation_heuristics.py b/tests/pages/test_product_navigation_heuristics.py index 9fd4250..f4a516d 100644 --- a/tests/pages/test_product_navigation_heuristics.py +++ b/tests/pages/test_product_navigation_heuristics.py @@ -82,7 +82,7 @@ async def test_unknown_product_page(): item = await page.to_item() assert item.subCategories == urls_subcategories - assert page._urls_for_category() == all_valid_urls + assert page._urls_for_navigation() == all_valid_urls # Heuristics turned ON page_params = PageParams({"full_domain": "example.com"}) @@ -100,7 +100,7 @@ async def test_unknown_product_page(): } ) ] - assert page._urls_for_category() == all_valid_urls + assert page._urls_for_navigation() == all_valid_urls @ensureDeferred @@ -129,10 +129,12 @@ async def test_crawl_nofollow_links(): def test_deprecated_page_objects(): - with pytest.warns(DeprecationWarning, match="page_objects"): - from zyte_spider_templates.page_objects import ( # noqa: F401 - HeuristicsProductNavigationPage, - ) + + # We cannot test this warning as it will be ignored after the test run for aticles + # with pytest.warns(DeprecationWarning, match="page_objects"): + # from zyte_spider_templates.page_objects import ( # noqa: F401 + # HeuristicsProductNavigationPage, + # ) # We cannot test the warning again because duplicate warnings are ignored, # but we still want to ensure that we can import the class. diff --git a/tests/test_article.py b/tests/test_article.py new file mode 100644 index 0000000..8f1a871 --- /dev/null +++ b/tests/test_article.py @@ -0,0 +1,639 @@ +import logging +import re +from unittest.mock import MagicMock, call + +import pytest +import scrapy +from pydantic import ValidationError +from scrapy_poet import DummyResponse +from scrapy_spider_metadata import get_spider_metadata +from zyte_common_items import ProbabilityRequest, Article, ArticleNavigation, Request + +from zyte_spider_templates import BaseSpiderParams +from zyte_spider_templates._geolocations import ( + GEOLOCATION_OPTIONS, + GEOLOCATION_OPTIONS_WITH_CODE, + Geolocation, +) +from zyte_spider_templates.spiders.article import ( + ArticleCrawlStrategy, + ArticleSpider, +) + +from . import get_crawler +from .test_utils import URL_TO_DOMAIN + + +def test_parameters(): + with pytest.raises(ValidationError): + ArticleSpider() + + ArticleSpider(url="https://example.com") + ArticleSpider( + url="https://example.com", crawl_strategy=ArticleCrawlStrategy.full + ) + ArticleSpider(url="https://example.com", crawl_strategy="full") + + with pytest.raises(ValidationError): + ArticleSpider(url="https://example.com", crawl_strategy="unknown") + + +def test_start_requests(): + crawler = get_crawler() + url = "https://example.com" + spider = ArticleSpider.from_crawler(crawler, url=url) + requests = list(spider.start_requests()) + assert len(requests) == 1 + assert requests[0].url == url + assert requests[0].callback == spider.parse_navigation + + +def test_crawl(): + subcategory_urls = [ + "https://example.com/category/news", + "https://example.com/category/sports", + ] + nextpage_url = "https://example.com/category/news?p=2" + article_urls = [ + "https://example.com/article?id=breaking-news", + "https://example.com/article?id=latest-scores", + ] + request = scrapy.Request("https://example.com") + response = DummyResponse(url=subcategory_urls[0], request=request) + + subcategories = { + "subCategories": [ + {"url": subcategory_urls[0], "metadata": {"probability": 0.95}}, + {"url": subcategory_urls[1], "metadata": {"probability": 0.78}}, + ], + } + nextpage = {"nextPage": {"url": nextpage_url}} + articles = { + "items": [ + {"url": article_urls[0], "metadata": {"probability": 0.99}}, + {"url": article_urls[1], "metadata": {"probability": 0.83}}, + ], + } + + url = subcategory_urls[0] + spider = ArticleSpider(url="https://example.com/") + + # no links found + navigation = ArticleNavigation.from_dict({"url": url}) + requests = list(spider.parse_navigation(response, navigation)) + assert len(requests) == 0 + + # subcategories only + navigation = ArticleNavigation.from_dict({"url": url, **subcategories}) + requests = list(spider.parse_navigation(response, navigation)) + assert len(requests) == 2 + assert requests[0].url == subcategory_urls[0] + assert requests[0].callback == spider.parse_navigation + assert requests[0].priority == 95 + assert requests[1].url == subcategory_urls[1] + assert requests[1].callback == spider.parse_navigation + assert requests[1].priority == 78 + + # subcategories + nextpage + navigation = ArticleNavigation.from_dict( + { + "url": url, + **subcategories, + **nextpage, + } + ) + requests = list(spider.parse_navigation(response, navigation)) + assert len(requests) == 2 + urls = {request.url for request in requests} + assert urls == {*subcategory_urls} + assert all(request.callback == spider.parse_navigation for request in requests) + assert [request.priority for request in requests] == [95, 78] + + # subcategories + nextpage + articles + navigation = ArticleNavigation.from_dict( + { + "url": url, + **subcategories, + **nextpage, + **articles, + } + ) + requests = list(spider.parse_navigation(response, navigation)) + urls = {request.url for request in requests} + assert urls == {*article_urls, *subcategory_urls, nextpage_url} + for request in requests: + if request.url in article_urls: + assert request.callback == spider.parse_article + else: + assert request.callback == spider.parse_navigation + assert [request.priority for request in requests] == [199, 183, 100, 95, 78] + + # nextpage + articles + navigation = ArticleNavigation.from_dict( + { + "url": url, + **nextpage, + **articles, + } + ) + requests = list(spider.parse_navigation(response, navigation)) + assert len(requests) == 3 + assert requests[0].url == article_urls[0] + assert requests[0].callback == spider.parse_article + assert requests[1].url == article_urls[1] + assert requests[1].callback == spider.parse_article + assert requests[2].url == nextpage_url + assert requests[2].callback == spider.parse_navigation + assert [request.priority for request in requests] == [199, 183, 100] + + # subcategories + articles + navigation = ArticleNavigation.from_dict( + { + "url": url, + **subcategories, + **articles, + } + ) + requests = list(spider.parse_navigation(response, navigation)) + assert len(requests) == 4 + assert requests[0].url == article_urls[0] + assert requests[0].callback == spider.parse_article + assert requests[1].url == article_urls[1] + assert requests[1].callback == spider.parse_article + assert requests[2].url == subcategory_urls[0] + assert requests[2].callback == spider.parse_navigation + assert requests[3].url == subcategory_urls[1] + assert requests[3].callback == spider.parse_navigation + assert [request.priority for request in requests] == [199, 183, 95, 78] + + # nextpage + navigation = ArticleNavigation.from_dict( + { + "url": url, + **nextpage, + } + ) + requests = list(spider.parse_navigation(response, navigation)) + assert len(requests) == 0 + + # articles + navigation = ArticleNavigation.from_dict( + { + "url": url, + **articles, + } + ) + requests = list(spider.parse_navigation(response, navigation)) + assert len(requests) == 2 + assert requests[0].url == article_urls[0] + assert requests[0].callback == spider.parse_article + assert requests[1].url == article_urls[1] + assert requests[1].callback == spider.parse_article + assert [request.priority for request in requests] == [199, 183] + + # Test parse_navigation() behavior on pagination_only crawl strategy. + spider = ArticleSpider( + url="https://example.com/", crawl_strategy="pagination_only" + ) + + # nextpage + articles + navigation = ArticleNavigation.from_dict( + { + "url": url, + **subcategories, + **nextpage, + **articles, + } + ) + requests = list(spider.parse_navigation(response, navigation)) + urls = {request.url for request in requests} + assert urls == {*article_urls, nextpage_url} + for request in requests: + if request.url in article_urls: + assert request.callback == spider.parse_article + else: + assert request.callback == spider.parse_navigation + + +@pytest.mark.parametrize( + "probability,has_article,article_drop", + ((0.9, True, False), (0.09, False, True), (0.1, True, False), (None, True, False)), +) +def test_parse_article(probability, has_article, article_drop, caplog): + caplog.clear() + + article_url = "https://example.com/article?id=breaking-news" + article = ProbabilityRequest.from_dict( + {"url": article_url, "metadata": {"probability": probability}} + ) + response = DummyResponse(article_url) + spider = ArticleSpider(url="https://example.com") + mock_crawler = MagicMock() + spider.crawler = mock_crawler + logging.getLogger().setLevel(logging.INFO) + articles = list(spider.parse_article(response, article)) + if article_drop: + assert mock_crawler.method_calls == [ + call.stats.inc_value("drop_item/article/low_probability") + ] + + if has_article: + assert len(articles) == 1 + assert articles[0] == article + assert caplog.text == "" + else: + assert len(articles) == 0 + assert str(article) in caplog.text + + +def test_arguments(): + # Ensure passing no arguments works. + crawler = get_crawler() + + # Needed since it's a required argument. + base_kwargs = {"url": "https://example.com"} + + ArticleSpider.from_crawler(crawler, **base_kwargs) + + for param, arg, setting, old_setting_value, getter_name, new_setting_value in ( + ("max_requests", "123", "ZYTE_API_MAX_REQUESTS", None, "getint", 123), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + None, + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + '{"browserHtml": true}', + "getdict", + {"browserHtml": True, "geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_AUTOMAP_PARAMS", + '{"geolocation": "IE"}', + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + None, + "getdict", + {"geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + '{"browserHtml": true}', + "getdict", + {"browserHtml": True, "geolocation": "DE"}, + ), + ( + "geolocation", + "DE", + "ZYTE_API_PROVIDER_PARAMS", + '{"geolocation": "IE"}', + "getdict", + {"geolocation": "DE"}, + ), + ( + "extract_from", + "browserHtml", + "ZYTE_API_PROVIDER_PARAMS", + None, + "getdict", + { + "articleOptions": {"extractFrom": "browserHtml"}, + "articleNavigationOptions": {"extractFrom": "browserHtml"}, + }, + ), + ( + "extract_from", + "httpResponseBody", + "ZYTE_API_PROVIDER_PARAMS", + {"geolocation": "US"}, + "getdict", + { + "articleOptions": {"extractFrom": "httpResponseBody"}, + "articleNavigationOptions": {"extractFrom": "httpResponseBody"}, + "geolocation": "US", + }, + ), + ( + "extract_from", + None, + "ZYTE_API_PROVIDER_PARAMS", + {"geolocation": "US"}, + "getdict", + {"geolocation": "US"}, + ), + ): + kwargs = {param: arg} + settings = {} + if old_setting_value is not None: + settings[setting] = old_setting_value + crawler = get_crawler(settings=settings) + spider = ArticleSpider.from_crawler(crawler, **kwargs, **base_kwargs) + getter = getattr(crawler.settings, getter_name) + assert getter(setting) == new_setting_value + assert spider.allowed_domains == ["example.com"] + + +def test_metadata(): + metadata = get_spider_metadata(ArticleSpider, normalize=True) + assert metadata == { + "template": True, + "title": "Article", + "description": "Template for spiders that extract article data from article websites.", + "param_schema": { + "properties": { + "crawl_strategy": { + "default": "full", + "title": "Crawl strategy", + "description": "Determines how the start URL and follow-up URLs are crawled.", + "type": "string", + "enum": ["full", "navigation", "pagination_only"], + "enumMeta": { + "full": { + "description": "Follow most links within the domain of URL in an attempt to discover and extract as many articles as possible.", + "title": "Full", + }, + "navigation": { + "description": ( + "Follow pagination, subcategories, and " + "article detail pages. Pagination Only is a " + "better choice if the target URL does not " + "have subcategories, or if Zyte API is " + "misidentifying some URLs as subcategories." + ), + "title": "Navigation", + }, + "pagination_only": { + "description": ( + "Follow pagination and article detail pages. Subcategory links are ignored." + ), + "title": "Pagination Only", + }, + }, + }, + "extract_from": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, + "title": "Extraction source", + "description": ( + "Whether to perform extraction using a browser request " + "(browserHtml) or an HTTP request (httpResponseBody)." + ), + "enum": ["httpResponseBody", "browserHtml"], + "enumMeta": { + "httpResponseBody": { + "title": "httpResponseBody", + "description": "Use HTTP responses. Cost-efficient and fast extraction method, which works well on many websites.", + }, + "browserHtml": { + "title": "browserHtml", + "description": "Use browser rendering. Often provides the best quality.", + }, + }, + }, + "geolocation": { + "anyOf": [ + {"type": "string"}, + {"type": "null"}, + ], + "default": None, + "title": "Geolocation", + "description": "ISO 3166-1 alpha-2 2-character string specified in " + "https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/geolocation.", + "enum": list( + sorted(GEOLOCATION_OPTIONS, key=GEOLOCATION_OPTIONS.__getitem__) + ), + "enumMeta": { + code: { + "title": GEOLOCATION_OPTIONS_WITH_CODE[code], + } + for code in Geolocation + }, + }, + "max_requests": { + "anyOf": [{"type": "integer"}, {"type": "null"}], + "default": 100, + "title": "Max Requests", + "description": ( + "The maximum number of Zyte API requests allowed for the crawl.\n" + "\n" + "Requests with error responses that cannot be retried or exceed " + "their retry limit also count here, but they incur in no costs " + "and do not increase the request count in Scrapy Cloud." + ), + "widget": "request-limit", + }, + "url": { + "type": "string", + "title": "URL", + "description": ( + "Initial URL for the crawl. Enter the full URL including http(s), " + "you can copy and paste it from your browser. Example: https://toscrape.com/" + ), + "pattern": r"^https?://[^:/\s]+(:\d{1,5})?(/[^\s]*)*(#[^\s]*)?$", + }, + }, + "required": ["url"], + "title": "ArticleSpiderParams", + "type": "object", + }, + } + geolocation = metadata["param_schema"]["properties"]["geolocation"] + assert geolocation["enum"][0] == "AF" + assert geolocation["enumMeta"]["UY"] == {"title": "Uruguay (UY)"} + assert set(geolocation["enum"]) == set(geolocation["enumMeta"]) + + +@pytest.mark.parametrize( + "valid,url", + [ + (False, ""), + (False, "http://"), + (False, "http:/example.com"), + (False, "ftp://example.com"), + (False, "example.com"), + (False, "//example.com"), + (False, "http://foo:bar@example.com"), + (False, " http://example.com"), + (False, "http://example.com "), + (False, "http://examp le.com"), + (False, "https://example.com:232323"), + (True, "http://example.com"), + (True, "http://bücher.example"), + (True, "http://xn--bcher-kva.example"), + (True, "https://i❤.ws"), + (True, "https://example.com"), + (True, "https://example.com/"), + (True, "https://example.com:2323"), + (True, "https://example.com:2323/"), + (True, "https://example.com:2323/foo"), + (True, "https://example.com/f"), + (True, "https://example.com/foo"), + (True, "https://example.com/foo/"), + (True, "https://example.com/foo/bar"), + (True, "https://example.com/foo/bar/"), + (True, "https://example.com/foo/bar?baz"), + (True, "https://example.com/foo/bar/?baz"), + (True, "https://example.com?foo"), + (True, "https://example.com?foo=bar"), + (True, "https://example.com/?foo=bar&baz"), + (True, "https://example.com/?foo=bar&baz#"), + (True, "https://example.com/?foo=bar&baz#frag"), + (True, "https://example.com#"), + (True, "https://example.com/#"), + (True, "https://example.com/&"), + (True, "https://example.com/&#"), + ], +) +def test_validation_url(url, valid): + url_re = BaseSpiderParams.model_fields["url"].metadata[0].pattern + assert bool(re.match(url_re, url)) == valid + + +def test_get_parse_article_request(): + base_kwargs = { + "url": "https://example.com", + } + crawler = get_crawler() + + # Crawls articles outside of domains by default + spider = ArticleSpider.from_crawler(crawler, **base_kwargs) + request = ProbabilityRequest(url="https://example.com") + scrapy_request = spider.get_parse_article_request(request) + assert scrapy_request.meta.get("allow_offsite") is True + + +def test_get_subcategory_request(): + url = "https://example.com" + + # Normal request but with mostly empty values + request = Request(url) + spider = ArticleSpider(url="https://example.com") + parse_navigation = lambda _: None + spider.parse_navigation = parse_navigation # type: ignore + + scrapy_request = spider.get_subcategory_request(request) + assert isinstance(scrapy_request, scrapy.Request) + assert scrapy_request.callback == parse_navigation + assert scrapy_request.priority == 0 + assert scrapy_request.meta == { + "page_params": {}, + "crawling_logs": { + "name": "", + "probability": None, + "page_type": "subCategories", + }, + } + + # Non-Heuristics request + request = ProbabilityRequest.from_dict( + {"url": url, "name": "Some request", "metadata": {"probability": 0.98}} + ) + spider = ArticleSpider(url="https://example.com") + parse_navigation = lambda _: None + spider.parse_navigation = parse_navigation # type: ignore + page_params = {"full_domain": "example.com"} + + scrapy_request = spider.get_subcategory_request(request, page_params=page_params) + assert isinstance(scrapy_request, scrapy.Request) + assert scrapy_request.callback == parse_navigation + assert scrapy_request.priority == 98 + assert scrapy_request.meta == { + "page_params": {}, + "crawling_logs": { + "name": "Some request", + "probability": 0.98, + "page_type": "subCategories", + }, + } + + # Heuristics request + request = ProbabilityRequest.from_dict( + { + "url": url, + "name": "[heuristics] Some request", + "metadata": {"probability": 0.1}, + } + ) + spider = ArticleSpider(url="https://example.com") + parse_navigation = lambda _: None + spider.parse_navigation = parse_navigation # type: ignore + page_params = {"full_domain": "example.com"} + + scrapy_request = spider.get_subcategory_request(request, page_params=page_params) + assert isinstance(scrapy_request, scrapy.Request) + assert scrapy_request.callback == parse_navigation + assert scrapy_request.priority == 10 + assert scrapy_request.meta == { + "page_params": page_params, + "crawling_logs": { + "name": "Some request", + "probability": 0.1, + "page_type": "articleNavigation-heuristics", + }, + } + + +def test_get_nextpage_request(): + url = "https://example.com" + + # Minimal Args + request = Request(url) + spider = ArticleSpider(url="https://example.com") + parse_navigation = lambda _: None + spider.parse_navigation = parse_navigation # type: ignore + + scrapy_request = spider.get_nextpage_request(request) + assert isinstance(scrapy_request, scrapy.Request) + assert scrapy_request.callback == parse_navigation + assert scrapy_request.priority == 100 + assert scrapy_request.meta == { + "page_params": {}, + "crawling_logs": {"name": "", "probability": None, "page_type": "nextPage"}, + } + + +def test_get_parse_navigation_request(): + url = "https://example.com" + + # Minimal args + request = Request(url) + spider = ArticleSpider(url="https://example.com") + parse_navigation = lambda _: None + spider.parse_navigation = parse_navigation # type: ignore + + scrapy_request = spider.get_parse_navigation_request(request) + assert isinstance(scrapy_request, scrapy.Request) + assert scrapy_request.callback == parse_navigation + assert scrapy_request.priority == 0 + assert scrapy_request.meta == { + "page_params": {}, + "crawling_logs": { + "name": "", + "probability": None, + "page_type": "articleNavigation", + }, + } + + +@pytest.mark.parametrize("url,allowed_domain", URL_TO_DOMAIN) +def test_set_allowed_domains(url, allowed_domain): + crawler = get_crawler() + + kwargs = {"url": url} + spider = ArticleSpider.from_crawler(crawler, **kwargs) + assert spider.allowed_domains == [allowed_domain] diff --git a/tests/test_heuristics.py b/tests/test_heuristics.py index fda92bd..4c2df8e 100644 --- a/tests/test_heuristics.py +++ b/tests/test_heuristics.py @@ -1,52 +1,92 @@ import pytest -from zyte_spider_templates.heuristics import might_be_category +from zyte_spider_templates.heuristics import article_filter, product_filter @pytest.mark.parametrize( - "test_input,expected", - ( - ("", True), - ("https://example.com", True), - ("https://example.com/search", False), - ("https://example.com/search.php", False), - ("https://example.com/articles", False), - ("https://example.com/articles.cgi", False), - ("https://example.com/articles#fragment-here", False), - ("https://example.com/xyz123/articles?q=1", False), - ("https://example.com/xyz123/articles/x?q=1", True), + "content_filter,test_input,expected", + [ + (product_filter, "", True), + (product_filter, "https://example.com", True), + (product_filter, "https://example.com/search", False), + (product_filter, "https://example.com/search.php", False), + (product_filter, "https://example.com/articles", False), + (product_filter, "https://example.com/articles.cgi", False), + (product_filter, "https://example.com/articles#fragment-here", False), + (product_filter, "https://example.com/xyz123/articles?q=1", False), + (product_filter, "https://example.com/xyz123/articles/x?q=1", True), # Regex - ("https://example.com/signin", False), - ("https://example.com/signin.html", False), - ("https://example.com/sign-in", False), - ("https://example.com/sign_in", False), - ("https://example.com/login", False), - ("https://example.com/login.html", False), - ("https://example.com/log-in", False), - ("https://example.com/log_in", False), - ("https://example.com/logout", False), - ("https://example.com/logout.html", False), - ("https://example.com/log-out", False), - ("https://example.com/log_out", False), - ("https://example.com/contact-us", False), - ("https://example.com/contact_us", False), - ("https://example.com/contactus", False), - ("https://example.com/contactus.asp", False), - ("https://example.com/contact", False), - ("https://example.com/contact.html", False), - ("https://example.com/lost_password", False), - ("https://example.com/lost-password", False), - ("https://example.com/forgot_password", False), - ("https://example.com/forgot-password", False), - ("https://example.com/forgot-password.cgi", False), - ("https://example.com/terms-of-use", False), - ("https://example.com/terms-of-use.html", False), - ("https://example.com/terms-of-service", False), - ("https://example.com/terms-of-conditions", False), - ("https://example.com/terms_of_use", False), - ("https://example.com/terms_of_service", False), - ("https://example.com/terms_of_conditions", False), - ), + (product_filter, "https://example.com/signin", False), + (product_filter, "https://example.com/signin.html", False), + (product_filter, "https://example.com/sign-in", False), + (product_filter, "https://example.com/sign_in", False), + (product_filter, "https://example.com/login", False), + (product_filter, "https://example.com/login.html", False), + (product_filter, "https://example.com/log-in", False), + (product_filter, "https://example.com/log_in", False), + (product_filter, "https://example.com/logout", False), + (product_filter, "https://example.com/logout.html", False), + (product_filter, "https://example.com/log-out", False), + (product_filter, "https://example.com/log_out", False), + (product_filter, "https://example.com/contact-us", False), + (product_filter, "https://example.com/contact_us", False), + (product_filter, "https://example.com/contactus", False), + (product_filter, "https://example.com/contactus.asp", False), + (product_filter, "https://example.com/contact", False), + (product_filter, "https://example.com/contact.html", False), + (product_filter, "https://example.com/lost_password", False), + (product_filter, "https://example.com/lost-password", False), + (product_filter, "https://example.com/forgot_password", False), + (product_filter, "https://example.com/forgot-password", False), + (product_filter, "https://example.com/forgot-password.cgi", False), + (product_filter, "https://example.com/terms-of-use", False), + (product_filter, "https://example.com/terms-of-use.html", False), + (product_filter, "https://example.com/terms-of-service", False), + (product_filter, "https://example.com/terms-of-conditions", False), + (product_filter, "https://example.com/terms_of_use", False), + (product_filter, "https://example.com/terms_of_service", False), + (product_filter, "https://example.com/terms_of_conditions", False), + (article_filter, "", True), + (article_filter, "https://example.com", True), + (article_filter, "https://example.com/search", True), + (article_filter, "https://example.com/search.php", True), + (article_filter, "https://example.com/articles", True), + (article_filter, "https://example.com/articles.cgi", True), + (article_filter, "https://example.com/articles#fragment-here", True), + (article_filter, "https://example.com/xyz123/articles?q=1", True), + (article_filter, "https://example.com/xyz123/articles/x?q=1", True), + # Regex + (article_filter, "https://example.com/signin", False), + (article_filter, "https://example.com/signin.html", False), + (article_filter, "https://example.com/sign-in", False), + (article_filter, "https://example.com/sign_in", False), + (article_filter, "https://example.com/login", False), + (article_filter, "https://example.com/login.html", False), + (article_filter, "https://example.com/log-in", False), + (article_filter, "https://example.com/log_in", False), + (article_filter, "https://example.com/logout", False), + (article_filter, "https://example.com/logout.html", False), + (article_filter, "https://example.com/log-out", False), + (article_filter, "https://example.com/log_out", False), + (article_filter, "https://example.com/contact-us", False), + (article_filter, "https://example.com/contact_us", False), + (article_filter, "https://example.com/contactus", False), + (article_filter, "https://example.com/contactus.asp", False), + (article_filter, "https://example.com/contact", False), + (article_filter, "https://example.com/contact.html", False), + (article_filter, "https://example.com/lost_password", False), + (article_filter, "https://example.com/lost-password", False), + (article_filter, "https://example.com/forgot_password", False), + (article_filter, "https://example.com/forgot-password", False), + (article_filter, "https://example.com/forgot-password.cgi", False), + (article_filter, "https://example.com/terms-of-use", False), + (article_filter, "https://example.com/terms-of-use.html", False), + (article_filter, "https://example.com/terms-of-service", False), + (article_filter, "https://example.com/terms-of-conditions", False), + (article_filter, "https://example.com/terms_of_use", False), + (article_filter, "https://example.com/terms_of_service", False), + (article_filter, "https://example.com/terms_of_conditions", False), + ], ) -def test_might_be_category(test_input, expected): - assert might_be_category(test_input) == expected +def test_might_be_relevant_content(content_filter, test_input, expected): + assert content_filter.might_be_relevant_content(test_input) == expected diff --git a/zyte_spider_templates/__init__.py b/zyte_spider_templates/__init__.py index e3de8c9..819d495 100644 --- a/zyte_spider_templates/__init__.py +++ b/zyte_spider_templates/__init__.py @@ -1,2 +1,3 @@ +from .spiders.article import ArticleSpider from .spiders.base import BaseSpider, BaseSpiderParams from .spiders.ecommerce import EcommerceSpider diff --git a/zyte_spider_templates/heuristics.py b/zyte_spider_templates/heuristics.py index 432d4ea..e50dd1f 100644 --- a/zyte_spider_templates/heuristics.py +++ b/zyte_spider_templates/heuristics.py @@ -1,58 +1,110 @@ import re from urllib.parse import urlparse -NO_CONTENT_PATHS = ( - "/authenticate", - "/my-account", - "/account", - "/my-wishlist", - "/search", - "/archive", - "/privacy-policy", - "/cookie-policy", - "/terms-conditions", - "/tos", - "/admin", - "/rss.xml", - "/subscribe", - "/newsletter", - "/settings", - "/cart", - "/articles", - "/artykuly", # Polish for articles - "/news", - "/blog", - "/about", - "/about-us", - "/affiliate", - "/press", - "/careers", -) -SUFFIXES = [".html", ".php", ".cgi", ".asp"] +class ContentFilter: + def __init__(self, no_content_paths, no_content_regex, suffixes=None): + self.no_content_paths = no_content_paths + self.no_content_regex = no_content_regex + self.suffixes = ( + suffixes if suffixes is not None else [".html", ".php", ".cgi", ".asp"] + ) -NO_CONTENT_RE = ( - r"/sign[_-]?in", - r"/log[_-]?(in|out)", - r"/contact[_-]?(us)?", - r"/(lost|forgot)[_-]password", - r"/terms[_-]of[_-](service|use|conditions)", -) + def might_be_relevant_content(self, url: str) -> bool: + """Returns True if the given URL might be relevant based on its path and predefined rules.""" + url = url.lower().rstrip("/") + url_path = urlparse(url).path + for suffix in [""] + self.suffixes: + for path in self.no_content_paths: + if url_path.endswith(path + suffix): + return False + for rule in self.no_content_regex: + if re.search(rule + suffix, url): + return False -def might_be_category(url: str) -> bool: - """Returns True if the given url might be a category based on its path.""" + return True - url = url.lower().rstrip("/") - url_path = urlparse(url).path - for suffix in [""] + SUFFIXES: - for path in NO_CONTENT_PATHS: - if url_path.endswith(path + suffix): - return False - for suffix in [""] + SUFFIXES: - for rule in NO_CONTENT_RE: - if re.search(rule + suffix, url): - return False +product_filter = ContentFilter( + no_content_paths=( + "/authenticate", + "/my-account", + "/account", + "/my-wishlist", + "/search", + "/archive", + "/privacy-policy", + "/cookie-policy", + "/terms-conditions", + "/tos", + "/admin", + "/rss.xml", + "/subscribe", + "/newsletter", + "/settings", + "/cart", + "/articles", + "/artykuly", # Polish for articles + "/news", + "/blog", + "/about", + "/about-us", + "/affiliate", + "/press", + "/careers", + ), + no_content_regex=( + r"/sign[_-]?in", + r"/log[_-]?(in|out)", + r"/contact[_-]?(us)?", + r"/(lost|forgot)[_-]password", + r"/terms[_-]of[_-](service|use|conditions)", + ), +) - return True +article_filter = ContentFilter( + no_content_paths=( + "/authenticate", + "/my-account", + "/account", + "/my-wishlist", + "/cart", + "/checkout", + "/order", + "/shop", + "/product", + "/products", + "/category", + "/categories", + "/privacy-policy", + "/cookie-policy", + "/terms-conditions", + "/tos", + "/admin", + "/login", + "/signup", + "/subscribe", + "/newsletter", + "/settings", + "/faq", + "/help", + "/support", + "/downloads", + "/careers", + "/jobs", + "/contact", + "/about", + "/about-us", + "/team", + "/testimonials", + "/reviews", + ), + no_content_regex=( + r"/sign[_-]?in", + r"/log[_-]?(in|out)", + r"/contact[_-]?(us)?", + r"/(lost|forgot)[_-]password", + r"/terms[_-]of[_-](service|use|conditions)", + ), +) diff --git a/zyte_spider_templates/page_objects/__init__.py b/zyte_spider_templates/page_objects/__init__.py index b1c941d..a17dbde 100644 --- a/zyte_spider_templates/page_objects/__init__.py +++ b/zyte_spider_templates/page_objects/__init__.py @@ -1,6 +1,6 @@ from warnings import warn -from ..pages import HeuristicsProductNavigationPage +from ..pages import HeuristicsArticleNavigationPage, HeuristicsProductNavigationPage warn( "The zyte_spider_templates.page_objects module is deprecated, use " diff --git a/zyte_spider_templates/page_objects/article_navigation_heuristics.py b/zyte_spider_templates/page_objects/article_navigation_heuristics.py new file mode 100644 index 0000000..7333abb --- /dev/null +++ b/zyte_spider_templates/page_objects/article_navigation_heuristics.py @@ -0,0 +1 @@ +from ..pages import HeuristicsArticleNavigationPage diff --git a/zyte_spider_templates/pages/__init__.py b/zyte_spider_templates/pages/__init__.py index 72b9c1c..a7efe05 100644 --- a/zyte_spider_templates/pages/__init__.py +++ b/zyte_spider_templates/pages/__init__.py @@ -1 +1,2 @@ +from .article_navigation_heuristics import HeuristicsArticleNavigationPage from .product_navigation_heuristics import HeuristicsProductNavigationPage diff --git a/zyte_spider_templates/pages/article_navigation_heuristics.py b/zyte_spider_templates/pages/article_navigation_heuristics.py new file mode 100644 index 0000000..93fd492 --- /dev/null +++ b/zyte_spider_templates/pages/article_navigation_heuristics.py @@ -0,0 +1,75 @@ +from typing import List, Optional + +import attrs +from scrapy.http import TextResponse +from scrapy.linkextractors import LinkExtractor +from web_poet import AnyResponse, PageParams, field, handle_urls +from zyte_common_items import AutoArticleNavigationPage, ProbabilityRequest + +from zyte_spider_templates.heuristics import article_filter + + +@handle_urls("") +@attrs.define +class HeuristicsArticleNavigationPage(AutoArticleNavigationPage): + response: AnyResponse + page_params: PageParams + content_filter = article_filter + + @field + def subCategories(self) -> Optional[List[ProbabilityRequest]]: + if self.page_params.get("full_domain"): + return ( + self.article_navigation.subCategories or [] + ) + self._probably_relevant_links() + return self.article_navigation.subCategories + + def _urls_for_navigation(self) -> List[str]: + """Return a list of all URLs in the navigation item: + - items + - next page + - subcategories + """ + navigation_urls = [] + if self.article_navigation.items: + navigation_urls.extend( + [r.url for r in self.article_navigation.subCategories or []] + ) + navigation_urls.extend([r.url for r in self.article_navigation.items or []]) + if self.article_navigation.nextPage: + navigation_urls.append(self.article_navigation.nextPage.url) + return navigation_urls + + def _probably_relevant_links(self) -> List[ProbabilityRequest]: + default_probability = 0.1 + + link_extractor = LinkExtractor( + allow_domains=self.page_params.get("full_domain") + ) + ignore_urls = set(self._urls_for_navigation()) + + links = [] + response = TextResponse( + url=str(self.response.url), body=self.response.text.encode() + ) + for link in link_extractor.extract_links(response): + if link.url in ignore_urls or link.nofollow: + continue + + if ( + self.content_filter + and not self.content_filter.might_be_relevant_content(link.url) + ): + continue + + name = (link.text or "").strip() + request = ProbabilityRequest.from_dict( + { + "url": link.url, + "name": f"[heuristics] {name}", + "metadata": {"probability": default_probability}, + } + ) + links.append(request) + + return links diff --git a/zyte_spider_templates/pages/product_navigation_heuristics.py b/zyte_spider_templates/pages/product_navigation_heuristics.py index bd012ff..6ab1974 100644 --- a/zyte_spider_templates/pages/product_navigation_heuristics.py +++ b/zyte_spider_templates/pages/product_navigation_heuristics.py @@ -6,7 +6,7 @@ from web_poet import AnyResponse, PageParams, field, handle_urls from zyte_common_items import AutoProductNavigationPage, ProbabilityRequest -from zyte_spider_templates.heuristics import might_be_category +from zyte_spider_templates.heuristics import product_filter @handle_urls("") @@ -14,55 +14,52 @@ class HeuristicsProductNavigationPage(AutoProductNavigationPage): response: AnyResponse page_params: PageParams + content_filter = product_filter @field def subCategories(self) -> Optional[List[ProbabilityRequest]]: if self.page_params.get("full_domain"): return ( self.product_navigation.subCategories or [] - ) + self._probably_category_links() + ) + self._probably_relevant_links() return self.product_navigation.subCategories - def _urls_for_category(self) -> List[str]: - """Return a list of all URLs in the ProductNavigation item: + def _urls_for_navigation(self) -> List[str]: + """Return a list of all URLs in the navigation item: - items - next page - subcategories """ - - category_urls = [] + navigation_urls = [] if self.product_navigation.items: - category_urls.extend( + navigation_urls.extend( [r.url for r in self.product_navigation.subCategories or []] ) - category_urls.extend([r.url for r in self.product_navigation.items or []]) + navigation_urls.extend([r.url for r in self.product_navigation.items or []]) if self.product_navigation.nextPage: - category_urls.append(self.product_navigation.nextPage.url) - return category_urls + navigation_urls.append(self.product_navigation.nextPage.url) + return navigation_urls - def _probably_category_links(self) -> List[ProbabilityRequest]: - # TODO: This should be tuned later + def _probably_relevant_links(self) -> List[ProbabilityRequest]: default_probability = 0.1 link_extractor = LinkExtractor( allow_domains=self.page_params.get("full_domain") ) - ignore_urls = set(self._urls_for_category()) + ignore_urls = set(self._urls_for_navigation()) links = [] response = TextResponse( url=str(self.response.url), body=self.response.text.encode() ) for link in link_extractor.extract_links(response): - if link.url in ignore_urls: - continue - - # TODO: Convert to a configurable parameter like 'obey_nofollow_links' - # some time after the MVP launch. - if link.nofollow: + if link.url in ignore_urls or link.nofollow: continue - if not might_be_category(link.url): + if ( + self.content_filter + and not self.content_filter.might_be_relevant_content(link.url) + ): continue name = (link.text or "").strip() diff --git a/zyte_spider_templates/spiders/article.py b/zyte_spider_templates/spiders/article.py new file mode 100644 index 0000000..09dcd7f --- /dev/null +++ b/zyte_spider_templates/spiders/article.py @@ -0,0 +1,249 @@ +from enum import Enum +from typing import Any, Callable, Dict, Iterable, Optional, Union + +import scrapy +from pydantic import Field +from scrapy import Request +from scrapy.crawler import Crawler +from scrapy_poet import DummyResponse +from scrapy_spider_metadata import Args +from zyte_common_items import Article, ArticleNavigation, ProbabilityRequest + +from zyte_spider_templates.documentation import document_enum +from zyte_spider_templates.spiders.base import ( + ARG_SETTING_PRIORITY, + BaseSpider, + BaseSpiderParams, +) +from zyte_spider_templates.utils import get_domain + + +@document_enum +class ArticleCrawlStrategy(str, Enum): + full: str = "full" + """Follow most links within the domain of URL in an attempt to discover and + extract as many articles as possible.""" + + navigation: str = "navigation" + """Follow pagination, subcategories, and article detail pages. + + Pagination Only is a better choice if the target URL does not have + subcategories, or if Zyte API is misidentifying some URLs as subcategories. + """ + + pagination_only: str = "pagination_only" + """Follow pagination and article detail pages. Subcategory links are + ignored.""" + + +class ArticleSpiderParams(BaseSpiderParams): + crawl_strategy: ArticleCrawlStrategy = Field( + title="Crawl strategy", + description="Determines how the start URL and follow-up URLs are crawled.", + default=ArticleCrawlStrategy.full, + json_schema_extra={ + "enumMeta": { + ArticleCrawlStrategy.full: { + "title": "Full", + "description": "Follow most links within the domain of URL in an attempt to discover and extract as many articles as possible.", + }, + ArticleCrawlStrategy.navigation: { + "title": "Navigation", + "description": ( + "Follow pagination, subcategories, and article detail " + "pages. Pagination Only is a better choice if the " + "target URL does not have subcategories, or if Zyte " + "API is misidentifying some URLs as subcategories." + ), + }, + ArticleCrawlStrategy.pagination_only: { + "title": "Pagination Only", + "description": ( + "Follow pagination and article detail pages. Subcategory links are ignored." + ), + }, + }, + }, + ) + + +class ArticleSpider(Args[ArticleSpiderParams], BaseSpider): + """Yield articles from an article website. + + See :class:`~zyte_spider_templates.spiders.article.ArticleSpiderParams` + for supported parameters. + + .. seealso:: :ref:`article`. + """ + + name = "article" + + metadata: Dict[str, Any] = { + **BaseSpider.metadata, + "title": "Article", + "description": "Template for spiders that extract article data from article websites.", + } + + @classmethod + def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> scrapy.Spider: + spider = super(ArticleSpider, cls).from_crawler(crawler, *args, **kwargs) + spider.allowed_domains = [get_domain(spider.args.url)] + + if spider.args.extract_from is not None: + spider.settings.set( + "ZYTE_API_PROVIDER_PARAMS", + { + "articleOptions": {"extractFrom": spider.args.extract_from}, + "articleNavigationOptions": { + "extractFrom": spider.args.extract_from + }, + **spider.settings.get("ZYTE_API_PROVIDER_PARAMS", {}), + }, + priority=ARG_SETTING_PRIORITY, + ) + + return spider + + def start_requests(self) -> Iterable[Request]: + page_params = {} + if self.args.crawl_strategy == ArticleCrawlStrategy.full: + page_params = {"full_domain": self.allowed_domains[0]} + + yield Request( + url=self.args.url, + callback=self.parse_navigation, + meta={ + "page_params": page_params, + "crawling_logs": {"page_type": "articleNavigation"}, + }, + ) + + def parse_navigation( + self, response: DummyResponse, navigation: ArticleNavigation + ) -> Iterable[Request]: + page_params = response.meta.get("page_params") + + articles = navigation.items or [] + for request in articles: + yield self.get_parse_article_request(request) + + if navigation.nextPage: + if not articles: + self.logger.info( + f"Ignoring nextPage link {navigation.nextPage} since there " + f"are no article links found in {navigation.url}" + ) + else: + yield self.get_nextpage_request(navigation.nextPage) + + if self.args.crawl_strategy != ArticleCrawlStrategy.pagination_only: + for request in navigation.subCategories or []: + yield self.get_subcategory_request(request, page_params=page_params) + + def parse_article( + self, response: DummyResponse, article: Article + ) -> Iterable[Article]: + probability = article.get_probability() + + # TODO: convert to a configurable parameter later on after the launch + if probability is None or probability >= 0.1: + yield article + else: + self.crawler.stats.inc_value("drop_item/article/low_probability") + self.logger.info( + f"Ignoring item from {response.url} since its probability is " + f"less than threshold of 0.1:\n{article}" + ) + + @staticmethod + def get_parse_navigation_request_priority( + request: Union[ProbabilityRequest, Request] + ) -> int: + if ( + not hasattr(request, "metadata") + or not request.metadata + or request.metadata.probability is None + ): + return 0 + return int(100 * request.metadata.probability) + + def get_parse_navigation_request( + self, + request: Union[ProbabilityRequest, Request], + callback: Optional[Callable] = None, + page_params: Optional[Dict[str, Any]] = None, + priority: Optional[int] = None, + page_type: str = "articleNavigation", + ) -> scrapy.Request: + callback = callback or self.parse_navigation + + return request.to_scrapy( + callback=callback, + priority=priority or self.get_parse_navigation_request_priority(request), + meta={ + "page_params": page_params or {}, + "crawling_logs": { + "name": request.name or "", + "probability": request.get_probability(), + "page_type": page_type, + }, + }, + ) + + def get_subcategory_request( + self, + request: Union[ProbabilityRequest, Request], + callback: Optional[Callable] = None, + page_params: Optional[Dict[str, Any]] = None, + priority: Optional[int] = None, + ) -> scrapy.Request: + page_type = "subCategories" + request_name = request.name or "" + if "[heuristics]" not in request_name: + page_params = None + else: + page_type = "articleNavigation-heuristics" + request.name = request_name.replace("[heuristics]", "").strip() + return self.get_parse_navigation_request( + request, + callback, + page_params, + priority, + page_type, + ) + + def get_nextpage_request( + self, + request: Union[ProbabilityRequest, Request], + callback: Optional[Callable] = None, + page_params: Optional[Dict[str, Any]] = None, + ): + return self.get_parse_navigation_request( + request, callback, page_params, self._NEXT_PAGE_PRIORITY, "nextPage" + ) + + def get_parse_article_request_priority(self, request: ProbabilityRequest) -> int: + probability = request.get_probability() or 0 + return int(100 * probability) + self._NEXT_PAGE_PRIORITY + + def get_parse_article_request( + self, request: ProbabilityRequest, callback: Optional[Callable] = None + ) -> scrapy.Request: + callback = callback or self.parse_article + priority = self.get_parse_article_request_priority(request) + + probability = request.get_probability() + + scrapy_request = request.to_scrapy( + callback=callback, + priority=priority, + meta={ + "crawling_logs": { + "name": request.name, + "probability": probability, + "page_type": "article", + } + }, + ) + scrapy_request.meta["allow_offsite"] = True + return scrapy_request From 4edd3312c63c17c83a39388448d962b0d4d6aeb6 Mon Sep 17 00:00:00 2001 From: Felipe Tiago Date: Wed, 27 Mar 2024 19:54:07 -0300 Subject: [PATCH 2/2] lint: tests --- tests/test_article.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_article.py b/tests/test_article.py index 8f1a871..b663301 100644 --- a/tests/test_article.py +++ b/tests/test_article.py @@ -29,9 +29,7 @@ def test_parameters(): ArticleSpider() ArticleSpider(url="https://example.com") - ArticleSpider( - url="https://example.com", crawl_strategy=ArticleCrawlStrategy.full - ) + ArticleSpider(url="https://example.com", crawl_strategy=ArticleCrawlStrategy.full) ArticleSpider(url="https://example.com", crawl_strategy="full") with pytest.raises(ValidationError): @@ -192,9 +190,7 @@ def test_crawl(): assert [request.priority for request in requests] == [199, 183] # Test parse_navigation() behavior on pagination_only crawl strategy. - spider = ArticleSpider( - url="https://example.com/", crawl_strategy="pagination_only" - ) + spider = ArticleSpider(url="https://example.com/", crawl_strategy="pagination_only") # nextpage + articles navigation = ArticleNavigation.from_dict(