From c20c5a7d9782ad021ebc6cbea883940617c77a84 Mon Sep 17 00:00:00 2001 From: Wesley van Lee Date: Fri, 11 Oct 2024 14:51:11 +0200 Subject: [PATCH] unit-tests: Extension and middleware base test setup --- pyproject.toml | 12 ++--- scrapy_webarchive/extensions.py | 56 ++++++++++++----------- scrapy_webarchive/middleware.py | 37 ++++++++------- tests/data/warc_1_1/README.md | 1 + tests/data/{ => warc_1_1}/quotes.wacz.gz | Bin tests/test_downloadermiddlewares.py | 6 +-- tests/test_extensions.py | 40 ++++++++++++++++ tests/test_middleware.py | 56 +++++++++++++++++++++++ 8 files changed, 154 insertions(+), 54 deletions(-) create mode 100644 tests/data/warc_1_1/README.md rename tests/data/{ => warc_1_1}/quotes.wacz.gz (100%) create mode 100644 tests/test_extensions.py create mode 100644 tests/test_middleware.py diff --git a/pyproject.toml b/pyproject.toml index 45b3c9e..9e789fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,12 +6,12 @@ build-backend = "setuptools.build_meta" name = "scrapy-webarchive" version = "0.0.1" dependencies = [ - "scrapy", - "warcio", - "smart-open", - "warc-knot", - "wacz", - "cdxj-indexer", + "Scrapy==2.11.2", + "warcio==1.7.4", + "smart-open==7.0.4", + "warc-knot==0.2.5", + "wacz==0.5.0", + "cdxj-indexer==1.4.5", ] requires-python = ">=3.7" authors = [] diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py index 2734c32..39bf773 100644 --- a/scrapy_webarchive/extensions.py +++ b/scrapy_webarchive/extensions.py @@ -7,7 +7,7 @@ from scrapy.exceptions import NotConfigured from scrapy.http.request import Request from scrapy.http.response import Response -from scrapy.pipelines import files +from scrapy.pipelines.files import FSFilesStore, FTPFilesStore, GCSFilesStore, S3FilesStore from scrapy.settings import Settings from typing_extensions import Self @@ -20,11 +20,11 @@ class WaczExporter: """WACZ exporter extension that writes spider requests/responses as WACZ during a crawl job.""" STORE_SCHEMES = { - "": files.FSFilesStore, - "file": files.FSFilesStore, - "s3": files.S3FilesStore, - "gs": files.GCSFilesStore, - "ftp": files.FTPFilesStore, + "": FSFilesStore, + "file": FSFilesStore, + "s3": S3FilesStore, + "gs": GCSFilesStore, + "ftp": FTPFilesStore, } def __init__(self, settings: Settings, crawler: Crawler) -> None: @@ -36,7 +36,18 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None: self.store = self._get_store() self.writer = WarcFileWriter(collection_name=crawler.spider.name) - self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"]) + + def _get_store(self): + archive_uri_template = self.settings["ARCHIVE_EXPORT_URI"] + uri = archive_uri_template.format(**get_archive_uri_template_variables()) + + if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir + scheme = "file" + else: + scheme = urlparse(uri).scheme + + store_cls = self.STORE_SCHEMES[scheme] + return store_cls(uri) @classmethod def from_crawler(cls, crawler: Crawler) -> Self: @@ -49,6 +60,7 @@ def from_crawler(cls, crawler: Crawler) -> Self: crawler.signals.connect(exporter.response_received, signal=signals.response_received) crawler.signals.connect(exporter.spider_closed, signal=signals.spider_closed) + crawler.signals.connect(exporter.spider_opened, signal=signals.spider_opened) return exporter @classmethod @@ -74,6 +86,9 @@ def from_settings(cls, settings: Settings, crawler: Crawler): return cls(settings=settings, crawler=crawler) + def spider_opened(self) -> None: + self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"]) + def response_received(self, response: Response, request: Request, spider: Spider) -> None: request.meta["WARC-Date"] = warc_date() @@ -93,24 +108,13 @@ def spider_closed(self) -> None: wacz_creator = WaczFileCreator(warc_fname=self.writer.warc_fname, store=self.store) wacz_creator.create_wacz() - def _get_context_variables(self): - current_date = datetime.now() - - return { - "year": current_date.strftime("%Y"), - "month": current_date.strftime("%m"), - "day": current_date.strftime("%d"), - "timestamp": current_date.strftime("%Y%m%d%H%M%S"), - } - def _get_store(self): - archive_uri_template = self.settings["ARCHIVE_EXPORT_URI"] - uri = archive_uri_template.format(**self._get_context_variables()) - - if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir - scheme = "file" - else: - scheme = urlparse(uri).scheme +def get_archive_uri_template_variables() -> dict: + current_date = datetime.now() - store_cls = self.STORE_SCHEMES[scheme] - return store_cls(uri) + return { + "year": current_date.strftime("%Y"), + "month": current_date.strftime("%m"), + "day": current_date.strftime("%d"), + "timestamp": current_date.strftime("%Y%m%d%H%M%S"), + } diff --git a/scrapy_webarchive/middleware.py b/scrapy_webarchive/middleware.py index 8d3c95d..dfdf6f1 100644 --- a/scrapy_webarchive/middleware.py +++ b/scrapy_webarchive/middleware.py @@ -52,26 +52,25 @@ def process_start_requests(self, start_requests: Iterable[Request], spider: Spid if not self.crawl: for request in start_requests: yield request + else: # ignore original start requests, just yield all responses found + for entry in self.wacz.iter_index(): + url = entry["url"] - # ignore original start requests, just yield all responses found - for entry in self.wacz.iter_index(): - url = entry["url"] + # filter out off-site responses + if hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains: + continue - # filter out off-site responses - if hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains: - continue + # only accept allowed responses if requested by spider + if hasattr(spider, "archive_regex") and not re.search(spider.archive_regex, url): + continue - # only accept whitelisted responses if requested by spider - if hasattr(spider, "archive_regexp") and not re.search(spider.archive_regexp, url): - continue + self.stats.inc_value("wacz/start_request_count", spider=spider) - self.stats.inc_value("wacz/start_request_count", spider=spider) - - # do not filter to allow all occurences to be handled - # since we don't yet get all information for the request, this can be necessary - yield record_transformer.request_for_record( - entry, - flags=["wacz_start_request"], - meta={"wacz_index_entry": entry}, - dont_filter=True, - ) + # do not filter to allow all occurences to be handled + # since we don't yet get all information for the request, this can be necessary + yield record_transformer.request_for_record( + entry, + flags=["wacz_start_request"], + meta={"wacz_index_entry": entry}, + dont_filter=True, + ) diff --git a/tests/data/warc_1_1/README.md b/tests/data/warc_1_1/README.md new file mode 100644 index 0000000..ec0d42b --- /dev/null +++ b/tests/data/warc_1_1/README.md @@ -0,0 +1 @@ +Example data was generated using https://github.com/webrecorder/archiveweb.page v0.12.8. The data in this folder is based on the WARC Format 1.1 (https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/) \ No newline at end of file diff --git a/tests/data/quotes.wacz.gz b/tests/data/warc_1_1/quotes.wacz.gz similarity index 100% rename from tests/data/quotes.wacz.gz rename to tests/data/warc_1_1/quotes.wacz.gz diff --git a/tests/test_downloadermiddlewares.py b/tests/test_downloadermiddlewares.py index d243dea..424e59b 100644 --- a/tests/test_downloadermiddlewares.py +++ b/tests/test_downloadermiddlewares.py @@ -14,16 +14,16 @@ class TestWaczMiddleware: def setup_method(self): self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider("quotes") - + def _get_settings(self, **new_settings): settings = { - "WACZ_SOURCE_URL": get_test_data_path("quotes.wacz.gz").as_uri(), + "WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(), "WACZ_CRAWL": False, "WACZ_TIMEOUT": 60, } settings.update(new_settings) return Settings(settings) - + @contextmanager def _middleware(self, **new_settings): settings = self._get_settings(**new_settings) diff --git a/tests/test_extensions.py b/tests/test_extensions.py new file mode 100644 index 0000000..9af120d --- /dev/null +++ b/tests/test_extensions.py @@ -0,0 +1,40 @@ +from unittest import mock + +import pytest +from scrapy.exceptions import NotConfigured +from scrapy.pipelines.files import FSFilesStore, FTPFilesStore, GCSFilesStore, S3FilesStore +from scrapy.utils.test import get_crawler + +from scrapy_webarchive.extensions import WaczExporter + + +class TestWaczExporterExtension: + def test_archive_export_uri_invalid_raises_not_configured(self): + crawler = get_crawler(settings_dict={}) + with pytest.raises(NotConfigured): + WaczExporter.from_crawler(crawler) + + @mock.patch('scrapy_webarchive.extensions.S3FilesStore.__init__', return_value=None) + @mock.patch('scrapy_webarchive.extensions.GCSFilesStore.__init__', return_value=None) + @mock.patch('scrapy_webarchive.extensions.FTPFilesStore.__init__', return_value=None) + @mock.patch('scrapy_webarchive.extensions.FSFilesStore.__init__', return_value=None) + def test_get_store(self, *args): + crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "/tmp/scrapy-webarchive/wacz/"}) + crawler.spider = crawler._create_spider("quotes") + extension = WaczExporter.from_crawler(crawler) + assert isinstance(extension.store, FSFilesStore) + + crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "s3://scrapy-webarchive/wacz/"}) + crawler.spider = crawler._create_spider("quotes") + extension = WaczExporter.from_crawler(crawler) + assert isinstance(extension.store, S3FilesStore) + + crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "gs://scrapy-webarchive/wacz/"}) + crawler.spider = crawler._create_spider("quotes") + extension = WaczExporter.from_crawler(crawler) + assert isinstance(extension.store, GCSFilesStore) + + crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "ftp://scrapy-webarchive/wacz/"}) + crawler.spider = crawler._create_spider("quotes") + extension = WaczExporter.from_crawler(crawler) + assert isinstance(extension.store, FTPFilesStore) diff --git a/tests/test_middleware.py b/tests/test_middleware.py new file mode 100644 index 0000000..36edc32 --- /dev/null +++ b/tests/test_middleware.py @@ -0,0 +1,56 @@ +from contextlib import contextmanager + +from scrapy.http.request import Request +from scrapy.settings import Settings +from scrapy.utils.test import get_crawler + +from scrapy_webarchive.middleware import WaczCrawlMiddleware + +from . import get_test_data_path + + +class TestWaczCrawlMiddlewareWarc11: + def setup_method(self): + self.crawler = get_crawler() + self.spider = self.crawler._create_spider("quotes") + + def _get_settings(self, **new_settings): + settings = { + "WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(), + "WACZ_TIMEOUT": 60, + } + settings.update(new_settings) + return Settings(settings) + + @contextmanager + def _middleware(self, **new_settings): + settings = self._get_settings(**new_settings) + mw = WaczCrawlMiddleware(settings, self.crawler.stats) + mw.spider_opened(self.spider) + yield mw + + def test_wacz_archive_is_ignored_follow_original_behaviour(self): + request = Request("https://quotes.toscrape.com") + + with self._middleware(WACZ_CRAWL=False) as mw: + out = list(mw.process_start_requests([request], self.spider)) + assert out == [request] + + def test_wacz_archive_iterates_all_records(self): + with self._middleware(WACZ_CRAWL=True) as mw: + out = list(mw.process_start_requests([], self.spider)) + assert len(out) == 101 + + def test_wacz_archive_filters_allowed_domains(self): + setattr(self.spider, "allowed_domains", "quotes.toscrape.com") + + with self._middleware(WACZ_CRAWL=True) as mw: + out = list(mw.process_start_requests([], self.spider)) + assert len(out) == 61 + + def test_wacz_archive_filters_archive_regex(self): + setattr(self.spider, "archive_regex", r"https://quotes\.toscrape\.com/page/\d+/") + + with self._middleware(WACZ_CRAWL=True) as mw: + out = list(mw.process_start_requests([], self.spider)) + assert len(out) == 9