Skip to content

Commit

Permalink
unit-tests: Extension and middleware base test setup
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 11, 2024
1 parent 47bd64f commit c20c5a7
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 54 deletions.
12 changes: 6 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ build-backend = "setuptools.build_meta"
name = "scrapy-webarchive"
version = "0.0.1"
dependencies = [
"scrapy",
"warcio",
"smart-open",
"warc-knot",
"wacz",
"cdxj-indexer",
"Scrapy==2.11.2",
"warcio==1.7.4",
"smart-open==7.0.4",
"warc-knot==0.2.5",
"wacz==0.5.0",
"cdxj-indexer==1.4.5",
]
requires-python = ">=3.7"
authors = []
Expand Down
56 changes: 30 additions & 26 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from scrapy.exceptions import NotConfigured
from scrapy.http.request import Request
from scrapy.http.response import Response
from scrapy.pipelines import files
from scrapy.pipelines.files import FSFilesStore, FTPFilesStore, GCSFilesStore, S3FilesStore
from scrapy.settings import Settings
from typing_extensions import Self

Expand All @@ -20,11 +20,11 @@ class WaczExporter:
"""WACZ exporter extension that writes spider requests/responses as WACZ during a crawl job."""

STORE_SCHEMES = {
"": files.FSFilesStore,
"file": files.FSFilesStore,
"s3": files.S3FilesStore,
"gs": files.GCSFilesStore,
"ftp": files.FTPFilesStore,
"": FSFilesStore,
"file": FSFilesStore,
"s3": S3FilesStore,
"gs": GCSFilesStore,
"ftp": FTPFilesStore,
}

def __init__(self, settings: Settings, crawler: Crawler) -> None:
Expand All @@ -36,7 +36,18 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None:

self.store = self._get_store()
self.writer = WarcFileWriter(collection_name=crawler.spider.name)
self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"])

def _get_store(self):
archive_uri_template = self.settings["ARCHIVE_EXPORT_URI"]
uri = archive_uri_template.format(**get_archive_uri_template_variables())

if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir
scheme = "file"
else:
scheme = urlparse(uri).scheme

store_cls = self.STORE_SCHEMES[scheme]
return store_cls(uri)

@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
Expand All @@ -49,6 +60,7 @@ def from_crawler(cls, crawler: Crawler) -> Self:

crawler.signals.connect(exporter.response_received, signal=signals.response_received)
crawler.signals.connect(exporter.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(exporter.spider_opened, signal=signals.spider_opened)
return exporter

@classmethod
Expand All @@ -74,6 +86,9 @@ def from_settings(cls, settings: Settings, crawler: Crawler):

return cls(settings=settings, crawler=crawler)

def spider_opened(self) -> None:
self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"])

def response_received(self, response: Response, request: Request, spider: Spider) -> None:
request.meta["WARC-Date"] = warc_date()

Expand All @@ -93,24 +108,13 @@ def spider_closed(self) -> None:
wacz_creator = WaczFileCreator(warc_fname=self.writer.warc_fname, store=self.store)
wacz_creator.create_wacz()

def _get_context_variables(self):
current_date = datetime.now()

return {
"year": current_date.strftime("%Y"),
"month": current_date.strftime("%m"),
"day": current_date.strftime("%d"),
"timestamp": current_date.strftime("%Y%m%d%H%M%S"),
}

def _get_store(self):
archive_uri_template = self.settings["ARCHIVE_EXPORT_URI"]
uri = archive_uri_template.format(**self._get_context_variables())

if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir
scheme = "file"
else:
scheme = urlparse(uri).scheme
def get_archive_uri_template_variables() -> dict:
current_date = datetime.now()

store_cls = self.STORE_SCHEMES[scheme]
return store_cls(uri)
return {
"year": current_date.strftime("%Y"),
"month": current_date.strftime("%m"),
"day": current_date.strftime("%d"),
"timestamp": current_date.strftime("%Y%m%d%H%M%S"),
}
37 changes: 18 additions & 19 deletions scrapy_webarchive/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,26 +52,25 @@ def process_start_requests(self, start_requests: Iterable[Request], spider: Spid
if not self.crawl:
for request in start_requests:
yield request
else: # ignore original start requests, just yield all responses found
for entry in self.wacz.iter_index():
url = entry["url"]

# ignore original start requests, just yield all responses found
for entry in self.wacz.iter_index():
url = entry["url"]
# filter out off-site responses
if hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains:
continue

# filter out off-site responses
if hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains:
continue
# only accept allowed responses if requested by spider
if hasattr(spider, "archive_regex") and not re.search(spider.archive_regex, url):
continue

# only accept whitelisted responses if requested by spider
if hasattr(spider, "archive_regexp") and not re.search(spider.archive_regexp, url):
continue
self.stats.inc_value("wacz/start_request_count", spider=spider)

self.stats.inc_value("wacz/start_request_count", spider=spider)

# do not filter to allow all occurences to be handled
# since we don't yet get all information for the request, this can be necessary
yield record_transformer.request_for_record(
entry,
flags=["wacz_start_request"],
meta={"wacz_index_entry": entry},
dont_filter=True,
)
# do not filter to allow all occurences to be handled
# since we don't yet get all information for the request, this can be necessary
yield record_transformer.request_for_record(
entry,
flags=["wacz_start_request"],
meta={"wacz_index_entry": entry},
dont_filter=True,
)
1 change: 1 addition & 0 deletions tests/data/warc_1_1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Example data was generated using https://github.com/webrecorder/archiveweb.page v0.12.8. The data in this folder is based on the WARC Format 1.1 (https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/)
File renamed without changes.
6 changes: 3 additions & 3 deletions tests/test_downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,16 @@ class TestWaczMiddleware:
def setup_method(self):
self.crawler = get_crawler(Spider)
self.spider = self.crawler._create_spider("quotes")

def _get_settings(self, **new_settings):
settings = {
"WACZ_SOURCE_URL": get_test_data_path("quotes.wacz.gz").as_uri(),
"WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
"WACZ_CRAWL": False,
"WACZ_TIMEOUT": 60,
}
settings.update(new_settings)
return Settings(settings)

@contextmanager
def _middleware(self, **new_settings):
settings = self._get_settings(**new_settings)
Expand Down
40 changes: 40 additions & 0 deletions tests/test_extensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from unittest import mock

import pytest
from scrapy.exceptions import NotConfigured
from scrapy.pipelines.files import FSFilesStore, FTPFilesStore, GCSFilesStore, S3FilesStore
from scrapy.utils.test import get_crawler

from scrapy_webarchive.extensions import WaczExporter


class TestWaczExporterExtension:
def test_archive_export_uri_invalid_raises_not_configured(self):
crawler = get_crawler(settings_dict={})
with pytest.raises(NotConfigured):
WaczExporter.from_crawler(crawler)

@mock.patch('scrapy_webarchive.extensions.S3FilesStore.__init__', return_value=None)
@mock.patch('scrapy_webarchive.extensions.GCSFilesStore.__init__', return_value=None)
@mock.patch('scrapy_webarchive.extensions.FTPFilesStore.__init__', return_value=None)
@mock.patch('scrapy_webarchive.extensions.FSFilesStore.__init__', return_value=None)
def test_get_store(self, *args):
crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "/tmp/scrapy-webarchive/wacz/"})
crawler.spider = crawler._create_spider("quotes")
extension = WaczExporter.from_crawler(crawler)
assert isinstance(extension.store, FSFilesStore)

crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "s3://scrapy-webarchive/wacz/"})
crawler.spider = crawler._create_spider("quotes")
extension = WaczExporter.from_crawler(crawler)
assert isinstance(extension.store, S3FilesStore)

crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "gs://scrapy-webarchive/wacz/"})
crawler.spider = crawler._create_spider("quotes")
extension = WaczExporter.from_crawler(crawler)
assert isinstance(extension.store, GCSFilesStore)

crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "ftp://scrapy-webarchive/wacz/"})
crawler.spider = crawler._create_spider("quotes")
extension = WaczExporter.from_crawler(crawler)
assert isinstance(extension.store, FTPFilesStore)
56 changes: 56 additions & 0 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from contextlib import contextmanager

from scrapy.http.request import Request
from scrapy.settings import Settings
from scrapy.utils.test import get_crawler

from scrapy_webarchive.middleware import WaczCrawlMiddleware

from . import get_test_data_path


class TestWaczCrawlMiddlewareWarc11:
def setup_method(self):
self.crawler = get_crawler()
self.spider = self.crawler._create_spider("quotes")

def _get_settings(self, **new_settings):
settings = {
"WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(),
"WACZ_TIMEOUT": 60,
}
settings.update(new_settings)
return Settings(settings)

@contextmanager
def _middleware(self, **new_settings):
settings = self._get_settings(**new_settings)
mw = WaczCrawlMiddleware(settings, self.crawler.stats)
mw.spider_opened(self.spider)
yield mw

def test_wacz_archive_is_ignored_follow_original_behaviour(self):
request = Request("https://quotes.toscrape.com")

with self._middleware(WACZ_CRAWL=False) as mw:
out = list(mw.process_start_requests([request], self.spider))
assert out == [request]

def test_wacz_archive_iterates_all_records(self):
with self._middleware(WACZ_CRAWL=True) as mw:
out = list(mw.process_start_requests([], self.spider))
assert len(out) == 101

def test_wacz_archive_filters_allowed_domains(self):
setattr(self.spider, "allowed_domains", "quotes.toscrape.com")

with self._middleware(WACZ_CRAWL=True) as mw:
out = list(mw.process_start_requests([], self.spider))
assert len(out) == 61

def test_wacz_archive_filters_archive_regex(self):
setattr(self.spider, "archive_regex", r"https://quotes\.toscrape\.com/page/\d+/")

with self._middleware(WACZ_CRAWL=True) as mw:
out = list(mw.process_start_requests([], self.spider))
assert len(out) == 9

0 comments on commit c20c5a7

Please sign in to comment.