From e4dea396328b5ac1b98926736754d579eacdf2fa Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Tue, 3 Sep 2024 06:04:21 +0000 Subject: [PATCH 1/3] improve ci --- tests/test_cli.py | 68 ++++++++++++++++++++++--------- tests/unit/test_capture_object.py | 6 ++- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index d3c3c05..779802c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,10 +1,24 @@ -from cdx_toolkit.cli import main - import json import sys +import os +import platform import pytest -import requests + +from cdx_toolkit.cli import main + + +def slow_ci(): + ''' + For Github Actions, the windows and macos runners are very slow. + Detect those runners, so that we can cut testing short. + ''' + if os.environ.get('FAKE_GITHUB_ACTION'): + return True + if os.environ.get('GITHUB_ACTION'): + if platform.system() in {'Darwin', 'Windows'}: + return True + def test_basics(capsys): @@ -80,6 +94,8 @@ def test_multi_cc1(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + if slow_ci(): + break def test_multi_cc2(capsys, caplog): @@ -101,9 +117,10 @@ def test_multi_cc2(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + if slow_ci(): + break -@pytest.mark.skip(reason='needs some ratelimit love XXX') def test_multi_ia(capsys, caplog): tests = [ [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, @@ -120,12 +137,11 @@ def test_multi_ia(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + break # XXX minimize IA for ratelimit purposes -def test_multi_misc_notia(capsys, caplog): +def test_multi_misc_not_ia(capsys, caplog): tests = [ - [{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, - {'count': 10, 'linefgrep': 'commoncrawl.org'}], [{'service': '-v -v --source https://web.arc4567hive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'exception': ValueError}], [{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, @@ -142,11 +158,14 @@ def test_multi_misc_notia(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + if slow_ci(): + break -@pytest.mark.skip(reason='needs some ratelimit love XXX') def test_multi_misc_ia(capsys, caplog): tests = [ + [{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, + {'count': 10, 'linefgrep': 'commoncrawl.org'}], [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'}, {'count': 1, 'is_int': True}], [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'}, @@ -157,35 +176,44 @@ def test_multi_misc_ia(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + break # XXX minimize IA for ratelimit reasons def test_warc(tmpdir, caplog): # crash testing only, so far - base = ' --limit 10 warc commoncrawl.org/*' + base = ' --limit 1 warc commoncrawl.org/*' - prefixes = ('-v -v --cc', '--ia', - '--cc --cc-mirror https://index.commoncrawl.org/', - '--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web') - suffixes = ('--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar', - '--prefix EMPTY --size 1 --url-fgrep bar', - '--prefix EMPTY --size 1 --url-fgrepv common') + prefixes = ( # note limit 2 below + '-v -v --cc', # only case run by slow_cli + '--ia', + '--cc --cc-mirror https://index.commoncrawl.org/', + '--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web', + ) + suffixes = ( + '--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar', + '--prefix EMPTY --size 1 --url-fgrep bar', + '--prefix EMPTY --size 1 --url-fgrepv common' + ) with tmpdir.as_cwd(): for p in prefixes: - if '--ia' in p or 'archive.org' in p: - # XXX skip - continue cmdline = p + base + if 'cc' in cmdline: + cmdline = cmdline.replace(' 1', ' 2') print(cmdline, file=sys.stderr) args = cmdline.split() main(args=args) + if slow_ci(): + break for s in suffixes: cmdline = prefixes[0] + base + ' ' + s print(cmdline, file=sys.stderr) args = cmdline.split() main(args=args) + if slow_ci(): + break assert True @@ -195,11 +223,11 @@ def one_ia_corner(tmpdir, cmdline): main(args=cmdline.split()) -@pytest.mark.skip(reason='needs some ratelimit love XXX') +@pytest.mark.skip(reason='needs some ratelimit love') def test_warc_ia_corners(tmpdir, caplog): ''' To test these more properly, need to add a --exact-warcname and then postprocess. - For now, these tests show up in the coverage report + For now, these are only crash tests. ''' # revisit vivification diff --git a/tests/unit/test_capture_object.py b/tests/unit/test_capture_object.py index 1aa7c75..ff7892a 100644 --- a/tests/unit/test_capture_object.py +++ b/tests/unit/test_capture_object.py @@ -6,7 +6,7 @@ def test_capture_object(): cdx_cc = cdx_toolkit.CDXFetcher(source='cc') - cdx_ia = cdx_toolkit.CDXFetcher(source='ia') + #XXX cdx_ia = cdx_toolkit.CDXFetcher(source='ia') cdx_only = cdx_toolkit.CDXFetcher(source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG') url = 'example.com' @@ -16,10 +16,12 @@ def test_capture_object(): for obj in cdx_only.iter(url, **kwargs): got_one = True with pytest.raises(ValueError): + # we don't know how to fetch the content in this situation _ = obj.content assert got_one, 'found a capture cdx_only' - for cdx in (cdx_cc, cdx_ia): + #XXX for cdx in (cdx_cc, cdx_ia): + for cdx in (cdx_cc,): got_one = False for obj in cdx.iter(url, **kwargs): got_one = True From ca0a775a7aabd2257d13a927feebcb14614751aa Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Tue, 3 Sep 2024 07:04:52 +0000 Subject: [PATCH 2/3] improve ci --- tests/test_cli.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 779802c..2284e56 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,11 +2,14 @@ import sys import os import platform +import logging import pytest from cdx_toolkit.cli import main +LOGGER = logging.getLogger(__name__) + def slow_ci(): ''' @@ -14,11 +17,17 @@ def slow_ci(): Detect those runners, so that we can cut testing short. ''' if os.environ.get('FAKE_GITHUB_ACTION'): + LOGGER.error('limiting pytest because FAKE_GITHUB_ACTION') return True if os.environ.get('GITHUB_ACTION'): if platform.system() in {'Darwin', 'Windows'}: + LOGGER.error('limiting pytest because GITHUB_ACTION') return True - + v = sys.version_info + if os.environ.get('GITHUB_ACTION') and v.major == 3 and v.minor != 12: + LOGGER.error('limiting pytest because GITHUB_ACTION and py != 3.12') + return False + LOGGER.error('full pytest') def test_basics(capsys): From 567722d1df5395e46c48b5909ff7f63d88a8c3f7 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Tue, 3 Sep 2024 17:58:22 +0000 Subject: [PATCH 3/3] change minimum intervals --- cdx_toolkit/myrequests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cdx_toolkit/myrequests.py b/cdx_toolkit/myrequests.py index 94fa726..5f8ab07 100644 --- a/cdx_toolkit/myrequests.py +++ b/cdx_toolkit/myrequests.py @@ -27,11 +27,11 @@ def dns_fatal(hostname): }, 'index.commoncrawl.org': { 'next_fetch': 0, - 'minimum_interval': 3.0, + 'minimum_interval': 1.0, }, 'data.commoncrawl.org': { 'next_fetch': 0, - 'minimum_interval': 3.0, + 'minimum_interval': 0.55, }, 'web.archive.org': { 'next_fetch': 0,