From 6578915b4f0a88ffcbe8fa402bfe2438e6778833 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Tue, 3 Sep 2024 20:11:47 +0000 Subject: [PATCH 1/3] feat: lower min intervals, reduce CI work --- cdx_toolkit/myrequests.py | 4 +- tests/test_cli.py | 77 +++++++++++++++++++++++-------- tests/unit/test_capture_object.py | 6 ++- 3 files changed, 63 insertions(+), 24 deletions(-) diff --git a/cdx_toolkit/myrequests.py b/cdx_toolkit/myrequests.py index 94fa726..5f8ab07 100644 --- a/cdx_toolkit/myrequests.py +++ b/cdx_toolkit/myrequests.py @@ -27,11 +27,11 @@ def dns_fatal(hostname): }, 'index.commoncrawl.org': { 'next_fetch': 0, - 'minimum_interval': 3.0, + 'minimum_interval': 1.0, }, 'data.commoncrawl.org': { 'next_fetch': 0, - 'minimum_interval': 3.0, + 'minimum_interval': 0.55, }, 'web.archive.org': { 'next_fetch': 0, diff --git a/tests/test_cli.py b/tests/test_cli.py index d3c3c05..2284e56 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,10 +1,33 @@ -from cdx_toolkit.cli import main - import json import sys +import os +import platform +import logging import pytest -import requests + +from cdx_toolkit.cli import main + +LOGGER = logging.getLogger(__name__) + + +def slow_ci(): + ''' + For Github Actions, the windows and macos runners are very slow. + Detect those runners, so that we can cut testing short. + ''' + if os.environ.get('FAKE_GITHUB_ACTION'): + LOGGER.error('limiting pytest because FAKE_GITHUB_ACTION') + return True + if os.environ.get('GITHUB_ACTION'): + if platform.system() in {'Darwin', 'Windows'}: + LOGGER.error('limiting pytest because GITHUB_ACTION') + return True + v = sys.version_info + if os.environ.get('GITHUB_ACTION') and v.major == 3 and v.minor != 12: + LOGGER.error('limiting pytest because GITHUB_ACTION and py != 3.12') + return False + LOGGER.error('full pytest') def test_basics(capsys): @@ -80,6 +103,8 @@ def test_multi_cc1(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + if slow_ci(): + break def test_multi_cc2(capsys, caplog): @@ -101,9 +126,10 @@ def test_multi_cc2(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + if slow_ci(): + break -@pytest.mark.skip(reason='needs some ratelimit love XXX') def test_multi_ia(capsys, caplog): tests = [ [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, @@ -120,12 +146,11 @@ def test_multi_ia(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + break # XXX minimize IA for ratelimit purposes -def test_multi_misc_notia(capsys, caplog): +def test_multi_misc_not_ia(capsys, caplog): tests = [ - [{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, - {'count': 10, 'linefgrep': 'commoncrawl.org'}], [{'service': '-v -v --source https://web.arc4567hive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'exception': ValueError}], [{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, @@ -142,11 +167,14 @@ def test_multi_misc_notia(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + if slow_ci(): + break -@pytest.mark.skip(reason='needs some ratelimit love XXX') def test_multi_misc_ia(capsys, caplog): tests = [ + [{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, + {'count': 10, 'linefgrep': 'commoncrawl.org'}], [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'}, {'count': 1, 'is_int': True}], [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'}, @@ -157,35 +185,44 @@ def test_multi_misc_ia(capsys, caplog): for t in tests: multi_helper(t, capsys, caplog) + break # XXX minimize IA for ratelimit reasons def test_warc(tmpdir, caplog): # crash testing only, so far - base = ' --limit 10 warc commoncrawl.org/*' + base = ' --limit 1 warc commoncrawl.org/*' - prefixes = ('-v -v --cc', '--ia', - '--cc --cc-mirror https://index.commoncrawl.org/', - '--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web') - suffixes = ('--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar', - '--prefix EMPTY --size 1 --url-fgrep bar', - '--prefix EMPTY --size 1 --url-fgrepv common') + prefixes = ( # note limit 2 below + '-v -v --cc', # only case run by slow_cli + '--ia', + '--cc --cc-mirror https://index.commoncrawl.org/', + '--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web', + ) + suffixes = ( + '--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar', + '--prefix EMPTY --size 1 --url-fgrep bar', + '--prefix EMPTY --size 1 --url-fgrepv common' + ) with tmpdir.as_cwd(): for p in prefixes: - if '--ia' in p or 'archive.org' in p: - # XXX skip - continue cmdline = p + base + if 'cc' in cmdline: + cmdline = cmdline.replace(' 1', ' 2') print(cmdline, file=sys.stderr) args = cmdline.split() main(args=args) + if slow_ci(): + break for s in suffixes: cmdline = prefixes[0] + base + ' ' + s print(cmdline, file=sys.stderr) args = cmdline.split() main(args=args) + if slow_ci(): + break assert True @@ -195,11 +232,11 @@ def one_ia_corner(tmpdir, cmdline): main(args=cmdline.split()) -@pytest.mark.skip(reason='needs some ratelimit love XXX') +@pytest.mark.skip(reason='needs some ratelimit love') def test_warc_ia_corners(tmpdir, caplog): ''' To test these more properly, need to add a --exact-warcname and then postprocess. - For now, these tests show up in the coverage report + For now, these are only crash tests. ''' # revisit vivification diff --git a/tests/unit/test_capture_object.py b/tests/unit/test_capture_object.py index 1aa7c75..ff7892a 100644 --- a/tests/unit/test_capture_object.py +++ b/tests/unit/test_capture_object.py @@ -6,7 +6,7 @@ def test_capture_object(): cdx_cc = cdx_toolkit.CDXFetcher(source='cc') - cdx_ia = cdx_toolkit.CDXFetcher(source='ia') + #XXX cdx_ia = cdx_toolkit.CDXFetcher(source='ia') cdx_only = cdx_toolkit.CDXFetcher(source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG') url = 'example.com' @@ -16,10 +16,12 @@ def test_capture_object(): for obj in cdx_only.iter(url, **kwargs): got_one = True with pytest.raises(ValueError): + # we don't know how to fetch the content in this situation _ = obj.content assert got_one, 'found a capture cdx_only' - for cdx in (cdx_cc, cdx_ia): + #XXX for cdx in (cdx_cc, cdx_ia): + for cdx in (cdx_cc,): got_one = False for obj in cdx.iter(url, **kwargs): got_one = True From 2c74dfd49382d809d8c00f25b8b571214a805320 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Wed, 4 Sep 2024 04:40:02 +0000 Subject: [PATCH 2/3] fix: remove unnecessary sleep (#37) --- cdx_toolkit/commoncrawl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py index a8608c7..2d6b9e5 100644 --- a/cdx_toolkit/commoncrawl.py +++ b/cdx_toolkit/commoncrawl.py @@ -60,7 +60,6 @@ def get_cc_endpoints(cc_mirror): if r.status_code != 200: raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo)) # pragma: no cover set_collinfo_cache(cc_mirror, r.text) - time.sleep(5) # XXX to avoid triggering rate limit col = r.json() endpoints = [x['cdx-api'] for x in col] From 83d1f31d5bb274c595a63e0d9dedbfff114e8c2a Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Wed, 4 Sep 2024 06:31:59 +0000 Subject: [PATCH 3/3] feat: split CI into normal and slow (#38) --- .github/workflows/ci-slow.yaml | 44 ++++++++++++++++++++++++++++++++++ .github/workflows/ci.yaml | 12 ---------- 2 files changed, 44 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/ci-slow.yaml diff --git a/.github/workflows/ci-slow.yaml b/.github/workflows/ci-slow.yaml new file mode 100644 index 0000000..76f8ca2 --- /dev/null +++ b/.github/workflows/ci-slow.yaml @@ -0,0 +1,44 @@ +name: CI-slow + +on: workflow_dispatch + +jobs: + unit-tests: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + #max-parallel: 1 + matrix: + include: + - python-version: '3.11' + os: macos-latest + EXTRA: true + - python-version: '3.12' + os: macos-latest + EXTRA: true + - python-version: '3.7' + os: windows-latest + EXTRA: true + - python-version: '3.12' + os: windows-latest + EXTRA: true + steps: + - name: checkout + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install setuptools on python 3.12+ + if: ${{ matrix.python-version >= '3.12' }} + run: | + pip install setuptools + + - name: Install cdx_toolkit + run: pip install .[test] + + - name: Run tests + run: | + make test_coverage diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index c78705e..4875825 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,18 +25,6 @@ jobs: EXTRA: true env: LOGLEVEL=DEBUG - - python-version: '3.11' - os: macos-latest - EXTRA: true - - python-version: '3.12' - os: macos-latest - EXTRA: true - - python-version: '3.7' - os: windows-latest - EXTRA: true - - python-version: '3.12' - os: windows-latest - EXTRA: true - python-version: '3.7' os: ubuntu-20.04 # oldest version on github actions EXTRA: true