Skip to content

Commit

Permalink
feat: lower min intervals, reduce CI work
Browse files Browse the repository at this point in the history
  • Loading branch information
wumpus authored Sep 3, 2024
1 parent 666fcaf commit 6578915
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 24 deletions.
4 changes: 2 additions & 2 deletions cdx_toolkit/myrequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ def dns_fatal(hostname):
},
'index.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 3.0,
'minimum_interval': 1.0,
},
'data.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 3.0,
'minimum_interval': 0.55,
},
'web.archive.org': {
'next_fetch': 0,
Expand Down
77 changes: 57 additions & 20 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@
from cdx_toolkit.cli import main

import json
import sys
import os
import platform
import logging

import pytest
import requests

from cdx_toolkit.cli import main

LOGGER = logging.getLogger(__name__)


def slow_ci():
'''
For Github Actions, the windows and macos runners are very slow.
Detect those runners, so that we can cut testing short.
'''
if os.environ.get('FAKE_GITHUB_ACTION'):
LOGGER.error('limiting pytest because FAKE_GITHUB_ACTION')
return True
if os.environ.get('GITHUB_ACTION'):
if platform.system() in {'Darwin', 'Windows'}:
LOGGER.error('limiting pytest because GITHUB_ACTION')
return True
v = sys.version_info
if os.environ.get('GITHUB_ACTION') and v.major == 3 and v.minor != 12:
LOGGER.error('limiting pytest because GITHUB_ACTION and py != 3.12')
return False
LOGGER.error('full pytest')


def test_basics(capsys):
Expand Down Expand Up @@ -80,6 +103,8 @@ def test_multi_cc1(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


def test_multi_cc2(capsys, caplog):
Expand All @@ -101,9 +126,10 @@ def test_multi_cc2(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


@pytest.mark.skip(reason='needs some ratelimit love XXX')
def test_multi_ia(capsys, caplog):
tests = [
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
Expand All @@ -120,12 +146,11 @@ def test_multi_ia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
break # XXX minimize IA for ratelimit purposes


def test_multi_misc_notia(capsys, caplog):
def test_multi_misc_not_ia(capsys, caplog):
tests = [
[{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'count': 10, 'linefgrep': 'commoncrawl.org'}],
[{'service': '-v -v --source https://web.arc4567hive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'exception': ValueError}],
[{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
Expand All @@ -142,11 +167,14 @@ def test_multi_misc_notia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


@pytest.mark.skip(reason='needs some ratelimit love XXX')
def test_multi_misc_ia(capsys, caplog):
tests = [
[{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'count': 10, 'linefgrep': 'commoncrawl.org'}],
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'},
{'count': 1, 'is_int': True}],
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'},
Expand All @@ -157,35 +185,44 @@ def test_multi_misc_ia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
break # XXX minimize IA for ratelimit reasons


def test_warc(tmpdir, caplog):
# crash testing only, so far

base = ' --limit 10 warc commoncrawl.org/*'
base = ' --limit 1 warc commoncrawl.org/*'

prefixes = ('-v -v --cc', '--ia',
'--cc --cc-mirror https://index.commoncrawl.org/',
'--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web')
suffixes = ('--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
'--prefix EMPTY --size 1 --url-fgrep bar',
'--prefix EMPTY --size 1 --url-fgrepv common')
prefixes = ( # note limit 2 below
'-v -v --cc', # only case run by slow_cli
'--ia',
'--cc --cc-mirror https://index.commoncrawl.org/',
'--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web',
)
suffixes = (
'--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
'--prefix EMPTY --size 1 --url-fgrep bar',
'--prefix EMPTY --size 1 --url-fgrepv common'
)

with tmpdir.as_cwd():
for p in prefixes:
if '--ia' in p or 'archive.org' in p:
# XXX skip
continue
cmdline = p + base
if 'cc' in cmdline:
cmdline = cmdline.replace(' 1', ' 2')
print(cmdline, file=sys.stderr)
args = cmdline.split()
main(args=args)
if slow_ci():
break

for s in suffixes:
cmdline = prefixes[0] + base + ' ' + s
print(cmdline, file=sys.stderr)
args = cmdline.split()
main(args=args)
if slow_ci():
break

assert True

Expand All @@ -195,11 +232,11 @@ def one_ia_corner(tmpdir, cmdline):
main(args=cmdline.split())


@pytest.mark.skip(reason='needs some ratelimit love XXX')
@pytest.mark.skip(reason='needs some ratelimit love')
def test_warc_ia_corners(tmpdir, caplog):
'''
To test these more properly, need to add a --exact-warcname and then postprocess.
For now, these tests show up in the coverage report
For now, these are only crash tests.
'''

# revisit vivification
Expand Down
6 changes: 4 additions & 2 deletions tests/unit/test_capture_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_capture_object():
cdx_cc = cdx_toolkit.CDXFetcher(source='cc')
cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
#XXX cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
cdx_only = cdx_toolkit.CDXFetcher(source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG')

url = 'example.com'
Expand All @@ -16,10 +16,12 @@ def test_capture_object():
for obj in cdx_only.iter(url, **kwargs):
got_one = True
with pytest.raises(ValueError):
# we don't know how to fetch the content in this situation
_ = obj.content
assert got_one, 'found a capture cdx_only'

for cdx in (cdx_cc, cdx_ia):
#XXX for cdx in (cdx_cc, cdx_ia):
for cdx in (cdx_cc,):
got_one = False
for obj in cdx.iter(url, **kwargs):
got_one = True
Expand Down

0 comments on commit 6578915

Please sign in to comment.