Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve ci #36

Merged
merged 3 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cdx_toolkit/myrequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ def dns_fatal(hostname):
},
'index.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 3.0,
'minimum_interval': 1.0,
},
'data.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 3.0,
'minimum_interval': 0.55,
},
'web.archive.org': {
'next_fetch': 0,
Expand Down
77 changes: 57 additions & 20 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@
from cdx_toolkit.cli import main

import json
import sys
import os
import platform
import logging

import pytest
import requests

from cdx_toolkit.cli import main

LOGGER = logging.getLogger(__name__)


def slow_ci():
'''
For Github Actions, the windows and macos runners are very slow.
Detect those runners, so that we can cut testing short.
'''
if os.environ.get('FAKE_GITHUB_ACTION'):
LOGGER.error('limiting pytest because FAKE_GITHUB_ACTION')
return True
if os.environ.get('GITHUB_ACTION'):
if platform.system() in {'Darwin', 'Windows'}:
LOGGER.error('limiting pytest because GITHUB_ACTION')
return True
v = sys.version_info
if os.environ.get('GITHUB_ACTION') and v.major == 3 and v.minor != 12:
LOGGER.error('limiting pytest because GITHUB_ACTION and py != 3.12')
return False
LOGGER.error('full pytest')


def test_basics(capsys):
Expand Down Expand Up @@ -80,6 +103,8 @@ def test_multi_cc1(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


def test_multi_cc2(capsys, caplog):
Expand All @@ -101,9 +126,10 @@ def test_multi_cc2(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


@pytest.mark.skip(reason='needs some ratelimit love XXX')
def test_multi_ia(capsys, caplog):
tests = [
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
Expand All @@ -120,12 +146,11 @@ def test_multi_ia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
break # XXX minimize IA for ratelimit purposes


def test_multi_misc_notia(capsys, caplog):
def test_multi_misc_not_ia(capsys, caplog):
tests = [
[{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'count': 10, 'linefgrep': 'commoncrawl.org'}],
[{'service': '-v -v --source https://web.arc4567hive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'exception': ValueError}],
[{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
Expand All @@ -142,11 +167,14 @@ def test_multi_misc_notia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


@pytest.mark.skip(reason='needs some ratelimit love XXX')
def test_multi_misc_ia(capsys, caplog):
tests = [
[{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'count': 10, 'linefgrep': 'commoncrawl.org'}],
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'},
{'count': 1, 'is_int': True}],
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'},
Expand All @@ -157,35 +185,44 @@ def test_multi_misc_ia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
break # XXX minimize IA for ratelimit reasons


def test_warc(tmpdir, caplog):
# crash testing only, so far

base = ' --limit 10 warc commoncrawl.org/*'
base = ' --limit 1 warc commoncrawl.org/*'

prefixes = ('-v -v --cc', '--ia',
'--cc --cc-mirror https://index.commoncrawl.org/',
'--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web')
suffixes = ('--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
'--prefix EMPTY --size 1 --url-fgrep bar',
'--prefix EMPTY --size 1 --url-fgrepv common')
prefixes = ( # note limit 2 below
'-v -v --cc', # only case run by slow_cli
'--ia',
'--cc --cc-mirror https://index.commoncrawl.org/',
'--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web',
)
suffixes = (
'--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
'--prefix EMPTY --size 1 --url-fgrep bar',
'--prefix EMPTY --size 1 --url-fgrepv common'
)

with tmpdir.as_cwd():
for p in prefixes:
if '--ia' in p or 'archive.org' in p:
# XXX skip
continue
cmdline = p + base
if 'cc' in cmdline:
cmdline = cmdline.replace(' 1', ' 2')
print(cmdline, file=sys.stderr)
args = cmdline.split()
main(args=args)
if slow_ci():
break

for s in suffixes:
cmdline = prefixes[0] + base + ' ' + s
print(cmdline, file=sys.stderr)
args = cmdline.split()
main(args=args)
if slow_ci():
break

assert True

Expand All @@ -195,11 +232,11 @@ def one_ia_corner(tmpdir, cmdline):
main(args=cmdline.split())


@pytest.mark.skip(reason='needs some ratelimit love XXX')
@pytest.mark.skip(reason='needs some ratelimit love')
def test_warc_ia_corners(tmpdir, caplog):
'''
To test these more properly, need to add a --exact-warcname and then postprocess.
For now, these tests show up in the coverage report
For now, these are only crash tests.
'''

# revisit vivification
Expand Down
6 changes: 4 additions & 2 deletions tests/unit/test_capture_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_capture_object():
cdx_cc = cdx_toolkit.CDXFetcher(source='cc')
cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
#XXX cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
cdx_only = cdx_toolkit.CDXFetcher(source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG')

url = 'example.com'
Expand All @@ -16,10 +16,12 @@ def test_capture_object():
for obj in cdx_only.iter(url, **kwargs):
got_one = True
with pytest.raises(ValueError):
# we don't know how to fetch the content in this situation
_ = obj.content
assert got_one, 'found a capture cdx_only'

for cdx in (cdx_cc, cdx_ia):
#XXX for cdx in (cdx_cc, cdx_ia):
for cdx in (cdx_cc,):
got_one = False
for obj in cdx.iter(url, **kwargs):
got_one = True
Expand Down