Skip to content

Commit

Permalink
Merge branch 'main' into cc-crawl
Browse files Browse the repository at this point in the history
  • Loading branch information
Greg Lindahl committed Sep 4, 2024
2 parents f026065 + 83d1f31 commit cbd530b
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 37 deletions.
44 changes: 44 additions & 0 deletions .github/workflows/ci-slow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: CI-slow

on: workflow_dispatch

jobs:
unit-tests:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
#max-parallel: 1
matrix:
include:
- python-version: '3.11'
os: macos-latest
EXTRA: true
- python-version: '3.12'
os: macos-latest
EXTRA: true
- python-version: '3.7'
os: windows-latest
EXTRA: true
- python-version: '3.12'
os: windows-latest
EXTRA: true
steps:
- name: checkout
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install setuptools on python 3.12+
if: ${{ matrix.python-version >= '3.12' }}
run: |
pip install setuptools
- name: Install cdx_toolkit
run: pip install .[test]

- name: Run tests
run: |
make test_coverage
12 changes: 0 additions & 12 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,6 @@ jobs:
EXTRA: true
env:
LOGLEVEL=DEBUG
- python-version: '3.11'
os: macos-latest
EXTRA: true
- python-version: '3.12'
os: macos-latest
EXTRA: true
- python-version: '3.7'
os: windows-latest
EXTRA: true
- python-version: '3.12'
os: windows-latest
EXTRA: true
- python-version: '3.7'
os: ubuntu-20.04 # oldest version on github actions
EXTRA: true
Expand Down
1 change: 0 additions & 1 deletion cdx_toolkit/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def get_cc_endpoints(cc_mirror):
if r.status_code != 200:
raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo)) # pragma: no cover
set_collinfo_cache(cc_mirror, r.text)
time.sleep(5) # XXX to avoid triggering rate limit
col = r.json()

endpoints = [x['cdx-api'] for x in col]
Expand Down
4 changes: 2 additions & 2 deletions cdx_toolkit/myrequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ def dns_fatal(hostname):
},
'index.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 3.0,
'minimum_interval': 1.0,
},
'data.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 3.0,
'minimum_interval': 0.55,
},
'web.archive.org': {
'next_fetch': 0,
Expand Down
77 changes: 57 additions & 20 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,33 @@
from cdx_toolkit.cli import main

import json
import sys
import os
import platform
import logging

import pytest
import requests

from cdx_toolkit.cli import main

LOGGER = logging.getLogger(__name__)


def slow_ci():
'''
For Github Actions, the windows and macos runners are very slow.
Detect those runners, so that we can cut testing short.
'''
if os.environ.get('FAKE_GITHUB_ACTION'):
LOGGER.error('limiting pytest because FAKE_GITHUB_ACTION')
return True
if os.environ.get('GITHUB_ACTION'):
if platform.system() in {'Darwin', 'Windows'}:
LOGGER.error('limiting pytest because GITHUB_ACTION')
return True
v = sys.version_info
if os.environ.get('GITHUB_ACTION') and v.major == 3 and v.minor != 12:
LOGGER.error('limiting pytest because GITHUB_ACTION and py != 3.12')
return False
LOGGER.error('full pytest')


def test_basics(capsys):
Expand Down Expand Up @@ -80,6 +103,8 @@ def test_multi_cc1(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


def test_multi_cc2(capsys, caplog):
Expand All @@ -101,9 +126,10 @@ def test_multi_cc2(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


@pytest.mark.skip(reason='needs some ratelimit love XXX')
def test_multi_ia(capsys, caplog):
tests = [
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
Expand All @@ -120,12 +146,11 @@ def test_multi_ia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
break # XXX minimize IA for ratelimit purposes


def test_multi_misc_notia(capsys, caplog):
def test_multi_misc_not_ia(capsys, caplog):
tests = [
[{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'count': 10, 'linefgrep': 'commoncrawl.org'}],
[{'service': '-v -v --source https://web.arc4567hive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'exception': ValueError}],
[{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
Expand All @@ -142,11 +167,14 @@ def test_multi_misc_notia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
if slow_ci():
break


@pytest.mark.skip(reason='needs some ratelimit love XXX')
def test_multi_misc_ia(capsys, caplog):
tests = [
[{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
{'count': 10, 'linefgrep': 'commoncrawl.org'}],
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'},
{'count': 1, 'is_int': True}],
[{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'},
Expand All @@ -157,35 +185,44 @@ def test_multi_misc_ia(capsys, caplog):

for t in tests:
multi_helper(t, capsys, caplog)
break # XXX minimize IA for ratelimit reasons


def test_warc(tmpdir, caplog):
# crash testing only, so far

base = ' --limit 10 warc commoncrawl.org/*'
base = ' --limit 1 warc commoncrawl.org/*'

prefixes = ('-v -v --cc', '--ia',
'--cc --cc-mirror https://index.commoncrawl.org/',
'--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web')
suffixes = ('--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
'--prefix EMPTY --size 1 --url-fgrep bar',
'--prefix EMPTY --size 1 --url-fgrepv common')
prefixes = ( # note limit 2 below
'-v -v --cc', # only case run by slow_cli
'--ia',
'--cc --cc-mirror https://index.commoncrawl.org/',
'--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web',
)
suffixes = (
'--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
'--prefix EMPTY --size 1 --url-fgrep bar',
'--prefix EMPTY --size 1 --url-fgrepv common'
)

with tmpdir.as_cwd():
for p in prefixes:
if '--ia' in p or 'archive.org' in p:
# XXX skip
continue
cmdline = p + base
if 'cc' in cmdline:
cmdline = cmdline.replace(' 1', ' 2')
print(cmdline, file=sys.stderr)
args = cmdline.split()
main(args=args)
if slow_ci():
break

for s in suffixes:
cmdline = prefixes[0] + base + ' ' + s
print(cmdline, file=sys.stderr)
args = cmdline.split()
main(args=args)
if slow_ci():
break

assert True

Expand All @@ -195,11 +232,11 @@ def one_ia_corner(tmpdir, cmdline):
main(args=cmdline.split())


@pytest.mark.skip(reason='needs some ratelimit love XXX')
@pytest.mark.skip(reason='needs some ratelimit love')
def test_warc_ia_corners(tmpdir, caplog):
'''
To test these more properly, need to add a --exact-warcname and then postprocess.
For now, these tests show up in the coverage report
For now, these are only crash tests.
'''

# revisit vivification
Expand Down
6 changes: 4 additions & 2 deletions tests/unit/test_capture_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_capture_object():
cdx_cc = cdx_toolkit.CDXFetcher(source='cc')
cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
#XXX cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
cdx_only = cdx_toolkit.CDXFetcher(source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG')

url = 'example.com'
Expand All @@ -16,10 +16,12 @@ def test_capture_object():
for obj in cdx_only.iter(url, **kwargs):
got_one = True
with pytest.raises(ValueError):
# we don't know how to fetch the content in this situation
_ = obj.content
assert got_one, 'found a capture cdx_only'

for cdx in (cdx_cc, cdx_ia):
#XXX for cdx in (cdx_cc, cdx_ia):
for cdx in (cdx_cc,):
got_one = False
for obj in cdx.iter(url, **kwargs):
got_one = True
Expand Down

0 comments on commit cbd530b

Please sign in to comment.