cocrawler · wumpus · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/cdx_toolkit/myrequests.py b/cdx_toolkit/myrequests.py
@@ -27,11 +27,11 @@ def dns_fatal(hostname):
     },
     'index.commoncrawl.org': {
         'next_fetch': 0,
-        'minimum_interval': 3.0,
+        'minimum_interval': 1.0,
     },
     'data.commoncrawl.org': {
         'next_fetch': 0,
-        'minimum_interval': 3.0,
+        'minimum_interval': 0.55,
     },
     'web.archive.org': {
         'next_fetch': 0,

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,10 +1,33 @@
-from cdx_toolkit.cli import main
-
 import json
 import sys
+import os
+import platform
+import logging
 
 import pytest
-import requests
+
+from cdx_toolkit.cli import main
+
+LOGGER = logging.getLogger(__name__)
+
+
+def slow_ci():
+    '''
+    For Github Actions, the windows and macos runners are very slow.
+    Detect those runners, so that we can cut testing short.
+    '''
+    if os.environ.get('FAKE_GITHUB_ACTION'):
+        LOGGER.error('limiting pytest because FAKE_GITHUB_ACTION')
+        return True
+    if os.environ.get('GITHUB_ACTION'):
+        if platform.system() in {'Darwin', 'Windows'}:
+            LOGGER.error('limiting pytest because GITHUB_ACTION')
+            return True
+    v = sys.version_info
+    if os.environ.get('GITHUB_ACTION') and v.major == 3 and v.minor != 12:
+        LOGGER.error('limiting pytest because GITHUB_ACTION and py != 3.12')
+        return False
+    LOGGER.error('full pytest')
 
 
 def test_basics(capsys):
@@ -80,6 +103,8 @@ def test_multi_cc1(capsys, caplog):
 
     for t in tests:
         multi_helper(t, capsys, caplog)
+        if slow_ci():
+            break
 
 
 def test_multi_cc2(capsys, caplog):
@@ -101,9 +126,10 @@ def test_multi_cc2(capsys, caplog):
 
     for t in tests:
         multi_helper(t, capsys, caplog)
+        if slow_ci():
+            break
 
 
-@pytest.mark.skip(reason='needs some ratelimit love XXX')
 def test_multi_ia(capsys, caplog):
     tests = [
         [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
@@ -120,12 +146,11 @@ def test_multi_ia(capsys, caplog):
 
     for t in tests:
         multi_helper(t, capsys, caplog)
+        break  # XXX minimize IA for ratelimit purposes
 
 
-def test_multi_misc_notia(capsys, caplog):
+def test_multi_misc_not_ia(capsys, caplog):
     tests = [
-        [{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
-         {'count': 10, 'linefgrep': 'commoncrawl.org'}],
         [{'service': '-v -v --source https://web.arc4567hive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
          {'exception': ValueError}],
         [{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
@@ -142,11 +167,14 @@ def test_multi_misc_notia(capsys, caplog):
 
     for t in tests:
         multi_helper(t, capsys, caplog)
+        if slow_ci():
+            break
 
 
-@pytest.mark.skip(reason='needs some ratelimit love XXX')
 def test_multi_misc_ia(capsys, caplog):
     tests = [
+        [{'service': '--source https://web.archive.org/cdx/search/cdx', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
+         {'count': 10, 'linefgrep': 'commoncrawl.org'}],
         [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'},
          {'count': 1, 'is_int': True}],
         [{'service': '--ia', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'},
@@ -157,35 +185,44 @@ def test_multi_misc_ia(capsys, caplog):
 
     for t in tests:
         multi_helper(t, capsys, caplog)
+        break  # XXX minimize IA for ratelimit reasons
 
 
 def test_warc(tmpdir, caplog):
     # crash testing only, so far
 
-    base = ' --limit 10 warc commoncrawl.org/*'
+    base = ' --limit 1 warc commoncrawl.org/*'
 
-    prefixes = ('-v -v --cc', '--ia',
-                '--cc --cc-mirror https://index.commoncrawl.org/',
-                '--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web')
-    suffixes = ('--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
-                '--prefix EMPTY --size 1 --url-fgrep bar',
-                '--prefix EMPTY --size 1 --url-fgrepv common')
+    prefixes = (  # note limit 2 below
+        '-v -v --cc',  # only case run by slow_cli
+        '--ia',
+        '--cc --cc-mirror https://index.commoncrawl.org/',
+        '--source https://web.archive.org/cdx/search/cdx --wb https://web.archive.org/web',
+    )
+    suffixes = (
+        '--prefix FOO --subprefix BAR --size 1 --creator creator --operator bob --url-fgrep common --url-fgrepv bar',
+        '--prefix EMPTY --size 1 --url-fgrep bar',
+        '--prefix EMPTY --size 1 --url-fgrepv common'
+    )
 
     with tmpdir.as_cwd():
         for p in prefixes:
-            if '--ia' in p or 'archive.org' in p:
-                # XXX skip
-                continue
             cmdline = p + base
+            if 'cc' in cmdline:
+                cmdline = cmdline.replace(' 1', ' 2')
             print(cmdline, file=sys.stderr)
             args = cmdline.split()
             main(args=args)
+            if slow_ci():
+                break
 
         for s in suffixes:
             cmdline = prefixes[0] + base + ' ' + s
             print(cmdline, file=sys.stderr)
             args = cmdline.split()
             main(args=args)
+            if slow_ci():
+                break
 
         assert True
 
@@ -195,11 +232,11 @@ def one_ia_corner(tmpdir, cmdline):
         main(args=cmdline.split())
 
 
-@pytest.mark.skip(reason='needs some ratelimit love XXX')
+@pytest.mark.skip(reason='needs some ratelimit love')
 def test_warc_ia_corners(tmpdir, caplog):
     '''
     To test these more properly, need to add a --exact-warcname and then postprocess.
-    For now, these tests show up in the coverage report
+    For now, these are only crash tests.
     '''
 
     # revisit vivification

diff --git a/tests/unit/test_capture_object.py b/tests/unit/test_capture_object.py
@@ -6,7 +6,7 @@
 
 def test_capture_object():
     cdx_cc = cdx_toolkit.CDXFetcher(source='cc')
-    cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
+    #XXX cdx_ia = cdx_toolkit.CDXFetcher(source='ia')
     cdx_only = cdx_toolkit.CDXFetcher(source='https://web.archive.org/cdx/search/cdx', loglevel='DEBUG')
 
     url = 'example.com'
@@ -16,10 +16,12 @@ def test_capture_object():
     for obj in cdx_only.iter(url, **kwargs):
         got_one = True
         with pytest.raises(ValueError):
+            # we don't know how to fetch the content in this situation
             _ = obj.content
     assert got_one, 'found a capture cdx_only'
 
-    for cdx in (cdx_cc, cdx_ia):
+    #XXX for cdx in (cdx_cc, cdx_ia):
+    for cdx in (cdx_cc,):
         got_one = False
         for obj in cdx.iter(url, **kwargs):
             got_one = True