From a7b7b402b45fe51f3f1d2ea55c380846365acdb2 Mon Sep 17 00:00:00 2001
From: Greg Lindahl <greg@commomncrawl.org>
Date: Sun, 8 Sep 2024 07:58:26 +0000
Subject: [PATCH] feat: cdxt --crawl, plus partial docs

---
 README.md                  |  67 +++++++++++++------
 cdx_toolkit/__init__.py    |  15 +++--
 cdx_toolkit/cli.py         |   6 +-
 cdx_toolkit/commoncrawl.py | 131 ++++++++++++++++++++++++-------------
 tests/test_cli.py          |  19 +++++-
 tests/unit/test_cc.py      |  33 ++++++++--
 6 files changed, 190 insertions(+), 81 deletions(-)

diff --git a/README.md b/README.md
index 3ce1d26..9032d0f 100644
--- a/README.md
+++ b/README.md
@@ -3,37 +3,36 @@
 [![build](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml/badge.svg)](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml) [![coverage](https://codecov.io/gh/cocrawler/cdx_toolkit/graph/badge.svg?token=M1YJB998LE)](https://codecov.io/gh/cocrawler/cdx_toolkit) [![Apache License 2.0](https://img.shields.io/github/license/cocrawler/cdx_toolkit.svg)](LICENSE)
 
 cdx_toolkit is a set of tools for working with CDX indices of web
-crawls and archives, including those at CommonCrawl and the Internet
-Archive's Wayback Machine.
+crawls and archives, including those at the Common Crawl Foundation
+(CCF) and those at the Internet Archive's Wayback Machine.
 
-CommonCrawl uses Ilya Kreymer's pywb to serve the CDX API, which is
-somewhat different from the Internet Archive's CDX API server. cdx_toolkit
-hides these differences as best it can. cdx_toolkit also knits
-together the monthly Common Crawl CDX indices into a single, virtual
-index.
+Common Crawl uses Ilya Kreymer's pywb to serve the CDX API, which is
+somewhat different from the Internet Archive's CDX API server.
+cdx_toolkit hides these differences as best it can. cdx_toolkit also
+knits together the monthly Common Crawl CDX indices into a single,
+virtual index.
 
 Finally, cdx_toolkit allows extracting archived pages from CC and IA
-into WARC files.  If you're looking to create subsets of CC or IA data
-and then process them into WET or WAT files, this is a feature you'll
-find useful.
+into WARC files. If you're looking to create subsets of CC or IA data
+and then further process them, this is a feature you'll find useful.
 
 ## Installing
 
-cdx toolkit requires Python 3.
-
 ```
 $ pip install cdx_toolkit
 ```
 
-or clone this repo and use `python ./setup.py install`.
+or clone this repo and use `pip install .`
 
 ## Command-line tools
 
 ```
 $ cdxt --cc size 'commoncrawl.org/*'
-$ cdxt --cc --limit 10 iter 'commoncrawl.org/*'
+$ cdxt --cc --limit 10 iter 'commoncrawl.org/*'  # returns the most recent year
+$ cdxt --crawl 3 --limit 10 iter 'commoncrawl.org/*'  # returns the most recent 3 crawls
 $ cdxt --cc --limit 10 --filter '=status:200' iter 'commoncrawl.org/*'
-$ cdxt --ia --limit 10 iter 'commoncrawl.org/*'
+
+$ cdxt --ia --limit 10 iter 'commoncrawl.org/*'  # will show the beginning of IA's crawl
 $ cdxt --ia --limit 10 warc 'commoncrawl.org/*'
 ```
 
@@ -41,15 +40,42 @@ cdxt takes a large number of command line switches, controlling
 the time period and all other CDX query options. cdxt can generate
 WARC, jsonl, and csv outputs.
 
-** Note that by default, cdxt --cc will iterate over the previous
-year of captures. **
+If you don't specify much about the crawls or dates or number of
+records you're interested in, some default limits will kick in to
+prevent overly-large queries. These default limits include a maximum
+of 1000 records (`--limit 1000`) and a limit of 1 year of CC indexes.
+To exceed these limits, use `--limit` and `--crawl` or `--from` and
+`--to`.
+
+## Selecting particular CCF crawls
+
+Common Crawl's data is divided into "crawls", which were yearly at the
+start, and are currently done monthly. There are over 100 of them.
+
+XXX
 
-See
+Unlike some web archives, CCF doesn't have a single CDX index that
+covers all of these crawls. CCF does have a hive-sharded Parquet index
+(called the columnar index) that covers all of our indexes. You
+can find more information about this index at
+[the blog post about it](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format).
+
+The Internet Archive cdx index is organized as a single crawl that goes
+from the very beginning until now. That's why there is no `--crawl` for
+`--ia`. Note that cdx queries to `--ia` will default to one year year
+and limit 1000 entries if you do not specify `--from`, `--to`, and `--limit`.
+
+## Selecting by time
+
+XXX
+
+## The full syntax for command-line tools
 
 ```
 $ cdxt --help
 $ cdxt iter --help
 $ cdxt warc --help
+$ cdxt size --help
 ```
 
 for full details. Note that argument order really matters; each switch
@@ -57,7 +83,10 @@ is valid only either before or after the {iter,warc,size} command.
 
 Add -v (or -vv) to see what's going on under the hood.
 
-## Programming example
+## Python programming example
+
+Everything that you can do on the command line, and much more, can
+be done by writing a Python program.
 
 ```
 import cdx_toolkit
diff --git a/cdx_toolkit/__init__.py b/cdx_toolkit/__init__.py
index e1ba73c..e413a71 100644
--- a/cdx_toolkit/__init__.py
+++ b/cdx_toolkit/__init__.py
@@ -197,12 +197,14 @@ def __next__(self):
                 LOGGER.debug('getting more in __next__')
                 self.get_more()
                 if len(self.captures) <= 0:
+                    # XXX print out a warning if this hits the default limit of 1000
                     raise StopIteration
 
 
 class CDXFetcher:
-    def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None):
+    def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None):
         self.source = source
+        self.crawl = crawl
         self.cc_sort = cc_sort
         self.source = source
         if wb is not None and warc_download_prefix is not None:
@@ -211,12 +213,11 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No
         self.warc_download_prefix = warc_download_prefix
 
         if source == 'cc':
-            self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/'
-            self.raw_index_list = get_cc_endpoints(self.cc_mirror)
             if wb is not None:
                 raise ValueError('cannot specify wb= for source=cc')
+            self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/'
+            self.raw_index_list = get_cc_endpoints(self.cc_mirror)
             self.warc_download_prefix = warc_download_prefix or 'https://data.commoncrawl.org'
-            #https://commoncrawl.s3.amazonaws.com
         elif source == 'ia':
             self.index_list = ('https://web.archive.org/cdx/search/cdx',)
             if self.warc_download_prefix is None and self.wb is None:
@@ -230,8 +231,10 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No
             LOGGER.setLevel(level=loglevel)
 
     def customize_index_list(self, params):
-        if self.source == 'cc' and ('from' in params or 'from_ts' in params or 'to' in params or 'closest' in params):
+        if self.source == 'cc' and (self.crawl or 'crawl' in params or 'from' in params or 'from_ts' in params or 'to' in params or 'closest' in params):
             LOGGER.info('making a custom cc index list')
+            if self.crawl and 'crawl' not in params:
+                params['crawl'] = self.crawl
             return filter_cc_endpoints(self.raw_index_list, self.cc_sort, params=params)
         else:
             return self.index_list
@@ -278,7 +281,7 @@ def iter(self, url, **kwargs):
             params['filter'] = munge_filter(params['filter'], self.source)
 
         if self.source == 'cc':
-            apply_cc_defaults(params)
+            apply_cc_defaults(params, crawl_present=bool(self.crawl))
 
         index_list = self.customize_index_list(params)
         return CDXFetcherIter(self, params=params, index_list=index_list)
diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py
index a1aa528..6ffa393 100644
--- a/cdx_toolkit/cli.py
+++ b/cdx_toolkit/cli.py
@@ -18,14 +18,14 @@ def main(args=None):
     parser.add_argument('--verbose', '-v', action='count', help='set logging level to INFO (-v) or DEBUG (-vv)')
 
     parser.add_argument('--cc', action='store_const', const='cc', help='direct the query to the Common Crawl CDX/WARCs')
+    parser.add_argument('--crawl', action='store', help='crawl names (comma separated) or an integer for the most recent N crawls. Implies --cc')
     parser.add_argument('--ia', action='store_const', const='ia', help='direct the query to the Internet Archive CDX/wayback')
     parser.add_argument('--source', action='store', help='direct the query to this CDX server')
     parser.add_argument('--wb', action='store', help='direct replays for content to this wayback')
     parser.add_argument('--limit', type=int, action='store')
     parser.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror')
     parser.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending')
-    parser.add_argument('--crawl', nargs='*', action='store', help='crawl names or an integer for the most recent N crawls. Implies --cc')
-    parser.add_argument('--from', action='store')  # XXX default for cc
+    parser.add_argument('--from', action='store')
     parser.add_argument('--to', action='store')
     parser.add_argument('--filter', action='append', help='see CDX API documentation for usage')
     parser.add_argument('--get', action='store_true', help='use a single get instead of a paged iteration. default limit=1000')
@@ -103,7 +103,7 @@ def setup(cmd):
     if cmd.cc_mirror:
         kwargs['cc_mirror'] = cmd.cc_mirror
     if cmd.crawl:
-        kwargs['crawl'] = normalize_crawl(cmd.crawl)
+        kwargs['crawl'] = normalize_crawl([cmd.crawl])  # currently a string, not a list
     if getattr(cmd, 'warc_download_prefix', None) is not None:
         kwargs['warc_download_prefix'] = cmd.warc_download_prefix
 
diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py
index d3f26c3..8ef7b23 100644
--- a/cdx_toolkit/commoncrawl.py
+++ b/cdx_toolkit/commoncrawl.py
@@ -10,7 +10,7 @@
 import logging
 
 from .myrequests import myrequests_get
-from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special
+from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special
 
 LOGGER = logging.getLogger(__name__)
 
@@ -22,7 +22,7 @@ def normalize_crawl(crawl):
             crawls.extend(c.split(','))
         else:
             crawls.append(c)
-    if len(crawls) > 1 and any(x.isdigit() for x in crawls):
+    if len(crawls) > 1 and (any(x.isdigit() for x in crawls)):
         raise ValueError('If you specify an integer, only one crawl is allowed')
     return crawls
 
@@ -79,44 +79,79 @@ def get_cc_endpoints(cc_mirror):
         raise ValueError('Surprisingly few endpoints for common crawl index')  # pragma: no cover
     LOGGER.info('Found %d endpoints in the Common Crawl index', len(endpoints))
 
-    # endpoints arrive sorted oldest to newest, but let's force that anyawy
+    # endpoints arrive descending, make them ascending
     endpoints = sorted(endpoints)
 
     return endpoints
 
 
-def apply_cc_defaults(params, now=None):
-    if 'crawl' in params:
-        return
+def apply_cc_defaults(params, crawl_present=False, now=None):
+    # closest has needs
+    #   if crawl, do nothing (expect the user to have picked the correct crawls)
+    #     XXX ? check sort order, which happens later?
+    #   if no from or to, set them -/+ 3 months from the closest timestamp
+    # crawl? nothing
+    # no crawl? 1 year if not specified
 
-    three_months = 3 * 30 * 86400
-    year = 365*86400
-    if params.get('from_ts') is None:
-        if params.get('closest') is not None:
-            closest_t = timestamp_to_time(params['closest'])
+    if params.get('closest') is not None:
+        closest_t = timestamp_to_time(params['closest'])
+        three_months = 3 * 30 * 86400
+        if params.get('from_ts') is None:
             params['from_ts'] = time_to_timestamp(closest_t - three_months)
             LOGGER.info('no from but closest, setting from=%s', params['from_ts'])
+        if params.get('to') is None:
+            params['to'] = time_to_timestamp(closest_t + three_months)
+            LOGGER.info('no to but closest, setting to=%s', params['to'])
+        # XXX set sort order to funky? which does not exist yet
+    elif not crawl_present:
+        # can't check params for 'crawl' because crawl is not ever set in params
+        year = 365*86400
+        if params.get('from_ts') is not None:
             if params.get('to') is None:
-                params['to'] = time_to_timestamp(closest_t + three_months)
-                LOGGER.info('no to but closest, setting to=%s', params['to'])
+                #from_ts = pad_timestamp(params['from_ts'])
+                #params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year)
+                #LOGGER.info('no to, setting to=%s', params['to'])
+                LOGGER.info('from but no to, not doing anything')
         elif params.get('to') is not None:
-            to = pad_timestamp_up(params['to'])
-            params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year)
-            LOGGER.info('no from but to, setting from=%s', params['from_ts'])
+            if params.get('from_ts') is None:
+                to = pad_timestamp_up(params['to'])
+                params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year)
+                LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts'])
         else:
             if not now:
+                # now is passed in by tests. if not set, use actual now.
+                # XXX could be changed to mock
                 now = time.time()
             params['from_ts'] = time_to_timestamp(now - year)
-            LOGGER.info('no from, setting from=%s', params['from_ts'])
-    if params.get('to') is None:
-        if params.get('closest') is not None:
-            closest_t = timestamp_to_time(params['closest'])
-            # 3 months later
-            params['to'] = time_to_timestamp(closest_t + three_months)
-            LOGGER.info('no to but closest, setting from=%s', params['to'])
-        else:
-            # no to or closest; from was set above, we will not set to
-            pass
+            LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts'])
+    else:
+        # crawl -- assume the user picked the right things
+        pass
+
+
+def match_cc_crawls(crawls, raw_index_list):
+    # match crawls requested on the command line to actual crawls
+    # note that from/to are not considered here
+    # crawls should be normalized so it's supposed to be a list of str
+    if len(crawls) == 1 and crawls[0].isdigit():
+        num = int(crawls[0])
+        raw_index_list = raw_index_list[-num:]
+    else:
+        selected = set()
+        used = set()
+        for asked in crawls:
+            for available in raw_index_list:
+                if asked in available:
+                    used.add(asked)
+                    selected.add(available)
+        if not used:
+            raise ValueError('No matches for crawls '+','.join(crawls))
+        missed = set(crawls).difference(used)
+        if missed:
+            LOGGER.warning('No matches for these crawl args: '+','.join(missed))
+        raw_index_list = sorted(selected)
+    LOGGER.info('matched crawls are: '+','.join(raw_index_list))
+    return raw_index_list
 
 
 def make_cc_maps(raw_index_list):
@@ -146,6 +181,8 @@ def make_cc_maps(raw_index_list):
 
 def check_cc_from_to(params):
     # given caller's time specification, select from and to times; enforce limit on combinations
+    # closest: both from and to must be present
+    # otherwise: expect from to exist (due to the cc default 1 year)
     if 'closest' in params:
         if 'from_ts' not in params or params['from_ts'] is None:
             raise ValueError('Cannot happen')
@@ -185,24 +222,27 @@ def bisect_cc(cc_map, cc_times, from_ts_t, to_t):
 
 
 def filter_cc_endpoints(raw_index_list, cc_sort, params={}):
-    # YYY with --crawl, just check that the list is crawls that exist
-    # YYY if we want to expand CC-MAIN-2024 to be all 2024 crawls, that can be done here
-    # YYY we do need to reorder according to cc_sort
-    # what is the type of raw_index_list -- it is from collinfo.json cdx-api
-    # "cdx-api": "https://index.commoncrawl.org/CC-MAIN-2024-18-index"
-
-    # if no --crawl
-    cc_map, cc_times = make_cc_maps(raw_index_list)
-
-    from_ts_t, to_t = check_cc_from_to(params)
-
-    index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t)
+    crawl_present = False
+    if 'crawl' in params:
+        crawl_present = True
+        crawls = params['crawl']
+        del params['crawl']
+        index_list = match_cc_crawls(crawls, raw_index_list)
 
-    # write the fully-adjusted from and to into params XXX necessasry?
-    # XXX wut? should we only do this when we've changed or added these ?!
-    params['from_ts'] = time_to_timestamp(from_ts_t)
-    if to_t is not None:
-        params['to'] = time_to_timestamp(to_t)
+    else:
+        # date-based selection. if --crawl was specified, raw_index_list has already been narrowed
+        # YYY this does not yet use collinfo.json from, to
+        # YYY shouldn't this be skipped if crawl_present?
+        cc_map, cc_times = make_cc_maps(raw_index_list)
+        from_ts_t, to_t = check_cc_from_to(params)
+        index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t)
+
+        # write the fully-adjusted from and to into params XXX necessasry?
+        # XXX wut? should we only do this when we've changed or added these ?!
+        # to_t might have been padded. does from_ts ever get padded?
+        params['from_ts'] = time_to_timestamp(from_ts_t)
+        if to_t is not None:
+            params['to'] = time_to_timestamp(to_t)
 
     # adjust index_list order based on cc_sort order
     if 'closest' in params:
@@ -219,7 +259,10 @@ def filter_cc_endpoints(raw_index_list, cc_sort, params={}):
         raise ValueError('unknown cc_sort arg of '+cc_sort)
 
     if index_list:
-        LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1])
+        if crawl_present:
+            LOGGER.info('using cc crawls '+','.join(index_list))
+        else:
+            LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1])
     else:
         LOGGER.warning('empty cc index range found')
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 2284e56..8a3be51 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -41,6 +41,16 @@ def test_basics(capsys):
         # this might be commoncrawl.org./ or commoncrawl.org/
         assert 'commoncrawl.org' in line
 
+    args = '--crawl 2 --limit 10 iter commoncrawl.org/*'.split()
+    main(args=args)
+    out, err = capsys.readouterr()
+
+    split = out.splitlines()
+    assert len(split) == 10
+    for line in out.splitlines():
+        # this might be commoncrawl.org./ or commoncrawl.org/
+        assert 'commoncrawl.org' in line
+
 
 def multi_helper(t, capsys, caplog):
     inputs = t[0]
@@ -83,8 +93,8 @@ def test_multi_cc1(capsys, caplog):
          {'count': 10, 'linefgrep': 'commoncrawl.org'}],
         [{'service': '--cc', 'mods': '--limit 11', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
          {'count': 11, 'linefgrep': 'commoncrawl.org'}],
-#        [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/thisurlneverdidexist'},
-#         {'count': 0}],  # should limit to 1 index because it runs slowly!
+        [{'service': '--crawl 1', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/thisurlneverdidexist'},
+         {'count': 0}],  # runs slowly if we don't limit crawl to 1
         [{'service': '--cc', 'mods': '--cc-mirror https://index.commoncrawl.org/ --limit 11', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
          {'count': 11, 'linefgrep': 'commoncrawl.org'}],
         [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/* --all-fields'},
@@ -156,6 +166,11 @@ def test_multi_misc_not_ia(capsys, caplog):
         [{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
          {'exception': ValueError}],
 
+        [{'service': '--crawl 1,1', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
+         {'exception': ValueError}],
+        [{'service': '--crawl 1,CC-MAIN-2024', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'},
+         {'exception': ValueError}],
+
         [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'},
          {'count': 1, 'is_int': True}],
         [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'},
diff --git a/tests/unit/test_cc.py b/tests/unit/test_cc.py
index 17aa2e6..65bd29f 100644
--- a/tests/unit/test_cc.py
+++ b/tests/unit/test_cc.py
@@ -40,33 +40,52 @@ def test_apply_cc_defaults():
     now = 1524962339.157388  # 20180429003859
 
     tests = [
+        [{'crawl': 'foo'}, {}],
         [{'closest': '20180101'}, {'from_ts': '20171003000000', 'to': '20180401000000'}],
         [{'closest': '20180101', 'to': '20181201'}, {'from_ts': '20171003000000'}],
         [{'to': '20180101'}, {'from_ts': '20170131235959'}],
-        [{}, {'from_ts': '20170429003859'}],  # hits both elses, uses now
+        [{}, {'from_ts': '20170429003859'}],  # uses now
         [{'from_ts': '20100101', 'closest': '20150301'}, {'to': '20150530000000'}],
-        [{'from_ts': '20100101'}, {}],  # hits the second else only
+        [{'from_ts': '20100101'}, {}],
     ]
 
     for test_in, test_out in tests:
+        crawl_present = bool(test_in.pop('crawl', None))
         test_out.update(test_in)
-        cdx_toolkit.commoncrawl.apply_cc_defaults(test_in, now=now)
+        cdx_toolkit.commoncrawl.apply_cc_defaults(test_in, crawl_present=crawl_present, now=now)
         assert test_in == test_out
 
 
 my_cc_endpoints = [
+    # expected to be ascending
+    'https://index.commoncrawl.org/CC-MAIN-2008-2009-index',
+    'https://index.commoncrawl.org/CC-MAIN-2009-2010-index',
+    'https://index.commoncrawl.org/CC-MAIN-2012-index',
     'https://index.commoncrawl.org/CC-MAIN-2013-20-index',
     'https://index.commoncrawl.org/CC-MAIN-2017-51-index',
     'https://index.commoncrawl.org/CC-MAIN-2018-05-index',
     'https://index.commoncrawl.org/CC-MAIN-2018-09-index',
     'https://index.commoncrawl.org/CC-MAIN-2018-13-index',
-    # and the specials
-    'https://index.commoncrawl.org/CC-MAIN-2012-index',
-    'https://index.commoncrawl.org/CC-MAIN-2009-2010-index',
-    'https://index.commoncrawl.org/CC-MAIN-2008-2009-index',
 ]
 
 
+def test_match_cc_crawls():
+    tests = [
+        [['CC-MAIN-2013-20'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index']],
+        [['CC-MAIN-2017'], ['https://index.commoncrawl.org/CC-MAIN-2017-51-index']],
+        [['CC-MAIN-2018'], ['https://index.commoncrawl.org/CC-MAIN-2018-05-index',
+                            'https://index.commoncrawl.org/CC-MAIN-2018-09-index',
+                            'https://index.commoncrawl.org/CC-MAIN-2018-13-index']],
+        [['CC-MAIN-2013', 'CC-MAIN-2017'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index',
+                                            'https://index.commoncrawl.org/CC-MAIN-2017-51-index']],
+        [['CC-MAIN-2013-20', 'no match'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index']],  # .warning
+    ]
+    for t in tests:
+        assert cdx_toolkit.commoncrawl.match_cc_crawls(t[0], my_cc_endpoints) == t[1]
+    with pytest.raises(ValueError):
+        cdx_toolkit.commoncrawl.match_cc_crawls(['no match'], my_cc_endpoints)
+
+
 def test_make_cc_maps():
     cc_map, cc_times = cdx_toolkit.commoncrawl.make_cc_maps(my_cc_endpoints)
     t = cc_times[0]