feat: cdxt --crawl, plus partial docs

cocrawler · Sep 8, 2024 · a7b7b40 · a7b7b40
1 parent cbd530b
commit a7b7b40
Show file tree

Hide file tree

Showing 6 changed files with 190 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -3,61 +3,90 @@
 [![build](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml/badge.svg)](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml) [![coverage](https://codecov.io/gh/cocrawler/cdx_toolkit/graph/badge.svg?token=M1YJB998LE)](https://codecov.io/gh/cocrawler/cdx_toolkit) [![Apache License 2.0](https://img.shields.io/github/license/cocrawler/cdx_toolkit.svg)](LICENSE)
 
 cdx_toolkit is a set of tools for working with CDX indices of web
-crawls and archives, including those at CommonCrawl and the Internet
-Archive's Wayback Machine.
+crawls and archives, including those at the Common Crawl Foundation
+(CCF) and those at the Internet Archive's Wayback Machine.
 
-CommonCrawl uses Ilya Kreymer's pywb to serve the CDX API, which is
-somewhat different from the Internet Archive's CDX API server. cdx_toolkit
-hides these differences as best it can. cdx_toolkit also knits
-together the monthly Common Crawl CDX indices into a single, virtual
-index.
+Common Crawl uses Ilya Kreymer's pywb to serve the CDX API, which is
+somewhat different from the Internet Archive's CDX API server.
+cdx_toolkit hides these differences as best it can. cdx_toolkit also
+knits together the monthly Common Crawl CDX indices into a single,
+virtual index.
 
 Finally, cdx_toolkit allows extracting archived pages from CC and IA
-into WARC files.  If you're looking to create subsets of CC or IA data
-and then process them into WET or WAT files, this is a feature you'll
-find useful.
+into WARC files. If you're looking to create subsets of CC or IA data
+and then further process them, this is a feature you'll find useful.
 
 ## Installing
 
-cdx toolkit requires Python 3.
-
 ```
 $ pip install cdx_toolkit
 ```
 
-or clone this repo and use `python ./setup.py install`.
+or clone this repo and use `pip install .`
 
 ## Command-line tools
 
 ```
 $ cdxt --cc size 'commoncrawl.org/*'
-$ cdxt --cc --limit 10 iter 'commoncrawl.org/*'
+$ cdxt --cc --limit 10 iter 'commoncrawl.org/*'  # returns the most recent year
+$ cdxt --crawl 3 --limit 10 iter 'commoncrawl.org/*'  # returns the most recent 3 crawls
 $ cdxt --cc --limit 10 --filter '=status:200' iter 'commoncrawl.org/*'
-$ cdxt --ia --limit 10 iter 'commoncrawl.org/*'
+
+$ cdxt --ia --limit 10 iter 'commoncrawl.org/*'  # will show the beginning of IA's crawl
 $ cdxt --ia --limit 10 warc 'commoncrawl.org/*'
 ```
 
 cdxt takes a large number of command line switches, controlling
 the time period and all other CDX query options. cdxt can generate
 WARC, jsonl, and csv outputs.
 
-** Note that by default, cdxt --cc will iterate over the previous
-year of captures. **
+If you don't specify much about the crawls or dates or number of
+records you're interested in, some default limits will kick in to
+prevent overly-large queries. These default limits include a maximum
+of 1000 records (`--limit 1000`) and a limit of 1 year of CC indexes.
+To exceed these limits, use `--limit` and `--crawl` or `--from` and
+`--to`.
+
+## Selecting particular CCF crawls
+
+Common Crawl's data is divided into "crawls", which were yearly at the
+start, and are currently done monthly. There are over 100 of them.
+
+XXX
 
-See
+Unlike some web archives, CCF doesn't have a single CDX index that
+covers all of these crawls. CCF does have a hive-sharded Parquet index
+(called the columnar index) that covers all of our indexes. You
+can find more information about this index at
+[the blog post about it](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format).
+
+The Internet Archive cdx index is organized as a single crawl that goes
+from the very beginning until now. That's why there is no `--crawl` for
+`--ia`. Note that cdx queries to `--ia` will default to one year year
+and limit 1000 entries if you do not specify `--from`, `--to`, and `--limit`.
+
+## Selecting by time
+
+XXX
+
+## The full syntax for command-line tools
 
 ```
 $ cdxt --help
 $ cdxt iter --help
 $ cdxt warc --help
+$ cdxt size --help
 ```
 
 for full details. Note that argument order really matters; each switch
 is valid only either before or after the {iter,warc,size} command.
 
 Add -v (or -vv) to see what's going on under the hood.
 
-## Programming example
+## Python programming example
+
+Everything that you can do on the command line, and much more, can
+be done by writing a Python program.
 
 ```
 import cdx_toolkit

diff --git a/cdx_toolkit/__init__.py b/cdx_toolkit/__init__.py
@@ -197,12 +197,14 @@ def __next__(self):
                 LOGGER.debug('getting more in __next__')
                 self.get_more()
                 if len(self.captures) <= 0:
+                    # XXX print out a warning if this hits the default limit of 1000
                     raise StopIteration
 
 
 class CDXFetcher:
-    def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None):
+    def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None):
         self.source = source
+        self.crawl = crawl
         self.cc_sort = cc_sort
         self.source = source
         if wb is not None and warc_download_prefix is not None:
@@ -211,12 +213,11 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No
         self.warc_download_prefix = warc_download_prefix
 
         if source == 'cc':
-            self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/'
-            self.raw_index_list = get_cc_endpoints(self.cc_mirror)
             if wb is not None:
                 raise ValueError('cannot specify wb= for source=cc')
+            self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/'
+            self.raw_index_list = get_cc_endpoints(self.cc_mirror)
             self.warc_download_prefix = warc_download_prefix or 'https://data.commoncrawl.org'
-            #https://commoncrawl.s3.amazonaws.com
         elif source == 'ia':
             self.index_list = ('https://web.archive.org/cdx/search/cdx',)
             if self.warc_download_prefix is None and self.wb is None:
@@ -230,8 +231,10 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No
             LOGGER.setLevel(level=loglevel)
 
     def customize_index_list(self, params):
-        if self.source == 'cc' and ('from' in params or 'from_ts' in params or 'to' in params or 'closest' in params):
+        if self.source == 'cc' and (self.crawl or 'crawl' in params or 'from' in params or 'from_ts' in params or 'to' in params or 'closest' in params):
             LOGGER.info('making a custom cc index list')
+            if self.crawl and 'crawl' not in params:
+                params['crawl'] = self.crawl
             return filter_cc_endpoints(self.raw_index_list, self.cc_sort, params=params)
         else:
             return self.index_list
@@ -278,7 +281,7 @@ def iter(self, url, **kwargs):
             params['filter'] = munge_filter(params['filter'], self.source)
 
         if self.source == 'cc':
-            apply_cc_defaults(params)
+            apply_cc_defaults(params, crawl_present=bool(self.crawl))
 
         index_list = self.customize_index_list(params)
         return CDXFetcherIter(self, params=params, index_list=index_list)

diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py
@@ -18,14 +18,14 @@ def main(args=None):
     parser.add_argument('--verbose', '-v', action='count', help='set logging level to INFO (-v) or DEBUG (-vv)')
 
     parser.add_argument('--cc', action='store_const', const='cc', help='direct the query to the Common Crawl CDX/WARCs')
+    parser.add_argument('--crawl', action='store', help='crawl names (comma separated) or an integer for the most recent N crawls. Implies --cc')
     parser.add_argument('--ia', action='store_const', const='ia', help='direct the query to the Internet Archive CDX/wayback')
     parser.add_argument('--source', action='store', help='direct the query to this CDX server')
     parser.add_argument('--wb', action='store', help='direct replays for content to this wayback')
     parser.add_argument('--limit', type=int, action='store')
     parser.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror')
     parser.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending')
-    parser.add_argument('--crawl', nargs='*', action='store', help='crawl names or an integer for the most recent N crawls. Implies --cc')
-    parser.add_argument('--from', action='store')  # XXX default for cc
+    parser.add_argument('--from', action='store')
     parser.add_argument('--to', action='store')
     parser.add_argument('--filter', action='append', help='see CDX API documentation for usage')
     parser.add_argument('--get', action='store_true', help='use a single get instead of a paged iteration. default limit=1000')
@@ -103,7 +103,7 @@ def setup(cmd):
     if cmd.cc_mirror:
         kwargs['cc_mirror'] = cmd.cc_mirror
     if cmd.crawl:
-        kwargs['crawl'] = normalize_crawl(cmd.crawl)
+        kwargs['crawl'] = normalize_crawl([cmd.crawl])  # currently a string, not a list
     if getattr(cmd, 'warc_download_prefix', None) is not None:
         kwargs['warc_download_prefix'] = cmd.warc_download_prefix
 

diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py
@@ -10,7 +10,7 @@
 import logging
 
 from .myrequests import myrequests_get
-from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special
+from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special
 
 LOGGER = logging.getLogger(__name__)
 
@@ -22,7 +22,7 @@ def normalize_crawl(crawl):
             crawls.extend(c.split(','))
         else:
             crawls.append(c)
-    if len(crawls) > 1 and any(x.isdigit() for x in crawls):
+    if len(crawls) > 1 and (any(x.isdigit() for x in crawls)):
         raise ValueError('If you specify an integer, only one crawl is allowed')
     return crawls
 
@@ -79,44 +79,79 @@ def get_cc_endpoints(cc_mirror):
         raise ValueError('Surprisingly few endpoints for common crawl index')  # pragma: no cover
     LOGGER.info('Found %d endpoints in the Common Crawl index', len(endpoints))
 
-    # endpoints arrive sorted oldest to newest, but let's force that anyawy
+    # endpoints arrive descending, make them ascending
     endpoints = sorted(endpoints)
 
     return endpoints
 
 
-def apply_cc_defaults(params, now=None):
-    if 'crawl' in params:
-        return
+def apply_cc_defaults(params, crawl_present=False, now=None):
+    # closest has needs
+    #   if crawl, do nothing (expect the user to have picked the correct crawls)
+    #     XXX ? check sort order, which happens later?
+    #   if no from or to, set them -/+ 3 months from the closest timestamp
+    # crawl? nothing
+    # no crawl? 1 year if not specified
 
-    three_months = 3 * 30 * 86400
-    year = 365*86400
-    if params.get('from_ts') is None:
-        if params.get('closest') is not None:
-            closest_t = timestamp_to_time(params['closest'])
+    if params.get('closest') is not None:
+        closest_t = timestamp_to_time(params['closest'])
+        three_months = 3 * 30 * 86400
+        if params.get('from_ts') is None:
             params['from_ts'] = time_to_timestamp(closest_t - three_months)
             LOGGER.info('no from but closest, setting from=%s', params['from_ts'])
+        if params.get('to') is None:
+            params['to'] = time_to_timestamp(closest_t + three_months)
+            LOGGER.info('no to but closest, setting to=%s', params['to'])
+        # XXX set sort order to funky? which does not exist yet
+    elif not crawl_present:
+        # can't check params for 'crawl' because crawl is not ever set in params
+        year = 365*86400
+        if params.get('from_ts') is not None:
             if params.get('to') is None:
-                params['to'] = time_to_timestamp(closest_t + three_months)
-                LOGGER.info('no to but closest, setting to=%s', params['to'])
+                #from_ts = pad_timestamp(params['from_ts'])
+                #params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year)
+                #LOGGER.info('no to, setting to=%s', params['to'])
+                LOGGER.info('from but no to, not doing anything')
         elif params.get('to') is not None:
-            to = pad_timestamp_up(params['to'])
-            params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year)
-            LOGGER.info('no from but to, setting from=%s', params['from_ts'])
+            if params.get('from_ts') is None:
+                to = pad_timestamp_up(params['to'])
+                params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year)
+                LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts'])
         else:
             if not now:
+                # now is passed in by tests. if not set, use actual now.
+                # XXX could be changed to mock
                 now = time.time()
             params['from_ts'] = time_to_timestamp(now - year)
-            LOGGER.info('no from, setting from=%s', params['from_ts'])
-    if params.get('to') is None:
-        if params.get('closest') is not None:
-            closest_t = timestamp_to_time(params['closest'])
-            # 3 months later
-            params['to'] = time_to_timestamp(closest_t + three_months)
-            LOGGER.info('no to but closest, setting from=%s', params['to'])
-        else:
-            # no to or closest; from was set above, we will not set to
-            pass
+            LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts'])
+    else:
+        # crawl -- assume the user picked the right things
+        pass
+
+
+def match_cc_crawls(crawls, raw_index_list):
+    # match crawls requested on the command line to actual crawls
+    # note that from/to are not considered here
+    # crawls should be normalized so it's supposed to be a list of str
+    if len(crawls) == 1 and crawls[0].isdigit():
+        num = int(crawls[0])
+        raw_index_list = raw_index_list[-num:]
+    else:
+        selected = set()
+        used = set()
+        for asked in crawls:
+            for available in raw_index_list:
+                if asked in available:
+                    used.add(asked)
+                    selected.add(available)
+        if not used:
+            raise ValueError('No matches for crawls '+','.join(crawls))
+        missed = set(crawls).difference(used)
+        if missed:
+            LOGGER.warning('No matches for these crawl args: '+','.join(missed))
+        raw_index_list = sorted(selected)
+    LOGGER.info('matched crawls are: '+','.join(raw_index_list))
+    return raw_index_list
 
 
 def make_cc_maps(raw_index_list):
@@ -146,6 +181,8 @@ def make_cc_maps(raw_index_list):
 
 def check_cc_from_to(params):
     # given caller's time specification, select from and to times; enforce limit on combinations
+    # closest: both from and to must be present
+    # otherwise: expect from to exist (due to the cc default 1 year)
     if 'closest' in params:
         if 'from_ts' not in params or params['from_ts'] is None:
             raise ValueError('Cannot happen')
@@ -185,24 +222,27 @@ def bisect_cc(cc_map, cc_times, from_ts_t, to_t):
 
 
 def filter_cc_endpoints(raw_index_list, cc_sort, params={}):
-    # YYY with --crawl, just check that the list is crawls that exist
-    # YYY if we want to expand CC-MAIN-2024 to be all 2024 crawls, that can be done here
-    # YYY we do need to reorder according to cc_sort
-    # what is the type of raw_index_list -- it is from collinfo.json cdx-api
-    # "cdx-api": "https://index.commoncrawl.org/CC-MAIN-2024-18-index"
-
-    # if no --crawl
-    cc_map, cc_times = make_cc_maps(raw_index_list)
-
-    from_ts_t, to_t = check_cc_from_to(params)
-
-    index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t)
+    crawl_present = False
+    if 'crawl' in params:
+        crawl_present = True
+        crawls = params['crawl']
+        del params['crawl']
+        index_list = match_cc_crawls(crawls, raw_index_list)
 
-    # write the fully-adjusted from and to into params XXX necessasry?
-    # XXX wut? should we only do this when we've changed or added these ?!
-    params['from_ts'] = time_to_timestamp(from_ts_t)
-    if to_t is not None:
-        params['to'] = time_to_timestamp(to_t)
+    else:
+        # date-based selection. if --crawl was specified, raw_index_list has already been narrowed
+        # YYY this does not yet use collinfo.json from, to
+        # YYY shouldn't this be skipped if crawl_present?
+        cc_map, cc_times = make_cc_maps(raw_index_list)
+        from_ts_t, to_t = check_cc_from_to(params)
+        index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t)
+
+        # write the fully-adjusted from and to into params XXX necessasry?
+        # XXX wut? should we only do this when we've changed or added these ?!
+        # to_t might have been padded. does from_ts ever get padded?
+        params['from_ts'] = time_to_timestamp(from_ts_t)
+        if to_t is not None:
+            params['to'] = time_to_timestamp(to_t)
 
     # adjust index_list order based on cc_sort order
     if 'closest' in params:
@@ -219,7 +259,10 @@ def filter_cc_endpoints(raw_index_list, cc_sort, params={}):
         raise ValueError('unknown cc_sort arg of '+cc_sort)
 
     if index_list:
-        LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1])
+        if crawl_present:
+            LOGGER.info('using cc crawls '+','.join(index_list))
+        else:
+            LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1])
     else:
         LOGGER.warning('empty cc index range found')