From a7b7b402b45fe51f3f1d2ea55c380846365acdb2 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 8 Sep 2024 07:58:26 +0000 Subject: [PATCH] feat: cdxt --crawl, plus partial docs --- README.md | 67 +++++++++++++------ cdx_toolkit/__init__.py | 15 +++-- cdx_toolkit/cli.py | 6 +- cdx_toolkit/commoncrawl.py | 131 ++++++++++++++++++++++++------------- tests/test_cli.py | 19 +++++- tests/unit/test_cc.py | 33 ++++++++-- 6 files changed, 190 insertions(+), 81 deletions(-) diff --git a/README.md b/README.md index 3ce1d26..9032d0f 100644 --- a/README.md +++ b/README.md @@ -3,37 +3,36 @@ [![build](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml/badge.svg)](https://github.com/cocrawler/cdx_toolkit/actions/workflows/ci.yaml) [![coverage](https://codecov.io/gh/cocrawler/cdx_toolkit/graph/badge.svg?token=M1YJB998LE)](https://codecov.io/gh/cocrawler/cdx_toolkit) [![Apache License 2.0](https://img.shields.io/github/license/cocrawler/cdx_toolkit.svg)](LICENSE) cdx_toolkit is a set of tools for working with CDX indices of web -crawls and archives, including those at CommonCrawl and the Internet -Archive's Wayback Machine. +crawls and archives, including those at the Common Crawl Foundation +(CCF) and those at the Internet Archive's Wayback Machine. -CommonCrawl uses Ilya Kreymer's pywb to serve the CDX API, which is -somewhat different from the Internet Archive's CDX API server. cdx_toolkit -hides these differences as best it can. cdx_toolkit also knits -together the monthly Common Crawl CDX indices into a single, virtual -index. +Common Crawl uses Ilya Kreymer's pywb to serve the CDX API, which is +somewhat different from the Internet Archive's CDX API server. +cdx_toolkit hides these differences as best it can. cdx_toolkit also +knits together the monthly Common Crawl CDX indices into a single, +virtual index. Finally, cdx_toolkit allows extracting archived pages from CC and IA -into WARC files. If you're looking to create subsets of CC or IA data -and then process them into WET or WAT files, this is a feature you'll -find useful. +into WARC files. If you're looking to create subsets of CC or IA data +and then further process them, this is a feature you'll find useful. ## Installing -cdx toolkit requires Python 3. - ``` $ pip install cdx_toolkit ``` -or clone this repo and use `python ./setup.py install`. +or clone this repo and use `pip install .` ## Command-line tools ``` $ cdxt --cc size 'commoncrawl.org/*' -$ cdxt --cc --limit 10 iter 'commoncrawl.org/*' +$ cdxt --cc --limit 10 iter 'commoncrawl.org/*' # returns the most recent year +$ cdxt --crawl 3 --limit 10 iter 'commoncrawl.org/*' # returns the most recent 3 crawls $ cdxt --cc --limit 10 --filter '=status:200' iter 'commoncrawl.org/*' -$ cdxt --ia --limit 10 iter 'commoncrawl.org/*' + +$ cdxt --ia --limit 10 iter 'commoncrawl.org/*' # will show the beginning of IA's crawl $ cdxt --ia --limit 10 warc 'commoncrawl.org/*' ``` @@ -41,15 +40,42 @@ cdxt takes a large number of command line switches, controlling the time period and all other CDX query options. cdxt can generate WARC, jsonl, and csv outputs. -** Note that by default, cdxt --cc will iterate over the previous -year of captures. ** +If you don't specify much about the crawls or dates or number of +records you're interested in, some default limits will kick in to +prevent overly-large queries. These default limits include a maximum +of 1000 records (`--limit 1000`) and a limit of 1 year of CC indexes. +To exceed these limits, use `--limit` and `--crawl` or `--from` and +`--to`. + +## Selecting particular CCF crawls + +Common Crawl's data is divided into "crawls", which were yearly at the +start, and are currently done monthly. There are over 100 of them. + +XXX -See +Unlike some web archives, CCF doesn't have a single CDX index that +covers all of these crawls. CCF does have a hive-sharded Parquet index +(called the columnar index) that covers all of our indexes. You +can find more information about this index at +[the blog post about it](https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format). + +The Internet Archive cdx index is organized as a single crawl that goes +from the very beginning until now. That's why there is no `--crawl` for +`--ia`. Note that cdx queries to `--ia` will default to one year year +and limit 1000 entries if you do not specify `--from`, `--to`, and `--limit`. + +## Selecting by time + +XXX + +## The full syntax for command-line tools ``` $ cdxt --help $ cdxt iter --help $ cdxt warc --help +$ cdxt size --help ``` for full details. Note that argument order really matters; each switch @@ -57,7 +83,10 @@ is valid only either before or after the {iter,warc,size} command. Add -v (or -vv) to see what's going on under the hood. -## Programming example +## Python programming example + +Everything that you can do on the command line, and much more, can +be done by writing a Python program. ``` import cdx_toolkit diff --git a/cdx_toolkit/__init__.py b/cdx_toolkit/__init__.py index e1ba73c..e413a71 100644 --- a/cdx_toolkit/__init__.py +++ b/cdx_toolkit/__init__.py @@ -197,12 +197,14 @@ def __next__(self): LOGGER.debug('getting more in __next__') self.get_more() if len(self.captures) <= 0: + # XXX print out a warning if this hits the default limit of 1000 raise StopIteration class CDXFetcher: - def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None): + def __init__(self, source='cc', crawl=None, wb=None, warc_download_prefix=None, cc_mirror=None, cc_sort='mixed', loglevel=None): self.source = source + self.crawl = crawl self.cc_sort = cc_sort self.source = source if wb is not None and warc_download_prefix is not None: @@ -211,12 +213,11 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No self.warc_download_prefix = warc_download_prefix if source == 'cc': - self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/' - self.raw_index_list = get_cc_endpoints(self.cc_mirror) if wb is not None: raise ValueError('cannot specify wb= for source=cc') + self.cc_mirror = cc_mirror or 'https://index.commoncrawl.org/' + self.raw_index_list = get_cc_endpoints(self.cc_mirror) self.warc_download_prefix = warc_download_prefix or 'https://data.commoncrawl.org' - #https://commoncrawl.s3.amazonaws.com elif source == 'ia': self.index_list = ('https://web.archive.org/cdx/search/cdx',) if self.warc_download_prefix is None and self.wb is None: @@ -230,8 +231,10 @@ def __init__(self, source='cc', wb=None, warc_download_prefix=None, cc_mirror=No LOGGER.setLevel(level=loglevel) def customize_index_list(self, params): - if self.source == 'cc' and ('from' in params or 'from_ts' in params or 'to' in params or 'closest' in params): + if self.source == 'cc' and (self.crawl or 'crawl' in params or 'from' in params or 'from_ts' in params or 'to' in params or 'closest' in params): LOGGER.info('making a custom cc index list') + if self.crawl and 'crawl' not in params: + params['crawl'] = self.crawl return filter_cc_endpoints(self.raw_index_list, self.cc_sort, params=params) else: return self.index_list @@ -278,7 +281,7 @@ def iter(self, url, **kwargs): params['filter'] = munge_filter(params['filter'], self.source) if self.source == 'cc': - apply_cc_defaults(params) + apply_cc_defaults(params, crawl_present=bool(self.crawl)) index_list = self.customize_index_list(params) return CDXFetcherIter(self, params=params, index_list=index_list) diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py index a1aa528..6ffa393 100644 --- a/cdx_toolkit/cli.py +++ b/cdx_toolkit/cli.py @@ -18,14 +18,14 @@ def main(args=None): parser.add_argument('--verbose', '-v', action='count', help='set logging level to INFO (-v) or DEBUG (-vv)') parser.add_argument('--cc', action='store_const', const='cc', help='direct the query to the Common Crawl CDX/WARCs') + parser.add_argument('--crawl', action='store', help='crawl names (comma separated) or an integer for the most recent N crawls. Implies --cc') parser.add_argument('--ia', action='store_const', const='ia', help='direct the query to the Internet Archive CDX/wayback') parser.add_argument('--source', action='store', help='direct the query to this CDX server') parser.add_argument('--wb', action='store', help='direct replays for content to this wayback') parser.add_argument('--limit', type=int, action='store') parser.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror') parser.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending') - parser.add_argument('--crawl', nargs='*', action='store', help='crawl names or an integer for the most recent N crawls. Implies --cc') - parser.add_argument('--from', action='store') # XXX default for cc + parser.add_argument('--from', action='store') parser.add_argument('--to', action='store') parser.add_argument('--filter', action='append', help='see CDX API documentation for usage') parser.add_argument('--get', action='store_true', help='use a single get instead of a paged iteration. default limit=1000') @@ -103,7 +103,7 @@ def setup(cmd): if cmd.cc_mirror: kwargs['cc_mirror'] = cmd.cc_mirror if cmd.crawl: - kwargs['crawl'] = normalize_crawl(cmd.crawl) + kwargs['crawl'] = normalize_crawl([cmd.crawl]) # currently a string, not a list if getattr(cmd, 'warc_download_prefix', None) is not None: kwargs['warc_download_prefix'] = cmd.warc_download_prefix diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py index d3f26c3..8ef7b23 100644 --- a/cdx_toolkit/commoncrawl.py +++ b/cdx_toolkit/commoncrawl.py @@ -10,7 +10,7 @@ import logging from .myrequests import myrequests_get -from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special +from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special LOGGER = logging.getLogger(__name__) @@ -22,7 +22,7 @@ def normalize_crawl(crawl): crawls.extend(c.split(',')) else: crawls.append(c) - if len(crawls) > 1 and any(x.isdigit() for x in crawls): + if len(crawls) > 1 and (any(x.isdigit() for x in crawls)): raise ValueError('If you specify an integer, only one crawl is allowed') return crawls @@ -79,44 +79,79 @@ def get_cc_endpoints(cc_mirror): raise ValueError('Surprisingly few endpoints for common crawl index') # pragma: no cover LOGGER.info('Found %d endpoints in the Common Crawl index', len(endpoints)) - # endpoints arrive sorted oldest to newest, but let's force that anyawy + # endpoints arrive descending, make them ascending endpoints = sorted(endpoints) return endpoints -def apply_cc_defaults(params, now=None): - if 'crawl' in params: - return +def apply_cc_defaults(params, crawl_present=False, now=None): + # closest has needs + # if crawl, do nothing (expect the user to have picked the correct crawls) + # XXX ? check sort order, which happens later? + # if no from or to, set them -/+ 3 months from the closest timestamp + # crawl? nothing + # no crawl? 1 year if not specified - three_months = 3 * 30 * 86400 - year = 365*86400 - if params.get('from_ts') is None: - if params.get('closest') is not None: - closest_t = timestamp_to_time(params['closest']) + if params.get('closest') is not None: + closest_t = timestamp_to_time(params['closest']) + three_months = 3 * 30 * 86400 + if params.get('from_ts') is None: params['from_ts'] = time_to_timestamp(closest_t - three_months) LOGGER.info('no from but closest, setting from=%s', params['from_ts']) + if params.get('to') is None: + params['to'] = time_to_timestamp(closest_t + three_months) + LOGGER.info('no to but closest, setting to=%s', params['to']) + # XXX set sort order to funky? which does not exist yet + elif not crawl_present: + # can't check params for 'crawl' because crawl is not ever set in params + year = 365*86400 + if params.get('from_ts') is not None: if params.get('to') is None: - params['to'] = time_to_timestamp(closest_t + three_months) - LOGGER.info('no to but closest, setting to=%s', params['to']) + #from_ts = pad_timestamp(params['from_ts']) + #params['to'] = time_to_timestamp(timestamp_to_time(from_ts) + year) + #LOGGER.info('no to, setting to=%s', params['to']) + LOGGER.info('from but no to, not doing anything') elif params.get('to') is not None: - to = pad_timestamp_up(params['to']) - params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year) - LOGGER.info('no from but to, setting from=%s', params['from_ts']) + if params.get('from_ts') is None: + to = pad_timestamp_up(params['to']) + params['from_ts'] = time_to_timestamp(timestamp_to_time(to) - year) + LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts']) else: if not now: + # now is passed in by tests. if not set, use actual now. + # XXX could be changed to mock now = time.time() params['from_ts'] = time_to_timestamp(now - year) - LOGGER.info('no from, setting from=%s', params['from_ts']) - if params.get('to') is None: - if params.get('closest') is not None: - closest_t = timestamp_to_time(params['closest']) - # 3 months later - params['to'] = time_to_timestamp(closest_t + three_months) - LOGGER.info('no to but closest, setting from=%s', params['to']) - else: - # no to or closest; from was set above, we will not set to - pass + LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts']) + else: + # crawl -- assume the user picked the right things + pass + + +def match_cc_crawls(crawls, raw_index_list): + # match crawls requested on the command line to actual crawls + # note that from/to are not considered here + # crawls should be normalized so it's supposed to be a list of str + if len(crawls) == 1 and crawls[0].isdigit(): + num = int(crawls[0]) + raw_index_list = raw_index_list[-num:] + else: + selected = set() + used = set() + for asked in crawls: + for available in raw_index_list: + if asked in available: + used.add(asked) + selected.add(available) + if not used: + raise ValueError('No matches for crawls '+','.join(crawls)) + missed = set(crawls).difference(used) + if missed: + LOGGER.warning('No matches for these crawl args: '+','.join(missed)) + raw_index_list = sorted(selected) + LOGGER.info('matched crawls are: '+','.join(raw_index_list)) + return raw_index_list def make_cc_maps(raw_index_list): @@ -146,6 +181,8 @@ def make_cc_maps(raw_index_list): def check_cc_from_to(params): # given caller's time specification, select from and to times; enforce limit on combinations + # closest: both from and to must be present + # otherwise: expect from to exist (due to the cc default 1 year) if 'closest' in params: if 'from_ts' not in params or params['from_ts'] is None: raise ValueError('Cannot happen') @@ -185,24 +222,27 @@ def bisect_cc(cc_map, cc_times, from_ts_t, to_t): def filter_cc_endpoints(raw_index_list, cc_sort, params={}): - # YYY with --crawl, just check that the list is crawls that exist - # YYY if we want to expand CC-MAIN-2024 to be all 2024 crawls, that can be done here - # YYY we do need to reorder according to cc_sort - # what is the type of raw_index_list -- it is from collinfo.json cdx-api - # "cdx-api": "https://index.commoncrawl.org/CC-MAIN-2024-18-index" - - # if no --crawl - cc_map, cc_times = make_cc_maps(raw_index_list) - - from_ts_t, to_t = check_cc_from_to(params) - - index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t) + crawl_present = False + if 'crawl' in params: + crawl_present = True + crawls = params['crawl'] + del params['crawl'] + index_list = match_cc_crawls(crawls, raw_index_list) - # write the fully-adjusted from and to into params XXX necessasry? - # XXX wut? should we only do this when we've changed or added these ?! - params['from_ts'] = time_to_timestamp(from_ts_t) - if to_t is not None: - params['to'] = time_to_timestamp(to_t) + else: + # date-based selection. if --crawl was specified, raw_index_list has already been narrowed + # YYY this does not yet use collinfo.json from, to + # YYY shouldn't this be skipped if crawl_present? + cc_map, cc_times = make_cc_maps(raw_index_list) + from_ts_t, to_t = check_cc_from_to(params) + index_list = bisect_cc(cc_map, cc_times, from_ts_t, to_t) + + # write the fully-adjusted from and to into params XXX necessasry? + # XXX wut? should we only do this when we've changed or added these ?! + # to_t might have been padded. does from_ts ever get padded? + params['from_ts'] = time_to_timestamp(from_ts_t) + if to_t is not None: + params['to'] = time_to_timestamp(to_t) # adjust index_list order based on cc_sort order if 'closest' in params: @@ -219,7 +259,10 @@ def filter_cc_endpoints(raw_index_list, cc_sort, params={}): raise ValueError('unknown cc_sort arg of '+cc_sort) if index_list: - LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1]) + if crawl_present: + LOGGER.info('using cc crawls '+','.join(index_list)) + else: + LOGGER.info('using cc index range from %s to %s', index_list[0], index_list[-1]) else: LOGGER.warning('empty cc index range found') diff --git a/tests/test_cli.py b/tests/test_cli.py index 2284e56..8a3be51 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -41,6 +41,16 @@ def test_basics(capsys): # this might be commoncrawl.org./ or commoncrawl.org/ assert 'commoncrawl.org' in line + args = '--crawl 2 --limit 10 iter commoncrawl.org/*'.split() + main(args=args) + out, err = capsys.readouterr() + + split = out.splitlines() + assert len(split) == 10 + for line in out.splitlines(): + # this might be commoncrawl.org./ or commoncrawl.org/ + assert 'commoncrawl.org' in line + def multi_helper(t, capsys, caplog): inputs = t[0] @@ -83,8 +93,8 @@ def test_multi_cc1(capsys, caplog): {'count': 10, 'linefgrep': 'commoncrawl.org'}], [{'service': '--cc', 'mods': '--limit 11', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'count': 11, 'linefgrep': 'commoncrawl.org'}], -# [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/thisurlneverdidexist'}, -# {'count': 0}], # should limit to 1 index because it runs slowly! + [{'service': '--crawl 1', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/thisurlneverdidexist'}, + {'count': 0}], # runs slowly if we don't limit crawl to 1 [{'service': '--cc', 'mods': '--cc-mirror https://index.commoncrawl.org/ --limit 11', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'count': 11, 'linefgrep': 'commoncrawl.org'}], [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/* --all-fields'}, @@ -156,6 +166,11 @@ def test_multi_misc_not_ia(capsys, caplog): [{'service': '-v -v --source https://example.com/404', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, {'exception': ValueError}], + [{'service': '--crawl 1,1', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, + {'exception': ValueError}], + [{'service': '--crawl 1,CC-MAIN-2024', 'mods': '--limit 10', 'cmd': 'iter', 'rest': 'commoncrawl.org/*'}, + {'exception': ValueError}], + [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'size', 'rest': 'commoncrawl.org/*'}, {'count': 1, 'is_int': True}], [{'service': '--cc', 'mods': '--limit 10', 'cmd': 'size', 'rest': '--details commoncrawl.org/*'}, diff --git a/tests/unit/test_cc.py b/tests/unit/test_cc.py index 17aa2e6..65bd29f 100644 --- a/tests/unit/test_cc.py +++ b/tests/unit/test_cc.py @@ -40,33 +40,52 @@ def test_apply_cc_defaults(): now = 1524962339.157388 # 20180429003859 tests = [ + [{'crawl': 'foo'}, {}], [{'closest': '20180101'}, {'from_ts': '20171003000000', 'to': '20180401000000'}], [{'closest': '20180101', 'to': '20181201'}, {'from_ts': '20171003000000'}], [{'to': '20180101'}, {'from_ts': '20170131235959'}], - [{}, {'from_ts': '20170429003859'}], # hits both elses, uses now + [{}, {'from_ts': '20170429003859'}], # uses now [{'from_ts': '20100101', 'closest': '20150301'}, {'to': '20150530000000'}], - [{'from_ts': '20100101'}, {}], # hits the second else only + [{'from_ts': '20100101'}, {}], ] for test_in, test_out in tests: + crawl_present = bool(test_in.pop('crawl', None)) test_out.update(test_in) - cdx_toolkit.commoncrawl.apply_cc_defaults(test_in, now=now) + cdx_toolkit.commoncrawl.apply_cc_defaults(test_in, crawl_present=crawl_present, now=now) assert test_in == test_out my_cc_endpoints = [ + # expected to be ascending + 'https://index.commoncrawl.org/CC-MAIN-2008-2009-index', + 'https://index.commoncrawl.org/CC-MAIN-2009-2010-index', + 'https://index.commoncrawl.org/CC-MAIN-2012-index', 'https://index.commoncrawl.org/CC-MAIN-2013-20-index', 'https://index.commoncrawl.org/CC-MAIN-2017-51-index', 'https://index.commoncrawl.org/CC-MAIN-2018-05-index', 'https://index.commoncrawl.org/CC-MAIN-2018-09-index', 'https://index.commoncrawl.org/CC-MAIN-2018-13-index', - # and the specials - 'https://index.commoncrawl.org/CC-MAIN-2012-index', - 'https://index.commoncrawl.org/CC-MAIN-2009-2010-index', - 'https://index.commoncrawl.org/CC-MAIN-2008-2009-index', ] +def test_match_cc_crawls(): + tests = [ + [['CC-MAIN-2013-20'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index']], + [['CC-MAIN-2017'], ['https://index.commoncrawl.org/CC-MAIN-2017-51-index']], + [['CC-MAIN-2018'], ['https://index.commoncrawl.org/CC-MAIN-2018-05-index', + 'https://index.commoncrawl.org/CC-MAIN-2018-09-index', + 'https://index.commoncrawl.org/CC-MAIN-2018-13-index']], + [['CC-MAIN-2013', 'CC-MAIN-2017'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index', + 'https://index.commoncrawl.org/CC-MAIN-2017-51-index']], + [['CC-MAIN-2013-20', 'no match'], ['https://index.commoncrawl.org/CC-MAIN-2013-20-index']], # .warning + ] + for t in tests: + assert cdx_toolkit.commoncrawl.match_cc_crawls(t[0], my_cc_endpoints) == t[1] + with pytest.raises(ValueError): + cdx_toolkit.commoncrawl.match_cc_crawls(['no match'], my_cc_endpoints) + + def test_make_cc_maps(): cc_map, cc_times = cdx_toolkit.commoncrawl.make_cc_maps(my_cc_endpoints) t = cc_times[0]