diff --git a/cdx_toolkit/cli.py b/cdx_toolkit/cli.py index 22a2e1e..a1aa528 100644 --- a/cdx_toolkit/cli.py +++ b/cdx_toolkit/cli.py @@ -6,6 +6,7 @@ import os import cdx_toolkit +from cdx_toolkit.commoncrawl import normalize_crawl LOGGER = logging.getLogger(__name__) @@ -23,6 +24,7 @@ def main(args=None): parser.add_argument('--limit', type=int, action='store') parser.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror') parser.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending') + parser.add_argument('--crawl', nargs='*', action='store', help='crawl names or an integer for the most recent N crawls. Implies --cc') parser.add_argument('--from', action='store') # XXX default for cc parser.add_argument('--to', action='store') parser.add_argument('--filter', action='append', help='see CDX API documentation for usage') @@ -93,13 +95,15 @@ def get_version(): def setup(cmd): kwargs = {} - kwargs['source'] = cmd.cc or cmd.ia or cmd.source or None + kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None if kwargs['source'] is None: raise ValueError('must specify --cc, --ia, or a --source') if cmd.wb: kwargs['wb'] = cmd.wb if cmd.cc_mirror: kwargs['cc_mirror'] = cmd.cc_mirror + if cmd.crawl: + kwargs['crawl'] = normalize_crawl(cmd.crawl) if getattr(cmd, 'warc_download_prefix', None) is not None: kwargs['warc_download_prefix'] = cmd.warc_download_prefix diff --git a/cdx_toolkit/commoncrawl.py b/cdx_toolkit/commoncrawl.py index a8608c7..545f97f 100644 --- a/cdx_toolkit/commoncrawl.py +++ b/cdx_toolkit/commoncrawl.py @@ -15,6 +15,18 @@ LOGGER = logging.getLogger(__name__) +def normalize_crawl(crawl): + crawls = [] + for c in crawl: + if ',' in c: + crawls.extend(c.split(',')) + else: + crawls.append(c) + if len(crawls) > 1 and any(x.isdigit() for x in crawls): + raise ValueError('If you specify an integer, only one crawl is allowed') + return crawls + + def get_cache_names(cc_mirror): cache = os.path.expanduser('~/.cache/cdx_toolkit/') filename = re.sub(r'[^\w]', '_', cc_mirror.replace('https://', '')) @@ -75,6 +87,9 @@ def get_cc_endpoints(cc_mirror): def apply_cc_defaults(params, now=None): + if 'crawl' in params: + return + three_months = 3 * 30 * 86400 year = 365*86400 if params.get('from_ts') is None: @@ -171,6 +186,13 @@ def bisect_cc(cc_map, cc_times, from_ts_t, to_t): def filter_cc_endpoints(raw_index_list, cc_sort, params={}): + # YYY with --crawl, just check that the list is crawls that exist + # YYY if we want to expand CC-MAIN-2024 to be all 2024 crawls, that can be done here + # YYY we do need to reorder according to cc_sort + # what is the type of raw_index_list -- it is from collinfo.json cdx-api + # "cdx-api": "https://index.commoncrawl.org/CC-MAIN-2024-18-index" + + # if no --crawl cc_map, cc_times = make_cc_maps(raw_index_list) from_ts_t, to_t = check_cc_from_to(params) @@ -186,7 +208,8 @@ def filter_cc_endpoints(raw_index_list, cc_sort, params={}): # adjust index_list order based on cc_sort order if 'closest' in params: # XXX funky ordering not implemented, inform the caller - # cli already prints a warning for iter + closer, telling user to use get instead + # cli already prints a warning for iter + closest, telling user to use get instead + # no need to warn if it's a single crawl # this routine is called for both get and iter pass if cc_sort == 'ascending': diff --git a/tests/unit/test_cc.py b/tests/unit/test_cc.py index e23f70f..17aa2e6 100644 --- a/tests/unit/test_cc.py +++ b/tests/unit/test_cc.py @@ -9,6 +9,22 @@ logging.basicConfig(level='INFO') +def test_normalize_crawl(): + tests = [ + [['1'], ['1']], + [['a'], ['a']], + [['a', 'b'], ['a', 'b']], + [['a,b', 'c'], ['a', 'b', 'c']], + [['a,b,c,d'], ['a', 'b', 'c', 'd']], + [['a', 'b,c'], ['a', 'b', 'c']], + ] + + for t in tests: + assert cdx_toolkit.commoncrawl.normalize_crawl(t[0]) == t[1] + with pytest.raises(ValueError): + cdx_toolkit.commoncrawl.normalize_crawl(['1', '2']) + + def test_apply_cc_defaults(): # no from # closest -- sets from, to