Skip to content

Commit

Permalink
work in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
Greg Lindahl committed Sep 4, 2024
1 parent 666fcaf commit f026065
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 2 deletions.
6 changes: 5 additions & 1 deletion cdx_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os

import cdx_toolkit
from cdx_toolkit.commoncrawl import normalize_crawl

LOGGER = logging.getLogger(__name__)

Expand All @@ -23,6 +24,7 @@ def main(args=None):
parser.add_argument('--limit', type=int, action='store')
parser.add_argument('--cc-mirror', action='store', help='use this Common Crawl index mirror')
parser.add_argument('--cc-sort', action='store', help='default mixed, alternatively: ascending')
parser.add_argument('--crawl', nargs='*', action='store', help='crawl names or an integer for the most recent N crawls. Implies --cc')
parser.add_argument('--from', action='store') # XXX default for cc
parser.add_argument('--to', action='store')
parser.add_argument('--filter', action='append', help='see CDX API documentation for usage')
Expand Down Expand Up @@ -93,13 +95,15 @@ def get_version():

def setup(cmd):
kwargs = {}
kwargs['source'] = cmd.cc or cmd.ia or cmd.source or None
kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None
if kwargs['source'] is None:
raise ValueError('must specify --cc, --ia, or a --source')
if cmd.wb:
kwargs['wb'] = cmd.wb
if cmd.cc_mirror:
kwargs['cc_mirror'] = cmd.cc_mirror
if cmd.crawl:
kwargs['crawl'] = normalize_crawl(cmd.crawl)
if getattr(cmd, 'warc_download_prefix', None) is not None:
kwargs['warc_download_prefix'] = cmd.warc_download_prefix

Expand Down
25 changes: 24 additions & 1 deletion cdx_toolkit/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@
LOGGER = logging.getLogger(__name__)


def normalize_crawl(crawl):
crawls = []
for c in crawl:
if ',' in c:
crawls.extend(c.split(','))
else:
crawls.append(c)
if len(crawls) > 1 and any(x.isdigit() for x in crawls):
raise ValueError('If you specify an integer, only one crawl is allowed')
return crawls


def get_cache_names(cc_mirror):
cache = os.path.expanduser('~/.cache/cdx_toolkit/')
filename = re.sub(r'[^\w]', '_', cc_mirror.replace('https://', ''))
Expand Down Expand Up @@ -75,6 +87,9 @@ def get_cc_endpoints(cc_mirror):


def apply_cc_defaults(params, now=None):
if 'crawl' in params:
return

three_months = 3 * 30 * 86400
year = 365*86400
if params.get('from_ts') is None:
Expand Down Expand Up @@ -171,6 +186,13 @@ def bisect_cc(cc_map, cc_times, from_ts_t, to_t):


def filter_cc_endpoints(raw_index_list, cc_sort, params={}):
# YYY with --crawl, just check that the list is crawls that exist
# YYY if we want to expand CC-MAIN-2024 to be all 2024 crawls, that can be done here
# YYY we do need to reorder according to cc_sort
# what is the type of raw_index_list -- it is from collinfo.json cdx-api
# "cdx-api": "https://index.commoncrawl.org/CC-MAIN-2024-18-index"

# if no --crawl
cc_map, cc_times = make_cc_maps(raw_index_list)

from_ts_t, to_t = check_cc_from_to(params)
Expand All @@ -186,7 +208,8 @@ def filter_cc_endpoints(raw_index_list, cc_sort, params={}):
# adjust index_list order based on cc_sort order
if 'closest' in params:
# XXX funky ordering not implemented, inform the caller
# cli already prints a warning for iter + closer, telling user to use get instead
# cli already prints a warning for iter + closest, telling user to use get instead
# no need to warn if it's a single crawl
# this routine is called for both get and iter
pass
if cc_sort == 'ascending':
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/test_cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,22 @@
logging.basicConfig(level='INFO')


def test_normalize_crawl():
tests = [
[['1'], ['1']],
[['a'], ['a']],
[['a', 'b'], ['a', 'b']],
[['a,b', 'c'], ['a', 'b', 'c']],
[['a,b,c,d'], ['a', 'b', 'c', 'd']],
[['a', 'b,c'], ['a', 'b', 'c']],
]

for t in tests:
assert cdx_toolkit.commoncrawl.normalize_crawl(t[0]) == t[1]
with pytest.raises(ValueError):
cdx_toolkit.commoncrawl.normalize_crawl(['1', '2'])


def test_apply_cc_defaults():
# no from
# closest -- sets from, to
Expand Down

0 comments on commit f026065

Please sign in to comment.