From 719dc684cf85cace7ab21d8744b26c1d8aaff545 Mon Sep 17 00:00:00 2001 From: Greg Lindahl Date: Sun, 1 Sep 2024 20:50:38 +0000 Subject: [PATCH] Change retries to a ratelimit-based system (#35) This is needed because both IA and CCF are now rate-limiting their CDX API endpoints. --- cdx_toolkit/myrequests.py | 74 +++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 15 deletions(-) diff --git a/cdx_toolkit/myrequests.py b/cdx_toolkit/myrequests.py index 9127fbc..94fa726 100644 --- a/cdx_toolkit/myrequests.py +++ b/cdx_toolkit/myrequests.py @@ -7,7 +7,6 @@ LOGGER = logging.getLogger(__name__) - previously_seen_hostnames = { 'commoncrawl.s3.amazonaws.com', 'data.commoncrawl.org', @@ -15,14 +14,60 @@ } -def dns_fatal(url): +def dns_fatal(hostname): '''We have a dns error, should we fail immediately or not?''' - hostname = urlparse(url).hostname if hostname not in previously_seen_hostnames: return True +retry_info = { + 'default': { + 'next_fetch': 0, + 'minimum_interval': 3.0, + }, + 'index.commoncrawl.org': { + 'next_fetch': 0, + 'minimum_interval': 3.0, + }, + 'data.commoncrawl.org': { + 'next_fetch': 0, + 'minimum_interval': 3.0, + }, + 'web.archive.org': { + 'next_fetch': 0, + 'minimum_interval': 6.0, + }, +} + + +def get_retries(hostname): + if hostname not in retry_info: + retry_info[hostname] = retry_info['default'].copy() + LOGGER.debug('initializing retry info for new host '+hostname) + entry = retry_info[hostname] + if not entry['next_fetch']: + entry['next_fetch'] = time.time() + return entry['next_fetch'], entry['minimum_interval'] + + +def update_next_fetch(hostname, next_fetch): + retry_info[hostname]['next_fetch'] = next_fetch + + def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): + t = time.time() + + hostname = urlparse(url).hostname + next_fetch, minimum_interval = get_retries(hostname) + + if t < next_fetch: + dt = next_fetch - t + if dt > 3.1: + LOGGER.debug('sleeping for {:.3f}s before next fetch'.format(dt)) + time.sleep(dt) + # next_fetch is also updated at the bottom + update_next_fetch(hostname, next_fetch + minimum_interval) + if params: if 'from_ts' in params: params['from'] = params['from_ts'] @@ -38,8 +83,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): headers['User-Agent'] = 'pypi_cdx_toolkit/'+__version__ retry = True - retry_sec = 1 - retry_max_sec = 30 + retry_sec = 2 * minimum_interval + retry_max_sec = 60 retries = 0 connect_errors = 0 while retry: @@ -62,14 +107,10 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): # I have never seen IA or CC send 429 or 509, but just in case... # 429 is also a slow down, IA started sending them mid-2023 retries += 1 - if retries > 5: - LOGGER.warning('retrying after 1s for %d', resp.status_code) - if resp.text: - LOGGER.warning('response body is %s', resp.text) - else: - LOGGER.info('retrying after 1s for %d', resp.status_code) - if resp.text: - LOGGER.info('response body is %s', resp.text) + level = 30 if retries > 5 else 20 # 30=warning 20=info + LOGGER.log(level, 'retrying after %.2fs for %d', retry_sec, resp.status_code) + if resp.text: + LOGGER.log(level, 'response body is %s', resp.text) time.sleep(retry_sec) retry_sec = min(retry_sec*2, retry_max_sec) continue @@ -93,8 +134,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): raise ValueError(string) if connect_errors > 10: LOGGER.warning(string) - LOGGER.info('retrying after 1s for '+str(e)) - time.sleep(retry_sec) + LOGGER.info('retrying after {:.2f}s for '.format(retry_max_sec)+str(e)) + time.sleep(retry_max_sec) # notice the extra-long sleep retry_sec = min(retry_sec*2, retry_max_sec) except requests.exceptions.RequestException as e: # pragma: no cover LOGGER.warning('something unexpected happened, giving up after %s', str(e)) @@ -104,4 +145,7 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False): if hostname not in previously_seen_hostnames: previously_seen_hostnames.add(hostname) + # in case we had a lot of retries, etc + update_next_fetch(hostname, time.time() + minimum_interval) + return resp