Skip to content

Commit

Permalink
Change retries to a ratelimit-based system (#35)
Browse files Browse the repository at this point in the history
This is needed because both IA and CCF are now rate-limiting their CDX API endpoints.
  • Loading branch information
wumpus authored Sep 1, 2024
1 parent eab58e2 commit 719dc68
Showing 1 changed file with 59 additions and 15 deletions.
74 changes: 59 additions & 15 deletions cdx_toolkit/myrequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,67 @@

LOGGER = logging.getLogger(__name__)


previously_seen_hostnames = {
'commoncrawl.s3.amazonaws.com',
'data.commoncrawl.org',
'web.archive.org',
}


def dns_fatal(url):
def dns_fatal(hostname):
'''We have a dns error, should we fail immediately or not?'''
hostname = urlparse(url).hostname
if hostname not in previously_seen_hostnames:
return True


retry_info = {
'default': {
'next_fetch': 0,
'minimum_interval': 3.0,
},
'index.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 3.0,
},
'data.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 3.0,
},
'web.archive.org': {
'next_fetch': 0,
'minimum_interval': 6.0,
},
}


def get_retries(hostname):
if hostname not in retry_info:
retry_info[hostname] = retry_info['default'].copy()
LOGGER.debug('initializing retry info for new host '+hostname)
entry = retry_info[hostname]
if not entry['next_fetch']:
entry['next_fetch'] = time.time()
return entry['next_fetch'], entry['minimum_interval']


def update_next_fetch(hostname, next_fetch):
retry_info[hostname]['next_fetch'] = next_fetch


def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
t = time.time()

hostname = urlparse(url).hostname
next_fetch, minimum_interval = get_retries(hostname)

if t < next_fetch:
dt = next_fetch - t
if dt > 3.1:
LOGGER.debug('sleeping for {:.3f}s before next fetch'.format(dt))
time.sleep(dt)
# next_fetch is also updated at the bottom
update_next_fetch(hostname, next_fetch + minimum_interval)

if params:
if 'from_ts' in params:
params['from'] = params['from_ts']
Expand All @@ -38,8 +83,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
headers['User-Agent'] = 'pypi_cdx_toolkit/'+__version__

retry = True
retry_sec = 1
retry_max_sec = 30
retry_sec = 2 * minimum_interval
retry_max_sec = 60
retries = 0
connect_errors = 0
while retry:
Expand All @@ -62,14 +107,10 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
# I have never seen IA or CC send 429 or 509, but just in case...
# 429 is also a slow down, IA started sending them mid-2023
retries += 1
if retries > 5:
LOGGER.warning('retrying after 1s for %d', resp.status_code)
if resp.text:
LOGGER.warning('response body is %s', resp.text)
else:
LOGGER.info('retrying after 1s for %d', resp.status_code)
if resp.text:
LOGGER.info('response body is %s', resp.text)
level = 30 if retries > 5 else 20 # 30=warning 20=info
LOGGER.log(level, 'retrying after %.2fs for %d', retry_sec, resp.status_code)
if resp.text:
LOGGER.log(level, 'response body is %s', resp.text)
time.sleep(retry_sec)
retry_sec = min(retry_sec*2, retry_max_sec)
continue
Expand All @@ -93,8 +134,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
raise ValueError(string)
if connect_errors > 10:
LOGGER.warning(string)
LOGGER.info('retrying after 1s for '+str(e))
time.sleep(retry_sec)
LOGGER.info('retrying after {:.2f}s for '.format(retry_max_sec)+str(e))
time.sleep(retry_max_sec) # notice the extra-long sleep
retry_sec = min(retry_sec*2, retry_max_sec)
except requests.exceptions.RequestException as e: # pragma: no cover
LOGGER.warning('something unexpected happened, giving up after %s', str(e))
Expand All @@ -104,4 +145,7 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
if hostname not in previously_seen_hostnames:
previously_seen_hostnames.add(hostname)

# in case we had a lot of retries, etc
update_next_fetch(hostname, time.time() + minimum_interval)

return resp

0 comments on commit 719dc68

Please sign in to comment.