From 11dba0331470c063e60e92b8d060208bd4d468d7 Mon Sep 17 00:00:00 2001 From: Kai Jauslin Date: Fri, 30 Oct 2020 12:28:12 +0100 Subject: [PATCH 1/2] Add url_post template parameter for remote cdx api url_post is the original url with the POST data extracted from the urlkey and appended as query parameter. Use with outbackcdx instead of url parameter. --- pywb/warcserver/index/indexsource.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index 67d966374..d2b5eaa30 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -1,4 +1,4 @@ -from six.moves.urllib.parse import quote_plus +from six.moves.urllib.parse import quote_plus, quote, parse_qs, urlparse from warcio.timeutils import PAD_14_DOWN, http_date_to_timestamp, pad_timestamp, timestamp_now, timestamp_to_http_date from pywb.utils.binsearch import iter_range @@ -112,6 +112,8 @@ def init_from_config(cls, config): #============================================================================= class RemoteIndexSource(BaseIndexSource): CDX_MATCH_RX = re.compile('^cdxj?\+(?Phttps?\:.*)') + POSTDATA_MATCH_RX = re.compile('.*?[?&](?P__wb_post_data|__warc_post_data|__wb_json_data)'\ + '=(?P[^&]+).*$') def __init__(self, api_url, replay_url, url_field='load_url', closest_limit=10): self.api_url = api_url @@ -121,7 +123,9 @@ def __init__(self, api_url, replay_url, url_field='load_url', closest_limit=10): self._init_sesh() def _get_api_url(self, params): + self.add_url_post_param(params) api_url = res_template(self.api_url, params) + if 'closest' in params and self.closest_limit: api_url += '&limit=' + str(self.closest_limit) @@ -130,6 +134,21 @@ def _get_api_url(self, params): return api_url + def add_url_post_param(self, params): + # extract POST data value from urlkey and compose url_post parameter + key_str = params['key'].decode('utf-8') + match_post = re.match(self.POSTDATA_MATCH_RX, key_str) + params['url_post'] = params['url'] + + if match_post and match_post.groupdict() is not None: + url_query = parse_qs(urlparse(params['url']).query) + post_key = match_post.groupdict()['post_key'] + post_data = match_post.groupdict()['post_data'] + if len(url_query.keys()) == 0: + params['url_post'] += quote('?%s=%s' % (post_key, post_data)) + else: + params['url_post'] += quote('&%s=%s' % (post_key, post_data)) + def load_index(self, params): api_url = self._get_api_url(params) try: From 853eedc246385ed8a83b294750319a6a666e50c7 Mon Sep 17 00:00:00 2001 From: Kai Jauslin Date: Thu, 31 Dec 2020 13:39:55 +0100 Subject: [PATCH 2/2] Fix url_post url encoding --- pywb/warcserver/index/indexsource.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pywb/warcserver/index/indexsource.py b/pywb/warcserver/index/indexsource.py index d2b5eaa30..98dd341cf 100644 --- a/pywb/warcserver/index/indexsource.py +++ b/pywb/warcserver/index/indexsource.py @@ -132,13 +132,14 @@ def _get_api_url(self, params): if 'matchType' in params: api_url += '&matchType=' + params.get('matchType') + self.logger.info(api_url) return api_url def add_url_post_param(self, params): # extract POST data value from urlkey and compose url_post parameter key_str = params['key'].decode('utf-8') match_post = re.match(self.POSTDATA_MATCH_RX, key_str) - params['url_post'] = params['url'] + params['url_post'] = quote(params['url']) if match_post and match_post.groupdict() is not None: url_query = parse_qs(urlparse(params['url']).query)