diff --git a/docs/settings.rst b/docs/settings.rst index 367cf2d8..5a2e7b7d 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -43,6 +43,7 @@ The default file included in the package is `settings.cfg `_ to be followed. Set to 0 to not follow any redirection. Using a custom file on the command-line diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py index 867e5ccd..b494a33b 100644 --- a/tests/downloads_tests.py +++ b/tests/downloads_tests.py @@ -27,6 +27,7 @@ from trafilatura.cli_utils import (download_queue_processing, url_processing_pipeline) from trafilatura.core import extract +import trafilatura.downloads from trafilatura.downloads import (DEFAULT_HEADERS, USER_AGENT, _determine_headers, _handle_response, _parse_config, _pycurl_is_live_page, @@ -47,6 +48,14 @@ UA_CONFIG = use_config(filename=os.path.join(RESOURCES_DIR, 'newsettings.cfg')) +def _reset_downloads_global_objects(): + """ + Force global objects to be re-created + """ + trafilatura.downloads.HTTP_POOL = None + trafilatura.downloads.NO_CERT_POOL = None + trafilatura.downloads.RETRY_STRATEGY = None + def test_fetch(): '''Test URL fetching.''' # logic: empty request? @@ -70,8 +79,9 @@ def test_fetch(): assert _send_pycurl_request('https://expired.badssl.com/', False, DEFAULT_CONFIG) is not None # no SSL, no decoding url = 'https://httpbun.com/status/200' - response = _send_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG) - assert response.data == b'' + for no_ssl in (True, False): + response = _send_request('https://httpbun.com/status/200', no_ssl, DEFAULT_CONFIG) + assert response.data == b'' if pycurl is not None: response1 = _send_pycurl_request('https://httpbun.com/status/200', True, DEFAULT_CONFIG) assert _handle_response(url, response1, False, DEFAULT_CONFIG) == _handle_response(url, response, False, DEFAULT_CONFIG) @@ -94,7 +104,19 @@ def test_fetch(): assert load_html(response) is not None # nothing to see here assert extract(response, url=response.url, config=ZERO_CONFIG) is None - + # test handling redirects + res = fetch_url('http://httpbin.org/redirect/2') + assert len(res) > 100 # We followed redirects and downloaded something in the end + new_config = use_config() # get a new config instance to avoid mutating the default one + # Patch max directs: limit to 0. We won't fetch any page as a result + new_config.set('DEFAULT', 'MAX_REDIRECTS', '0') + _reset_downloads_global_objects() # force Retry strategy and PoolManager to be recreated with the new config value + res = fetch_url('http://httpbin.org/redirect/1', config=new_config) + assert res is None + # Also test max redir implementation on pycurl if available + if pycurl is not None: + assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None + _reset_downloads_global_objects() # reset global objects again to avoid affecting other tests def test_config(): '''Test how configuration options are read and stored.''' diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index 10201aac..bdb1c88c 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -43,7 +43,6 @@ PKG_VERSION = version("trafilatura") NUM_CONNECTIONS = 50 -MAX_REDIRECTS = 2 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) HTTP_POOL = None @@ -90,8 +89,8 @@ def _send_request(url, no_ssl, config): global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY if not RETRY_STRATEGY: RETRY_STRATEGY = urllib3.util.Retry( - total=0, - redirect=MAX_REDIRECTS, # raise_on_redirect=False, + total=config.getint("DEFAULT", "MAX_REDIRECTS"), + redirect=config.getint("DEFAULT", "MAX_REDIRECTS"), # raise_on_redirect=False, connect=0, backoff_factor=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')/2, status_forcelist=[ @@ -107,13 +106,13 @@ def _send_request(url, no_ssl, config): if not HTTP_POOL: HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), ca_certs=certifi.where(), num_pools=NUM_CONNECTIONS) # cert_reqs='CERT_REQUIRED' # execute request - response = HTTP_POOL.request('GET', url, headers=_determine_headers(config)) + response = HTTP_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY) else: # define pool if not NO_CERT_POOL: NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), cert_reqs='CERT_NONE', num_pools=NUM_CONNECTIONS) # execute request - response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config)) + response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY) except urllib3.exceptions.SSLError: LOGGER.warning('retrying after SSLError: %s', url) return _send_request(url, True, config) @@ -275,7 +274,7 @@ def _send_pycurl_request(url, no_ssl, config): curl.setopt(pycurl.HTTPHEADER, headerlist) # curl.setopt(pycurl.USERAGENT, '') curl.setopt(pycurl.FOLLOWLOCATION, 1) - curl.setopt(pycurl.MAXREDIRS, MAX_REDIRECTS) + curl.setopt(pycurl.MAXREDIRS, config.getint('DEFAULT', 'MAX_REDIRECTS')) curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')) curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')) curl.setopt(pycurl.NOSIGNAL, 1) diff --git a/trafilatura/settings.cfg b/trafilatura/settings.cfg index dfd753b9..160f5909 100644 --- a/trafilatura/settings.cfg +++ b/trafilatura/settings.cfg @@ -12,6 +12,8 @@ SLEEP_TIME = 5 USER_AGENTS = # cookie for HTTP requests COOKIE = +# Maximum number of redirects that we will follow +MAX_REDIRECTS = 2 # Extraction MIN_EXTRACTED_SIZE = 250