Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce MAX_REDIRECTS config setting and fix urllib3 redirect handling #461

Merged
merged 9 commits into from
Jan 4, 2024
1 change: 1 addition & 0 deletions docs/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ The default file included in the package is `settings.cfg <https://github.com/ad
* ``EXTENSIVE_DATE_SEARCH = on`` set to ``off`` to deactivate ``htmldate``'s opportunistic search (lower recall, higher precision)
- Navigation
* ``EXTERNAL_URLS = off`` do not take URLs from other websites in feeds and sitemaps (CLI mode)
* ``MAX_REDIRECTS = 2``: maximum number of `URL redirections <https://en.wikipedia.org/wiki/URL_redirection>`_ to be followed. Set to 0 to not follow any redirection.


Using a custom file on the command-line
Expand Down
14 changes: 12 additions & 2 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
_send_pycurl_request, _send_request,
_urllib3_is_live_page,
add_to_compressed_dict, fetch_url,
is_live_page, load_download_buffer)
is_live_page, load_download_buffer, _reset_global_objects)
from trafilatura.settings import DEFAULT_CONFIG, use_config
from trafilatura.utils import decode_response, load_html

Expand Down Expand Up @@ -94,7 +94,17 @@ def test_fetch():
assert load_html(response) is not None
# nothing to see here
assert extract(response, url=response.url, config=ZERO_CONFIG) is None

# test handling redirects
res = fetch_url('https://httpbun.com/redirect/2')
assert len(res) > 100 # We followed redirects and downloaded something in the end
new_config = use_config() # get a new config instance to avoid mutating the default one
# Patch max directs: limit to 0. We won't fetch any page as a result
new_config.set('DEFAULT', 'MAX_REDIRECTS', '0')
_reset_global_objects() # force Retry strategy and PoolManager to be recreated with the new config value
res = fetch_url('http://httpbin.org/redirect/1', config=new_config)
assert res is None
# Also test max redir implementation on pycurl
assert _send_pycurl_request('http://httpbin.org/redirect/1', True, new_config) is None

def test_config():
'''Test how configuration options are read and stored.'''
Expand Down
20 changes: 14 additions & 6 deletions trafilatura/downloads.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
PKG_VERSION = version("trafilatura")

NUM_CONNECTIONS = 50
MAX_REDIRECTS = 2

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HTTP_POOL = None
Expand Down Expand Up @@ -90,8 +89,8 @@ def _send_request(url, no_ssl, config):
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
if not RETRY_STRATEGY:
RETRY_STRATEGY = urllib3.util.Retry(
total=0,
redirect=MAX_REDIRECTS, # raise_on_redirect=False,
total=config.getint("DEFAULT", "MAX_REDIRECTS"),
adbar marked this conversation as resolved.
Show resolved Hide resolved
redirect=config.getint("DEFAULT", "MAX_REDIRECTS"), # raise_on_redirect=False,
connect=0,
backoff_factor=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')/2,
status_forcelist=[
Expand All @@ -107,13 +106,13 @@ def _send_request(url, no_ssl, config):
if not HTTP_POOL:
HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), ca_certs=certifi.where(), num_pools=NUM_CONNECTIONS) # cert_reqs='CERT_REQUIRED'
# execute request
response = HTTP_POOL.request('GET', url, headers=_determine_headers(config))
response = HTTP_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
else:
# define pool
if not NO_CERT_POOL:
NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), cert_reqs='CERT_NONE', num_pools=NUM_CONNECTIONS)
# execute request
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config))
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
except urllib3.exceptions.SSLError:
LOGGER.warning('retrying after SSLError: %s', url)
return _send_request(url, True, config)
Expand Down Expand Up @@ -275,7 +274,7 @@ def _send_pycurl_request(url, no_ssl, config):
curl.setopt(pycurl.HTTPHEADER, headerlist)
# curl.setopt(pycurl.USERAGENT, '')
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, MAX_REDIRECTS)
curl.setopt(pycurl.MAXREDIRS, config.getint('DEFAULT', 'MAX_REDIRECTS'))
curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.NOSIGNAL, 1)
Expand Down Expand Up @@ -327,3 +326,12 @@ def _send_pycurl_request(url, no_ssl, config):
# tidy up
curl.close()
return RawResponse(bufferbytes, respcode, effective_url)


def _reset_global_objects():
adbar marked this conversation as resolved.
Show resolved Hide resolved
"""
Force global objects to be re-created
Currently only used during tests
"""
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY = None, None, None
2 changes: 2 additions & 0 deletions trafilatura/settings.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ SLEEP_TIME = 5
USER_AGENTS =
# cookie for HTTP requests
COOKIE =
# Maximum number of redirects that we will follow
MAX_REDIRECTS = 5

# Extraction
MIN_EXTRACTED_SIZE = 250
Expand Down
Loading