diff --git a/docs/api/downloader.md b/docs/api/downloader.md index 54ffac4..a09b6a5 100644 --- a/docs/api/downloader.md +++ b/docs/api/downloader.md @@ -11,3 +11,5 @@ members: false ::: wpextract.download.AuthorizationType + +::: wpextract.download.requestsession.DEFAULT_UA \ No newline at end of file diff --git a/docs/usage/download.md b/docs/usage/download.md index 4c720f8..86fca18 100644 --- a/docs/usage/download.md +++ b/docs/usage/download.md @@ -59,6 +59,9 @@ $ wpextract download TARGET OUT_JSON `--max-redirects MAX_REDIRECTS` : Maximum number of redirects before giving up (default: 20) +`--user-agent USER_AGENT` +: User agent to use for requests. Default is a recent version of Chrome on Linux (see [`requestsession.DEFAULT_UA`][wpextract.download.requestsession.DEFAULT_UA]) + **logging** `--log FILE`, `-l FILE` @@ -109,6 +112,11 @@ We would also suggest enabling the following options, with consideration for how - `--wait` to space out requests - `--random-wait` to vary the time between requests to avoid patterns +You may also wish to consider: + +- The reputation of the IP used to make requests. IPs in ranges belonging to common VPS providers, e.g. DigitalOcean or AWS, may be more likely to be rate limited. +- `--user-agent` to set a custom user agent. The default is a recent version of Chrome on Linux, but this may become outdated. If using authentication, this may need to match the user agent of the browser used to log in. + ### Error Handling If an HTTP error occurs, the command will retry the request up to `--max-retries` times, with the backoff set by `--backoff-factor`. If the maximum number of retries is reached, the command will output the error, stop collecting the given data type, and start collecting the following data type. This is because it's presumed that if a given page is non-functional, the following one will be too. diff --git a/src/wpextract/cli/_download.py b/src/wpextract/cli/_download.py index 1f47a50..ec55e41 100644 --- a/src/wpextract/cli/_download.py +++ b/src/wpextract/cli/_download.py @@ -114,6 +114,11 @@ def validate_wait(ctx, param, value): help="Maximum number of redirects before giving up", show_default=True, ) +@optgroup.option( + "--user-agent", + type=str, + help="User-Agent string to use for requests. Set to a recent version of Chrome on Linux by default.", +) @logging_options def download( target: str, @@ -130,6 +135,7 @@ def download( max_retries: int, backoff_factor: float, max_redirects: int, + user_agent: Optional[str], log: Optional[Path], verbose: bool, ): @@ -166,6 +172,7 @@ def download( max_retries=max_retries, backoff_factor=backoff_factor, max_redirects=max_redirects, + user_agent=user_agent, ) with setup_tqdm_redirect(log is None): diff --git a/src/wpextract/download/requestsession.py b/src/wpextract/download/requestsession.py index 1d86068..9206383 100644 --- a/src/wpextract/download/requestsession.py +++ b/src/wpextract/download/requestsession.py @@ -9,7 +9,7 @@ from requests.auth import HTTPBasicAuth, HTTPDigestAuth from urllib3 import Retry -DEFAULT_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" +DEFAULT_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" class ConnectionCouldNotResolve(Exception): @@ -204,6 +204,7 @@ def __init__( max_retries: int = 10, backoff_factor: float = 0.1, max_redirects: int = 20, + user_agent: str = DEFAULT_UA, ): """Create a new request session. @@ -217,6 +218,7 @@ def __init__( max_retries: the maximum number of retries before failing backoff_factor: Factor to wait between successive retries max_redirects: maximum number of redirects to follow + user_agent: User agent to use for requests. Set to [`DEFAULT_UA`][wpextract.download.requestsession.DEFAULT_UA] by default. """ self.s = requests.Session() if proxy is not None: @@ -234,6 +236,7 @@ def __init__( self.timeout = timeout self._mount_retry(backoff_factor, max_redirects, max_retries) self.waiter = RequestWait(wait, random_wait) + self.user_agent = user_agent def _mount_retry(self, backoff_factor, max_redirects, max_retries): retry = Retry( @@ -258,7 +261,7 @@ def post(self, url, data=None): def do_request(self, method, url, data=None, stream=False): """Helper class to regroup requests and handle exceptions at the same location.""" - headers = {"User-Agent": DEFAULT_UA} + headers = {"User-Agent": self.user_agent} response = None try: if method == "post": diff --git a/src/wpextract/downloader.py b/src/wpextract/downloader.py index ad2f5d3..41fd646 100644 --- a/src/wpextract/downloader.py +++ b/src/wpextract/downloader.py @@ -18,6 +18,7 @@ def __init__( data_types: list[str], session: Optional[RequestSession] = None, json_prefix: Optional[str] = None, + user_agent: Optional[str] = None, ): """Initializes the WPDownloader object. @@ -27,6 +28,7 @@ def __init__( data_types: set of data types to download session : request session. Will be created from default constructor if not provided. json_prefix: prefix to prepend to JSON file names + user_agent: User agent to use for requests. See [`RequestSession`][wpextract.download.requestsession.RequestSession]. """ self.target = target self.out_path = out_path diff --git a/tests/cli/test_download.py b/tests/cli/test_download.py index 4c516b0..40b607c 100644 --- a/tests/cli/test_download.py +++ b/tests/cli/test_download.py @@ -14,6 +14,14 @@ def mock_cls_invoke(mocker, runner, datadir, args=None): return dl_mock, result +def mock_cls_invoke_req_sess(mocker, runner, datadir, args=None): + rq_mock = mocker.patch("wpextract.download.RequestSession") + + dl_mock, result = mock_cls_invoke(mocker, runner, datadir, args) + + return rq_mock, dl_mock, result + + def test_default_args(mocker, runner, datadir): dl_mock, result = mock_cls_invoke(mocker, runner, datadir) assert result.exit_code == 0 @@ -40,3 +48,13 @@ def test_wait_random_validation(mocker, runner, datadir): mocker, runner, datadir, ["--random-wait", "--wait", "1"] ) assert result.exit_code == 0 + + +def test_custom_ua(mocker, runner, datadir): + req_mock, dl_mock, result = mock_cls_invoke_req_sess( + mocker, runner, datadir, ["--user-agent", "test"] + ) + assert result.exit_code == 0 + + req_mock.assert_called_once() + assert req_mock.call_args.kwargs["user_agent"] == "test"