Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 1.0.4 #44

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/api/downloader.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@
members: false

::: wpextract.download.AuthorizationType

::: wpextract.download.requestsession.DEFAULT_UA
8 changes: 8 additions & 0 deletions docs/usage/download.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ $ wpextract download TARGET OUT_JSON
`--max-redirects MAX_REDIRECTS`
: Maximum number of redirects before giving up (default: 20)

`--user-agent USER_AGENT`
: User agent to use for requests. Default is a recent version of Chrome on Linux (see [`requestsession.DEFAULT_UA`][wpextract.download.requestsession.DEFAULT_UA])

**logging**

`--log FILE`, `-l FILE`
Expand Down Expand Up @@ -109,6 +112,11 @@ We would also suggest enabling the following options, with consideration for how
- `--wait` to space out requests
- `--random-wait` to vary the time between requests to avoid patterns

You may also wish to consider:

- The reputation of the IP used to make requests. IPs in ranges belonging to common VPS providers, e.g. DigitalOcean or AWS, may be more likely to be rate limited.
- `--user-agent` to set a custom user agent. The default is a recent version of Chrome on Linux, but this may become outdated. If using authentication, this may need to match the user agent of the browser used to log in.

### Error Handling

If an HTTP error occurs, the command will retry the request up to `--max-retries` times, with the backoff set by `--backoff-factor`. If the maximum number of retries is reached, the command will output the error, stop collecting the given data type, and start collecting the following data type. This is because it's presumed that if a given page is non-functional, the following one will be too.
Expand Down
7 changes: 7 additions & 0 deletions src/wpextract/cli/_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ def validate_wait(ctx, param, value):
help="Maximum number of redirects before giving up",
show_default=True,
)
@optgroup.option(
"--user-agent",
type=str,
help="User-Agent string to use for requests. Set to a recent version of Chrome on Linux by default.",
)
@logging_options
def download(
target: str,
Expand All @@ -130,6 +135,7 @@ def download(
max_retries: int,
backoff_factor: float,
max_redirects: int,
user_agent: Optional[str],
log: Optional[Path],
verbose: bool,
):
Expand Down Expand Up @@ -166,6 +172,7 @@ def download(
max_retries=max_retries,
backoff_factor=backoff_factor,
max_redirects=max_redirects,
user_agent=user_agent,
)

with setup_tqdm_redirect(log is None):
Expand Down
7 changes: 5 additions & 2 deletions src/wpextract/download/requestsession.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
from urllib3 import Retry

DEFAULT_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
DEFAULT_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"


class ConnectionCouldNotResolve(Exception):
Expand Down Expand Up @@ -204,6 +204,7 @@ def __init__(
max_retries: int = 10,
backoff_factor: float = 0.1,
max_redirects: int = 20,
user_agent: str = DEFAULT_UA,
):
"""Create a new request session.

Expand All @@ -217,6 +218,7 @@ def __init__(
max_retries: the maximum number of retries before failing
backoff_factor: Factor to wait between successive retries
max_redirects: maximum number of redirects to follow
user_agent: User agent to use for requests. Set to [`DEFAULT_UA`][wpextract.download.requestsession.DEFAULT_UA] by default.
"""
self.s = requests.Session()
if proxy is not None:
Expand All @@ -234,6 +236,7 @@ def __init__(
self.timeout = timeout
self._mount_retry(backoff_factor, max_redirects, max_retries)
self.waiter = RequestWait(wait, random_wait)
self.user_agent = user_agent

def _mount_retry(self, backoff_factor, max_redirects, max_retries):
retry = Retry(
Expand All @@ -258,7 +261,7 @@ def post(self, url, data=None):

def do_request(self, method, url, data=None, stream=False):
"""Helper class to regroup requests and handle exceptions at the same location."""
headers = {"User-Agent": DEFAULT_UA}
headers = {"User-Agent": self.user_agent}
response = None
try:
if method == "post":
Expand Down
2 changes: 2 additions & 0 deletions src/wpextract/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def __init__(
data_types: list[str],
session: Optional[RequestSession] = None,
json_prefix: Optional[str] = None,
user_agent: Optional[str] = None,
):
"""Initializes the WPDownloader object.

Expand All @@ -27,6 +28,7 @@ def __init__(
data_types: set of data types to download
session : request session. Will be created from default constructor if not provided.
json_prefix: prefix to prepend to JSON file names
user_agent: User agent to use for requests. See [`RequestSession`][wpextract.download.requestsession.RequestSession].
"""
self.target = target
self.out_path = out_path
Expand Down
18 changes: 18 additions & 0 deletions tests/cli/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ def mock_cls_invoke(mocker, runner, datadir, args=None):
return dl_mock, result


def mock_cls_invoke_req_sess(mocker, runner, datadir, args=None):
rq_mock = mocker.patch("wpextract.download.RequestSession")

dl_mock, result = mock_cls_invoke(mocker, runner, datadir, args)

return rq_mock, dl_mock, result


def test_default_args(mocker, runner, datadir):
dl_mock, result = mock_cls_invoke(mocker, runner, datadir)
assert result.exit_code == 0
Expand All @@ -40,3 +48,13 @@ def test_wait_random_validation(mocker, runner, datadir):
mocker, runner, datadir, ["--random-wait", "--wait", "1"]
)
assert result.exit_code == 0


def test_custom_ua(mocker, runner, datadir):
req_mock, dl_mock, result = mock_cls_invoke_req_sess(
mocker, runner, datadir, ["--user-agent", "test"]
)
assert result.exit_code == 0

req_mock.assert_called_once()
assert req_mock.call_args.kwargs["user_agent"] == "test"