From 86ee3bd75250fe3d83c62d6f213ff7efc1d7cb16 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 3 Apr 2024 17:11:58 -0400 Subject: [PATCH] Allow ACLJs to use *, SURT wildcard to match all URLs (#882) Also adds tests and documentation --- docs/manual/access-control.rst | 10 ++++++++++ pywb/warcserver/access_checker.py | 4 ++++ sample_archive/access/allow_all.aclj | 1 + tests/config_test_access.yaml | 7 +++++++ tests/test_acl.py | 4 ++++ 5 files changed, 26 insertions(+) create mode 100644 sample_archive/access/allow_all.aclj diff --git a/docs/manual/access-control.rst b/docs/manual/access-control.rst index 037272496..a80198eea 100644 --- a/docs/manual/access-control.rst +++ b/docs/manual/access-control.rst @@ -105,6 +105,12 @@ Given these rules, a user would: * but would receive an 'access blocked' error message when viewing ``http://httpbin.org/`` (block) * would receive a 404 not found error when viewing ``http://httpbin.org/anything`` (exclude) +To match any possible URL in an .aclj file, set ``*,`` as the leading SURT, for example:: + + *, - {"access": "allow"} + +Lines starting with ``*,`` should generally be at the end of the file, respecting the reverse alphabetical order. + Access Types: allow, block, exclude, allow_ignore_embargo ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -149,6 +155,10 @@ To make this work, pywb must be running behind an Apache or Nginx system that is For example, this header may be set based on IP range, or based on password authentication. +To allow a user access to all URLs, overriding more specific rules and the ``default_access`` configuration setting, use the ``*,`` SURT:: + + *, - {"access": "allow", "user": "staff"} + Further examples of how to set this header will be provided in the deployments section. **Note: Do not use the user-based rules without configuring proper authentication on an Apache or Nginx frontend to set or remove this header, otherwise the 'X-Pywb-ACL-User' can easily be faked.** diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index 46cd7acd9..a5fc83356 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -260,6 +260,10 @@ def find_access_rule(self, url, ts=None, urlkey=None, collection=None, acl_user= if key.startswith(acl_key): acl_obj = CDXObject(acl) + # Check for "*," in ACL, which matches any URL + if acl_key == b"*,": + acl_obj = CDXObject(acl) + if acl_obj: user = acl_obj.get('user') if user == acl_user: diff --git a/sample_archive/access/allow_all.aclj b/sample_archive/access/allow_all.aclj new file mode 100644 index 000000000..1e93abe4a --- /dev/null +++ b/sample_archive/access/allow_all.aclj @@ -0,0 +1 @@ +*, - {"access": "allow", "user": "staff"} diff --git a/tests/config_test_access.yaml b/tests/config_test_access.yaml index 8fb352f7c..332d5a742 100644 --- a/tests/config_test_access.yaml +++ b/tests/config_test_access.yaml @@ -62,6 +62,13 @@ collections: acl_paths: - ./sample_archive/access/pywb.aclj + pywb-wildcard-surt: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + default_access: block + acl_paths: + - ./sample_archive/access/allow_all.aclj + diff --git a/tests/test_acl.py b/tests/test_acl.py index ea7655aa8..4e94962c5 100644 --- a/tests/test_acl.py +++ b/tests/test_acl.py @@ -96,5 +96,9 @@ def test_allowed_different_coll_acl_dir(self): assert '"http://httpbin.org/anything/resource.json"' in resp.text + def test_allow_all_acl_user_specific(self): + resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', status=451) + assert 'Access Blocked' in resp.text + resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', headers={"X-Pywb-Acl-User": "staff"}, status=200)