diff --git a/README.md b/README.md index 96bfa858..8cbc6dc5 100644 --- a/README.md +++ b/README.md @@ -105,16 +105,14 @@ tldextract http://forums.bbc.co.uk ### Note About Caching Beware when first running the module, it updates its TLD list with a live HTTP -request. This updated TLD set is cached indefinitely in -`/path/to/tldextract/.tld_set`. +request. This updated TLD set is usually cached indefinitely in ``$HOME/.cache/python-tldextract`. +To control the cache's location, set TLDEXTRACT_CACHE environment variable or set the +cache_dir path in TLDExtract initialization. (Arguably runtime bootstrapping like that shouldn't be the default behavior, like for production systems. But I want you to have the latest TLDs, especially when I haven't kept this code up to date.) -To avoid this fetch or control the cache's location, use your own extract -callable by setting TLDEXTRACT_CACHE environment variable or by setting the -cache_dir path in TLDExtract initialization. ```python # extract callable that falls back to the included TLD snapshot, no live HTTP fetching diff --git a/tests/test_cache.py b/tests/test_cache.py index 6c53ed3c..b148ae15 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,6 +1,12 @@ """Test the caching functionality""" +import os.path +import sys +import types + import pytest -from tldextract.cache import DiskCache + +import tldextract.cache +from tldextract.cache import DiskCache, get_pkg_unique_identifier, get_cache_dir def test_disk_cache(tmpdir): @@ -15,3 +21,45 @@ def test_disk_cache(tmpdir): cache.set("testing", "foo", "baz") assert cache.get("testing", "foo") == "baz" + + +def test_get_pkg_unique_identifier(monkeypatch): + monkeypatch.setattr(sys, "version_info", (3, 8, 1, "final", 0)) + monkeypatch.setattr(sys, "prefix", "/home/john/.pyenv/versions/myvirtualenv") + + mock_version_module = types.ModuleType('tldextract._version', 'mocked module') + mock_version_module.version = "1.2.3" + monkeypatch.setitem(sys.modules, "tldextract._version", mock_version_module) + + assert get_pkg_unique_identifier() == "3.8.1.final__myvirtualenv__f01a7b__tldextract-1.2.3" + + +def test_get_cache_dir(monkeypatch): + pkg_identifier = "3.8.1.final__myvirtualenv__f01a7b__tldextract-1.2.3" + monkeypatch.setattr(tldextract.cache, "get_pkg_unique_identifier", lambda: pkg_identifier) + + # with no HOME set, fallback to attempting to use package directory itself + monkeypatch.delenv("HOME", raising=False) + monkeypatch.delenv("XDG_CACHE_HOME", raising=False) + monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False) + assert get_cache_dir().endswith("tldextract/.suffix_cache/") + + # with home set, but not anything else specified, use XDG_CACHE_HOME default + monkeypatch.setenv("HOME", "/home/john") + monkeypatch.delenv("XDG_CACHE_HOME", raising=False) + monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False) + assert get_cache_dir() == os.path.join("/home/john", ".cache/python-tldextract", pkg_identifier) + + # if XDG_CACHE_HOME is set, use it + monkeypatch.setenv("HOME", "/home/john") + monkeypatch.setenv("XDG_CACHE_HOME", "/my/alt/cache") + monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False) + + assert get_cache_dir() == os.path.join("/my/alt/cache/python-tldextract", pkg_identifier) + + # if TLDEXTRACT_CACHE is set, use it + monkeypatch.setenv("HOME", "/home/john") + monkeypatch.setenv("XDG_CACHE_HOME", "/my/alt/cache") + monkeypatch.setenv("TLDEXTRACT_CACHE", "/alt-tld-cache") + + assert get_cache_dir() == "/alt-tld-cache" diff --git a/tldextract/cache.py b/tldextract/cache.py index e1e95aff..0059a10a 100644 --- a/tldextract/cache.py +++ b/tldextract/cache.py @@ -1,9 +1,11 @@ """Helpers """ import errno +import hashlib import json import logging import os import os.path +import sys from hashlib import md5 from filelock import FileLock @@ -13,8 +15,59 @@ _DID_LOG_UNABLE_TO_CACHE = False +def get_pkg_unique_identifier(): + """ + Generate an identifier unique to the python version, tldextract version, and python instance + + This will prevent interference between virtualenvs and issues that might arise when installing + a new version of tldextract + """ + try: + # pylint: disable=import-outside-toplevel + from tldextract._version import version + except ImportError: + version = "dev" + + tldextract_version = "tldextract-" + version + python_env_name = os.path.basename(sys.prefix) + # just to handle the edge case of two identically named python environments + python_binary_path_short_hash = hashlib.md5(sys.prefix.encode("utf-8")).hexdigest()[:6] + python_version = ".".join([str(v) for v in sys.version_info[:-1]]) + identifier_parts = [ + python_version, + python_env_name, + python_binary_path_short_hash, + tldextract_version + ] + pkg_identifier = "__".join(identifier_parts) + + return pkg_identifier + + +def get_cache_dir(): + """ + Get a cache dir that we have permission to write to + + Try to follow the XDG standard, but if that doesn't work fallback to the package directory + """ + try: + # http://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html + xdg_cache_home = os.getenv("XDG_CACHE_HOME", None) + if xdg_cache_home is None: + user_home = os.getenv("HOME", None) + xdg_cache_home = os.path.join(user_home, ".cache") + pkg_id = get_pkg_unique_identifier() + cache_dir_default = os.path.join(xdg_cache_home, "python-tldextract", pkg_id) + + return os.path.expanduser(os.environ.get("TLDEXTRACT_CACHE", cache_dir_default)) + except TypeError: # noqa + # fallback to trying to use package directory itself + return os.path.join(os.path.dirname(__file__), ".suffix_cache/") + + class DiskCache: """Disk _cache that only works for jsonable values""" + NOT_SET = object() def __init__(self, cache_dir, lock_timeout=20): self.enabled = bool(cache_dir) diff --git a/tldextract/tldextract.py b/tldextract/tldextract.py index 04744d1c..536f056d 100644 --- a/tldextract/tldextract.py +++ b/tldextract/tldextract.py @@ -56,14 +56,13 @@ import idna -from .cache import DiskCache +from .cache import DiskCache, get_cache_dir from .remote import IP_RE, SCHEME_RE, looks_like_ip from .suffix_list import get_suffix_lists LOG = logging.getLogger("tldextract") -CACHE_DIR_DEFAULT = os.path.join(os.path.dirname(__file__), ".suffix_cache/") -CACHE_DIR = os.path.expanduser(os.environ.get("TLDEXTRACT_CACHE", CACHE_DIR_DEFAULT)) + CACHE_TIMEOUT = os.environ.get("TLDEXTRACT_CACHE_TIMEOUT") PUBLIC_SUFFIX_LIST_URLS = ( @@ -131,7 +130,7 @@ class TLDExtract: # TODO: Agreed with Pylint: too-many-arguments def __init__( # pylint: disable=too-many-arguments self, - cache_dir=CACHE_DIR, + cache_dir=get_cache_dir(), suffix_list_urls=PUBLIC_SUFFIX_LIST_URLS, fallback_to_snapshot=True, include_psl_private_domains=False,