Skip to content

Commit

Permalink
feature: use cache directory in user folder if possible
Browse files Browse the repository at this point in the history
Follows the XDG Base Directory Specification
https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html

Falls back to attempting to use package directory if it can't determine and XDG directory location.
  • Loading branch information
brycedrennan committed Oct 25, 2020
1 parent 1b1e07e commit 3f589a4
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 10 deletions.
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,16 +105,14 @@ tldextract http://forums.bbc.co.uk
### Note About Caching

Beware when first running the module, it updates its TLD list with a live HTTP
request. This updated TLD set is cached indefinitely in
`/path/to/tldextract/.tld_set`.
request. This updated TLD set is usually cached indefinitely in ``$HOME/.cache/python-tldextract`.
To control the cache's location, set TLDEXTRACT_CACHE environment variable or set the
cache_dir path in TLDExtract initialization.

(Arguably runtime bootstrapping like that shouldn't be the default behavior,
like for production systems. But I want you to have the latest TLDs, especially
when I haven't kept this code up to date.)

To avoid this fetch or control the cache's location, use your own extract
callable by setting TLDEXTRACT_CACHE environment variable or by setting the
cache_dir path in TLDExtract initialization.

```python
# extract callable that falls back to the included TLD snapshot, no live HTTP fetching
Expand Down
50 changes: 49 additions & 1 deletion tests/test_cache.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
"""Test the caching functionality"""
import os.path
import sys
import types

import pytest
from tldextract.cache import DiskCache

import tldextract.cache
from tldextract.cache import DiskCache, get_pkg_unique_identifier, get_cache_dir


def test_disk_cache(tmpdir):
Expand All @@ -15,3 +21,45 @@ def test_disk_cache(tmpdir):

cache.set("testing", "foo", "baz")
assert cache.get("testing", "foo") == "baz"


def test_get_pkg_unique_identifier(monkeypatch):
monkeypatch.setattr(sys, "version_info", (3, 8, 1, "final", 0))
monkeypatch.setattr(sys, "prefix", "/home/john/.pyenv/versions/myvirtualenv")

mock_version_module = types.ModuleType('tldextract._version', 'mocked module')
mock_version_module.version = "1.2.3"
monkeypatch.setitem(sys.modules, "tldextract._version", mock_version_module)

assert get_pkg_unique_identifier() == "3.8.1.final__myvirtualenv__f01a7b__tldextract-1.2.3"


def test_get_cache_dir(monkeypatch):
pkg_identifier = "3.8.1.final__myvirtualenv__f01a7b__tldextract-1.2.3"
monkeypatch.setattr(tldextract.cache, "get_pkg_unique_identifier", lambda: pkg_identifier)

# with no HOME set, fallback to attempting to use package directory itself
monkeypatch.delenv("HOME", raising=False)
monkeypatch.delenv("XDG_CACHE_HOME", raising=False)
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)
assert get_cache_dir().endswith("tldextract/.suffix_cache/")

# with home set, but not anything else specified, use XDG_CACHE_HOME default
monkeypatch.setenv("HOME", "/home/john")
monkeypatch.delenv("XDG_CACHE_HOME", raising=False)
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)
assert get_cache_dir() == os.path.join("/home/john", ".cache/python-tldextract", pkg_identifier)

# if XDG_CACHE_HOME is set, use it
monkeypatch.setenv("HOME", "/home/john")
monkeypatch.setenv("XDG_CACHE_HOME", "/my/alt/cache")
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)

assert get_cache_dir() == os.path.join("/my/alt/cache/python-tldextract", pkg_identifier)

# if TLDEXTRACT_CACHE is set, use it
monkeypatch.setenv("HOME", "/home/john")
monkeypatch.setenv("XDG_CACHE_HOME", "/my/alt/cache")
monkeypatch.setenv("TLDEXTRACT_CACHE", "/alt-tld-cache")

assert get_cache_dir() == "/alt-tld-cache"
53 changes: 53 additions & 0 deletions tldextract/cache.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Helpers """
import errno
import hashlib
import json
import logging
import os
import os.path
import sys
from hashlib import md5

from filelock import FileLock
Expand All @@ -13,8 +15,59 @@
_DID_LOG_UNABLE_TO_CACHE = False


def get_pkg_unique_identifier():
"""
Generate an identifier unique to the python version, tldextract version, and python instance
This will prevent interference between virtualenvs and issues that might arise when installing
a new version of tldextract
"""
try:
# pylint: disable=import-outside-toplevel
from tldextract._version import version
except ImportError:
version = "dev"

tldextract_version = "tldextract-" + version
python_env_name = os.path.basename(sys.prefix)
# just to handle the edge case of two identically named python environments
python_binary_path_short_hash = hashlib.md5(sys.prefix.encode("utf-8")).hexdigest()[:6]
python_version = ".".join([str(v) for v in sys.version_info[:-1]])
identifier_parts = [
python_version,
python_env_name,
python_binary_path_short_hash,
tldextract_version
]
pkg_identifier = "__".join(identifier_parts)

return pkg_identifier


def get_cache_dir():
"""
Get a cache dir that we have permission to write to
Try to follow the XDG standard, but if that doesn't work fallback to the package directory
"""
try:
# http://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
xdg_cache_home = os.getenv("XDG_CACHE_HOME", None)
if xdg_cache_home is None:
user_home = os.getenv("HOME", None)
xdg_cache_home = os.path.join(user_home, ".cache")
pkg_id = get_pkg_unique_identifier()
cache_dir_default = os.path.join(xdg_cache_home, "python-tldextract", pkg_id)

return os.path.expanduser(os.environ.get("TLDEXTRACT_CACHE", cache_dir_default))
except TypeError: # noqa
# fallback to trying to use package directory itself
return os.path.join(os.path.dirname(__file__), ".suffix_cache/")


class DiskCache:
"""Disk _cache that only works for jsonable values"""
NOT_SET = object()

def __init__(self, cache_dir, lock_timeout=20):
self.enabled = bool(cache_dir)
Expand Down
7 changes: 3 additions & 4 deletions tldextract/tldextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,13 @@

import idna

from .cache import DiskCache
from .cache import DiskCache, get_cache_dir
from .remote import IP_RE, SCHEME_RE, looks_like_ip
from .suffix_list import get_suffix_lists

LOG = logging.getLogger("tldextract")

CACHE_DIR_DEFAULT = os.path.join(os.path.dirname(__file__), ".suffix_cache/")
CACHE_DIR = os.path.expanduser(os.environ.get("TLDEXTRACT_CACHE", CACHE_DIR_DEFAULT))

CACHE_TIMEOUT = os.environ.get("TLDEXTRACT_CACHE_TIMEOUT")

PUBLIC_SUFFIX_LIST_URLS = (
Expand Down Expand Up @@ -131,7 +130,7 @@ class TLDExtract:
# TODO: Agreed with Pylint: too-many-arguments
def __init__( # pylint: disable=too-many-arguments
self,
cache_dir=CACHE_DIR,
cache_dir=get_cache_dir(),
suffix_list_urls=PUBLIC_SUFFIX_LIST_URLS,
fallback_to_snapshot=True,
include_psl_private_domains=False,
Expand Down

0 comments on commit 3f589a4

Please sign in to comment.