Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: use cache directory in user folder if possible #213

Merged
merged 1 commit into from
Nov 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,16 +105,14 @@ tldextract http://forums.bbc.co.uk
### Note About Caching

Beware when first running the module, it updates its TLD list with a live HTTP
request. This updated TLD set is cached indefinitely in
`/path/to/tldextract/.tld_set`.
request. This updated TLD set is usually cached indefinitely in ``$HOME/.cache/python-tldextract`.
To control the cache's location, set TLDEXTRACT_CACHE environment variable or set the
cache_dir path in TLDExtract initialization.

(Arguably runtime bootstrapping like that shouldn't be the default behavior,
like for production systems. But I want you to have the latest TLDs, especially
when I haven't kept this code up to date.)

To avoid this fetch or control the cache's location, use your own extract
callable by setting TLDEXTRACT_CACHE environment variable or by setting the
cache_dir path in TLDExtract initialization.

```python
# extract callable that falls back to the included TLD snapshot, no live HTTP fetching
Expand Down
50 changes: 49 additions & 1 deletion tests/test_cache.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
"""Test the caching functionality"""
import os.path
import sys
import types

import pytest
from tldextract.cache import DiskCache

import tldextract.cache
from tldextract.cache import DiskCache, get_pkg_unique_identifier, get_cache_dir


def test_disk_cache(tmpdir):
Expand All @@ -15,3 +21,45 @@ def test_disk_cache(tmpdir):

cache.set("testing", "foo", "baz")
assert cache.get("testing", "foo") == "baz"


def test_get_pkg_unique_identifier(monkeypatch):
monkeypatch.setattr(sys, "version_info", (3, 8, 1, "final", 0))
monkeypatch.setattr(sys, "prefix", "/home/john/.pyenv/versions/myvirtualenv")

mock_version_module = types.ModuleType('tldextract._version', 'mocked module')
mock_version_module.version = "1.2.3"
monkeypatch.setitem(sys.modules, "tldextract._version", mock_version_module)

assert get_pkg_unique_identifier() == "3.8.1.final__myvirtualenv__f01a7b__tldextract-1.2.3"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Magic string f01a7b?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's a md5 hash of sys.prefix. not sure a better way?



def test_get_cache_dir(monkeypatch):
pkg_identifier = "3.8.1.final__myvirtualenv__f01a7b__tldextract-1.2.3"
monkeypatch.setattr(tldextract.cache, "get_pkg_unique_identifier", lambda: pkg_identifier)

# with no HOME set, fallback to attempting to use package directory itself
monkeypatch.delenv("HOME", raising=False)
monkeypatch.delenv("XDG_CACHE_HOME", raising=False)
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)
assert get_cache_dir().endswith("tldextract/.suffix_cache/")

# with home set, but not anything else specified, use XDG_CACHE_HOME default
monkeypatch.setenv("HOME", "/home/john")
monkeypatch.delenv("XDG_CACHE_HOME", raising=False)
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)
assert get_cache_dir() == os.path.join("/home/john", ".cache/python-tldextract", pkg_identifier)

# if XDG_CACHE_HOME is set, use it
monkeypatch.setenv("HOME", "/home/john")
monkeypatch.setenv("XDG_CACHE_HOME", "/my/alt/cache")
monkeypatch.delenv("TLDEXTRACT_CACHE", raising=False)

assert get_cache_dir() == os.path.join("/my/alt/cache/python-tldextract", pkg_identifier)

# if TLDEXTRACT_CACHE is set, use it
monkeypatch.setenv("HOME", "/home/john")
monkeypatch.setenv("XDG_CACHE_HOME", "/my/alt/cache")
monkeypatch.setenv("TLDEXTRACT_CACHE", "/alt-tld-cache")

assert get_cache_dir() == "/alt-tld-cache"
55 changes: 55 additions & 0 deletions tldextract/cache.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Helpers """
import errno
import hashlib
import json
import logging
import os
import os.path
import sys
from hashlib import md5

from filelock import FileLock
Expand All @@ -13,6 +15,59 @@
_DID_LOG_UNABLE_TO_CACHE = False


def get_pkg_unique_identifier():
"""
Generate an identifier unique to the python version, tldextract version, and python instance

This will prevent interference between virtualenvs and issues that might arise when installing
a new version of tldextract
"""
try:
# pylint: disable=import-outside-toplevel
from tldextract._version import version
except ImportError:
version = "dev"

tldextract_version = "tldextract-" + version
python_env_name = os.path.basename(sys.prefix)
# just to handle the edge case of two identically named python environments
python_binary_path_short_hash = hashlib.md5(sys.prefix.encode("utf-8")).hexdigest()[:6]
python_version = ".".join([str(v) for v in sys.version_info[:-1]])
identifier_parts = [
python_version,
python_env_name,
python_binary_path_short_hash,
tldextract_version
]
pkg_identifier = "__".join(identifier_parts)

return pkg_identifier


def get_cache_dir():
"""
Get a cache dir that we have permission to write to

Try to follow the XDG standard, but if that doesn't work fallback to the package directory
http://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
"""
cache_dir = os.environ.get("TLDEXTRACT_CACHE", None)
if cache_dir is not None:
return cache_dir

xdg_cache_home = os.getenv("XDG_CACHE_HOME", None)
if xdg_cache_home is None:
user_home = os.getenv("HOME", None)
if user_home:
xdg_cache_home = os.path.join(user_home, ".cache")

if xdg_cache_home is not None:
return os.path.join(xdg_cache_home, "python-tldextract", get_pkg_unique_identifier())

# fallback to trying to use package directory itself
return os.path.join(os.path.dirname(__file__), ".suffix_cache/")


class DiskCache:
"""Disk _cache that only works for jsonable values"""

Expand Down
7 changes: 3 additions & 4 deletions tldextract/tldextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,13 @@

import idna

from .cache import DiskCache
from .cache import DiskCache, get_cache_dir
from .remote import IP_RE, SCHEME_RE, looks_like_ip
from .suffix_list import get_suffix_lists

LOG = logging.getLogger("tldextract")

CACHE_DIR_DEFAULT = os.path.join(os.path.dirname(__file__), ".suffix_cache/")
CACHE_DIR = os.path.expanduser(os.environ.get("TLDEXTRACT_CACHE", CACHE_DIR_DEFAULT))

CACHE_TIMEOUT = os.environ.get("TLDEXTRACT_CACHE_TIMEOUT")

PUBLIC_SUFFIX_LIST_URLS = (
Expand Down Expand Up @@ -131,7 +130,7 @@ class TLDExtract:
# TODO: Agreed with Pylint: too-many-arguments
def __init__( # pylint: disable=too-many-arguments
self,
cache_dir=CACHE_DIR,
cache_dir=get_cache_dir(),
suffix_list_urls=PUBLIC_SUFFIX_LIST_URLS,
fallback_to_snapshot=True,
include_psl_private_domains=False,
Expand Down