Skip to content

Commit

Permalink
Prototype SQIDs in line with ADR 0018
Browse files Browse the repository at this point in the history
  • Loading branch information
dragon-dxw committed Apr 12, 2024
1 parent 5b4272d commit e8a5b94
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 0 deletions.
22 changes: 22 additions & 0 deletions ds-caselaw-ingester/content_sqid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from sqids import Sqids

# HASH_SUBSTRING_LENGTH must be strictly less than 16;
# for 16, hashes starting with 8 have a number too large to be turned into sqid.
# (under the hood, sqids are numbers less than the hex value
# 8000 0000 0000 0000 and do not exist for numbers higher than that)
HASH_SUBSTRING_LENGTH = 12

# SQID_ALPHABET contains no vowels, including y
SQID_ALPHABET = "bcdfghjklmnpqrstvwxz"
SQID_MIN_LENGTH = 8

sqids = Sqids(alphabet=SQID_ALPHABET, min_length=SQID_MIN_LENGTH)


def _hex_digest_to_int(digest_string: str) -> int:
return int(digest_string.encode("utf-8")[:HASH_SUBSTRING_LENGTH], 16)


def hex_digest_to_sqid(digest_string: str) -> str:
num = _hex_digest_to_int(digest_string)
return sqids.encode([num])
45 changes: 45 additions & 0 deletions ds-caselaw-ingester/test_sqid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import content_sqid
import pytest
from content_sqid import _hex_digest_to_int, hex_digest_to_sqid


@pytest.fixture()
def no_hash_limit():
"""Remove the limitation on the length of the contenthash that is consumed temporarily"""
old = content_sqid.HASH_SUBSTRING_LENGTH
content_sqid.HASH_SUBSTRING_LENGTH = 999
yield None
content_sqid.HASH_SUBSTRING_LENGTH = old


def test_hex_to_int():
"""
These values shouldn't change -- if they do, it means our hashes aren't stable.
Changing the alphabet will change them.
"""
assert _hex_digest_to_int("deadbeef") == 3735928559
assert hex_digest_to_sqid("deadbeef") == "hdgcqtcnm"


def test_min_length():
"""Low-value hashes are an acceptable length"""
assert hex_digest_to_sqid("0") == "xcsrdnmp"


def test_max_value():
"""This should be the largest value we can ever get"""
assert hex_digest_to_sqid("ffffffffffffffffffffffffffffff") == "tspwbpshvpklr"


def test_hex_truncation():
"""A large hex value works and is the same value as the truncated version"""
assert _hex_digest_to_int(
"2597c39e63c20d69dc0cb189a88a8ab127c335cdcbf1d9ee43de3f711002de52"
) == _hex_digest_to_int("2597c39e63c2")


def test_demo_limit_of_truncation(no_hash_limit):
"""Demonstrate that without a limit to the length of a hash, a 16-character hash can fail"""
assert hex_digest_to_sqid("7fffffffffffffff")
with pytest.raises(ValueError):
assert hex_digest_to_sqid("8000000000000000")
1 change: 1 addition & 0 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ notifications-python-client~=9.0
mypy-boto3-s3
mypy-boto3-sns
python-dotenv
sqids

0 comments on commit e8a5b94

Please sign in to comment.