From 3fcbcfe0c249bcc99af8e4c3e9878531c5ebedfb Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Mon, 11 Mar 2024 11:29:57 +0100 Subject: [PATCH] add sorting in hashing for distinct and factor it away (#253) --- astrapy/cursors.py | 17 +++++++++++------ tests/idiomatic/integration/test_dml_sync.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/astrapy/cursors.py b/astrapy/cursors.py index 82ee31e0..ccdb6b54 100644 --- a/astrapy/cursors.py +++ b/astrapy/cursors.py @@ -126,6 +126,15 @@ def _reduce_distinct_key_to_safe(distinct_key: str) -> str: return distinct_key.split(".")[0] +def _hash_document(document: Dict[str, Any]) -> str: + _normalized_item = _normalize_payload_value(path=[], value=document) + _normalized_json = json.dumps( + _normalized_item, sort_keys=True, separators=(",", ":") + ) + _item_hash = hashlib.md5(_normalized_json.encode()).hexdigest() + return _item_hash + + class BaseCursor: """ Represents a generic Cursor over query results, regardless of whether @@ -476,9 +485,7 @@ def distinct(self, key: str) -> List[Any]: d_cursor = self._copy(projection={_key: True}, started=False) for document in d_cursor: for item in _extractor(document): - _normalized_item = _normalize_payload_value(path=[], value=item) - _normalized_json = json.dumps(_normalized_item, separators=(",", ":")) - _item_hash = hashlib.md5(_normalized_json.encode()).hexdigest() + _item_hash = _hash_document(item) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) @@ -645,9 +652,7 @@ async def distinct(self, key: str) -> List[Any]: d_cursor = self._copy(projection={_key: True}, started=False) async for document in d_cursor: for item in _extractor(document): - _normalized_item = _normalize_payload_value(path=[], value=item) - _normalized_json = json.dumps(_normalized_item, separators=(",", ":")) - _item_hash = hashlib.md5(_normalized_json.encode()).hexdigest() + _item_hash = _hash_document(item) if _item_hash not in _item_hashes: _item_hashes.add(_item_hash) distinct_items.append(item) diff --git a/tests/idiomatic/integration/test_dml_sync.py b/tests/idiomatic/integration/test_dml_sync.py index 8715eabe..1741403f 100644 --- a/tests/idiomatic/integration/test_dml_sync.py +++ b/tests/idiomatic/integration/test_dml_sync.py @@ -405,7 +405,7 @@ def test_collection_distinct_nonhashable_sync( {"f": datetime.datetime(2000, 1, 1, 12, 00, 00)}, {"f": None}, ] - col.insert_many(documents) + col.insert_many(documents * 2) d_items = col.distinct("f") expected = [