Skip to content

Commit

Permalink
update test params to account for new minhash algo (#442)
Browse files Browse the repository at this point in the history
Signed-off-by: Ayush Dattagupta <[email protected]>
  • Loading branch information
ayushdg authored Dec 20, 2024
1 parent f73c1b8 commit c929203
Showing 1 changed file with 2 additions and 12 deletions.
14 changes: 2 additions & 12 deletions tests/test_fuzzy_dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def gpu_client(self, request):
[
(5, 0.5, [[4, -1]]),
(10, 0.39, [[4, -1], [1, 2]]),
(3, 0.3, [[4, -1], [1, 2, 300]]),
(15, 0.3, [[4, -1], [1, 2, 300]]),
],
)
def test_fuzzy_dedup(
Expand All @@ -329,11 +329,6 @@ def test_fuzzy_dedup(
duplicate_docs,
tmpdir,
):
if not use_64_bit_hash and jaccard_threshold == 0.3:
pytest.xfail(
"TODO: RAPIDS 24.12 fails with parameters 3-0.3-duplicate_docs2-False"
)

print(self.client)
# Dedup might fail when indices per partition do not start from 0
fuzzy_dedup_data.df = fuzzy_dedup_data.df.reset_index(drop=True)
Expand Down Expand Up @@ -477,17 +472,12 @@ def test_num_anchors(self, large_fuzzy_dedup_data, num_anchors, tmpdir):
# Duplcated docs estimated from true_jaccard values
[
(10, [[4, -1], [1, 2, 300]]),
(3, [[4, -1], [1, 2, 300]]),
(5, [[4, -1], [1, 2, 300]]),
],
)
def test_no_fp_check(
self, fuzzy_dedup_data, use_64_bit_hash, num_buckets, duplicate_docs, tmpdir
):
if not use_64_bit_hash and num_buckets == 3:
pytest.xfail(
"TODO: RAPIDS 24.12 fails with parameters 3-duplicate_docs1-False"
)

config = FuzzyDuplicatesConfig(
cache_dir=tmpdir,
id_field="id",
Expand Down

0 comments on commit c929203

Please sign in to comment.