Skip to content

Commit

Permalink
improved SimHash collision probability computation in unit test
Browse files Browse the repository at this point in the history
  • Loading branch information
oertl committed Nov 27, 2023
1 parent 176ce13 commit dc34ee2
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,11 @@
*/
package com.dynatrace.hash4j.similarity;

abstract class AbstractSimHashPolicyTest extends AbstractSimilarityHasherPolicyTest {
import org.hipparchus.distribution.discrete.BinomialDistribution;

protected static double calculateComponentCollisionProbability(double cosineSimilarity) {
return Math.min(1., Math.max(0.5, Math.acos(-cosineSimilarity) / Math.PI));
}
abstract class AbstractSimHashPolicyTest extends AbstractSimilarityHasherPolicyTest {

@Override
protected double calculateExpectedMatchProbability(
protected static double calculateComponentCollisionProbabilityApproximately(
long intersectionSize, long difference1Size, long difference2Size) {

double expectedCosineSimilarity =
Expand All @@ -31,7 +28,59 @@ protected double calculateExpectedMatchProbability(
(intersectionSize + difference1Size)
* (double) (intersectionSize + difference2Size));

return calculateComponentCollisionProbability(expectedCosineSimilarity);
return Math.min(1., Math.max(0.5, Math.acos(-expectedCosineSimilarity) / Math.PI));
}

private static double calculateComponentCollisionProbabilityExactly(
long intersectionSize, long difference1Size, long difference2Size) {
BinomialDistribution intersectionDistribution =
new BinomialDistribution(Math.toIntExact(intersectionSize), 0.5);
BinomialDistribution difference1Distribution =
new BinomialDistribution(Math.toIntExact(difference1Size), 0.5);
BinomialDistribution difference2Distribution =
new BinomialDistribution(Math.toIntExact(difference2Size), 0.5);

double sum = 0;
for (long countIntersection = 0; countIntersection <= intersectionSize; ++countIntersection) {
double probabilityIntersection =
intersectionDistribution.probability(Math.toIntExact(countIntersection));
for (long countDifference1 = 0; countDifference1 <= difference1Size; ++countDifference1) {
double probabilityDifference1 =
difference1Distribution.probability(Math.toIntExact(countDifference1));
for (long countDifference2 = 0; countDifference2 <= difference2Size; ++countDifference2) {
double probabilityDifference2 =
difference2Distribution.probability(Math.toIntExact(countDifference2));

double probability =
probabilityIntersection * probabilityDifference1 * probabilityDifference2;

long setSize1 = difference1Size + intersectionSize;
long setSize2 = difference2Size + intersectionSize;
long count1 = countDifference1 + countIntersection;
long count2 = countDifference2 + countIntersection;

if (count1 * 2 == setSize1 || count2 * 2 == setSize2) {
sum += 0.5 * probability;
} else if ((count1 * 2 > setSize1) == (count2 * 2 > setSize2)) {
sum += probability;
}
}
}
}
return Math.min(1., sum);
}

@Override
protected double calculateExpectedMatchProbability(
long intersectionSize, long difference1Size, long difference2Size) {

if (intersectionSize <= 20 && difference1Size <= 20 && difference2Size <= 20) {
return calculateComponentCollisionProbabilityExactly(
intersectionSize, difference1Size, difference2Size);
} else {
return calculateComponentCollisionProbabilityApproximately(
intersectionSize, difference1Size, difference2Size);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2022 Dynatrace LLC
* Copyright 2022-2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -62,9 +62,9 @@ private void testCase(Collection<String> data1, Collection<String> data2) {
long difference2Size = (long) set2.size() - intersectionSize;

long[] elementHashesSet1 =
set1.stream().mapToLong(Hashing.wyhashFinal3()::hashCharsToLong).toArray();
set1.stream().mapToLong(Hashing.komihash5_0()::hashCharsToLong).toArray();
long[] elementHashesSet2 =
set2.stream().mapToLong(Hashing.wyhashFinal3()::hashCharsToLong).toArray();
set2.stream().mapToLong(Hashing.komihash5_0()::hashCharsToLong).toArray();

SimilarityHasher hasher = policy.createHasher();

Expand Down

0 comments on commit dc34ee2

Please sign in to comment.