diff --git a/README.md b/README.md index 7a1f06e9..10506ede 100644 --- a/README.md +++ b/README.md @@ -59,14 +59,14 @@ class TestClass { TestClass obj = new TestClass(); // create an instance of some test class -Hasher64 hasher = Hashing.wyhashFinal4(); // create a hasher instance +Hasher64 hasher = Hashing.komihash5_0(); // create a hasher instance // variant 1: hash object by passing data into a hash stream long hash1 = hasher.hashStream().putInt(obj.a).putLong(obj.b).putString(obj.c).getAsLong(); // gives 0x89a90f343c3d4862L // variant 2: hash object by defining a funnel HashFunnel funnel = (o, sink) -> sink.putInt(o.a).putLong(o.b).putString(o.c); -long hash2 = hasher.hashToLong(obj, funnel); // gives 0x89a90f343c3d4862L +long hash2 = hasher.hashToLong(obj, funnel); // gives 0x90553fd9c675dfb2L ``` More examples can be found in [HashingDemo.java](src/test/java/com/dynatrace/hash4j/hashing/HashingDemo.java). @@ -152,7 +152,7 @@ Both algorithms share the following properties: ### Usage ```java -Hasher64 hasher = Hashing.wyhashFinal4(); // create a hasher instance +Hasher64 hasher = Hashing.komihash5_0(); // create a hasher instance UltraLogLog sketch = UltraLogLog.create(12); // corresponds to a standard error of 1.2% and requires 4kB diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index 943f0cbf..7f93135c 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index c30b486a..3fa8f862 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.3-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.4-bin.zip networkTimeout=10000 +validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew index 65dcd68d..0adc8e1a 100755 --- a/gradlew +++ b/gradlew @@ -83,10 +83,8 @@ done # This is normally unused # shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} -APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD=maximum @@ -133,10 +131,13 @@ location of your Java installation." fi else JAVACMD=java - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." + fi fi # Increase the maximum file descriptors if we can. @@ -144,7 +145,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then case $MAX_FD in #( max*) # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC3045 + # shellcheck disable=SC3045 MAX_FD=$( ulimit -H -n ) || warn "Could not query maximum file descriptor limit" esac @@ -152,7 +153,7 @@ if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then '' | soft) :;; #( *) # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. - # shellcheck disable=SC3045 + # shellcheck disable=SC3045 ulimit -n "$MAX_FD" || warn "Could not set maximum file descriptor limit to $MAX_FD" esac @@ -197,6 +198,10 @@ if "$cygwin" || "$msys" ; then done fi + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + # Collect all arguments for the java command; # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of # shell script including quotes and variable substitutions, so put them in diff --git a/src/test/java/com/dynatrace/hash4j/distinctcount/ConversionDemo.java b/src/test/java/com/dynatrace/hash4j/distinctcount/ConversionDemo.java index 010f7c66..3a078e62 100644 --- a/src/test/java/com/dynatrace/hash4j/distinctcount/ConversionDemo.java +++ b/src/test/java/com/dynatrace/hash4j/distinctcount/ConversionDemo.java @@ -26,7 +26,7 @@ class ConversionDemo { @Test void demoUltraLogLogToHyperLogLogConversion() { - Hasher64 hasher = Hashing.wyhashFinal3(); + Hasher64 hasher = Hashing.komihash5_0(); HyperLogLog hllSketch = HyperLogLog.create(12); UltraLogLog ullSketch = UltraLogLog.create(12); diff --git a/src/test/java/com/dynatrace/hash4j/distinctcount/HyperLogLogDemo.java b/src/test/java/com/dynatrace/hash4j/distinctcount/HyperLogLogDemo.java index a9143027..89b8ef8f 100644 --- a/src/test/java/com/dynatrace/hash4j/distinctcount/HyperLogLogDemo.java +++ b/src/test/java/com/dynatrace/hash4j/distinctcount/HyperLogLogDemo.java @@ -27,7 +27,7 @@ class HyperLogLogDemo { @Test void demoBasicUsage() { - Hasher64 hasher = Hashing.wyhashFinal3(); + Hasher64 hasher = Hashing.komihash5_0(); HyperLogLog sketch = HyperLogLog.create(12); @@ -43,7 +43,7 @@ void demoBasicUsage() { @Test void demoMerging() { - Hasher64 hasher = Hashing.wyhashFinal3(); + Hasher64 hasher = Hashing.komihash5_0(); HyperLogLog sketch1 = HyperLogLog.create(12) @@ -61,7 +61,7 @@ void demoMerging() { @Test void demoMartingaleEstimation() { - Hasher64 hasher = Hashing.wyhashFinal3(); + Hasher64 hasher = Hashing.komihash5_0(); HyperLogLog sketch = HyperLogLog.create(12); MartingaleEstimator martingaleEstimator = new MartingaleEstimator(); diff --git a/src/test/java/com/dynatrace/hash4j/distinctcount/UltraLogLogDemo.java b/src/test/java/com/dynatrace/hash4j/distinctcount/UltraLogLogDemo.java index 07cb2e26..20e56fb0 100644 --- a/src/test/java/com/dynatrace/hash4j/distinctcount/UltraLogLogDemo.java +++ b/src/test/java/com/dynatrace/hash4j/distinctcount/UltraLogLogDemo.java @@ -27,7 +27,7 @@ class UltraLogLogDemo { @Test void demoBasicUsage() { - Hasher64 hasher = Hashing.wyhashFinal3(); + Hasher64 hasher = Hashing.komihash5_0(); UltraLogLog sketch = UltraLogLog.create(12); @@ -43,7 +43,7 @@ void demoBasicUsage() { @Test void demoMerging() { - Hasher64 hasher = Hashing.wyhashFinal4(); + Hasher64 hasher = Hashing.komihash5_0(); UltraLogLog sketch1 = UltraLogLog.create(12) @@ -61,7 +61,7 @@ void demoMerging() { @Test void demoMartingaleEstimation() { - Hasher64 hasher = Hashing.wyhashFinal4(); + Hasher64 hasher = Hashing.komihash5_0(); UltraLogLog sketch = UltraLogLog.create(12); MartingaleEstimator martingaleEstimator = new MartingaleEstimator(); diff --git a/src/test/java/com/dynatrace/hash4j/hashing/HashingDemo.java b/src/test/java/com/dynatrace/hash4j/hashing/HashingDemo.java index 2259d642..7ce33c1d 100644 --- a/src/test/java/com/dynatrace/hash4j/hashing/HashingDemo.java +++ b/src/test/java/com/dynatrace/hash4j/hashing/HashingDemo.java @@ -34,7 +34,7 @@ class TestClass { TestClass obj = new TestClass(); // create a hasher instance - Hasher64 hasher = Hashing.wyhashFinal4(); + Hasher64 hasher = Hashing.komihash5_0(); // variant 1: hash object by passing data into a hash stream long hash1 = hasher.hashStream().putInt(obj.a).putLong(obj.b).putString(obj.c).getAsLong(); @@ -44,7 +44,7 @@ class TestClass { long hash2 = hasher.hashToLong(obj, funnel); // both variants lead to same hash value - assertThat(hash1).isEqualTo(hash2).isEqualTo(0x89a90f343c3d4862L); + assertThat(hash1).isEqualTo(hash2).isEqualTo(0x90553fd9c675dfb2L); } // Some class with two string fields. @@ -221,7 +221,7 @@ void demoHashDataBase() { void demoHashStringsWithPotentialCollision() { // create a hasher instance - Hasher64 hasher = Komihash4_3.create(); + Hasher64 hasher = Komihash5_0.create(); // hash multiple variable length fields together long hash1 = hasher.hashStream().putString("ANDRE").putString("WRIGHT").getAsLong(); @@ -236,7 +236,7 @@ void demoHashStringsWithPotentialCollision() { void demoHashListOfStrings() { // create a hasher instance - Hasher64 hasher = Komihash4_3.create(); + Hasher64 hasher = Komihash5_0.create(); // three ways to compute a hash value of the character sequence "A", "B", "C", // by grouping them differently in strings and lists @@ -267,7 +267,7 @@ void demoHashListOfStrings() { void demoHashMultiSetOfStrings() { // create a hasher instance - Hasher64 hasher = Komihash4_3.create(); + Hasher64 hasher = Komihash5_0.create(); long hash1 = hasher diff --git a/src/test/java/com/dynatrace/hash4j/similarity/AbstractSimHashPolicyTest.java b/src/test/java/com/dynatrace/hash4j/similarity/AbstractSimHashPolicyTest.java index e48be92b..10fc2c40 100644 --- a/src/test/java/com/dynatrace/hash4j/similarity/AbstractSimHashPolicyTest.java +++ b/src/test/java/com/dynatrace/hash4j/similarity/AbstractSimHashPolicyTest.java @@ -15,14 +15,11 @@ */ package com.dynatrace.hash4j.similarity; -abstract class AbstractSimHashPolicyTest extends AbstractSimilarityHasherPolicyTest { +import org.hipparchus.distribution.discrete.BinomialDistribution; - protected static double calculateComponentCollisionProbability(double cosineSimilarity) { - return Math.min(1., Math.max(0.5, Math.acos(-cosineSimilarity) / Math.PI)); - } +abstract class AbstractSimHashPolicyTest extends AbstractSimilarityHasherPolicyTest { - @Override - protected double calculateExpectedMatchProbability( + protected static double calculateComponentCollisionProbabilityApproximately( long intersectionSize, long difference1Size, long difference2Size) { double expectedCosineSimilarity = @@ -31,7 +28,59 @@ protected double calculateExpectedMatchProbability( (intersectionSize + difference1Size) * (double) (intersectionSize + difference2Size)); - return calculateComponentCollisionProbability(expectedCosineSimilarity); + return Math.min(1., Math.max(0.5, Math.acos(-expectedCosineSimilarity) / Math.PI)); + } + + private static double calculateComponentCollisionProbabilityExactly( + long intersectionSize, long difference1Size, long difference2Size) { + BinomialDistribution intersectionDistribution = + new BinomialDistribution(Math.toIntExact(intersectionSize), 0.5); + BinomialDistribution difference1Distribution = + new BinomialDistribution(Math.toIntExact(difference1Size), 0.5); + BinomialDistribution difference2Distribution = + new BinomialDistribution(Math.toIntExact(difference2Size), 0.5); + + double sum = 0; + for (long countIntersection = 0; countIntersection <= intersectionSize; ++countIntersection) { + double probabilityIntersection = + intersectionDistribution.probability(Math.toIntExact(countIntersection)); + for (long countDifference1 = 0; countDifference1 <= difference1Size; ++countDifference1) { + double probabilityDifference1 = + difference1Distribution.probability(Math.toIntExact(countDifference1)); + for (long countDifference2 = 0; countDifference2 <= difference2Size; ++countDifference2) { + double probabilityDifference2 = + difference2Distribution.probability(Math.toIntExact(countDifference2)); + + double probability = + probabilityIntersection * probabilityDifference1 * probabilityDifference2; + + long setSize1 = difference1Size + intersectionSize; + long setSize2 = difference2Size + intersectionSize; + long count1 = countDifference1 + countIntersection; + long count2 = countDifference2 + countIntersection; + + if (count1 * 2 == setSize1 || count2 * 2 == setSize2) { + sum += 0.5 * probability; + } else if ((count1 * 2 > setSize1) == (count2 * 2 > setSize2)) { + sum += probability; + } + } + } + } + return Math.min(1., sum); + } + + @Override + protected double calculateExpectedMatchProbability( + long intersectionSize, long difference1Size, long difference2Size) { + + if (intersectionSize <= 20 && difference1Size <= 20 && difference2Size <= 20) { + return calculateComponentCollisionProbabilityExactly( + intersectionSize, difference1Size, difference2Size); + } else { + return calculateComponentCollisionProbabilityApproximately( + intersectionSize, difference1Size, difference2Size); + } } @Override diff --git a/src/test/java/com/dynatrace/hash4j/similarity/AbstractSimilarityHasherPolicyTest.java b/src/test/java/com/dynatrace/hash4j/similarity/AbstractSimilarityHasherPolicyTest.java index b24906e6..0419c7a3 100644 --- a/src/test/java/com/dynatrace/hash4j/similarity/AbstractSimilarityHasherPolicyTest.java +++ b/src/test/java/com/dynatrace/hash4j/similarity/AbstractSimilarityHasherPolicyTest.java @@ -1,5 +1,5 @@ /* - * Copyright 2022 Dynatrace LLC + * Copyright 2022-2023 Dynatrace LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,9 +62,9 @@ private void testCase(Collection data1, Collection data2) { long difference2Size = (long) set2.size() - intersectionSize; long[] elementHashesSet1 = - set1.stream().mapToLong(Hashing.wyhashFinal3()::hashCharsToLong).toArray(); + set1.stream().mapToLong(Hashing.komihash5_0()::hashCharsToLong).toArray(); long[] elementHashesSet2 = - set2.stream().mapToLong(Hashing.wyhashFinal3()::hashCharsToLong).toArray(); + set2.stream().mapToLong(Hashing.komihash5_0()::hashCharsToLong).toArray(); SimilarityHasher hasher = policy.createHasher();