Skip to content

Commit

Permalink
Merge pull request #182 from dynatrace-oss/refactorings
Browse files Browse the repository at this point in the history
Refactorings
  • Loading branch information
oertl authored Nov 27, 2023
2 parents b3e8d1b + dc34ee2 commit 383a64f
Show file tree
Hide file tree
Showing 10 changed files with 88 additions and 33 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,14 @@ class TestClass {

TestClass obj = new TestClass(); // create an instance of some test class

Hasher64 hasher = Hashing.wyhashFinal4(); // create a hasher instance
Hasher64 hasher = Hashing.komihash5_0(); // create a hasher instance

// variant 1: hash object by passing data into a hash stream
long hash1 = hasher.hashStream().putInt(obj.a).putLong(obj.b).putString(obj.c).getAsLong(); // gives 0x89a90f343c3d4862L

// variant 2: hash object by defining a funnel
HashFunnel<TestClass> funnel = (o, sink) -> sink.putInt(o.a).putLong(o.b).putString(o.c);
long hash2 = hasher.hashToLong(obj, funnel); // gives 0x89a90f343c3d4862L
long hash2 = hasher.hashToLong(obj, funnel); // gives 0x90553fd9c675dfb2L
```
More examples can be found in [HashingDemo.java](src/test/java/com/dynatrace/hash4j/hashing/HashingDemo.java).

Expand Down Expand Up @@ -152,7 +152,7 @@ Both algorithms share the following properties:

### Usage
```java
Hasher64 hasher = Hashing.wyhashFinal4(); // create a hasher instance
Hasher64 hasher = Hashing.komihash5_0(); // create a hasher instance

UltraLogLog sketch = UltraLogLog.create(12); // corresponds to a standard error of 1.2% and requires 4kB

Expand Down
Binary file modified gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
3 changes: 2 additions & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.3-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.4-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
19 changes: 12 additions & 7 deletions gradlew
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,8 @@ done
# This is normally unused
# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit

# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit

# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum
Expand Down Expand Up @@ -133,26 +131,29 @@ location of your Java installation."
fi
else
JAVACMD=java
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
if ! command -v java >/dev/null 2>&1
then
die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
fi

# Increase the maximum file descriptors if we can.
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
case $MAX_FD in #(
max*)
# In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
# shellcheck disable=SC3045
# shellcheck disable=SC3045
MAX_FD=$( ulimit -H -n ) ||
warn "Could not query maximum file descriptor limit"
esac
case $MAX_FD in #(
'' | soft) :;; #(
*)
# In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
# shellcheck disable=SC3045
# shellcheck disable=SC3045
ulimit -n "$MAX_FD" ||
warn "Could not set maximum file descriptor limit to $MAX_FD"
esac
Expand Down Expand Up @@ -197,6 +198,10 @@ if "$cygwin" || "$msys" ; then
done
fi


# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'

# Collect all arguments for the java command;
# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
# shell script including quotes and variable substitutions, so put them in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ConversionDemo {
@Test
void demoUltraLogLogToHyperLogLogConversion() {

Hasher64 hasher = Hashing.wyhashFinal3();
Hasher64 hasher = Hashing.komihash5_0();

HyperLogLog hllSketch = HyperLogLog.create(12);
UltraLogLog ullSketch = UltraLogLog.create(12);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class HyperLogLogDemo {
@Test
void demoBasicUsage() {

Hasher64 hasher = Hashing.wyhashFinal3();
Hasher64 hasher = Hashing.komihash5_0();

HyperLogLog sketch = HyperLogLog.create(12);

Expand All @@ -43,7 +43,7 @@ void demoBasicUsage() {
@Test
void demoMerging() {

Hasher64 hasher = Hashing.wyhashFinal3();
Hasher64 hasher = Hashing.komihash5_0();

HyperLogLog sketch1 =
HyperLogLog.create(12)
Expand All @@ -61,7 +61,7 @@ void demoMerging() {
@Test
void demoMartingaleEstimation() {

Hasher64 hasher = Hashing.wyhashFinal3();
Hasher64 hasher = Hashing.komihash5_0();

HyperLogLog sketch = HyperLogLog.create(12);
MartingaleEstimator martingaleEstimator = new MartingaleEstimator();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class UltraLogLogDemo {
@Test
void demoBasicUsage() {

Hasher64 hasher = Hashing.wyhashFinal3();
Hasher64 hasher = Hashing.komihash5_0();

UltraLogLog sketch = UltraLogLog.create(12);

Expand All @@ -43,7 +43,7 @@ void demoBasicUsage() {
@Test
void demoMerging() {

Hasher64 hasher = Hashing.wyhashFinal4();
Hasher64 hasher = Hashing.komihash5_0();

UltraLogLog sketch1 =
UltraLogLog.create(12)
Expand All @@ -61,7 +61,7 @@ void demoMerging() {
@Test
void demoMartingaleEstimation() {

Hasher64 hasher = Hashing.wyhashFinal4();
Hasher64 hasher = Hashing.komihash5_0();

UltraLogLog sketch = UltraLogLog.create(12);
MartingaleEstimator martingaleEstimator = new MartingaleEstimator();
Expand Down
10 changes: 5 additions & 5 deletions src/test/java/com/dynatrace/hash4j/hashing/HashingDemo.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class TestClass {
TestClass obj = new TestClass();

// create a hasher instance
Hasher64 hasher = Hashing.wyhashFinal4();
Hasher64 hasher = Hashing.komihash5_0();

// variant 1: hash object by passing data into a hash stream
long hash1 = hasher.hashStream().putInt(obj.a).putLong(obj.b).putString(obj.c).getAsLong();
Expand All @@ -44,7 +44,7 @@ class TestClass {
long hash2 = hasher.hashToLong(obj, funnel);

// both variants lead to same hash value
assertThat(hash1).isEqualTo(hash2).isEqualTo(0x89a90f343c3d4862L);
assertThat(hash1).isEqualTo(hash2).isEqualTo(0x90553fd9c675dfb2L);
}

// Some class with two string fields.
Expand Down Expand Up @@ -221,7 +221,7 @@ void demoHashDataBase() {
void demoHashStringsWithPotentialCollision() {

// create a hasher instance
Hasher64 hasher = Komihash4_3.create();
Hasher64 hasher = Komihash5_0.create();

// hash multiple variable length fields together
long hash1 = hasher.hashStream().putString("ANDRE").putString("WRIGHT").getAsLong();
Expand All @@ -236,7 +236,7 @@ void demoHashStringsWithPotentialCollision() {
void demoHashListOfStrings() {

// create a hasher instance
Hasher64 hasher = Komihash4_3.create();
Hasher64 hasher = Komihash5_0.create();

// three ways to compute a hash value of the character sequence "A", "B", "C",
// by grouping them differently in strings and lists
Expand Down Expand Up @@ -267,7 +267,7 @@ void demoHashListOfStrings() {
void demoHashMultiSetOfStrings() {

// create a hasher instance
Hasher64 hasher = Komihash4_3.create();
Hasher64 hasher = Komihash5_0.create();

long hash1 =
hasher
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,11 @@
*/
package com.dynatrace.hash4j.similarity;

abstract class AbstractSimHashPolicyTest extends AbstractSimilarityHasherPolicyTest {
import org.hipparchus.distribution.discrete.BinomialDistribution;

protected static double calculateComponentCollisionProbability(double cosineSimilarity) {
return Math.min(1., Math.max(0.5, Math.acos(-cosineSimilarity) / Math.PI));
}
abstract class AbstractSimHashPolicyTest extends AbstractSimilarityHasherPolicyTest {

@Override
protected double calculateExpectedMatchProbability(
protected static double calculateComponentCollisionProbabilityApproximately(
long intersectionSize, long difference1Size, long difference2Size) {

double expectedCosineSimilarity =
Expand All @@ -31,7 +28,59 @@ protected double calculateExpectedMatchProbability(
(intersectionSize + difference1Size)
* (double) (intersectionSize + difference2Size));

return calculateComponentCollisionProbability(expectedCosineSimilarity);
return Math.min(1., Math.max(0.5, Math.acos(-expectedCosineSimilarity) / Math.PI));
}

private static double calculateComponentCollisionProbabilityExactly(
long intersectionSize, long difference1Size, long difference2Size) {
BinomialDistribution intersectionDistribution =
new BinomialDistribution(Math.toIntExact(intersectionSize), 0.5);
BinomialDistribution difference1Distribution =
new BinomialDistribution(Math.toIntExact(difference1Size), 0.5);
BinomialDistribution difference2Distribution =
new BinomialDistribution(Math.toIntExact(difference2Size), 0.5);

double sum = 0;
for (long countIntersection = 0; countIntersection <= intersectionSize; ++countIntersection) {
double probabilityIntersection =
intersectionDistribution.probability(Math.toIntExact(countIntersection));
for (long countDifference1 = 0; countDifference1 <= difference1Size; ++countDifference1) {
double probabilityDifference1 =
difference1Distribution.probability(Math.toIntExact(countDifference1));
for (long countDifference2 = 0; countDifference2 <= difference2Size; ++countDifference2) {
double probabilityDifference2 =
difference2Distribution.probability(Math.toIntExact(countDifference2));

double probability =
probabilityIntersection * probabilityDifference1 * probabilityDifference2;

long setSize1 = difference1Size + intersectionSize;
long setSize2 = difference2Size + intersectionSize;
long count1 = countDifference1 + countIntersection;
long count2 = countDifference2 + countIntersection;

if (count1 * 2 == setSize1 || count2 * 2 == setSize2) {
sum += 0.5 * probability;
} else if ((count1 * 2 > setSize1) == (count2 * 2 > setSize2)) {
sum += probability;
}
}
}
}
return Math.min(1., sum);
}

@Override
protected double calculateExpectedMatchProbability(
long intersectionSize, long difference1Size, long difference2Size) {

if (intersectionSize <= 20 && difference1Size <= 20 && difference2Size <= 20) {
return calculateComponentCollisionProbabilityExactly(
intersectionSize, difference1Size, difference2Size);
} else {
return calculateComponentCollisionProbabilityApproximately(
intersectionSize, difference1Size, difference2Size);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2022 Dynatrace LLC
* Copyright 2022-2023 Dynatrace LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -62,9 +62,9 @@ private void testCase(Collection<String> data1, Collection<String> data2) {
long difference2Size = (long) set2.size() - intersectionSize;

long[] elementHashesSet1 =
set1.stream().mapToLong(Hashing.wyhashFinal3()::hashCharsToLong).toArray();
set1.stream().mapToLong(Hashing.komihash5_0()::hashCharsToLong).toArray();
long[] elementHashesSet2 =
set2.stream().mapToLong(Hashing.wyhashFinal3()::hashCharsToLong).toArray();
set2.stream().mapToLong(Hashing.komihash5_0()::hashCharsToLong).toArray();

SimilarityHasher hasher = policy.createHasher();

Expand Down

0 comments on commit 383a64f

Please sign in to comment.