Skip to content

Commit

Permalink
[GH-15687] Extract string comparators from Duke library (#15692)
Browse files Browse the repository at this point in the history
  • Loading branch information
mn-mikke authored Aug 16, 2023
1 parent 9b237b0 commit 48abf3a
Show file tree
Hide file tree
Showing 14 changed files with 577 additions and 25 deletions.
8 changes: 0 additions & 8 deletions h2o-core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,6 @@ dependencies {

api "com.google.code.gson:gson:${gsonVersion}"
api 'commons-lang:commons-lang:2.6'

// Duke library: collection of String comparators
api('no.priv.garshol.duke:duke:1.2') {
exclude group: 'org.apache.lucene', module: 'lucene-core'
exclude group: 'org.apache.lucene', module: 'lucene-analyzers-common'
exclude group: 'org.apache.lucene', module: 'lucene-spatial'
exclude group: 'org.mapdb', module: 'mapdb'
}

testImplementation project(':h2o-test-support')
testRuntimeOnly project(":${defaultWebserverModule}")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package water.rapids.ast.prims.string;

import no.priv.garshol.duke.Comparator;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
Expand All @@ -11,6 +10,7 @@
import water.rapids.ast.AstPrimitive;
import water.rapids.ast.AstRoot;
import water.rapids.vals.ValFrame;
import water.util.comparison.string.StringComparator;
import water.util.comparison.string.StringComparatorFactory;

/**
Expand Down Expand Up @@ -73,7 +73,7 @@ private StringDistanceComparator(String measure, boolean compareEmpty) {
@Override
public void map(Chunk[] cs, NewChunk[] nc) {
BufferedString tmpStr = new BufferedString();
Comparator cmp = StringComparatorFactory.makeComparator(_measure);
StringComparator cmp = StringComparatorFactory.makeComparator(_measure);
int N = nc.length;
assert N * 2 == cs.length;
for (int i = 0; i < N; i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import ai.h2o.mojos.runtime.frame.MojoFrameMeta;
import ai.h2o.mojos.runtime.transforms.MojoTransform;
import ai.h2o.mojos.runtime.transforms.MojoTransformBuilderFactory;
import no.priv.garshol.duke.Comparator;
import water.util.comparison.string.StringComparatorFactory;
import water.util.comparison.string.StringComparator;

import java.util.HashMap;
import java.util.Map;
Expand Down Expand Up @@ -67,7 +67,7 @@ public static class Factory implements MojoTransformBuilderFactory {

private static final HashMap<String,StringPropertiesBinaryFunction> _supportedFunctions = new HashMap<String,StringPropertiesBinaryFunction>() {{
put("strDistance", new StringPropertiesBinaryFunction() {
Comparator _comparator = null;
StringComparator _comparator = null;

boolean _compareEmpty = false;

Expand Down
1 change: 0 additions & 1 deletion h2o-genmodel/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ dependencies {
api "ai.h2o:h2o-tree-api:0.3.17"
// dependencies that are shared with h2o-core - always use the same version as h2o-core
api "com.google.code.gson:gson:${gsonVersion}"
api "no.priv.garshol.duke:duke:1.2"
api "commons-lang:commons-lang:2.6"
api "joda-time:joda-time:2.10.13"
// test only
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package water.util.comparison.string;

/*
Copyright 2023 Lars Marius Garshol
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Original code: https://github.com/larsga/Duke/blob/duke-1.2/src/main/java/no/priv/garshol/duke/comparators/ExactComparator.java
public class ExactComparator implements StringComparator {

public boolean isTokenized() {
return false;
}

public double compare(String v1, String v2) {
return v1.equals(v2) ? 1.0 : 0.0;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
*/
package water.util.comparison.string;

import no.priv.garshol.duke.Comparator;

import java.util.ArrayList;
import java.util.List;

Expand All @@ -20,7 +18,7 @@
* E. Yancey, RESEARCH REPORT SERIES (Statistics #2005-05), US Bureau
* of the Census. http://www.census.gov/srd/papers/pdf/rrs2005-05.pdf
*/
public class H2OJaroWinklerComparator implements Comparator {
public class H2OJaroWinklerComparator implements StringComparator {

public double compare(String s1, String s2) {
return similarity(s1, s2);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package water.util.comparison.string;

/*
Copyright 2023 Lars Marius Garshol
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Original code: https://github.com/larsga/Duke/blob/duke-1.2/src/main/java/no/priv/garshol/duke/comparators/JaccardIndexComparator.java
public class JaccardIndexComparator implements StringComparator {
private StringComparator subcomp;

public JaccardIndexComparator() {
this.subcomp = new ExactComparator();
}

public void setComparator(StringComparator comp) {
this.subcomp = comp;
}

public boolean isTokenized() {
return true;
}

public double compare(String s1, String s2) {
if (s1.equals(s2))
return 1.0;

// tokenize
String[] t1 = StringUtils.split(s1);
String[] t2 = StringUtils.split(s2);

// FIXME: we assume t1 and t2 do not have internal duplicates

// ensure that t1 is shorter than or same length as t2
if (t1.length > t2.length) {
String[] tmp = t2;
t2 = t1;
t1 = tmp;
}

// find best matches for each token in t1
double intersection = 0;
double union = t1.length + t2.length;
for (int ix1 = 0; ix1 < t1.length; ix1++) {
double highest = 0;
for (int ix2 = 0; ix2 < t2.length; ix2++)
highest = Math.max(highest, subcomp.compare(t1[ix1], t2[ix2]));

// INV: the best match for t1[ix1] in t2 is has similarity highest
intersection += highest;
union -= highest; // we reduce the union by this similarity
}

return intersection / union;
}
}

Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package water.util.comparison.string;

import no.priv.garshol.duke.Comparator;

import static org.apache.commons.lang.math.IEEE754rUtils.min;

/**
Expand All @@ -24,7 +22,7 @@
* limitations under the License.
* #L%
**/
public class LevenshteinDistanceComparator implements Comparator {
public class LevenshteinDistanceComparator implements StringComparator {

@Override
public boolean isTokenized() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package water.util.comparison.string;

/*
Copyright 2023 Lars Marius Garshol
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

// Original code: https://github.com/larsga/Duke/blob/duke-1.2/src/main/java/no/priv/garshol/duke/comparators/LongestCommonSubstring.java
public class LongestCommonSubstring implements StringComparator {
private int minlen = 2;
private Formula formula = Formula.OVERLAP;

public double compare(String s1, String s2) {
// a couple of quick cutoffs
if (s1.equals(s2))
return 1.0;
if (Math.min(s1.length(), s2.length()) == 0)
return 0.0;

// the results of the algorithm depends on the order of the input
// strings. therefore need a sub-method for this computation
return (compare_(s1, s2) + compare_(s2, s1)) / 2.0;
}

// FIXME: speed this up by using a one-dimensional array
private double compare_(String s1, String s2) {
// before we begin, note the length of the strings
int shortlen = Math.min(s1.length(), s2.length());
int longlen = Math.max(s1.length(), s2.length());

int removed = 0; // total length of common substrings
while (true) {
// first, we identify the longest common substring
int longest = 0;
int longesti = 0;
int longestj = 0;

int[][] matrix = new int[s1.length()][s2.length()];
for (int i = 0; i < s1.length(); i++) {
for (int j = 0; j < s2.length(); j++) {
if (s1.charAt(i) == s2.charAt(j)) {
if (i == 0 || j == 0)
matrix[i][j] = 1;
else
matrix[i][j] = matrix[i - 1][j - 1] + 1;

if (matrix[i][j] > longest) {
longest = matrix[i][j];
longesti = i;
longestj = j;
}
} else
matrix[i][j] = 0;
}
}

longesti++; // this solves an off-by-one problem
longestj++; // this solves an off-by-one problem

// at this point we know the length of the longest common
// substring, and also its location, since it ends at indexes
// longesti and longestj.

if (longest < minlen)
break; // all remaining common substrings are too short, so we stop

// now we slice away the common substrings
s1 = s1.substring(0, longesti - longest) + s1.substring(longesti);
s2 = s2.substring(0, longestj - longest) + s2.substring(longestj);
removed += longest;
}

return formula.compute(removed, shortlen, longlen);
}

public boolean isTokenized() {
return true;
}

public void setMinimumLength(int minlen) {
this.minlen = minlen;
}

public int getMinimumLength() {
return this.minlen;
}

public void setFormula(Formula formula) {
this.formula = formula;
}

public Formula getFormula() {
return formula;
}

/**
* Represents the different formulas we can use to compute similarity.
*/
public enum Formula {
OVERLAP {
public double compute(int removed, int shortlen, int longlen) {
return removed / (double) shortlen;
}
}, DICE {
public double compute(int removed, int shortlen, int longlen) {
return 2*removed / (double) (shortlen + longlen);
}
}, JACCARD {
public double compute(int removed, int shortlen, int longlen) {
return removed / (double) (shortlen + longlen - removed);
}
};

public double compute(int removed, int shortlen, int longlen) {
throw new IllegalStateException("Unknown formula: " + this);
}
}
}
Loading

0 comments on commit 48abf3a

Please sign in to comment.