-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
Showing
14 changed files
with
577 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
30 changes: 30 additions & 0 deletions
30
h2o-genmodel/src/main/java/water/util/comparison/string/ExactComparator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package water.util.comparison.string; | ||
|
||
/* | ||
Copyright 2023 Lars Marius Garshol | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
// Original code: https://github.com/larsga/Duke/blob/duke-1.2/src/main/java/no/priv/garshol/duke/comparators/ExactComparator.java | ||
public class ExactComparator implements StringComparator { | ||
|
||
public boolean isTokenized() { | ||
return false; | ||
} | ||
|
||
public double compare(String v1, String v2) { | ||
return v1.equals(v2) ? 1.0 : 0.0; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
68 changes: 68 additions & 0 deletions
68
h2o-genmodel/src/main/java/water/util/comparison/string/JaccardIndexComparator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
package water.util.comparison.string; | ||
|
||
/* | ||
Copyright 2023 Lars Marius Garshol | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
// Original code: https://github.com/larsga/Duke/blob/duke-1.2/src/main/java/no/priv/garshol/duke/comparators/JaccardIndexComparator.java | ||
public class JaccardIndexComparator implements StringComparator { | ||
private StringComparator subcomp; | ||
|
||
public JaccardIndexComparator() { | ||
this.subcomp = new ExactComparator(); | ||
} | ||
|
||
public void setComparator(StringComparator comp) { | ||
this.subcomp = comp; | ||
} | ||
|
||
public boolean isTokenized() { | ||
return true; | ||
} | ||
|
||
public double compare(String s1, String s2) { | ||
if (s1.equals(s2)) | ||
return 1.0; | ||
|
||
// tokenize | ||
String[] t1 = StringUtils.split(s1); | ||
String[] t2 = StringUtils.split(s2); | ||
|
||
// FIXME: we assume t1 and t2 do not have internal duplicates | ||
|
||
// ensure that t1 is shorter than or same length as t2 | ||
if (t1.length > t2.length) { | ||
String[] tmp = t2; | ||
t2 = t1; | ||
t1 = tmp; | ||
} | ||
|
||
// find best matches for each token in t1 | ||
double intersection = 0; | ||
double union = t1.length + t2.length; | ||
for (int ix1 = 0; ix1 < t1.length; ix1++) { | ||
double highest = 0; | ||
for (int ix2 = 0; ix2 < t2.length; ix2++) | ||
highest = Math.max(highest, subcomp.compare(t1[ix1], t2[ix2])); | ||
|
||
// INV: the best match for t1[ix1] in t2 is has similarity highest | ||
intersection += highest; | ||
union -= highest; // we reduce the union by this similarity | ||
} | ||
|
||
return intersection / union; | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
129 changes: 129 additions & 0 deletions
129
h2o-genmodel/src/main/java/water/util/comparison/string/LongestCommonSubstring.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
package water.util.comparison.string; | ||
|
||
/* | ||
Copyright 2023 Lars Marius Garshol | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
// Original code: https://github.com/larsga/Duke/blob/duke-1.2/src/main/java/no/priv/garshol/duke/comparators/LongestCommonSubstring.java | ||
public class LongestCommonSubstring implements StringComparator { | ||
private int minlen = 2; | ||
private Formula formula = Formula.OVERLAP; | ||
|
||
public double compare(String s1, String s2) { | ||
// a couple of quick cutoffs | ||
if (s1.equals(s2)) | ||
return 1.0; | ||
if (Math.min(s1.length(), s2.length()) == 0) | ||
return 0.0; | ||
|
||
// the results of the algorithm depends on the order of the input | ||
// strings. therefore need a sub-method for this computation | ||
return (compare_(s1, s2) + compare_(s2, s1)) / 2.0; | ||
} | ||
|
||
// FIXME: speed this up by using a one-dimensional array | ||
private double compare_(String s1, String s2) { | ||
// before we begin, note the length of the strings | ||
int shortlen = Math.min(s1.length(), s2.length()); | ||
int longlen = Math.max(s1.length(), s2.length()); | ||
|
||
int removed = 0; // total length of common substrings | ||
while (true) { | ||
// first, we identify the longest common substring | ||
int longest = 0; | ||
int longesti = 0; | ||
int longestj = 0; | ||
|
||
int[][] matrix = new int[s1.length()][s2.length()]; | ||
for (int i = 0; i < s1.length(); i++) { | ||
for (int j = 0; j < s2.length(); j++) { | ||
if (s1.charAt(i) == s2.charAt(j)) { | ||
if (i == 0 || j == 0) | ||
matrix[i][j] = 1; | ||
else | ||
matrix[i][j] = matrix[i - 1][j - 1] + 1; | ||
|
||
if (matrix[i][j] > longest) { | ||
longest = matrix[i][j]; | ||
longesti = i; | ||
longestj = j; | ||
} | ||
} else | ||
matrix[i][j] = 0; | ||
} | ||
} | ||
|
||
longesti++; // this solves an off-by-one problem | ||
longestj++; // this solves an off-by-one problem | ||
|
||
// at this point we know the length of the longest common | ||
// substring, and also its location, since it ends at indexes | ||
// longesti and longestj. | ||
|
||
if (longest < minlen) | ||
break; // all remaining common substrings are too short, so we stop | ||
|
||
// now we slice away the common substrings | ||
s1 = s1.substring(0, longesti - longest) + s1.substring(longesti); | ||
s2 = s2.substring(0, longestj - longest) + s2.substring(longestj); | ||
removed += longest; | ||
} | ||
|
||
return formula.compute(removed, shortlen, longlen); | ||
} | ||
|
||
public boolean isTokenized() { | ||
return true; | ||
} | ||
|
||
public void setMinimumLength(int minlen) { | ||
this.minlen = minlen; | ||
} | ||
|
||
public int getMinimumLength() { | ||
return this.minlen; | ||
} | ||
|
||
public void setFormula(Formula formula) { | ||
this.formula = formula; | ||
} | ||
|
||
public Formula getFormula() { | ||
return formula; | ||
} | ||
|
||
/** | ||
* Represents the different formulas we can use to compute similarity. | ||
*/ | ||
public enum Formula { | ||
OVERLAP { | ||
public double compute(int removed, int shortlen, int longlen) { | ||
return removed / (double) shortlen; | ||
} | ||
}, DICE { | ||
public double compute(int removed, int shortlen, int longlen) { | ||
return 2*removed / (double) (shortlen + longlen); | ||
} | ||
}, JACCARD { | ||
public double compute(int removed, int shortlen, int longlen) { | ||
return removed / (double) (shortlen + longlen - removed); | ||
} | ||
}; | ||
|
||
public double compute(int removed, int shortlen, int longlen) { | ||
throw new IllegalStateException("Unknown formula: " + this); | ||
} | ||
} | ||
} |
Oops, something went wrong.