-
-
Notifications
You must be signed in to change notification settings - Fork 2.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fuzzy matching for journal abbreviations #12477
base: main
Are you sure you want to change the base?
Changes from all commits
9ce1f36
8c34a3a
0435f38
e241bf9
14aa1ba
acfd019
c6a0d0b
50a12c2
df16223
bd5b648
b63c073
8e62cdc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,19 @@ | ||
|
||
package org.jabref.logic.journals; | ||
|
||
import java.io.Serial; | ||
import java.io.Serializable; | ||
import java.util.Objects; | ||
|
||
import org.apache.commons.text.similarity.LevenshteinDistance; | ||
|
||
public class Abbreviation implements Comparable<Abbreviation>, Serializable { | ||
|
||
@Serial | ||
private static final long serialVersionUID = 1; | ||
|
||
private static final LevenshteinDistance LEVENSHTEIN = LevenshteinDistance.getDefaultInstance(); | ||
|
||
private transient String name; | ||
private final String abbreviation; | ||
private transient String dotlessAbbreviation; | ||
|
@@ -56,13 +63,33 @@ public String getDotlessAbbreviation() { | |
return this.dotlessAbbreviation; | ||
} | ||
|
||
public boolean isSimilar(String otherName) { | ||
String normalizedThis = normalize(this.name); | ||
String normalizedOther = normalize(otherName); | ||
|
||
int distance = LEVENSHTEIN.apply(normalizedThis, normalizedOther); | ||
return distance <= 2; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a reasoning why "2" works. |
||
} | ||
|
||
private static String normalize(String input) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add JavaDoc There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added |
||
return input.toLowerCase().replaceAll("[^a-z0-9 ]", "").trim(); | ||
} | ||
|
||
@Override | ||
public int compareTo(Abbreviation toCompare) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What you described in the PR description should appear as JavaDoc here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added DocStrings to the code. |
||
if (isSimilar(toCompare.getName())) { | ||
return 0; | ||
} | ||
|
||
int nameComparison = getName().compareTo(toCompare.getName()); | ||
if (nameComparison != 0) { | ||
return nameComparison; | ||
} | ||
|
||
if (isSimilar(toCompare.getAbbreviation())) { | ||
return 0; | ||
} | ||
|
||
int abbreviationComparison = getAbbreviation().compareTo(toCompare.getAbbreviation()); | ||
if (abbreviationComparison != 0) { | ||
return abbreviationComparison; | ||
|
@@ -103,7 +130,9 @@ public boolean equals(Object o) { | |
return false; | ||
} | ||
Abbreviation that = (Abbreviation) o; | ||
return getName().equals(that.getName()) && getAbbreviation().equals(that.getAbbreviation()) && getShortestUniqueAbbreviation().equals(that.getShortestUniqueAbbreviation()); | ||
return Objects.equals(normalize(name), normalize(that.name)) && | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add some JavaDoc There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think, there should be no normalization here! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I reverted this change |
||
Objects.equals(normalize(abbreviation), normalize(that.abbreviation)) && | ||
Objects.equals(normalize(shortestUniqueAbbreviation), normalize(that.shortestUniqueAbbreviation)); | ||
} | ||
|
||
@Override | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -152,7 +152,7 @@ public void addCustomAbbreviations(Collection<Abbreviation> abbreviationsToAdd) | |
} | ||
|
||
public Optional<String> getNextAbbreviation(String text) { | ||
return get(text).map(abbreviation -> abbreviation.getNext(text)); | ||
return get(text).map(abbreviation -> abbreviation.getNext(text.trim())); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why a trim at the end but no trim at the beginning? Shoulnd't be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Thank you very much for your insightful feedback! I’ve reviewed all your comments and did my best to implement them." |
||
} | ||
|
||
public Optional<String> getDefaultAbbreviation(String text) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.junit.jupiter.api.Assertions.assertNotEquals; | ||
import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
||
class AbbreviationTest { | ||
|
||
|
@@ -111,10 +112,46 @@ void equals() { | |
assertNotEquals("String", abbreviation); | ||
} | ||
|
||
@Test | ||
void testSlightDifferences() { | ||
assertTrue(new Abbreviation("Long Name", "L. N.").isSimilar("Longn Name")); | ||
} | ||
|
||
@Test | ||
void testMissingLetter() { | ||
assertTrue(new Abbreviation("Long Name", "L. N.").isSimilar("Long ame")); | ||
} | ||
|
||
@Test | ||
void testPunctuationDifferences() { | ||
assertTrue(new Abbreviation("Long Name", "L. N.").isSimilar("Long, Name")); | ||
} | ||
|
||
@Test | ||
void testCaseDifferences() { | ||
assertTrue(new Abbreviation("Long Name", "L. N.").isSimilar("LONG NAME")); | ||
} | ||
|
||
@Test | ||
void equalAbbrevationsWithFourComponentsAreAlsoCompareZero() { | ||
Abbreviation abbreviation1 = new Abbreviation("Long Name", "L. N.", "LN"); | ||
Abbreviation abbreviation2 = new Abbreviation("Long Name", "L. N.", "LN"); | ||
assertEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
} | ||
|
||
@Test | ||
void testCompareToWithFuzzyMatching() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please convert to parameterized Test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a parameterized for compareTo |
||
Abbreviation abbreviation1 = new Abbreviation("Long Name", "L. N."); | ||
Abbreviation abbreviation2 = new Abbreviation("Long Name", "L. N."); | ||
assertEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
|
||
abbreviation2 = new Abbreviation("Long Name", "L. N. "); | ||
assertEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
|
||
abbreviation2 = new Abbreviation("Long Name", "L. N"); | ||
assertEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
|
||
abbreviation2 = new Abbreviation("Short Name", "S. N."); | ||
assertNotEquals(0, abbreviation1.compareTo(abbreviation2)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No additional empty line at the beginning
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed