From 18b8546c38f9c7e2d4ee28e408c63f46fe2643b5 Mon Sep 17 00:00:00 2001 From: Ruslan Sennov Date: Thu, 16 Nov 2017 07:43:11 +0300 Subject: [PATCH 1/9] levenstein skeleton --- code/.gitignore | 3 + code/pom.xml | 5 + .../levenstein/LevensteinDistanceIndex.java | 162 ++++++++++++++++++ .../cqengine/query/QueryFactory.java | 4 + .../query/simple/LevensteinDistance.java | 62 +++++++ .../index/levenstein/LevensteinTest.java | 31 ++++ 6 files changed, 267 insertions(+) create mode 100644 code/.gitignore create mode 100644 code/src/main/java/com/googlecode/cqengine/index/levenstein/LevensteinDistanceIndex.java create mode 100644 code/src/main/java/com/googlecode/cqengine/query/simple/LevensteinDistance.java create mode 100644 code/src/test/java/com/googlecode/cqengine/index/levenstein/LevensteinTest.java diff --git a/code/.gitignore b/code/.gitignore new file mode 100644 index 000000000..34e1547a3 --- /dev/null +++ b/code/.gitignore @@ -0,0 +1,3 @@ +target +.idea +*.iml \ No newline at end of file diff --git a/code/pom.xml b/code/pom.xml index 003f8f9c0..e696e69e8 100644 --- a/code/pom.xml +++ b/code/pom.xml @@ -248,6 +248,11 @@ sqlite-jdbc 3.20.1 + + com.github.universal-automata + liblevenshtein + 3.0.0 + com.esotericsoftware kryo diff --git a/code/src/main/java/com/googlecode/cqengine/index/levenstein/LevensteinDistanceIndex.java b/code/src/main/java/com/googlecode/cqengine/index/levenstein/LevensteinDistanceIndex.java new file mode 100644 index 000000000..12c16ac7f --- /dev/null +++ b/code/src/main/java/com/googlecode/cqengine/index/levenstein/LevensteinDistanceIndex.java @@ -0,0 +1,162 @@ +package com.googlecode.cqengine.index.levenstein; + +import com.github.liblevenshtein.collection.dictionary.SortedDawg; +import com.github.liblevenshtein.transducer.Algorithm; +import com.github.liblevenshtein.transducer.Candidate; +import com.github.liblevenshtein.transducer.ITransducer; +import com.github.liblevenshtein.transducer.factory.TransducerBuilder; +import com.googlecode.cqengine.attribute.Attribute; +import com.googlecode.cqengine.index.Index; +import com.googlecode.cqengine.index.support.AbstractAttributeIndex; +import com.googlecode.cqengine.index.support.CloseableIterator; +import com.googlecode.cqengine.persistence.support.ObjectSet; +import com.googlecode.cqengine.persistence.support.ObjectStore; +import com.googlecode.cqengine.query.simple.LevensteinDistance; +import com.googlecode.cqengine.query.Query; +import com.googlecode.cqengine.query.option.QueryOptions; +import com.googlecode.cqengine.resultset.ResultSet; + +import java.util.*; +import java.util.function.Function; + +/** + * @author Ruslan Sennov + */ +public class LevensteinDistanceIndex extends AbstractAttributeIndex { + + private ITransducer transducer; + private Map> terms; + + /** + * Private constructor, used by static factory methods. + * + * @param attribute The attribute on which the index will be built + */ + private LevensteinDistanceIndex(Attribute attribute) { + super(attribute, new HashSet>() {{ + add(LevensteinDistance.class); + }}); + } + + @Override + public boolean isMutable() { + return false; + } + + @Override + public boolean isQuantized() { + return false; + } + + @Override + public ResultSet retrieve(Query query, QueryOptions queryOptions) { + if (query instanceof LevensteinDistance) { + LevensteinDistance lev = (LevensteinDistance) query; + final Iterable it = transducer.transduce(lev.getValue(), lev.getMaxDistance()); + Set set = new HashSet<>(); + it.forEach(candidate -> { + set.addAll(terms.get(candidate.term())); + }); + return new ResultSet() { + @Override + public Iterator iterator() { + return set.iterator(); + } + + @Override + public boolean contains(O object) { + return set.contains(object); + } + + @Override + public boolean matches(O object) { + return set.contains(object); + } + + @Override + public Query getQuery() { + return query; + } + + @Override + public QueryOptions getQueryOptions() { + return queryOptions; + } + + @Override + public int getRetrievalCost() { + return 10; + } + + @Override + public int getMergeCost() { + return 10; + } + + @Override + public int size() { + return set.size(); + } + + @Override + public void close() { + set.clear(); + } + }; + } + throw new RuntimeException(); + } + + @Override + public Index getEffectiveIndex() { + return this; + } + + @Override + public boolean addAll(ObjectSet objectSet, QueryOptions queryOptions) { + // this index is immutable, will never be here + throw new IllegalStateException(); + } + + @Override + public boolean removeAll(ObjectSet objectSet, QueryOptions queryOptions) { + // this index is immutable, will never be here + throw new IllegalStateException(); + } + + @Override + public void clear(QueryOptions queryOptions) { + } + + @Override + public void init(ObjectStore objectStore, QueryOptions queryOptions) { + CloseableIterator it = objectStore.iterator(queryOptions); + terms = new HashMap<>(); + it.forEachRemaining(o -> { + attribute.getValues(o, queryOptions).forEach(term -> { + Set objects = terms.computeIfAbsent(term, new Function>() { + @Override + public Set apply(String s) { + return new HashSet<>(); + } + }); + objects.add(o); + }); + }); + it.close(); + SortedDawg dict = new SortedDawg(); + List list = new ArrayList<>(terms.keySet()); + Collections.sort(list); + dict.addAll(list); + transducer = new TransducerBuilder() + .dictionary(dict) + .algorithm(Algorithm.TRANSPOSITION) + .includeDistance(true) + .build(); + } + + public static LevensteinDistanceIndex onAttribute(Attribute attribute) { + return new LevensteinDistanceIndex<>(attribute); + } + +} diff --git a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java index b5eaea0a7..0c0d2ef9f 100644 --- a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java +++ b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java @@ -1287,6 +1287,10 @@ public static OrderByOption orderBy(AttributeOrder attributeOrder1, At return new OrderByOption(attributeOrders); } + public static LevensteinDistance levensteinDistance(SimpleAttribute attribute, String value, int maxDistance) { + return new LevensteinDistance<>(attribute, value, maxDistance); + } + // *************************************************************************************************************** /** diff --git a/code/src/main/java/com/googlecode/cqengine/query/simple/LevensteinDistance.java b/code/src/main/java/com/googlecode/cqengine/query/simple/LevensteinDistance.java new file mode 100644 index 000000000..01aa5462e --- /dev/null +++ b/code/src/main/java/com/googlecode/cqengine/query/simple/LevensteinDistance.java @@ -0,0 +1,62 @@ +package com.googlecode.cqengine.query.simple; + +import com.googlecode.cqengine.attribute.Attribute; +import com.googlecode.cqengine.attribute.SimpleAttribute; +import com.googlecode.cqengine.query.option.QueryOptions; + +import java.util.Objects; + +/** + * @author Ruslan Sennov + */ +public class LevensteinDistance extends SimpleQuery { + + private final String value; + private final int maxDistance; + + /** + * Creates a new {@link SimpleQuery} initialized to make assertions on values of the specified attribute + * + * @param attribute The attribute on which the assertion is to be made + */ + public LevensteinDistance(SimpleAttribute attribute, String value, int maxDistance) { + super(attribute); + this.value = value; + this.maxDistance = maxDistance; + } + + public String getValue() { + return value; + } + + public int getMaxDistance() { + return maxDistance; + } + + @Override + protected boolean matchesSimpleAttribute(SimpleAttribute attribute, O object, QueryOptions queryOptions) { + throw new RuntimeException("Missing Levenstein index on attribute " + attribute.toString()); + } + + @Override + protected boolean matchesNonSimpleAttribute(Attribute attribute, O object, QueryOptions queryOptions) { + throw new RuntimeException("Missing Levenstein index on attribute " + attribute.toString()); + } + + @Override + protected int calcHashCode() { + return Objects.hashCode(value) + 31 * maxDistance; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + LevensteinDistance that = (LevensteinDistance) o; + + if (!attribute.equals(that.attribute)) return false; + if (maxDistance != that.maxDistance) return false; + return value != null ? value.equals(that.value) : that.value == null; + } +} diff --git a/code/src/test/java/com/googlecode/cqengine/index/levenstein/LevensteinTest.java b/code/src/test/java/com/googlecode/cqengine/index/levenstein/LevensteinTest.java new file mode 100644 index 000000000..711cbe39b --- /dev/null +++ b/code/src/test/java/com/googlecode/cqengine/index/levenstein/LevensteinTest.java @@ -0,0 +1,31 @@ +package com.googlecode.cqengine.index.levenstein; + +import com.googlecode.cqengine.ConcurrentIndexedCollection; +import com.googlecode.cqengine.IndexedCollection; +import com.googlecode.cqengine.testutil.Car; +import com.googlecode.cqengine.testutil.CarFactory; +import org.junit.Test; + +import static com.googlecode.cqengine.query.QueryFactory.levensteinDistance; +import static org.junit.Assert.assertEquals; + +/** + * @author Ruslan Sennov + */ +public class LevensteinTest { + + @Test(expected = IllegalStateException.class) + public void testImmutable() { + IndexedCollection collection = new ConcurrentIndexedCollection(); + collection.addIndex(LevensteinDistanceIndex.onAttribute(Car.MODEL)); + collection.addAll(CarFactory.createCollectionOfCars(10)); + } + + @Test + public void testQuery() { + IndexedCollection collection = new ConcurrentIndexedCollection(); + collection.addAll(CarFactory.createCollectionOfCars(10)); + collection.addIndex(LevensteinDistanceIndex.onAttribute(Car.MANUFACTURER)); + assertEquals(3, collection.retrieve(levensteinDistance(Car.MANUFACTURER, "Frd", 1)).size()); + } +} From d09e0ee4d0a4e3e24a1f2a2111eaca60b5855b0b Mon Sep 17 00:00:00 2001 From: Andrei Tomashpolskiy Date: Mon, 2 Apr 2018 13:07:03 +0300 Subject: [PATCH 2/9] Levenshtein index --- code/.gitignore | 3 - .../LevenshteinDistanceIndex.java} | 91 ++++++++++++------- .../cqengine/query/QueryFactory.java | 4 +- ...Distance.java => LevenshteinDistance.java} | 10 +- .../LevenshteinTest.java} | 12 +-- 5 files changed, 72 insertions(+), 48 deletions(-) delete mode 100644 code/.gitignore rename code/src/main/java/com/googlecode/cqengine/index/{levenstein/LevensteinDistanceIndex.java => levenshtein/LevenshteinDistanceIndex.java} (57%) rename code/src/main/java/com/googlecode/cqengine/query/simple/{LevensteinDistance.java => LevenshteinDistance.java} (78%) rename code/src/test/java/com/googlecode/cqengine/index/{levenstein/LevensteinTest.java => levenshtein/LevenshteinTest.java} (65%) diff --git a/code/.gitignore b/code/.gitignore deleted file mode 100644 index 34e1547a3..000000000 --- a/code/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -target -.idea -*.iml \ No newline at end of file diff --git a/code/src/main/java/com/googlecode/cqengine/index/levenstein/LevensteinDistanceIndex.java b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java similarity index 57% rename from code/src/main/java/com/googlecode/cqengine/index/levenstein/LevensteinDistanceIndex.java rename to code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java index 12c16ac7f..a77da08c0 100644 --- a/code/src/main/java/com/googlecode/cqengine/index/levenstein/LevensteinDistanceIndex.java +++ b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java @@ -1,4 +1,4 @@ -package com.googlecode.cqengine.index.levenstein; +package com.googlecode.cqengine.index.levenshtein; import com.github.liblevenshtein.collection.dictionary.SortedDawg; import com.github.liblevenshtein.transducer.Algorithm; @@ -11,19 +11,27 @@ import com.googlecode.cqengine.index.support.CloseableIterator; import com.googlecode.cqengine.persistence.support.ObjectSet; import com.googlecode.cqengine.persistence.support.ObjectStore; -import com.googlecode.cqengine.query.simple.LevensteinDistance; import com.googlecode.cqengine.query.Query; import com.googlecode.cqengine.query.option.QueryOptions; +import com.googlecode.cqengine.query.simple.LevenshteinDistance; import com.googlecode.cqengine.resultset.ResultSet; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.function.Function; /** * @author Ruslan Sennov */ -public class LevensteinDistanceIndex extends AbstractAttributeIndex { +public class LevenshteinDistanceIndex extends AbstractAttributeIndex { + private final TransducerFactory transducerFactory; private ITransducer transducer; private Map> terms; @@ -32,10 +40,9 @@ public class LevensteinDistanceIndex extends AbstractAttributeIndex attribute) { - super(attribute, new HashSet>() {{ - add(LevensteinDistance.class); - }}); + private LevenshteinDistanceIndex(Attribute attribute, Algorithm transducerAlgorithm) { + super(attribute, Collections.singleton(LevenshteinDistance.class)); + this.transducerFactory = new TransducerFactory(transducerAlgorithm); } @Override @@ -50,11 +57,11 @@ public boolean isQuantized() { @Override public ResultSet retrieve(Query query, QueryOptions queryOptions) { - if (query instanceof LevensteinDistance) { - LevensteinDistance lev = (LevensteinDistance) query; - final Iterable it = transducer.transduce(lev.getValue(), lev.getMaxDistance()); + Class queryClass = query.getClass(); + if (LevenshteinDistance.class.equals(queryClass)) { + LevenshteinDistance lev = (LevenshteinDistance) query; Set set = new HashSet<>(); - it.forEach(candidate -> { + transducer.transduce(lev.getValue(), lev.getMaxDistance()).forEach(candidate -> { set.addAll(terms.get(candidate.term())); }); return new ResultSet() { @@ -103,8 +110,9 @@ public void close() { set.clear(); } }; + } else { + throw new IllegalArgumentException("Unsupported query: " + query); } - throw new RuntimeException(); } @Override @@ -130,33 +138,52 @@ public void clear(QueryOptions queryOptions) { @Override public void init(ObjectStore objectStore, QueryOptions queryOptions) { - CloseableIterator it = objectStore.iterator(queryOptions); - terms = new HashMap<>(); - it.forEachRemaining(o -> { - attribute.getValues(o, queryOptions).forEach(term -> { - Set objects = terms.computeIfAbsent(term, new Function>() { - @Override - public Set apply(String s) { - return new HashSet<>(); - } + try (CloseableIterator it = objectStore.iterator(queryOptions)) { + terms = new HashMap<>(); + it.forEachRemaining(o -> { + attribute.getValues(o, queryOptions).forEach(term -> { + Set objects = terms.computeIfAbsent(term, new Function>() { + @Override + public Set apply(String s) { + return new HashSet<>(); + } + }); + objects.add(o); }); - objects.add(o); }); - }); - it.close(); + } SortedDawg dict = new SortedDawg(); List list = new ArrayList<>(terms.keySet()); Collections.sort(list); dict.addAll(list); - transducer = new TransducerBuilder() - .dictionary(dict) - .algorithm(Algorithm.TRANSPOSITION) - .includeDistance(true) - .build(); + transducer = transducerFactory.buildTransducer(dict); } - public static LevensteinDistanceIndex onAttribute(Attribute attribute) { - return new LevensteinDistanceIndex<>(attribute); + public static LevenshteinDistanceIndex onAttribute(Attribute attribute) { + return new LevenshteinDistanceIndex<>(attribute, Algorithm.STANDARD); } + public static LevenshteinDistanceIndex withSpellingCorrectionOnAttribute(Attribute attribute) { + return new LevenshteinDistanceIndex<>(attribute, Algorithm.TRANSPOSITION); + } + + public static LevenshteinDistanceIndex withOCRCorrectionOnAttribute(Attribute attribute) { + return new LevenshteinDistanceIndex<>(attribute, Algorithm.MERGE_AND_SPLIT); + } } + +class TransducerFactory { + private final Algorithm transducerAlgorithm; + + TransducerFactory(Algorithm transducerAlgorithm) { + this.transducerAlgorithm = transducerAlgorithm; + } + + ITransducer buildTransducer(SortedDawg dictionary) { + return new TransducerBuilder() + .dictionary(dictionary) + .algorithm(transducerAlgorithm) + .includeDistance(true) + .build(); + } +} \ No newline at end of file diff --git a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java index 0c0d2ef9f..af3539423 100644 --- a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java +++ b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java @@ -1287,8 +1287,8 @@ public static OrderByOption orderBy(AttributeOrder attributeOrder1, At return new OrderByOption(attributeOrders); } - public static LevensteinDistance levensteinDistance(SimpleAttribute attribute, String value, int maxDistance) { - return new LevensteinDistance<>(attribute, value, maxDistance); + public static LevenshteinDistance levenshteinDistance(SimpleAttribute attribute, String value, int maxDistance) { + return new LevenshteinDistance<>(attribute, value, maxDistance); } // *************************************************************************************************************** diff --git a/code/src/main/java/com/googlecode/cqengine/query/simple/LevensteinDistance.java b/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java similarity index 78% rename from code/src/main/java/com/googlecode/cqengine/query/simple/LevensteinDistance.java rename to code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java index 01aa5462e..70cfc0d34 100644 --- a/code/src/main/java/com/googlecode/cqengine/query/simple/LevensteinDistance.java +++ b/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java @@ -9,7 +9,7 @@ /** * @author Ruslan Sennov */ -public class LevensteinDistance extends SimpleQuery { +public class LevenshteinDistance extends SimpleQuery { private final String value; private final int maxDistance; @@ -19,7 +19,7 @@ public class LevensteinDistance extends SimpleQuery { * * @param attribute The attribute on which the assertion is to be made */ - public LevensteinDistance(SimpleAttribute attribute, String value, int maxDistance) { + public LevenshteinDistance(SimpleAttribute attribute, String value, int maxDistance) { super(attribute); this.value = value; this.maxDistance = maxDistance; @@ -35,12 +35,12 @@ public int getMaxDistance() { @Override protected boolean matchesSimpleAttribute(SimpleAttribute attribute, O object, QueryOptions queryOptions) { - throw new RuntimeException("Missing Levenstein index on attribute " + attribute.toString()); + throw new RuntimeException("Missing Levenshtein index on attribute " + attribute.toString()); } @Override protected boolean matchesNonSimpleAttribute(Attribute attribute, O object, QueryOptions queryOptions) { - throw new RuntimeException("Missing Levenstein index on attribute " + attribute.toString()); + throw new RuntimeException("Missing Levenshtein index on attribute " + attribute.toString()); } @Override @@ -53,7 +53,7 @@ public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - LevensteinDistance that = (LevensteinDistance) o; + LevenshteinDistance that = (LevenshteinDistance) o; if (!attribute.equals(that.attribute)) return false; if (maxDistance != that.maxDistance) return false; diff --git a/code/src/test/java/com/googlecode/cqengine/index/levenstein/LevensteinTest.java b/code/src/test/java/com/googlecode/cqengine/index/levenshtein/LevenshteinTest.java similarity index 65% rename from code/src/test/java/com/googlecode/cqengine/index/levenstein/LevensteinTest.java rename to code/src/test/java/com/googlecode/cqengine/index/levenshtein/LevenshteinTest.java index 711cbe39b..81651fb29 100644 --- a/code/src/test/java/com/googlecode/cqengine/index/levenstein/LevensteinTest.java +++ b/code/src/test/java/com/googlecode/cqengine/index/levenshtein/LevenshteinTest.java @@ -1,4 +1,4 @@ -package com.googlecode.cqengine.index.levenstein; +package com.googlecode.cqengine.index.levenshtein; import com.googlecode.cqengine.ConcurrentIndexedCollection; import com.googlecode.cqengine.IndexedCollection; @@ -6,18 +6,18 @@ import com.googlecode.cqengine.testutil.CarFactory; import org.junit.Test; -import static com.googlecode.cqengine.query.QueryFactory.levensteinDistance; +import static com.googlecode.cqengine.query.QueryFactory.levenshteinDistance; import static org.junit.Assert.assertEquals; /** * @author Ruslan Sennov */ -public class LevensteinTest { +public class LevenshteinTest { @Test(expected = IllegalStateException.class) public void testImmutable() { IndexedCollection collection = new ConcurrentIndexedCollection(); - collection.addIndex(LevensteinDistanceIndex.onAttribute(Car.MODEL)); + collection.addIndex(LevenshteinDistanceIndex.onAttribute(Car.MODEL)); collection.addAll(CarFactory.createCollectionOfCars(10)); } @@ -25,7 +25,7 @@ public void testImmutable() { public void testQuery() { IndexedCollection collection = new ConcurrentIndexedCollection(); collection.addAll(CarFactory.createCollectionOfCars(10)); - collection.addIndex(LevensteinDistanceIndex.onAttribute(Car.MANUFACTURER)); - assertEquals(3, collection.retrieve(levensteinDistance(Car.MANUFACTURER, "Frd", 1)).size()); + collection.addIndex(LevenshteinDistanceIndex.onAttribute(Car.MANUFACTURER)); + assertEquals(3, collection.retrieve(levenshteinDistance(Car.MANUFACTURER, "Frd", 1)).size()); } } From 69c463d6fb0e7eeb38387a6f101e31b00cadec71 Mon Sep 17 00:00:00 2001 From: Andrei Tomashpolskiy Date: Wed, 4 Apr 2018 12:26:30 +0300 Subject: [PATCH 3/9] Levenshtein index: use less restrictive attribute type for creating LevenshteinDistance query --- .../main/java/com/googlecode/cqengine/query/QueryFactory.java | 2 +- .../googlecode/cqengine/query/simple/LevenshteinDistance.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java index af3539423..cfa2b9809 100644 --- a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java +++ b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java @@ -1287,7 +1287,7 @@ public static OrderByOption orderBy(AttributeOrder attributeOrder1, At return new OrderByOption(attributeOrders); } - public static LevenshteinDistance levenshteinDistance(SimpleAttribute attribute, String value, int maxDistance) { + public static LevenshteinDistance levenshteinDistance(Attribute attribute, String value, int maxDistance) { return new LevenshteinDistance<>(attribute, value, maxDistance); } diff --git a/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java b/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java index 70cfc0d34..f4b4d5fc4 100644 --- a/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java +++ b/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java @@ -19,7 +19,7 @@ public class LevenshteinDistance extends SimpleQuery { * * @param attribute The attribute on which the assertion is to be made */ - public LevenshteinDistance(SimpleAttribute attribute, String value, int maxDistance) { + public LevenshteinDistance(Attribute attribute, String value, int maxDistance) { super(attribute); this.value = value; this.maxDistance = maxDistance; From 213bc9aed4396f290668f3c63b0557bff63a7516 Mon Sep 17 00:00:00 2001 From: Andrei Tomashpolskiy Date: Wed, 4 Apr 2018 14:09:47 +0300 Subject: [PATCH 4/9] Levenshtein index: preserve the order of liblevenshtein transducer results --- .../cqengine/index/levenshtein/LevenshteinDistanceIndex.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java index a77da08c0..f7da367ff 100644 --- a/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java +++ b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java @@ -21,6 +21,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -60,7 +61,7 @@ public ResultSet retrieve(Query query, QueryOptions queryOptions) { Class queryClass = query.getClass(); if (LevenshteinDistance.class.equals(queryClass)) { LevenshteinDistance lev = (LevenshteinDistance) query; - Set set = new HashSet<>(); + Set set = new LinkedHashSet<>(); transducer.transduce(lev.getValue(), lev.getMaxDistance()).forEach(candidate -> { set.addAll(terms.get(candidate.term())); }); From 2aeb69d220cfd5dd670a6b1195279b5440a6e7e6 Mon Sep 17 00:00:00 2001 From: Andrei Tomashpolskiy Date: Mon, 9 Apr 2018 11:12:51 +0300 Subject: [PATCH 5/9] Levenshtein index: finalize dictionary creation --- .../cqengine/index/levenshtein/LevenshteinDistanceIndex.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java index f7da367ff..2c75de519 100644 --- a/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java +++ b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java @@ -157,6 +157,7 @@ public Set apply(String s) { List list = new ArrayList<>(terms.keySet()); Collections.sort(list); dict.addAll(list); + dict.finish(); transducer = transducerFactory.buildTransducer(dict); } From abf1addc8a967cf9081f182660bc20a2c722efe8 Mon Sep 17 00:00:00 2001 From: Andrei Tomashpolskiy Date: Mon, 9 Apr 2018 17:57:27 +0300 Subject: [PATCH 6/9] Levenshtein index: add convenient toString implementation for LevenshteinDistance query --- .../cqengine/query/simple/LevenshteinDistance.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java b/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java index f4b4d5fc4..fe3b8b460 100644 --- a/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java +++ b/code/src/main/java/com/googlecode/cqengine/query/simple/LevenshteinDistance.java @@ -59,4 +59,11 @@ public boolean equals(Object o) { if (maxDistance != that.maxDistance) return false; return value != null ? value.equals(that.value) : that.value == null; } + + @Override + public String toString() { + return "distance("+ asLiteral(super.getAttributeName()) + + ", " + asLiteral(value) + + ")<=" + maxDistance; + } } From 9a3dd11239f59e3923342bca7c3849e043663f8e Mon Sep 17 00:00:00 2001 From: Andrei Tomashpolskiy Date: Mon, 16 Apr 2018 10:53:38 +0300 Subject: [PATCH 7/9] Levenshtein index: use Finam liblevenshtein-lite dependency in POM --- code/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/pom.xml b/code/pom.xml index e696e69e8..c038215f1 100644 --- a/code/pom.xml +++ b/code/pom.xml @@ -249,8 +249,8 @@ 3.20.1 - com.github.universal-automata - liblevenshtein + ru.finam + liblevenshtein-lite 3.0.0 From 6fdfaf24f8ed9ec0521dc647e815a81610959747 Mon Sep 17 00:00:00 2001 From: Andrei Tomashpolskiy Date: Tue, 17 Apr 2018 10:30:20 +0300 Subject: [PATCH 8/9] Levenshtein index: use Finam liblevenshtein-lite dependency in POM (bump version to 3.0.1) --- code/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/pom.xml b/code/pom.xml index c038215f1..bcceab803 100644 --- a/code/pom.xml +++ b/code/pom.xml @@ -251,7 +251,7 @@ ru.finam liblevenshtein-lite - 3.0.0 + 3.0.1 com.esotericsoftware From 285fcdd38f3623856bcd116cf60c38af4dcb0b94 Mon Sep 17 00:00:00 2001 From: Andrei Tomashpolskiy Date: Tue, 17 Apr 2018 12:59:05 +0300 Subject: [PATCH 9/9] Levenshtein index: migrate code to Java 1.6 --- .../levenshtein/LevenshteinDistanceIndex.java | 51 ++++++++++--------- .../cqengine/query/QueryFactory.java | 2 +- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java index 2c75de519..3564d573f 100644 --- a/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java +++ b/code/src/main/java/com/googlecode/cqengine/index/levenshtein/LevenshteinDistanceIndex.java @@ -25,7 +25,6 @@ import java.util.List; import java.util.Map; import java.util.Set; -import java.util.function.Function; /** * @author Ruslan Sennov @@ -42,7 +41,7 @@ public class LevenshteinDistanceIndex extends AbstractAttributeIndex attribute, Algorithm transducerAlgorithm) { - super(attribute, Collections.singleton(LevenshteinDistance.class)); + super(attribute, Collections.>singleton(LevenshteinDistance.class)); this.transducerFactory = new TransducerFactory(transducerAlgorithm); } @@ -57,14 +56,14 @@ public boolean isQuantized() { } @Override - public ResultSet retrieve(Query query, QueryOptions queryOptions) { + public ResultSet retrieve(final Query query, final QueryOptions queryOptions) { Class queryClass = query.getClass(); if (LevenshteinDistance.class.equals(queryClass)) { LevenshteinDistance lev = (LevenshteinDistance) query; - Set set = new LinkedHashSet<>(); - transducer.transduce(lev.getValue(), lev.getMaxDistance()).forEach(candidate -> { + final Set set = new LinkedHashSet(); + for (Candidate candidate : transducer.transduce(lev.getValue(), lev.getMaxDistance())) { set.addAll(terms.get(candidate.term())); - }); + } return new ResultSet() { @Override public Iterator iterator() { @@ -139,22 +138,28 @@ public void clear(QueryOptions queryOptions) { @Override public void init(ObjectStore objectStore, QueryOptions queryOptions) { - try (CloseableIterator it = objectStore.iterator(queryOptions)) { - terms = new HashMap<>(); - it.forEachRemaining(o -> { - attribute.getValues(o, queryOptions).forEach(term -> { - Set objects = terms.computeIfAbsent(term, new Function>() { - @Override - public Set apply(String s) { - return new HashSet<>(); - } - }); - objects.add(o); - }); - }); + CloseableIterator it = null; + try { + it = objectStore.iterator(queryOptions); + terms = new HashMap>(); + O o; + while (it.hasNext()) { + o = it.next(); + for (String term : attribute.getValues(o, queryOptions)) { + if (!terms.containsKey(term)) { + terms.put(term, new HashSet()); + } + terms.get(term).add(o); + } + } + } finally { + if (it != null) { + it.close(); + } } + SortedDawg dict = new SortedDawg(); - List list = new ArrayList<>(terms.keySet()); + List list = new ArrayList(terms.keySet()); Collections.sort(list); dict.addAll(list); dict.finish(); @@ -162,15 +167,15 @@ public Set apply(String s) { } public static LevenshteinDistanceIndex onAttribute(Attribute attribute) { - return new LevenshteinDistanceIndex<>(attribute, Algorithm.STANDARD); + return new LevenshteinDistanceIndex(attribute, Algorithm.STANDARD); } public static LevenshteinDistanceIndex withSpellingCorrectionOnAttribute(Attribute attribute) { - return new LevenshteinDistanceIndex<>(attribute, Algorithm.TRANSPOSITION); + return new LevenshteinDistanceIndex(attribute, Algorithm.TRANSPOSITION); } public static LevenshteinDistanceIndex withOCRCorrectionOnAttribute(Attribute attribute) { - return new LevenshteinDistanceIndex<>(attribute, Algorithm.MERGE_AND_SPLIT); + return new LevenshteinDistanceIndex(attribute, Algorithm.MERGE_AND_SPLIT); } } diff --git a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java index cfa2b9809..86f359a0f 100644 --- a/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java +++ b/code/src/main/java/com/googlecode/cqengine/query/QueryFactory.java @@ -1288,7 +1288,7 @@ public static OrderByOption orderBy(AttributeOrder attributeOrder1, At } public static LevenshteinDistance levenshteinDistance(Attribute attribute, String value, int maxDistance) { - return new LevenshteinDistance<>(attribute, value, maxDistance); + return new LevenshteinDistance(attribute, value, maxDistance); } // ***************************************************************************************************************