Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Levenshtein index #198

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
5 changes: 5 additions & 0 deletions code/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,11 @@
<artifactId>sqlite-jdbc</artifactId>
<version>3.20.1</version>
</dependency>
<dependency>
<groupId>ru.finam</groupId>
<artifactId>liblevenshtein-lite</artifactId>
<version>3.0.0</version>

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

3.0.1 is java6 compatible and is ready for use

</dependency>
<dependency>
<groupId>com.esotericsoftware</groupId>
<artifactId>kryo</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
package com.googlecode.cqengine.index.levenshtein;

import com.github.liblevenshtein.collection.dictionary.SortedDawg;
import com.github.liblevenshtein.transducer.Algorithm;
import com.github.liblevenshtein.transducer.Candidate;
import com.github.liblevenshtein.transducer.ITransducer;
import com.github.liblevenshtein.transducer.factory.TransducerBuilder;
import com.googlecode.cqengine.attribute.Attribute;
import com.googlecode.cqengine.index.Index;
import com.googlecode.cqengine.index.support.AbstractAttributeIndex;
import com.googlecode.cqengine.index.support.CloseableIterator;
import com.googlecode.cqengine.persistence.support.ObjectSet;
import com.googlecode.cqengine.persistence.support.ObjectStore;
import com.googlecode.cqengine.query.Query;
import com.googlecode.cqengine.query.option.QueryOptions;
import com.googlecode.cqengine.query.simple.LevenshteinDistance;
import com.googlecode.cqengine.resultset.ResultSet;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;

/**
* @author <a href="mailto:[email protected]">Ruslan Sennov</a>
*/
public class LevenshteinDistanceIndex<O> extends AbstractAttributeIndex<String, O> {

private final TransducerFactory transducerFactory;
private ITransducer<Candidate> transducer;
private Map<String, Set<O>> terms;

/**
* Private constructor, used by static factory methods.
*
* @param attribute The attribute on which the index will be built
*/
private LevenshteinDistanceIndex(Attribute<O, String> attribute, Algorithm transducerAlgorithm) {
super(attribute, Collections.singleton(LevenshteinDistance.class));
this.transducerFactory = new TransducerFactory(transducerAlgorithm);
}

@Override
public boolean isMutable() {
return false;
}

@Override
public boolean isQuantized() {
return false;
}

@Override
public ResultSet<O> retrieve(Query<O> query, QueryOptions queryOptions) {
Class<?> queryClass = query.getClass();
if (LevenshteinDistance.class.equals(queryClass)) {
LevenshteinDistance<O> lev = (LevenshteinDistance<O>) query;
Set<O> set = new LinkedHashSet<>();
transducer.transduce(lev.getValue(), lev.getMaxDistance()).forEach(candidate -> {
set.addAll(terms.get(candidate.term()));
});
return new ResultSet<O>() {
@Override
public Iterator<O> iterator() {
return set.iterator();
}

@Override
public boolean contains(O object) {
return set.contains(object);
}

@Override
public boolean matches(O object) {
return set.contains(object);
}

@Override
public Query<O> getQuery() {
return query;
}

@Override
public QueryOptions getQueryOptions() {
return queryOptions;
}

@Override
public int getRetrievalCost() {
return 10;
}

@Override
public int getMergeCost() {
return 10;
}

@Override
public int size() {
return set.size();
}

@Override
public void close() {
set.clear();
}
};
} else {
throw new IllegalArgumentException("Unsupported query: " + query);
}
}

@Override
public Index<O> getEffectiveIndex() {
return this;
}

@Override
public boolean addAll(ObjectSet<O> objectSet, QueryOptions queryOptions) {
// this index is immutable, will never be here
throw new IllegalStateException();
}

@Override
public boolean removeAll(ObjectSet<O> objectSet, QueryOptions queryOptions) {
// this index is immutable, will never be here
throw new IllegalStateException();
}

@Override
public void clear(QueryOptions queryOptions) {
}

@Override
public void init(ObjectStore<O> objectStore, QueryOptions queryOptions) {
try (CloseableIterator<O> it = objectStore.iterator(queryOptions)) {
terms = new HashMap<>();
it.forEachRemaining(o -> {
attribute.getValues(o, queryOptions).forEach(term -> {
Set<O> objects = terms.computeIfAbsent(term, new Function<String, Set<O>>() {
@Override
public Set<O> apply(String s) {
return new HashSet<>();
}
});
objects.add(o);
});
});
}
SortedDawg dict = new SortedDawg();
List<String> list = new ArrayList<>(terms.keySet());
Collections.sort(list);
dict.addAll(list);
dict.finish();
transducer = transducerFactory.buildTransducer(dict);
}

public static <O> LevenshteinDistanceIndex<O> onAttribute(Attribute<O, String> attribute) {
return new LevenshteinDistanceIndex<>(attribute, Algorithm.STANDARD);
}

public static <O> LevenshteinDistanceIndex<O> withSpellingCorrectionOnAttribute(Attribute<O, String> attribute) {
return new LevenshteinDistanceIndex<>(attribute, Algorithm.TRANSPOSITION);
}

public static <O> LevenshteinDistanceIndex<O> withOCRCorrectionOnAttribute(Attribute<O, String> attribute) {
return new LevenshteinDistanceIndex<>(attribute, Algorithm.MERGE_AND_SPLIT);
}
}

class TransducerFactory {
private final Algorithm transducerAlgorithm;

TransducerFactory(Algorithm transducerAlgorithm) {
this.transducerAlgorithm = transducerAlgorithm;
}

ITransducer<Candidate> buildTransducer(SortedDawg dictionary) {
return new TransducerBuilder()
.dictionary(dictionary)
.algorithm(transducerAlgorithm)
.includeDistance(true)
.build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1287,6 +1287,10 @@ public static <O> OrderByOption<O> orderBy(AttributeOrder<O> attributeOrder1, At
return new OrderByOption<O>(attributeOrders);
}

public static <O> LevenshteinDistance<O> levenshteinDistance(Attribute<O, String> attribute, String value, int maxDistance) {
return new LevenshteinDistance<>(attribute, value, maxDistance);
}

// ***************************************************************************************************************

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package com.googlecode.cqengine.query.simple;

import com.googlecode.cqengine.attribute.Attribute;
import com.googlecode.cqengine.attribute.SimpleAttribute;
import com.googlecode.cqengine.query.option.QueryOptions;

import java.util.Objects;

/**
* @author <a href="mailto:[email protected]">Ruslan Sennov</a>
*/
public class LevenshteinDistance<O> extends SimpleQuery<O, String> {

private final String value;
private final int maxDistance;

/**
* Creates a new {@link SimpleQuery} initialized to make assertions on values of the specified attribute
*
* @param attribute The attribute on which the assertion is to be made
*/
public LevenshteinDistance(Attribute<O, String> attribute, String value, int maxDistance) {
super(attribute);
this.value = value;
this.maxDistance = maxDistance;
}

public String getValue() {
return value;
}

public int getMaxDistance() {
return maxDistance;
}

@Override
protected boolean matchesSimpleAttribute(SimpleAttribute<O, String> attribute, O object, QueryOptions queryOptions) {
throw new RuntimeException("Missing Levenshtein index on attribute " + attribute.toString());
}

@Override
protected boolean matchesNonSimpleAttribute(Attribute<O, String> attribute, O object, QueryOptions queryOptions) {
throw new RuntimeException("Missing Levenshtein index on attribute " + attribute.toString());
}

@Override
protected int calcHashCode() {
return Objects.hashCode(value) + 31 * maxDistance;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;

LevenshteinDistance<?> that = (LevenshteinDistance<?>) o;

if (!attribute.equals(that.attribute)) return false;
if (maxDistance != that.maxDistance) return false;
return value != null ? value.equals(that.value) : that.value == null;
}

@Override
public String toString() {
return "distance("+ asLiteral(super.getAttributeName())
+ ", " + asLiteral(value)
+ ")<=" + maxDistance;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package com.googlecode.cqengine.index.levenshtein;

import com.googlecode.cqengine.ConcurrentIndexedCollection;
import com.googlecode.cqengine.IndexedCollection;
import com.googlecode.cqengine.testutil.Car;
import com.googlecode.cqengine.testutil.CarFactory;
import org.junit.Test;

import static com.googlecode.cqengine.query.QueryFactory.levenshteinDistance;
import static org.junit.Assert.assertEquals;

/**
* @author <a href="mailto:[email protected]">Ruslan Sennov</a>
*/
public class LevenshteinTest {

@Test(expected = IllegalStateException.class)
public void testImmutable() {
IndexedCollection<Car> collection = new ConcurrentIndexedCollection<Car>();
collection.addIndex(LevenshteinDistanceIndex.onAttribute(Car.MODEL));
collection.addAll(CarFactory.createCollectionOfCars(10));
}

@Test
public void testQuery() {
IndexedCollection<Car> collection = new ConcurrentIndexedCollection<Car>();
collection.addAll(CarFactory.createCollectionOfCars(10));
collection.addIndex(LevenshteinDistanceIndex.onAttribute(Car.MANUFACTURER));
assertEquals(3, collection.retrieve(levenshteinDistance(Car.MANUFACTURER, "Frd", 1)).size());
}
}