Skip to content

Commit

Permalink
Added tests to HomoglyphTokenFilter
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Khazin committed Dec 22, 2019
1 parent 29499f6 commit 57d7171
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 4 deletions.
15 changes: 11 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,23 @@
<packaging>jar</packaging>
<properties>
<elasticsearch.version>7.3.1</elasticsearch.version>
<lucene.version>8.1.0</lucene.version>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>${elasticsearch.version}</version>
<scope>compile</scope>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>${elasticsearch.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<version>${lucene.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package com.intsights.elasticsearch.index.analysis;

import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.MockTokenizer;

import java.io.IOException;
import java.io.StringReader;

class NoUtf16ParsingTokenizer extends MockTokenizer {
public NoUtf16ParsingTokenizer(){
super(MockTokenizer.WHITESPACE, false);
}

@Override
protected int readCodePoint() throws IOException {
int ch = readChar();
return ch;
}
}

public class TestHomoglyphTokenFilter extends BaseTokenStreamTestCase {
public void testEmptyInput() throws Exception {
TokenStream stream = whitespaceMockTokenizer("");
HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[0]);
}

public void testSingleHomoglyph() throws Exception {
TokenStream stream = whitespaceMockTokenizer("tℯst");
HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[] {"test"});
}

public void testNoHomoglyphs() throws Exception {
TokenStream stream = whitespaceMockTokenizer("test");
HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[0]);
}

public void testMultipleMappingHomoglyphs() throws Exception {
TokenStream stream = whitespaceMockTokenizer("hellО");
HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[] {
"hello", "hell0", "hel1o", "hel10", "he1lo", "he1l0", "he11o", "he110"
});
}

public void test4byteUtf16Homoglyphs() throws Exception {
TokenStream stream = whitespaceMockTokenizer("𝒕𝒆𝒔𝒕");
HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[] {"test"});
}

public void testMultipleLetterHomoglyphs() throws Exception {
TokenStream stream = whitespaceMockTokenizer("㏔-㎰");
HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[] {"mb-ps"});
}

public void testLowercasing() throws Exception {
TokenStream stream = whitespaceMockTokenizer("TEST");
HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[] {"test"});
}

public void testMultipleTokenInput() throws Exception {
TokenStream stream = whitespaceMockTokenizer("onℯ two 𝒕hree fОur FIVE sⅸ");
HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[] {
"one", "three", "four", "f0ur", "five", "six"
});
}

public void testUnicodeEdgeCases() throws Exception {
NoUtf16ParsingTokenizer stream = new NoUtf16ParsingTokenizer();
stream.setReader(new StringReader("aa\ud802aaℯ bb\udc02bbℯ ℯcc\ud803"));

HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
assertTokenStreamContents(filter, new String[] {
"aa\ud802aae", "bb\udc02bbe", "ecc\ud803"
});
}
}

0 comments on commit 57d7171

Please sign in to comment.