Added tests to HomoglyphTokenFilter

Intsights · Dec 22, 2019 · 57d7171 · 57d7171
1 parent 29499f6
commit 57d7171
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 4 deletions.
diff --git a/pom.xml b/pom.xml
@@ -7,16 +7,23 @@
 	<packaging>jar</packaging>
 	<properties>
 	    <elasticsearch.version>7.3.1</elasticsearch.version>
+		<lucene.version>8.1.0</lucene.version>
 		<maven.compiler.source>8</maven.compiler.source>
 		<maven.compiler.target>1.8</maven.compiler.target>
 	</properties>
 	<dependencies>
 	    <dependency>
-		<groupId>org.elasticsearch</groupId>
-		<artifactId>elasticsearch</artifactId>
-		<version>${elasticsearch.version}</version>
-		<scope>compile</scope>
+			<groupId>org.elasticsearch</groupId>
+			<artifactId>elasticsearch</artifactId>
+			<version>${elasticsearch.version}</version>
+			<scope>compile</scope>
 	    </dependency>
+		<dependency>
+			<groupId>org.apache.lucene</groupId>
+			<artifactId>lucene-test-framework</artifactId>
+			<version>${lucene.version}</version>
+			<scope>test</scope>
+		</dependency>
 	</dependencies>
 	<build>
 	    <plugins>

diff --git a/src/test/java/com/intsights/elasticsearch/index/analysis/TestHomoglyphTokenFilter.java b/src/test/java/com/intsights/elasticsearch/index/analysis/TestHomoglyphTokenFilter.java
@@ -0,0 +1,84 @@
+package com.intsights.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.MockTokenizer;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+class NoUtf16ParsingTokenizer extends MockTokenizer {
+    public NoUtf16ParsingTokenizer(){
+        super(MockTokenizer.WHITESPACE, false);
+    }
+
+    @Override
+    protected int readCodePoint() throws IOException {
+        int ch = readChar();
+        return ch;
+    }
+}
+
+public class TestHomoglyphTokenFilter extends BaseTokenStreamTestCase {
+    public void testEmptyInput() throws Exception {
+        TokenStream stream = whitespaceMockTokenizer("");
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[0]);
+    }
+
+    public void testSingleHomoglyph() throws Exception {
+        TokenStream stream = whitespaceMockTokenizer("tℯst");
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[] {"test"});
+    }
+
+    public void testNoHomoglyphs() throws Exception {
+        TokenStream stream = whitespaceMockTokenizer("test");
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[0]);
+    }
+
+    public void testMultipleMappingHomoglyphs() throws Exception {
+        TokenStream stream = whitespaceMockTokenizer("heｌｌО");
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[] {
+            "hello", "hell0", "hel1o", "hel10", "he1lo", "he1l0", "he11o", "he110"
+        });
+    }
+
+    public void test4byteUtf16Homoglyphs() throws Exception {
+        TokenStream stream = whitespaceMockTokenizer("𝒕𝒆𝒔𝒕");
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[] {"test"});
+    }
+
+    public void testMultipleLetterHomoglyphs() throws Exception {
+        TokenStream stream = whitespaceMockTokenizer("㏔-㎰");
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[] {"mb-ps"});
+    }
+
+    public void testLowercasing() throws Exception {
+        TokenStream stream = whitespaceMockTokenizer("TEST");
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[] {"test"});
+    }
+
+    public void testMultipleTokenInput() throws Exception {
+        TokenStream stream = whitespaceMockTokenizer("onℯ two 𝒕hree fОur FIVE sⅸ");
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[] {
+            "one", "three", "four", "f0ur", "five", "six"
+        });
+    }
+
+    public void testUnicodeEdgeCases() throws Exception {
+        NoUtf16ParsingTokenizer stream = new NoUtf16ParsingTokenizer();
+        stream.setReader(new StringReader("aa\ud802aaℯ bb\udc02bbℯ ℯcc\ud803"));
+
+        HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream);
+        assertTokenStreamContents(filter, new String[] {
+            "aa\ud802aae", "bb\udc02bbe", "ecc\ud803"
+        });
+    }
+}