From 57d7171a2b37646c87715f9f4db0efb5c1780ecc Mon Sep 17 00:00:00 2001 From: Mark Khazin Date: Sun, 22 Dec 2019 12:36:06 +0200 Subject: [PATCH] Added tests to HomoglyphTokenFilter --- pom.xml | 15 +++- .../analysis/TestHomoglyphTokenFilter.java | 84 +++++++++++++++++++ 2 files changed, 95 insertions(+), 4 deletions(-) create mode 100644 src/test/java/com/intsights/elasticsearch/index/analysis/TestHomoglyphTokenFilter.java diff --git a/pom.xml b/pom.xml index 23d0708..8189a88 100644 --- a/pom.xml +++ b/pom.xml @@ -7,16 +7,23 @@ jar 7.3.1 + 8.1.0 8 1.8 - org.elasticsearch - elasticsearch - ${elasticsearch.version} - compile + org.elasticsearch + elasticsearch + ${elasticsearch.version} + compile + + org.apache.lucene + lucene-test-framework + ${lucene.version} + test + diff --git a/src/test/java/com/intsights/elasticsearch/index/analysis/TestHomoglyphTokenFilter.java b/src/test/java/com/intsights/elasticsearch/index/analysis/TestHomoglyphTokenFilter.java new file mode 100644 index 0000000..72bc505 --- /dev/null +++ b/src/test/java/com/intsights/elasticsearch/index/analysis/TestHomoglyphTokenFilter.java @@ -0,0 +1,84 @@ +package com.intsights.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.MockTokenizer; + +import java.io.IOException; +import java.io.StringReader; + +class NoUtf16ParsingTokenizer extends MockTokenizer { + public NoUtf16ParsingTokenizer(){ + super(MockTokenizer.WHITESPACE, false); + } + + @Override + protected int readCodePoint() throws IOException { + int ch = readChar(); + return ch; + } +} + +public class TestHomoglyphTokenFilter extends BaseTokenStreamTestCase { + public void testEmptyInput() throws Exception { + TokenStream stream = whitespaceMockTokenizer(""); + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[0]); + } + + public void testSingleHomoglyph() throws Exception { + TokenStream stream = whitespaceMockTokenizer("tℯst"); + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[] {"test"}); + } + + public void testNoHomoglyphs() throws Exception { + TokenStream stream = whitespaceMockTokenizer("test"); + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[0]); + } + + public void testMultipleMappingHomoglyphs() throws Exception { + TokenStream stream = whitespaceMockTokenizer("hellО"); + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[] { + "hello", "hell0", "hel1o", "hel10", "he1lo", "he1l0", "he11o", "he110" + }); + } + + public void test4byteUtf16Homoglyphs() throws Exception { + TokenStream stream = whitespaceMockTokenizer("𝒕𝒆𝒔𝒕"); + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[] {"test"}); + } + + public void testMultipleLetterHomoglyphs() throws Exception { + TokenStream stream = whitespaceMockTokenizer("㏔-㎰"); + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[] {"mb-ps"}); + } + + public void testLowercasing() throws Exception { + TokenStream stream = whitespaceMockTokenizer("TEST"); + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[] {"test"}); + } + + public void testMultipleTokenInput() throws Exception { + TokenStream stream = whitespaceMockTokenizer("onℯ two 𝒕hree fОur FIVE sⅸ"); + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[] { + "one", "three", "four", "f0ur", "five", "six" + }); + } + + public void testUnicodeEdgeCases() throws Exception { + NoUtf16ParsingTokenizer stream = new NoUtf16ParsingTokenizer(); + stream.setReader(new StringReader("aa\ud802aaℯ bb\udc02bbℯ ℯcc\ud803")); + + HomoglyphTokenFilter filter = new HomoglyphTokenFilter(stream); + assertTokenStreamContents(filter, new String[] { + "aa\ud802aae", "bb\udc02bbe", "ecc\ud803" + }); + } +}