From d1fc26542997e48de84fdf5d39f312192d49af0e Mon Sep 17 00:00:00 2001 From: jlinn Date: Tue, 2 Jun 2015 18:52:39 -0700 Subject: [PATCH] Update to ES v1.5.2. Add url_decode option. --- README.md | 8 ++++-- pom.xml | 22 ++++++++++++--- .../index/analysis/URLTokenFilterFactory.java | 4 ++- .../index/analysis/url/URLTokenFilter.java | 27 ++++++++++++++----- src/main/resources/es-plugin.properties | 3 ++- .../url/URLTokenFilterIntegrationTest.java | 7 ++++- .../analysis/url/URLTokenFilterTest.java | 11 +++++++- src/test/resources/test-settings.json | 11 ++++++++ 8 files changed, 77 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 5ff0e45..03e7ebb 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ This plugin enables URL token filtering by URL part. | Elasticsearch Version | Plugin Version | |-----------------------|----------------| +| 1.5.2 | 1.1.0 | | 1.4.2 | 1.0.0 | ## Installation @@ -17,8 +18,10 @@ bin/plugin --install analysis-url --url https://github.com/jlinn/elasticsearch-a ``` ## Usage -This filter only has one option: `part`. This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are: +Options: +* `part`: This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are: `protocol`, `host`, `port`, `path`, `query`, and `ref`. +* `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be url decoded. Set up your index like so: ```json @@ -28,7 +31,8 @@ Set up your index like so: "filter": { "url_host": { "type": "url", - "part": "host" + "part": "host", + "url_decode": true } }, "analyzer": { diff --git a/pom.xml b/pom.xml index 34a0e8f..9cce5d2 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.elasticsearch elasticsearch-analysis-url - 1.0-SNAPSHOT + 1.1.0 jar Elasticsearch URL token filter plugin @@ -18,8 +18,8 @@ UTF-8 - 1.4.2 - 4.10.2 + 1.5.2 + 4.10.4 1.3 onerror true @@ -91,6 +91,22 @@ + + + src/main/resources + true + + *.properties + + + + src/main/resources + false + + *.properties + + + ${basedir}/src/test/java diff --git a/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java index 14aa722..d0fe96d 100644 --- a/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java @@ -15,16 +15,18 @@ @AnalysisSettingsRequired public class URLTokenFilterFactory extends AbstractTokenFilterFactory { private final URLPart part; + private final boolean urlDecode; @Inject public URLTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); this.part = URLPart.fromString(settings.get("part", "whole")); + this.urlDecode = settings.getAsBoolean("url_decode", false); } @Override public TokenStream create(TokenStream tokenStream) { - return new URLTokenFilter(tokenStream, part); + return new URLTokenFilter(tokenStream, part, urlDecode); } } diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java index a5874b7..47c6a1f 100644 --- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java +++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.net.URL; +import java.net.URLDecoder; /** * Joe Linn @@ -18,11 +19,18 @@ public final class URLTokenFilter extends TokenFilter { private final URLPart part; + private final boolean urlDeocde; + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); public URLTokenFilter(TokenStream input, URLPart part) { + this(input, part, false); + } + + public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) { super(input); this.part = part; + this.urlDeocde = urlDecode; } @Override @@ -34,29 +42,34 @@ public boolean incrementToken() throws IOException { return false; } URL url = new URL(urlString); + String partString; switch (part) { case PROTOCOL: - termAttribute.append(url.getProtocol()); + partString = url.getProtocol(); break; case HOST: - termAttribute.append(url.getHost()); + partString = url.getHost(); break; case PORT: - termAttribute.append(String.valueOf(url.getPort())); + partString = String.valueOf(url.getPort()); break; case PATH: - termAttribute.append(url.getPath()); + partString = url.getPath(); break; case REF: - termAttribute.append(url.getRef()); + partString = url.getRef(); break; case QUERY: - termAttribute.append(url.getQuery()); + partString = url.getQuery(); break; case WHOLE: default: - termAttribute.append(url.toString()); + partString = url.toString(); + } + if (urlDeocde) { + partString = URLDecoder.decode(partString, "UTF-8"); } + termAttribute.append(partString); return true; } return false; diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties index fd98921..56b80b3 100644 --- a/src/main/resources/es-plugin.properties +++ b/src/main/resources/es-plugin.properties @@ -1,2 +1,3 @@ plugin=org.elasticsearch.plugin.analysis.AnalysisURLPlugin -#lucene=${lucene.version} \ No newline at end of file +#lucene=${lucene.version} +version=${project.version} \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java index 861e57d..2c86664 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterIntegrationTest.java @@ -55,6 +55,11 @@ public void testEmptyString() { assertThat("no tokens", tokens, hasSize(0)); } + @Test + public void testUrlDecode() { + assertURLAnalyzesTo("https://foo.bar.com?email=foo%40bar.com", "url_query", "email=foo@bar.com"); + } + private void refresh() { client().admin().indices().prepareRefresh().get(); } @@ -62,7 +67,7 @@ private void refresh() { private void assertURLAnalyzesTo(String url, String analyzer, String expected) { List tokens = analyzeURL(url, analyzer); assertThat("a URL part was parsed", tokens, hasSize(1)); - assertEquals("term value", tokens.get(0).getTerm(), expected); + assertEquals("term value", expected, tokens.get(0).getTerm()); } private List analyzeURL(String url, String analyzer) { diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java index 4452a69..a30955b 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenFilterTest.java @@ -60,12 +60,21 @@ public void testNullURL() throws IOException { filter.incrementToken(); } + @Test + public void testUrlDecode() throws IOException { + assertTokenStreamContents(createFilter("https://www.foo.com?email=foo%40bar.com", URLPart.QUERY, true), "email=foo@bar.com"); + } + private URLTokenFilter createFilter(final String url, final URLPart part) { + return createFilter(url, part, false); + } + + private URLTokenFilter createFilter(final String url, final URLPart part, final boolean urlDecode) { int length = 0; if (url != null) { length = url.length(); } - return new URLTokenFilter(new SingleTokenTokenStream(new Token(url, 0, length)), part); + return new URLTokenFilter(new SingleTokenTokenStream(new Token(url, 0, length)), part, urlDecode); } private static void assertTokenStreamContents(TokenStream in, String output) throws IOException { diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json index d311aa4..17136f9 100644 --- a/src/test/resources/test-settings.json +++ b/src/test/resources/test-settings.json @@ -12,6 +12,11 @@ "url_port": { "type": "url", "part": "port" + }, + "url_query": { + "type": "url", + "part": "query", + "url_decode": true } }, "analyzer": { @@ -32,6 +37,12 @@ "url_port" ], "tokenizer": "whitespace" + }, + "url_query": { + "filter": [ + "url_query" + ], + "tokenizer": "whitespace" } } }