diff --git a/README.md b/README.md
index c6eb575..63c6293 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.
| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
-| 2.3.3 | 2.3.3.2 |
+| 2.3.3 | 2.3.3.3 |
| 2.3.2 | 2.3.2.1 |
| 2.3.1 | 2.3.1.1 |
| 2.3.0 | 2.3.0.1 |
@@ -26,13 +26,13 @@ This plugin enables URL tokenization and token filtering by URL part.
## Installation
```bash
-bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.3.2/elasticsearch-analysis-url-2.3.3.2.zip
+bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.3.3/elasticsearch-analysis-url-2.3.3.3.zip
```
## Usage
### URL Tokenizer
#### Options:
-* `part`: Defaults to `null`. If left `null`, all URL parts will be tokenized, and some additional tokens (`host:port` and `protocol://host`) will be included. Options are `whole`, `protocol`, `host`, `port`, `path`, `query`, and `ref`.
+* `part`: Defaults to `null`. If left `null`, all URL parts will be tokenized, and some additional tokens (`host:port` and `protocol://host`) will be included. Can be either a string (single URL part) or an array of multiple URL parts. Options are `whole`, `protocol`, `host`, `port`, `path`, `query`, and `ref`.
* `url_decode`: Defaults to `false`. If `true`, URL tokens will be URL decoded.
* `allow_malformed`: Defaults to `false`. If `true`, malformed URLs will not be rejected, but will be passed through without being tokenized.
* `tokenize_malformed`: Defaults to `false`. Has no effect if `allow_malformed` is `false`. If both are `true`, an attempt will be made to tokenize malformed URLs using regular expressions.
@@ -92,9 +92,9 @@ curl 'http://localhost:9200/index_name/_analyze?analyzer=url_host&pretty' -d 'ht
### URL Token Filter
#### Options:
* `part`: This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are:
-`protocol`, `host`, `port`, `path`, `query`, and `ref`.
+`protocol`, `host`, `port`, `path`, `query`, and `ref`. Can be either a single URL part (string) or an array of URL parts.
* `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be URL decoded.
-* `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
+* `allow_malformed`: Defaults to `false`. If `true`, documents containing malformed URLs will not be rejected, and an attempt will be made to parse the desired URL part from the malformed URL string.
If the desired part cannot be found, no value will be indexed for that field.
* `passthrough`: Defaults to `false`. If `true`, `allow_malformed` is implied, and any non-url tokens will be passed through the filter. Valid URLs will be tokenized according to the filter's other settings.
* `tokenize_host`: Defaults to `true`. If `true`, the host will be further tokenized using a [reverse path hierarchy tokenizer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pathhierarchy-tokenizer.html) with the delimiter set to `.`.
diff --git a/pom.xml b/pom.xml
index 8fdebfc..e9f0989 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
org.elasticsearch
elasticsearch-analysis-url
- 2.3.3.3-SNAPSHOT
+ 2.3.3.3
jar
Elasticsearch URL token filter plugin
diff --git a/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java
index f7da8cb..55546f1 100644
--- a/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/URLTokenFilterFactory.java
@@ -1,5 +1,7 @@
package org.elasticsearch.index.analysis;
+import com.google.common.base.Function;
+import com.google.common.collect.FluentIterable;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
@@ -8,13 +10,15 @@
import org.elasticsearch.index.analysis.url.URLTokenFilter;
import org.elasticsearch.index.settings.IndexSettingsService;
+import java.util.List;
+
/**
* Joe Linn
* 1/17/2015
*/
@AnalysisSettingsRequired
public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
- private final URLPart part;
+ private final List parts;
private final boolean urlDecode;
private boolean tokenizeHost;
private boolean tokenizePath;
@@ -27,7 +31,14 @@ public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings.indexSettings(), name, settings);
- this.part = URLPart.fromString(settings.get("part", "whole"));
+ this.parts = FluentIterable.of(settings.getAsArray("part", new String[]{"whole"}))
+ .transform(new Function() {
+ @Override
+ public URLPart apply(String input) {
+ return URLPart.fromString(input);
+ }
+ }).toList();
+
this.urlDecode = settings.getAsBoolean("url_decode", false);
this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
this.tokenizePath = settings.getAsBoolean("tokenize_path", true);
@@ -39,7 +50,8 @@ public URLTokenFilterFactory(Index index, IndexSettingsService indexSettings, @A
@Override
public TokenStream create(TokenStream tokenStream) {
- return new URLTokenFilter(tokenStream, part, urlDecode, allowMalformed, passthrough)
+ return new URLTokenFilter(tokenStream, null, urlDecode, allowMalformed, passthrough)
+ .setParts(parts)
.setTokenizeMalformed(tokenizeMalformed)
.setTokenizeHost(tokenizeHost)
.setTokenizePath(tokenizePath)
diff --git a/src/main/java/org/elasticsearch/index/analysis/URLTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/URLTokenizerFactory.java
index fd5125a..97a176f 100644
--- a/src/main/java/org/elasticsearch/index/analysis/URLTokenizerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/URLTokenizerFactory.java
@@ -1,6 +1,7 @@
package org.elasticsearch.index.analysis;
-import com.google.common.base.Strings;
+import com.google.common.base.Function;
+import com.google.common.collect.FluentIterable;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
@@ -9,13 +10,15 @@
import org.elasticsearch.index.analysis.url.URLTokenizer;
import org.elasticsearch.index.settings.IndexSettingsService;
+import java.util.List;
+
/**
* Joe Linn
* 8/1/2015
*/
@AnalysisSettingsRequired
public class URLTokenizerFactory extends AbstractTokenizerFactory {
- private URLPart part;
+ private List parts;
private boolean urlDecode;
private boolean tokenizeHost;
private boolean tokenizePath;
@@ -28,9 +31,14 @@ public class URLTokenizerFactory extends AbstractTokenizerFactory {
public URLTokenizerFactory(Index index, IndexSettingsService indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings.indexSettings(), name, settings);
- String partString = settings.get("part");
- if (!Strings.isNullOrEmpty(partString)) {
- this.part = URLPart.fromString(partString);
+ String[] parts = settings.getAsArray("part");
+ if (parts != null && parts.length > 0) {
+ this.parts = FluentIterable.of(parts).transform(new Function() {
+ @Override
+ public URLPart apply(String input) {
+ return URLPart.fromString(input);
+ }
+ }).toList();
}
this.urlDecode = settings.getAsBoolean("url_decode", false);
this.tokenizeHost = settings.getAsBoolean("tokenize_host", true);
@@ -44,7 +52,7 @@ public URLTokenizerFactory(Index index, IndexSettingsService indexSettings, @Ass
@Override
public Tokenizer create() {
URLTokenizer tokenizer = new URLTokenizer();
- tokenizer.setPart(part);
+ tokenizer.setParts(parts);
tokenizer.setUrlDecode(urlDecode);
tokenizer.setTokenizeHost(tokenizeHost);
tokenizer.setTokenizePath(tokenizePath);
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
index da8fc7a..863058a 100644
--- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
+++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenFilter.java
@@ -25,7 +25,7 @@
public final class URLTokenFilter extends TokenFilter {
public static final String NAME = "url";
- private final URLPart part;
+ private List parts;
private final boolean urlDeocde;
@@ -69,13 +69,22 @@ public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolea
public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode, boolean allowMalformed, boolean passthrough) {
super(input);
- this.part = part;
+ if (part != null) {
+ this.parts = ImmutableList.of(part);
+ } else {
+ parts = null;
+ }
this.urlDeocde = urlDecode;
this.allowMalformed = allowMalformed;
this.passthrough = passthrough;
}
+ public URLTokenFilter setParts(List parts) {
+ this.parts = parts;
+ return this;
+ }
+
public URLTokenFilter setTokenizeHost(boolean tokenizeHost) {
this.tokenizeHost = tokenizeHost;
return this;
@@ -99,7 +108,7 @@ public URLTokenFilter setTokenizeMalformed(boolean tokenizeMalformed) {
@Override
public boolean incrementToken() throws IOException {
- if(iterator == null || !iterator.hasNext()){
+ if (iterator == null || !iterator.hasNext()) {
if ((iterator != null && !iterator.hasNext() && !passthrough) || !advance()) {
return false;
}
@@ -157,7 +166,8 @@ private boolean advance() throws IOException {
*/
private List tokenize(String input) throws IOException {
List tokens = new ArrayList<>();
- URLTokenizer tokenizer = new URLTokenizer(part);
+ URLTokenizer tokenizer = new URLTokenizer();
+ tokenizer.setParts(parts);
tokenizer.setUrlDecode(urlDeocde);
tokenizer.setTokenizeHost(tokenizeHost);
tokenizer.setTokenizePath(tokenizePath);
@@ -190,18 +200,31 @@ public void reset() throws IOException {
* @return the url part if it can be parsed, null otherwise
*/
private String parseMalformed(String urlString) {
- switch (part) {
- case PROTOCOL:
- return applyPattern(REGEX_PROTOCOL, urlString);
- case PORT:
- return applyPattern(REGEX_PORT, urlString);
- case QUERY:
- return applyPattern(REGEX_QUERY, urlString);
- case WHOLE:
- return urlString;
- default:
- return urlString;
+ if (parts != null && !parts.isEmpty()) {
+ String ret;
+ for (URLPart part : parts) {
+ switch (part) {
+ case PROTOCOL:
+ ret = applyPattern(REGEX_PROTOCOL, urlString);
+ break;
+ case PORT:
+ ret = applyPattern(REGEX_PORT, urlString);
+ break;
+ case QUERY:
+ ret = applyPattern(REGEX_QUERY, urlString);
+ break;
+ case WHOLE:
+ ret = urlString;
+ break;
+ default:
+ ret = urlString;
+ }
+ if (!Strings.isNullOrEmpty(ret)) {
+ return ret;
+ }
+ }
}
+ return urlString;
}
/**
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
index 6fabe1d..58aca4c 100644
--- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
@@ -37,7 +37,7 @@ public final class URLTokenizer extends Tokenizer {
/**
* If set, only the given part of the url will be tokenized.
*/
- private URLPart part;
+ private List parts;
/**
* If true, url parts will be url decoded prior to tokenization.
@@ -84,7 +84,7 @@ public URLTokenizer() {
}
public URLTokenizer(URLPart part) {
- this.part = part;
+ setPart(part);
}
@@ -92,8 +92,13 @@ public URLTokenizer(AttributeFactory factory) {
super(factory);
}
+ public void setParts(List parts) { this.parts = parts; }
- public void setPart(URLPart part) { this.part = part; }
+ public void setPart(URLPart part) {
+ if (part != null) {
+ this.parts = ImmutableList.of(part);
+ }
+ }
public void setUrlDecode(boolean urlDecode) { this.urlDecode = urlDecode; }
@@ -164,9 +169,12 @@ private String readerToString(Reader reader) throws IOException {
private List tokenize(String urlString) throws IOException {
try {
URL url = new URL(urlString);
- if (part != null) {
- // single URL part
- return tokenize(url, part);
+ if (parts != null && !parts.isEmpty()) {
+ List tokens = new ArrayList<>();
+ for (URLPart part : parts) {
+ tokens.addAll(tokenize(url, part));
+ }
+ return tokens;
}
// No part is specified. Tokenize all parts.
Set tokens = new HashSet<>();
@@ -177,7 +185,14 @@ private List tokenize(String urlString) throws IOException {
return Lists.newArrayList(tokens);
} catch (MalformedURLException e) {
if (allowMalformed) {
- return tokenizeMalformed(urlString, tokenizeMalformed ? part : URLPart.WHOLE);
+ if (tokenizeMalformed && parts != null && !parts.isEmpty()) {
+ List tokens = new ArrayList<>();
+ for (URLPart part : parts) {
+ tokens.addAll(tokenizeMalformed(urlString, part));
+ }
+ return tokens;
+ }
+ return tokenizeMalformed(urlString, (parts == null || parts.isEmpty()) ? null : URLPart.WHOLE);
}
throw new IOException("Malformed URL: " + urlString, e);
}
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java
index 3dc6cb1..7669d35 100644
--- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java
+++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerIntegrationTest.java
@@ -27,6 +27,8 @@ public void testAnalyze() {
assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_all", "www.foo.bar.com:9200", "http://www.foo.bar.com");
+ assertTokensContain(URLTokenizerTest.TEST_HTTP_URL, "tokenizer_url_protocol_and_host", "http", "www.foo.bar.com", "foo.bar.com", "bar.com", "com");
+
assertTokensContain("foo.bar.com/baz.html/query?a=1", "tokenizer_url_all_malformed", "foo.bar.com", "/baz.html/query");
}
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
index f2b784d..67bd717 100644
--- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
+++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
@@ -1,5 +1,6 @@
package org.elasticsearch.index.analysis.url;
+import com.google.common.collect.ImmutableList;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -213,8 +214,18 @@ public void testMalformedWhole() throws Exception {
}
- private URLTokenizer createTokenizer(String input, URLPart part) throws IOException {
- URLTokenizer tokenizer = new URLTokenizer(part);
+ @Test
+ public void testProtocolAndPort() throws Exception {
+ URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PROTOCOL, URLPart.PORT);
+ assertTokenStreamContents(tokenizer, stringArray("http", "9200"));
+ }
+
+
+ private URLTokenizer createTokenizer(String input, URLPart... parts) throws IOException {
+ URLTokenizer tokenizer = new URLTokenizer();
+ if (parts != null) {
+ tokenizer.setParts(ImmutableList.copyOf(parts));
+ }
tokenizer.setReader(new StringReader(input));
return tokenizer;
}
diff --git a/src/test/resources/test-settings.json b/src/test/resources/test-settings.json
index b0b6607..f10b40a 100644
--- a/src/test/resources/test-settings.json
+++ b/src/test/resources/test-settings.json
@@ -14,6 +14,10 @@
"part": "host",
"tokenize_host": false
},
+ "url_protocol_and_host": {
+ "type": "url",
+ "part": ["protocol", "host"]
+ },
"url_all": {
"type": "url"
},
@@ -100,6 +104,9 @@
"tokenizer_url_host_single": {
"tokenizer": "url_host_single"
},
+ "tokenizer_url_protocol_and_host": {
+ "tokenizer": "url_protocol_and_host"
+ },
"tokenizer_url_all": {
"tokenizer": "url_all"
},