diff --git a/README.md b/README.md
index ec86bad..ad724d1 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.
| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
-| 2.3.4 | 2.3.4.1 |
+| 2.3.4 | 2.3.4.2 |
| 2.3.3 | 2.3.3.3 |
| 2.3.2 | 2.3.2.1 |
| 2.3.1 | 2.3.1.1 |
@@ -27,7 +27,7 @@ This plugin enables URL tokenization and token filtering by URL part.
## Installation
```bash
-bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.1/elasticsearch-analysis-url-2.3.4.1.zip
+bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.2/elasticsearch-analysis-url-2.3.4.2.zip
```
## Usage
diff --git a/pom.xml b/pom.xml
index d7f75ef..45fc797 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
org.elasticsearch
elasticsearch-analysis-url
- 2.3.4.1
+ 2.3.4.2
jar
Elasticsearch URL token filter plugin
diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
index 22eccc3..64a0f94 100644
--- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
+++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java
@@ -190,20 +190,7 @@ private List tokenize(String urlString) throws IOException {
} catch (MalformedURLException e) {
if (allowMalformed) {
if (tokenizeMalformed && parts != null && !parts.isEmpty()) {
- List tokens = new ArrayList<>();
- Set tokenStrings = new HashSet<>();
- for (URLPart part : parts) {
- for (Token token : tokenizeMalformed(urlString, part)) {
- if (part != URLPart.WHOLE) {
- tokens.add(token);
- tokenStrings.add(token.getToken());
- } else if (!tokenStrings.contains(token.getToken())) {
- // ensure that we are not adding a duplicate token when tokenize the whole malformed URL
- tokens.add(token);
- }
- }
- }
- return tokens;
+ return tokenizePartsMalformed(urlString, parts);
}
return tokenizeMalformed(urlString, (parts == null || parts.isEmpty()) ? null : URLPart.WHOLE);
}
@@ -212,6 +199,32 @@ private List tokenize(String urlString) throws IOException {
}
+ /**
+ * Tokenize all given parts of the given URL while ensuring that duplicate tokens are not created when the whole
+ * malformed URL is is identical to a single part token.
+ * @param urlString the malformed URL to be tokenized
+ * @param parts the desired {@link URLPart}s
+ * @return a list of {@link Token}s
+ * @throws IOException
+ */
+ private List tokenizePartsMalformed(String urlString, List parts) throws IOException {
+ List tokens = new ArrayList<>();
+ Set tokenStrings = new HashSet<>();
+ for (URLPart part : parts) {
+ for (Token token : tokenizeMalformed(urlString, part)) {
+ if (part != URLPart.WHOLE) {
+ tokens.add(token);
+ tokenStrings.add(token.getToken());
+ } else if (!tokenStrings.contains(token.getToken())) {
+ // ensure that we are not adding a duplicate token when tokenize the whole malformed URL
+ tokens.add(token);
+ }
+ }
+ }
+ return tokens;
+ }
+
+
/**
* Attempt to tokenize the given malformed URL.
* @param url the URL to be tokenized
@@ -222,11 +235,7 @@ private List tokenize(String urlString) throws IOException {
private List tokenizeMalformed(String url, URLPart part) throws IOException {
if (part == null) {
// No part is specified. Tokenize all parts.
- List tokens = new ArrayList<>();
- for (URLPart urlPart : URLPart.values()) {
- tokens.addAll(tokenizeMalformed(url, urlPart));
- }
- return tokens;
+ return tokenizePartsMalformed(url, ImmutableList.copyOf(URLPart.values()));
}
Optional partOptional = getPart(url, part);
if (!partOptional.isPresent() || partOptional.get().equals("")) {
diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
index 6ce6a02..e663c04 100644
--- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
+++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java
@@ -231,6 +231,25 @@ public void testMalformedHostAndWhole() throws Exception {
}
+ @Test
+ public void testTokenizeMalformedNoPartSpecified() throws Exception {
+ URLTokenizer tokenizer = createTokenizer("example.com");
+ tokenizer.setAllowMalformed(true);
+ tokenizer.setTokenizeMalformed(true);
+ tokenizer.setTokenizeHost(false);
+ assertTokenStreamContents(tokenizer, stringArray("example.com"));
+ }
+
+
+ @Test
+ public void testAllowMalformedNoPartsSpecified() throws Exception {
+ URLTokenizer tokenizer = createTokenizer("example.com");
+ tokenizer.setAllowMalformed(true);
+ tokenizer.setTokenizeHost(false);
+ assertTokenStreamContents(tokenizer, stringArray("example.com"));
+ }
+
+
private URLTokenizer createTokenizer(String input, URLPart... parts) throws IOException {
URLTokenizer tokenizer = new URLTokenizer();
if (parts != null) {