diff --git a/README.md b/README.md index ec86bad..ad724d1 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part. | Elasticsearch Version | Plugin Version | |-----------------------|----------------| -| 2.3.4 | 2.3.4.1 | +| 2.3.4 | 2.3.4.2 | | 2.3.3 | 2.3.3.3 | | 2.3.2 | 2.3.2.1 | | 2.3.1 | 2.3.1.1 | @@ -27,7 +27,7 @@ This plugin enables URL tokenization and token filtering by URL part. ## Installation ```bash -bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.1/elasticsearch-analysis-url-2.3.4.1.zip +bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.2/elasticsearch-analysis-url-2.3.4.2.zip ``` ## Usage diff --git a/pom.xml b/pom.xml index d7f75ef..45fc797 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.elasticsearch elasticsearch-analysis-url - 2.3.4.1 + 2.3.4.2 jar Elasticsearch URL token filter plugin diff --git a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java index 22eccc3..64a0f94 100644 --- a/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/url/URLTokenizer.java @@ -190,20 +190,7 @@ private List tokenize(String urlString) throws IOException { } catch (MalformedURLException e) { if (allowMalformed) { if (tokenizeMalformed && parts != null && !parts.isEmpty()) { - List tokens = new ArrayList<>(); - Set tokenStrings = new HashSet<>(); - for (URLPart part : parts) { - for (Token token : tokenizeMalformed(urlString, part)) { - if (part != URLPart.WHOLE) { - tokens.add(token); - tokenStrings.add(token.getToken()); - } else if (!tokenStrings.contains(token.getToken())) { - // ensure that we are not adding a duplicate token when tokenize the whole malformed URL - tokens.add(token); - } - } - } - return tokens; + return tokenizePartsMalformed(urlString, parts); } return tokenizeMalformed(urlString, (parts == null || parts.isEmpty()) ? null : URLPart.WHOLE); } @@ -212,6 +199,32 @@ private List tokenize(String urlString) throws IOException { } + /** + * Tokenize all given parts of the given URL while ensuring that duplicate tokens are not created when the whole + * malformed URL is is identical to a single part token. + * @param urlString the malformed URL to be tokenized + * @param parts the desired {@link URLPart}s + * @return a list of {@link Token}s + * @throws IOException + */ + private List tokenizePartsMalformed(String urlString, List parts) throws IOException { + List tokens = new ArrayList<>(); + Set tokenStrings = new HashSet<>(); + for (URLPart part : parts) { + for (Token token : tokenizeMalformed(urlString, part)) { + if (part != URLPart.WHOLE) { + tokens.add(token); + tokenStrings.add(token.getToken()); + } else if (!tokenStrings.contains(token.getToken())) { + // ensure that we are not adding a duplicate token when tokenize the whole malformed URL + tokens.add(token); + } + } + } + return tokens; + } + + /** * Attempt to tokenize the given malformed URL. * @param url the URL to be tokenized @@ -222,11 +235,7 @@ private List tokenize(String urlString) throws IOException { private List tokenizeMalformed(String url, URLPart part) throws IOException { if (part == null) { // No part is specified. Tokenize all parts. - List tokens = new ArrayList<>(); - for (URLPart urlPart : URLPart.values()) { - tokens.addAll(tokenizeMalformed(url, urlPart)); - } - return tokens; + return tokenizePartsMalformed(url, ImmutableList.copyOf(URLPart.values())); } Optional partOptional = getPart(url, part); if (!partOptional.isPresent() || partOptional.get().equals("")) { diff --git a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java index 6ce6a02..e663c04 100644 --- a/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java +++ b/src/test/java/org/elasticsearch/index/analysis/url/URLTokenizerTest.java @@ -231,6 +231,25 @@ public void testMalformedHostAndWhole() throws Exception { } + @Test + public void testTokenizeMalformedNoPartSpecified() throws Exception { + URLTokenizer tokenizer = createTokenizer("example.com"); + tokenizer.setAllowMalformed(true); + tokenizer.setTokenizeMalformed(true); + tokenizer.setTokenizeHost(false); + assertTokenStreamContents(tokenizer, stringArray("example.com")); + } + + + @Test + public void testAllowMalformedNoPartsSpecified() throws Exception { + URLTokenizer tokenizer = createTokenizer("example.com"); + tokenizer.setAllowMalformed(true); + tokenizer.setTokenizeHost(false); + assertTokenStreamContents(tokenizer, stringArray("example.com")); + } + + private URLTokenizer createTokenizer(String input, URLPart... parts) throws IOException { URLTokenizer tokenizer = new URLTokenizer(); if (parts != null) {