Skip to content

Commit

Permalink
De-duplicate malformed URL tokens when no URL parts are specified in …
Browse files Browse the repository at this point in the history
…tokenizer configuration
  • Loading branch information
jlinn committed Jul 18, 2016
1 parent 65f041e commit 705fa81
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 22 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ This plugin enables URL tokenization and token filtering by URL part.

| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
| 2.3.4 | 2.3.4.1 |
| 2.3.4 | 2.3.4.2 |
| 2.3.3 | 2.3.3.3 |
| 2.3.2 | 2.3.2.1 |
| 2.3.1 | 2.3.1.1 |
Expand All @@ -27,7 +27,7 @@ This plugin enables URL tokenization and token filtering by URL part.

## Installation
```bash
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.1/elasticsearch-analysis-url-2.3.4.1.zip
bin/plugin install https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.3.4.2/elasticsearch-analysis-url-2.3.4.2.zip
```

## Usage
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-url</artifactId>
<version>2.3.4.1</version>
<version>2.3.4.2</version>
<packaging>jar</packaging>
<description>Elasticsearch URL token filter plugin</description>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,20 +190,7 @@ private List<Token> tokenize(String urlString) throws IOException {
} catch (MalformedURLException e) {
if (allowMalformed) {
if (tokenizeMalformed && parts != null && !parts.isEmpty()) {
List<Token> tokens = new ArrayList<>();
Set<String> tokenStrings = new HashSet<>();
for (URLPart part : parts) {
for (Token token : tokenizeMalformed(urlString, part)) {
if (part != URLPart.WHOLE) {
tokens.add(token);
tokenStrings.add(token.getToken());
} else if (!tokenStrings.contains(token.getToken())) {
// ensure that we are not adding a duplicate token when tokenize the whole malformed URL
tokens.add(token);
}
}
}
return tokens;
return tokenizePartsMalformed(urlString, parts);
}
return tokenizeMalformed(urlString, (parts == null || parts.isEmpty()) ? null : URLPart.WHOLE);
}
Expand All @@ -212,6 +199,32 @@ private List<Token> tokenize(String urlString) throws IOException {
}


/**
* Tokenize all given parts of the given URL while ensuring that duplicate tokens are not created when the whole
* malformed URL is is identical to a single part token.
* @param urlString the malformed URL to be tokenized
* @param parts the desired {@link URLPart}s
* @return a list of {@link Token}s
* @throws IOException
*/
private List<Token> tokenizePartsMalformed(String urlString, List<URLPart> parts) throws IOException {
List<Token> tokens = new ArrayList<>();
Set<String> tokenStrings = new HashSet<>();
for (URLPart part : parts) {
for (Token token : tokenizeMalformed(urlString, part)) {
if (part != URLPart.WHOLE) {
tokens.add(token);
tokenStrings.add(token.getToken());
} else if (!tokenStrings.contains(token.getToken())) {
// ensure that we are not adding a duplicate token when tokenize the whole malformed URL
tokens.add(token);
}
}
}
return tokens;
}


/**
* Attempt to tokenize the given malformed URL.
* @param url the URL to be tokenized
Expand All @@ -222,11 +235,7 @@ private List<Token> tokenize(String urlString) throws IOException {
private List<Token> tokenizeMalformed(String url, URLPart part) throws IOException {
if (part == null) {
// No part is specified. Tokenize all parts.
List<Token> tokens = new ArrayList<>();
for (URLPart urlPart : URLPart.values()) {
tokens.addAll(tokenizeMalformed(url, urlPart));
}
return tokens;
return tokenizePartsMalformed(url, ImmutableList.copyOf(URLPart.values()));
}
Optional<String> partOptional = getPart(url, part);
if (!partOptional.isPresent() || partOptional.get().equals("")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,25 @@ public void testMalformedHostAndWhole() throws Exception {
}


@Test
public void testTokenizeMalformedNoPartSpecified() throws Exception {
URLTokenizer tokenizer = createTokenizer("example.com");
tokenizer.setAllowMalformed(true);
tokenizer.setTokenizeMalformed(true);
tokenizer.setTokenizeHost(false);
assertTokenStreamContents(tokenizer, stringArray("example.com"));
}


@Test
public void testAllowMalformedNoPartsSpecified() throws Exception {
URLTokenizer tokenizer = createTokenizer("example.com");
tokenizer.setAllowMalformed(true);
tokenizer.setTokenizeHost(false);
assertTokenStreamContents(tokenizer, stringArray("example.com"));
}


private URLTokenizer createTokenizer(String input, URLPart... parts) throws IOException {
URLTokenizer tokenizer = new URLTokenizer();
if (parts != null) {
Expand Down

0 comments on commit 705fa81

Please sign in to comment.