Skip to content

Commit

Permalink
Update to ES v1.5.2. Add url_decode option.
Browse files Browse the repository at this point in the history
  • Loading branch information
jlinn committed Jun 3, 2015
1 parent 2c8cf63 commit d1fc265
Show file tree
Hide file tree
Showing 8 changed files with 77 additions and 16 deletions.
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ This plugin enables URL token filtering by URL part.

| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
| 1.5.2 | 1.1.0 |
| 1.4.2 | 1.0.0 |

## Installation
Expand All @@ -17,8 +18,10 @@ bin/plugin --install analysis-url --url https://github.com/jlinn/elasticsearch-a
```

## Usage
This filter only has one option: `part`. This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are:
Options:
* `part`: This option defaults to `whole`, which will cause the entire URL to be returned. In this case, the filter only serves to validate incoming URLs. Other possible values are:
`protocol`, `host`, `port`, `path`, `query`, and `ref`.
* `url_decode`: Defaults to `false`. If `true`, the desired portion of the URL will be url decoded.

Set up your index like so:
```json
Expand All @@ -28,7 +31,8 @@ Set up your index like so:
"filter": {
"url_host": {
"type": "url",
"part": "host"
"part": "host",
"url_decode": true
}
},
"analyzer": {
Expand Down
22 changes: 19 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-url</artifactId>
<version>1.0-SNAPSHOT</version>
<version>1.1.0</version>
<packaging>jar</packaging>
<description>Elasticsearch URL token filter plugin</description>

Expand All @@ -18,8 +18,8 @@

<properties>
<project.build.sourceEncodint>UTF-8</project.build.sourceEncodint>
<elasticsearch.version>1.4.2</elasticsearch.version>
<lucene.version>4.10.2</lucene.version>
<elasticsearch.version>1.5.2</elasticsearch.version>
<lucene.version>4.10.4</lucene.version>
<hamcrest.version>1.3</hamcrest.version>
<tests.output>onerror</tests.output>
<tests.shuffle>true</tests.shuffle>
Expand Down Expand Up @@ -91,6 +91,22 @@
</dependencies>

<build>
<resources>
<resource>
<directory>src/main/resources</directory>
<filtering>true</filtering>
<includes>
<include>*.properties</include>
</includes>
</resource>
<resource>
<directory>src/main/resources</directory>
<filtering>false</filtering>
<excludes>
<exclude>*.properties</exclude>
</excludes>
</resource>
</resources>
<testResources>
<testResource>
<directory>${basedir}/src/test/java</directory>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,18 @@
@AnalysisSettingsRequired
public class URLTokenFilterFactory extends AbstractTokenFilterFactory {
private final URLPart part;
private final boolean urlDecode;

@Inject
public URLTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);

this.part = URLPart.fromString(settings.get("part", "whole"));
this.urlDecode = settings.getAsBoolean("url_decode", false);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new URLTokenFilter(tokenStream, part);
return new URLTokenFilter(tokenStream, part, urlDecode);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;

/**
* Joe Linn
Expand All @@ -18,11 +19,18 @@ public final class URLTokenFilter extends TokenFilter {

private final URLPart part;

private final boolean urlDeocde;

private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);

public URLTokenFilter(TokenStream input, URLPart part) {
this(input, part, false);
}

public URLTokenFilter(TokenStream input, URLPart part, boolean urlDecode) {
super(input);
this.part = part;
this.urlDeocde = urlDecode;
}

@Override
Expand All @@ -34,29 +42,34 @@ public boolean incrementToken() throws IOException {
return false;
}
URL url = new URL(urlString);
String partString;
switch (part) {
case PROTOCOL:
termAttribute.append(url.getProtocol());
partString = url.getProtocol();
break;
case HOST:
termAttribute.append(url.getHost());
partString = url.getHost();
break;
case PORT:
termAttribute.append(String.valueOf(url.getPort()));
partString = String.valueOf(url.getPort());
break;
case PATH:
termAttribute.append(url.getPath());
partString = url.getPath();
break;
case REF:
termAttribute.append(url.getRef());
partString = url.getRef();
break;
case QUERY:
termAttribute.append(url.getQuery());
partString = url.getQuery();
break;
case WHOLE:
default:
termAttribute.append(url.toString());
partString = url.toString();
}
if (urlDeocde) {
partString = URLDecoder.decode(partString, "UTF-8");
}
termAttribute.append(partString);
return true;
}
return false;
Expand Down
3 changes: 2 additions & 1 deletion src/main/resources/es-plugin.properties
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
plugin=org.elasticsearch.plugin.analysis.AnalysisURLPlugin
#lucene=${lucene.version}
#lucene=${lucene.version}
version=${project.version}
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,19 @@ public void testEmptyString() {
assertThat("no tokens", tokens, hasSize(0));
}

@Test
public void testUrlDecode() {
assertURLAnalyzesTo("https://foo.bar.com?email=foo%40bar.com", "url_query", "[email protected]");
}

private void refresh() {
client().admin().indices().prepareRefresh().get();
}

private void assertURLAnalyzesTo(String url, String analyzer, String expected) {
List<AnalyzeResponse.AnalyzeToken> tokens = analyzeURL(url, analyzer);
assertThat("a URL part was parsed", tokens, hasSize(1));
assertEquals("term value", tokens.get(0).getTerm(), expected);
assertEquals("term value", expected, tokens.get(0).getTerm());
}

private List<AnalyzeResponse.AnalyzeToken> analyzeURL(String url, String analyzer) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,21 @@ public void testNullURL() throws IOException {
filter.incrementToken();
}

@Test
public void testUrlDecode() throws IOException {
assertTokenStreamContents(createFilter("https://www.foo.com?email=foo%40bar.com", URLPart.QUERY, true), "[email protected]");
}

private URLTokenFilter createFilter(final String url, final URLPart part) {
return createFilter(url, part, false);
}

private URLTokenFilter createFilter(final String url, final URLPart part, final boolean urlDecode) {
int length = 0;
if (url != null) {
length = url.length();
}
return new URLTokenFilter(new SingleTokenTokenStream(new Token(url, 0, length)), part);
return new URLTokenFilter(new SingleTokenTokenStream(new Token(url, 0, length)), part, urlDecode);
}

private static void assertTokenStreamContents(TokenStream in, String output) throws IOException {
Expand Down
11 changes: 11 additions & 0 deletions src/test/resources/test-settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
"url_port": {
"type": "url",
"part": "port"
},
"url_query": {
"type": "url",
"part": "query",
"url_decode": true
}
},
"analyzer": {
Expand All @@ -32,6 +37,12 @@
"url_port"
],
"tokenizer": "whitespace"
},
"url_query": {
"filter": [
"url_query"
],
"tokenizer": "whitespace"
}
}
}
Expand Down

0 comments on commit d1fc265

Please sign in to comment.