Skip to content

Commit

Permalink
Update to ES v2.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
jlinn committed Nov 9, 2015
1 parent 69e6409 commit e657521
Show file tree
Hide file tree
Showing 11 changed files with 66 additions and 45 deletions.
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ language: java

jdk:
- oraclejdk7
- oraclejdk8
- oraclejdk8

sudo: false
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@ This plugin enables URL tokenization and token filtering by URL part.

| Elasticsearch Version | Plugin Version |
|-----------------------|----------------|
| 2.0.0 | 2.1.0 |
| 1.6.x, 1.7.x | 2.0.0 |
| 1.6.0 | 1.2.1 |
| 1.5.2 | 1.1.0 |
| 1.4.2 | 1.0.0 |

## Installation
```bash
bin/plugin --install analysis-url --url https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.0.0/elasticsearch-analysis-url-2.0.0.zip
bin/plugin install analysis-url --url https://github.com/jlinn/elasticsearch-analysis-url/releases/download/v2.1.0/elasticsearch-analysis-url-2.1.0.zip
```

## Usage
Expand Down
8 changes: 4 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-analysis-url</artifactId>
<version>2.0.0</version>
<version>2.1.0</version>
<packaging>jar</packaging>
<description>Elasticsearch URL token filter plugin</description>

Expand All @@ -18,8 +18,8 @@

<properties>
<project.build.sourceEncodint>UTF-8</project.build.sourceEncodint>
<elasticsearch.version>1.7.1</elasticsearch.version>
<lucene.version>4.10.4</lucene.version>
<elasticsearch.version>2.0.0</elasticsearch.version>
<lucene.version>5.2.1</lucene.version>
<hamcrest.version>1.3</hamcrest.version>
<tests.output>onerror</tests.output>
<tests.shuffle>true</tests.shuffle>
Expand Down Expand Up @@ -69,7 +69,7 @@

<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-all</artifactId>
<artifactId>hamcrest-library</artifactId>
<version>${hamcrest.version}</version>
<scope>test</scope>
</dependency>
Expand Down
4 changes: 1 addition & 3 deletions src/main/java/org/elasticsearch/index/analysis/URLPart.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package org.elasticsearch.index.analysis;

import org.elasticsearch.ElasticsearchIllegalArgumentException;

/**
* Joe Linn
* 1/17/2015
Expand All @@ -21,6 +19,6 @@ public static URLPart fromString(String part) {
return urlPart;
}
}
throw new ElasticsearchIllegalArgumentException(String.format("Unrecognized URL part: %s", part));
throw new IllegalArgumentException(String.format("Unrecognized URL part: %s", part));
}
}
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
package org.elasticsearch.index.analysis;

import com.google.common.base.Strings;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.base.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.url.URLTokenizer;
import org.elasticsearch.index.settings.IndexSettings;

import java.io.Reader;

/**
* Joe Linn
* 8/1/2015
Expand Down Expand Up @@ -40,9 +38,10 @@ public URLTokenizerFactory(Index index, @IndexSettings Settings indexSettings, @
this.allowMalformed = settings.getAsBoolean("allow_malformed", false);
}


@Override
public Tokenizer create(Reader reader) {
URLTokenizer tokenizer = new URLTokenizer(reader);
public Tokenizer create() {
URLTokenizer tokenizer = new URLTokenizer();
tokenizer.setPart(part);
tokenizer.setUrlDecode(urlDecode);
tokenizer.setTokenizeHost(tokenizeHost);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package org.elasticsearch.index.analysis.url;

import com.google.common.base.Strings;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.base.Strings;
import org.elasticsearch.index.analysis.URLPart;

import java.io.IOException;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package org.elasticsearch.index.analysis.url;

import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.net.InetAddresses;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
Expand All @@ -8,9 +11,6 @@
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeFactory;
import org.elasticsearch.common.base.Strings;
import org.elasticsearch.common.collect.ImmutableList;
import org.elasticsearch.common.net.InetAddresses;
import org.elasticsearch.index.analysis.URLPart;

import java.io.IOException;
Expand Down Expand Up @@ -73,18 +73,17 @@ public final class URLTokenizer extends Tokenizer {
private Iterator<Token> iterator;


public URLTokenizer(Reader input) {
super(input);
public URLTokenizer() {

}

public URLTokenizer(Reader input, URLPart part) {
this(input);
public URLTokenizer(URLPart part) {
this.part = part;
}


public URLTokenizer(AttributeFactory factory, Reader input) {
super(factory, input);
public URLTokenizer(AttributeFactory factory) {
super(factory);
}


Expand Down Expand Up @@ -206,7 +205,7 @@ private List<Token> tokenize(URL url, URLPart part) throws IOException {
end = getEndIndex(start, partStringRaw);
return ImmutableList.of(new Token(partString, part, start, end));
}
return tokenize(part, new ReversePathHierarchyTokenizer(new StringReader(partString), '.', '.'), start);
return tokenize(part, addReader(new ReversePathHierarchyTokenizer('.', '.'), new StringReader(partString)), start);
case PORT:
String port = getPort(url);
start = url.toString().indexOf(":" + port);
Expand All @@ -225,14 +224,14 @@ private List<Token> tokenize(URL url, URLPart part) throws IOException {
end = getEndIndex(start, partStringRaw);
return ImmutableList.of(new Token(partString, part, start, end));
}
return tokenize(part, new PathHierarchyTokenizer(new StringReader(partString), '/', '/'), start);
return tokenize(part, addReader(new PathHierarchyTokenizer('/', '/'), new StringReader(partString)), start);
case QUERY:
start = getStartIndex(url, partStringRaw);
if (!tokenizeQuery) {
end = getEndIndex(start, partStringRaw);
return ImmutableList.of(new Token(partString, part, start, end));
}
return tokenize(part, new PatternTokenizer(new StringReader(partString), QUERY_SEPARATOR, -1), start);
return tokenize(part, addReader(new PatternTokenizer(QUERY_SEPARATOR, -1), new StringReader(partString)), start);
case PROTOCOL:
case WHOLE:
end = partString.length();
Expand All @@ -246,6 +245,19 @@ private List<Token> tokenize(URL url, URLPart part) throws IOException {
}


/**
* Set the given reader on the given tokenizer
* @param tokenizer tokenizer on which the reader is to be set
* @param input the reader to set
* @return the given tokenizer with the given reader set
* @throws IOException
*/
private Tokenizer addReader(Tokenizer tokenizer, Reader input) throws IOException {
tokenizer.setReader(input);
return tokenizer;
}


/**
* Get the start index of the given string in the given url
* @param url the url
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.analysis.URLTokenAnalysisBinderProcessor;
import org.elasticsearch.plugins.AbstractPlugin;
import org.elasticsearch.plugins.Plugin;

/**
* Joe Linn
* 1/17/2015
*/
public class AnalysisURLPlugin extends AbstractPlugin {
public class AnalysisURLPlugin extends Plugin {
/**
* The name of the plugin.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package org.elasticsearch.index.analysis.url;

import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.common.io.Streams;
import org.elasticsearch.test.ElasticsearchSingleNodeTest;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.plugin.analysis.AnalysisURLPlugin;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.StreamsUtils;
import org.junit.Before;

import java.util.List;
Expand All @@ -11,27 +13,32 @@
* Joe Linn
* 8/1/2015
*/
public abstract class URLAnalysisTestCase extends ElasticsearchSingleNodeTest {
public abstract class URLAnalysisTestCase extends ESIntegTestCase {
protected static final String INDEX = "url_token_filter";


@Override
protected Settings nodeSettings(int nodeOrdinal) {
return Settings.builder()
.put(super.nodeSettings(nodeOrdinal))
.put("plugin.types", AnalysisURLPlugin.class.getName())
.build();
}

/**
* For subclasses to override. Overrides must call {@code super.setUp()}.
*/
@Before
@Override
public void setUp() throws Exception {
super.setUp();
String settings = Streams.copyToStringFromClasspath("/test-settings.json");
String mapping = Streams.copyToStringFromClasspath("/test-mapping.json");
String settings = StreamsUtils.copyToStringFromClasspath("/test-settings.json");
String mapping = StreamsUtils.copyToStringFromClasspath("/test-mapping.json");
client().admin().indices().prepareCreate(INDEX).setSettings(settings).addMapping("test", mapping).get();
refresh();
Thread.sleep(75); // Ensure that the shard is available before we start making analyze requests.
}

protected void refresh() {
client().admin().indices().prepareRefresh().get();
}

protected List<AnalyzeResponse.AnalyzeToken> analyzeURL(String url, String analyzer) {
return client().admin().indices().prepareAnalyze(INDEX, url).setAnalyzer(analyzer).get().getTokens();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHits;
import org.junit.Test;
Expand Down Expand Up @@ -60,7 +59,7 @@ public void testMalformed() {

SearchHits hits = client()
.prepareSearch(INDEX)
.setQuery(QueryBuilders.filteredQuery(QueryBuilders.matchAllQuery(), FilterBuilders.missingFilter("http_malformed.port")))
.setQuery(QueryBuilders.missingQuery("http_malformed.port"))
.get()
.getHits();
assertEquals("found a doc missing http_malformed.port", 1, hits.getTotalHits());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public void testTokenizeHost() throws IOException {


@Test
public void testTokenizePort() {
public void testTokenizePort() throws IOException {
URLTokenizer tokenizer = createTokenizer(TEST_HTTP_URL, URLPart.PORT);
assertThat(tokenizer, hasTokenAtOffset("9200", 23, 27));

Expand Down Expand Up @@ -96,15 +96,16 @@ public void testTokenizeQuery() throws IOException {


@Test
public void testTokenizeRef() {
public void testTokenizeRef() throws IOException {
URLTokenizer tokenizer = createTokenizer("http://foo.com#baz", URLPart.REF);
assertThat(tokenizer, hasTokenAtOffset("baz", 15, 18));
}


@Test
public void testAll() throws IOException {
URLTokenizer tokenizer = new URLTokenizer(new StringReader(TEST_HTTPS_URL));
URLTokenizer tokenizer = new URLTokenizer();
tokenizer.setReader(new StringReader(TEST_HTTPS_URL));
CharTermAttribute termAttribute = tokenizer.getAttribute(CharTermAttribute.class);
tokenizer.reset();
tokenizer.clearAttributes();
Expand Down Expand Up @@ -138,8 +139,10 @@ public void testAllowMalformed() throws IOException {
}


private URLTokenizer createTokenizer(String input, URLPart part) {
return new URLTokenizer(new StringReader(input), part);
private URLTokenizer createTokenizer(String input, URLPart part) throws IOException {
URLTokenizer tokenizer = new URLTokenizer(part);
tokenizer.setReader(new StringReader(input));
return tokenizer;
}


Expand Down

0 comments on commit e657521

Please sign in to comment.