Skip to content
This repository has been archived by the owner on May 27, 2020. It is now read-only.

Commit

Permalink
Merge branch 'branch-3.9' into branch-3.8
Browse files Browse the repository at this point in the history
# Conflicts:
#	.jenkins.yml
#	CHANGELOG.md
#	builder/pom.xml
#	doc/documentation.rst
#	plugin/pom.xml
#	plugin/src/main/java/com/stratio/cassandra/lucene/common/GeoTransformation.java
#	plugin/src/main/java/com/stratio/cassandra/lucene/index/NoIDFSimilarity.java
#	plugin/src/main/java/com/stratio/cassandra/lucene/schema/Schema.java
#	plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/SnowballAnalyzerBuilder.java
#	plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/SnowballFilter.java
#	plugin/src/main/java/com/stratio/cassandra/lucene/schema/mapping/Mapper.java
#	plugin/src/main/java/com/stratio/cassandra/lucene/util/BlockingExecutor.java
#	plugin/src/main/scala/com/stratio/cassandra/lucene/Index.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexException.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexOptions.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexPagingState.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexQueryHandler.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexService.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexServiceMBean.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexServiceSkinny.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexServiceWide.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexWriter.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexWriterSkinny.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/IndexWriterWide.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/column/Column.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/column/Columns.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/column/ColumnsMapper.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/index/DocumentIterator.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/index/FSIndex.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/index/NoIDFSimilarity.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/index/RAMIndex.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/index/TokenLengthAnalyzer.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/util/SimplePartitionIterator.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/util/TaskQueue.scala
#	plugin/src/main/scala/com/stratio/cassandra/lucene/util/Tracing.scala
#	plugin/src/test/scala/com/stratio/cassandra/lucene/IndexOptionsTest.scala
#	plugin/src/test/scala/com/stratio/cassandra/lucene/column/ColumnTest.scala
#	plugin/src/test/scala/com/stratio/cassandra/lucene/column/ColumnsMapperTest.scala
#	plugin/src/test/scala/com/stratio/cassandra/lucene/index/FSIndexTest.scala
#	pom.xml
#	testsAT/pom.xml
#	testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/udt/CheckNonFrozenUDTIT.java
#	testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/udt/UDTPartialUpdateIT.java
#	testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/util/CassandraUtils.java
  • Loading branch information
adelapena committed Dec 7, 2016
2 parents c782e05 + d467ba8 commit 1c4d617
Show file tree
Hide file tree
Showing 168 changed files with 3,640 additions and 2,581 deletions.
15 changes: 13 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,17 @@

## 3.8.2 (Upcoming)

* Show error message when unsupported PER PARTITION LIMIT option is used
* Optimize columns mapping (improves indexing performance)
* Add generic support for index partitioning
* Add token-based index partitioner
* Upgrade to Scala 2.12.0
* Avoid not required string interpolations in logging
* Avoid not required string interpolations in tracing
* Add support for geospatial shapes in bounding box search
* Add support for geospatial shapes in distance search
* Improve performance of needs read before write calculation
* Improve performance of needs before write calculation
* Show error message when unsupported PER PARTITION LIMIT option is used
* Upgrade all JSON serializers to FasterXML Jackson 2.8.6

## 3.8.1 (October 17, 2016)

Expand Down Expand Up @@ -97,6 +104,10 @@
## 3.0.3.1 (March 04, 2016)

* Fix performance issues with ClusteringIndexNamesFilter
<<<<<<< HEAD
=======

>>>>>>> branch-3.9
* Add indexing of WKT geographical shapes (point, linestring, polygon and their multipart)
* Add search by WKT geographical shapes (point, linestring, polygon and their multipart)
* Add API for search-time transformation of WKT geographical shapes
Expand Down
56 changes: 56 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
@Library('libpipelines@feature/multibranch') _

hose {
EMAIL = 'cassandra'
MODULE = 'cassandra-lucene-index'
DEVTIMEOUT = 50
RELEASETIMEOUT = 30
FOSS = true
REPOSITORY = 'cassandra-lucene-index'
LANG = 'java'
PKGMODULES = ['plugin']
PKGMODULESNAMES = ['stratio-cassandra-lucene-index']
DEBARCH = 'all'
RPMARCH = 'noarch'
EXPOSED_PORTS = [9042, 7199, 8000]

PARALLELIZE_AT = true

ATSERVICES = [
['CASSANDRA': [
'image': 'stratio/cassandra-lucene-index:%%VERSION',
'volumes':[
'jts:1.14.0'],
'env': [
'MAX_HEAP=256M',
'START_JOLOKIA=true',
'JOLOKIA_OPTS="port=8000,host=$(hostname --ip)"'],
'sleep': 10]],
]

ATPARAMETERS = """
| -Dit.host=%%CASSANDRA
| -Dit.monitor_service=jolokia
| -Dit.monitor_services_url=%%CASSANDRA:8000
| -DJACOCO_SERVER=%%CASSANDRA
| -Dit-embedded=false"""

DEV = { config ->

doCompile(config)
doUT(config)
doPackage(config)

parallel(DOC: {
doDoc(config)
}, QC: {
doStaticAnalysis(config)
}, DEPLOY: {
doDeploy(config)
}, DOCKER : {
doDocker(config)
}, failFast: config.FAILFAST)

doAT(config)
}
}
4 changes: 2 additions & 2 deletions builder/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.8.0</version>
<version>2.8.4</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.8.0</version>
<version>2.8.4</version>
</dependency>
<dependency>
<groupId>junit</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import com.stratio.cassandra.lucene.builder.common.GeoShape;
import com.stratio.cassandra.lucene.builder.common.GeoTransformation;
import com.stratio.cassandra.lucene.builder.index.Index;
import com.stratio.cassandra.lucene.builder.index.Partitioner;
import com.stratio.cassandra.lucene.builder.index.schema.Schema;
import com.stratio.cassandra.lucene.builder.index.schema.analysis.ClasspathAnalyzer;
import com.stratio.cassandra.lucene.builder.index.schema.analysis.SnowballAnalyzer;
Expand Down Expand Up @@ -755,4 +756,32 @@ public static GeoShape.Union union(List<GeoShape> shapes) {
public static GeoShape.Union union(String... shapes) {
return union(Stream.of(shapes).map(Builder::wkt).collect(Collectors.toList()));
}

/**
* Returns a new {@link Partitioner.None} to not partitioning the index.
*
* Index partitioning is useful to speed up some queries to the detriment of others, depending on the implementation.
* It is also useful to overcome the Lucene's hard limit of 2147483519 documents per index.
*
* @return a new no-action partitioning, equivalent to just don't partitioning the index
*/
public static Partitioner nonePartitioner() {
return new Partitioner.None();
}

/**
* Returns a new {@link Partitioner.OnToken} to split the index in {@code numPartitions} based on the row token.
*
* Index partitioning is useful to speed up some queries to the detriment of others, depending on the implementation.
* It is also useful to overcome the Lucene's hard limit of 2147483519 documents per index.
*
* Partitioning on token guarantees a good load balancing between partitions while speeding up partition-directed
* searches to the detriment of token range searches.
*
* @param numPartitions the number of partitions
* @return a new partitioner based on Cassandra's partitioning token
*/
public static Partitioner partitionerOnToken(int numPartitions) {
return new Partitioner.OnToken(numPartitions);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ public class Index extends JSONBuilder {
private Integer indexingThreads;
private Integer indexingQueuesSize;
private String excludedDataCenters;
private Partitioner partitioner;

/**
* Builds a new {@link Index} creation statement for the specified table and column.
Expand Down Expand Up @@ -209,6 +210,20 @@ public Index schema(Schema schema) {
return this;
}

/**
* Sets the {@link Partitioner}.
*
* Index partitioning is useful to speed up some queries to the detriment of others, depending on the implementation.
* It is also useful to overcome the Lucene's hard limit of 2147483519 documents per index.
*
* @param partitioner the {@link Partitioner}
* @return this with the specified partitioner
*/
public Index partitioner(Partitioner partitioner) {
this.partitioner = partitioner;
return this;
}

/** {@inheritDoc} */
@Override
public String build() {
Expand All @@ -226,6 +241,7 @@ public String build() {
option(sb, "indexing_threads", indexingThreads);
option(sb, "indexing_queues_size", indexingQueuesSize);
option(sb, "excluded_data_centers", excludedDataCenters);
option(sb, "partitioner", partitioner);
sb.append(String.format("'schema':'%s'}", schema));
return sb.toString();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Copyright (C) 2014 Stratio (http://stratio.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.stratio.cassandra.lucene.builder.index;

import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonSubTypes;
import com.fasterxml.jackson.annotation.JsonTypeInfo;
import com.stratio.cassandra.lucene.builder.JSONBuilder;
import com.stratio.cassandra.lucene.builder.index.Partitioner.*;

/**
* An index partitioner to split the index in multiple partitions.
*
* Index partitioning is useful to speed up some searches to the detriment of others, depending on the implementation.
* It is also useful to overcome the Lucene's hard limit of 2147483519 documents per index.
*
* @author Andres de la Pena {@literal <[email protected]>}
*/
@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type", defaultImpl = None.class)
@JsonSubTypes({@JsonSubTypes.Type(value = None.class, name = "none"),
@JsonSubTypes.Type(value = OnToken.class, name = "token")})
public abstract class Partitioner extends JSONBuilder {

/**
* {@link Partitioner} with no action, equivalent to not defining a partitioner.
*/
public static class None extends Partitioner {
}

/**
* {@link Partitioner} based on the Cassandra's partitioning token.
*
* Partitioning on token guarantees a good load balancing between partitions while speeding up partition-directed
* searches to the detriment of token range searches.
*/
public static class OnToken extends Partitioner {

@JsonProperty("partitions")
public final int partitions;

public OnToken(int partitions) {
this.partitions = partitions;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ public void testIndexFull() {
.indexingThreads(4)
.indexingQueuesSize(100)
.excludedDataCenters("DC1,DC2")
.partitioner(partitionerOnToken(8))
.defaultAnalyzer("my_analyzer")
.analyzer("my_analyzer", classpathAnalyzer("my_class"))
.analyzer("snow", snowballAnalyzer("tartar").stopwords("a,b,c"))
Expand All @@ -60,9 +61,17 @@ public void testIndexFull() {
.build();
String expected = "CREATE CUSTOM INDEX idx ON keyspace.table(lucene) " +
"USING 'com.stratio.cassandra.lucene.Index' " +
"WITH OPTIONS = {'refresh_seconds':'10.0','directory_path':'path','ram_buffer_mb':'64'," +
"'max_merge_mb':'16','max_cached_mb':'32','indexing_threads':'4'," +
"'indexing_queues_size':'100','excluded_data_centers':'DC1,DC2','schema':'{" +
"WITH OPTIONS = {" +
"'refresh_seconds':'10.0'," +
"'directory_path':'path'," +
"'ram_buffer_mb':'64'," +
"'max_merge_mb':'16'," +
"'max_cached_mb':'32'," +
"'indexing_threads':'4'," +
"'indexing_queues_size':'100'," +
"'excluded_data_centers':'DC1,DC2'," +
"'partitioner':'{\"type\":\"token\",\"partitions\":8}'," +
"'schema':'{" +
"\"default_analyzer\":\"my_analyzer\",\"analyzers\":{" +
"\"my_analyzer\":{\"type\":\"classpath\",\"class\":\"my_class\"}," +
"\"snow\":{\"type\":\"snowball\",\"language\":\"tartar\",\"stopwords\":\"a,b,c\"}}," +
Expand All @@ -71,6 +80,20 @@ public void testIndexFull() {
assertEquals("index serialization is wrong", expected, actual);
}

@Test
public void testNonePartitioner() {
String actual = nonePartitioner().build();
String expected = "{\"type\":\"none\"}";
assertEquals("none partitioner serialization is wrong", expected, actual);
}

@Test
public void testTokenPartitioner() {
String actual = partitionerOnToken(6).build();
String expected = "{\"type\":\"token\",\"partitions\":6}";
assertEquals("token partitioner serialization is wrong", expected, actual);
}

@Test
public void testBigDecimalMapperDefaults() {
String actual = bigDecimalMapper().build();
Expand Down
56 changes: 54 additions & 2 deletions doc/documentation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ Stratio's Cassandra Lucene Index
- `Example <#example>`__
- `Alternative syntaxes <#alternative-syntaxes>`__
- `Indexing <#indexing>`__
- `Partitioners <#partitioners>`__
- `None partitioner <#none-partitioner>`__
- `Token partitioner <#token-partitioner>`__
- `Analyzers <#analyzers>`__
- `Classpath analyzer <#classpath-analyzer>`__
- `Snowball analyzer <#snowball-analyzer>`__
Expand Down Expand Up @@ -244,7 +247,6 @@ and create them again with running newer version.
If you have huge amount of data in your cluster this could be an expensive task. We have tested it and here you have a
compatibility matrix that states between which versions it is not needed to delete the index:


+-----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
| From\\ To | 3.0.3.0 | 3.0.3.1 | 3.0.4.0 | 3.0.4.1 | 3.0.5.0 | 3.5.0 | 3.5.1 | 3.5.2 | 3.6.0 | 3.7.0 | 3.7.1 | 3.7.2 | 3.7.3 | 3.8.0 | 3.8.1 | 3.8.2 |
+===========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
Expand Down Expand Up @@ -544,6 +546,7 @@ where <options> is a JSON object:
('indexing_queues_size': '<int_value>',)?
('directory_path': '<string_value>',)?
('excluded_data_centers': '<string_value>',)?
('partitioner': '<partitioner_definition>',)?
'schema': '<schema_definition>'
};
Expand All @@ -565,6 +568,9 @@ All options take a value enclosed in single quotes:
- **excluded\_data\_centers**: The comma-separated list of the data centers
to be excluded. The index will be created on this data centers but all the
write operations will be silently ignored.
- **partitioner**: The optional index `partitioner <#partitioners>`__. Index partitioning is useful
to speed up some searches to the detriment of others, depending on the implementation. It is also
useful to overcome the Lucene's hard limit of 2147483519 documents per index.
- **schema**: see below

.. code-block:: sql
Expand All @@ -589,6 +595,52 @@ Where default\_analyzer defaults to ‘org.apache.lucene.analysis.standard.Stand
type: "<mapper_type>" (, <option>: "<value>")*
}
Partitioners
============

Lucene indexes can be partitioned on a per-node basis. This means that the local index in each node
can be split in multiple smaller fragments. Index partitioning is useful to speed up some searches
to the detriment of others, depending on the implementation. It is also useful to overcome the
Lucene's hard limit of 2147483519 documents per local index.

Partitioning is disabled by default, and it can be activated specifying a partitioner implementation
in the index creation statement.

Please note that the index creation statement specifies the values of several Lucene memory-related
attributes, such as *max_merge_mb* or *ram_buffer_mb*. These attributes are applied to each local
Lucene index or partition, so the amount of memory should be multiplied by the number of partitions.

None partitioner
________________

A partitioner with no action, equivalent to not defining a partitioner. This is the default
implementation.

.. code-block:: sql
CREATE CUSTOM INDEX test_idx ON test()
USING 'com.stratio.cassandra.lucene.Index'
WITH OPTIONS = {
'schema': '{...}',
'partitioner': '{type: "none"}',
};
Token partitioner
_________________

A partitioner based on the partition key token. Partitioning on token guarantees a good load
balancing between partitions while speeding up partition-directed searches to the detriment of any
other searches. The number of partitions per node should be specified.

.. code-block:: sql
CREATE CUSTOM INDEX test_idx ON test()
USING 'com.stratio.cassandra.lucene.Index'
WITH OPTIONS = {
'schema': '{...}',
'partitioner': '{type: "token", partitions: 4}',
};
Analyzers
=========

Expand Down Expand Up @@ -1655,7 +1707,6 @@ Maps an UUID value.
}'
};
Example
=======

Expand All @@ -1675,6 +1726,7 @@ Cassandra shell:
'max_merge_mb': '5',
'max_cached_mb': '30',
'excluded_data_centers': 'dc2,dc3',
'partitioner': '{type: "token", partitions: 4}',
'schema': '{
analyzers: {
my_custom_analyzer: {
Expand Down
Loading

0 comments on commit 1c4d617

Please sign in to comment.