Merge branch 'branch-3.9' into branch-3.8

# Conflicts: # .jenkins.yml # CHANGELOG.md # builder/pom.xml # doc/documentation.rst # plugin/pom.xml # plugin/src/main/java/com/stratio/cassandra/lucene/common/GeoTransformation.java # plugin/src/main/java/com/stratio/cassandra/lucene/index/NoIDFSimilarity.java # plugin/src/main/java/com/stratio/cassandra/lucene/schema/Schema.java # plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/SnowballAnalyzerBuilder.java # plugin/src/main/java/com/stratio/cassandra/lucene/schema/analysis/SnowballFilter.java # plugin/src/main/java/com/stratio/cassandra/lucene/schema/mapping/Mapper.java # plugin/src/main/java/com/stratio/cassandra/lucene/util/BlockingExecutor.java # plugin/src/main/scala/com/stratio/cassandra/lucene/Index.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexException.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexOptions.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexPagingState.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexQueryHandler.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexService.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexServiceMBean.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexServiceSkinny.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexServiceWide.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexWriter.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexWriterSkinny.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/IndexWriterWide.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/column/Column.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/column/Columns.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/column/ColumnsMapper.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/index/DocumentIterator.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/index/FSIndex.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/index/NoIDFSimilarity.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/index/RAMIndex.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/index/TokenLengthAnalyzer.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/util/SimplePartitionIterator.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/util/TaskQueue.scala # plugin/src/main/scala/com/stratio/cassandra/lucene/util/Tracing.scala # plugin/src/test/scala/com/stratio/cassandra/lucene/IndexOptionsTest.scala # plugin/src/test/scala/com/stratio/cassandra/lucene/column/ColumnTest.scala # plugin/src/test/scala/com/stratio/cassandra/lucene/column/ColumnsMapperTest.scala # plugin/src/test/scala/com/stratio/cassandra/lucene/index/FSIndexTest.scala # pom.xml # testsAT/pom.xml # testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/udt/CheckNonFrozenUDTIT.java # testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/udt/UDTPartialUpdateIT.java # testsAT/src/test/java/com/stratio/cassandra/lucene/testsAT/util/CassandraUtils.java
Stratio · Dec 7, 2016 · 1c4d617 · 1c4d617
2 parents c782e05 + d467ba8
commit 1c4d617
Show file tree

Hide file tree

Showing 168 changed files with 3,640 additions and 2,581 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,10 +2,17 @@
 
 ## 3.8.2 (Upcoming)
 
-* Show error message when unsupported PER PARTITION LIMIT option is used
+* Optimize columns mapping (improves indexing performance)
+* Add generic support for index partitioning
+* Add token-based index partitioner
+* Upgrade to Scala 2.12.0
+* Avoid not required string interpolations in logging
+* Avoid not required string interpolations in tracing
 * Add support for geospatial shapes in bounding box search
 * Add support for geospatial shapes in distance search
-* Improve performance of needs read before write calculation
+* Improve performance of needs before write calculation
+* Show error message when unsupported PER PARTITION LIMIT option is used
+* Upgrade all JSON serializers to FasterXML Jackson 2.8.6
 
 ## 3.8.1 (October 17, 2016)
 
@@ -97,6 +104,10 @@
 ## 3.0.3.1 (March 04, 2016)
 
 * Fix performance issues with ClusteringIndexNamesFilter
+<<<<<<< HEAD
+=======
+
+>>>>>>> branch-3.9
 * Add indexing of WKT geographical shapes (point, linestring, polygon and their multipart)
 * Add search by WKT geographical shapes (point, linestring, polygon and their multipart)
 * Add API for search-time transformation of WKT geographical shapes

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -0,0 +1,56 @@
+@Library('libpipelines@feature/multibranch') _
+
+hose {
+    EMAIL = 'cassandra'
+    MODULE = 'cassandra-lucene-index'
+    DEVTIMEOUT = 50
+    RELEASETIMEOUT = 30
+    FOSS = true
+    REPOSITORY = 'cassandra-lucene-index'    
+    LANG = 'java'
+    PKGMODULES = ['plugin']
+    PKGMODULESNAMES = ['stratio-cassandra-lucene-index']
+    DEBARCH = 'all'
+    RPMARCH = 'noarch'
+    EXPOSED_PORTS = [9042, 7199, 8000]
+
+    PARALLELIZE_AT = true
+
+    ATSERVICES =  [
+        ['CASSANDRA': [
+           'image': 'stratio/cassandra-lucene-index:%%VERSION',
+           'volumes':[
+                 'jts:1.14.0'],
+           'env': [
+                 'MAX_HEAP=256M',
+                  'START_JOLOKIA=true',
+                  'JOLOKIA_OPTS="port=8000,host=$(hostname --ip)"'],
+           'sleep': 10]],
+        ]
+
+    ATPARAMETERS = """
+        | -Dit.host=%%CASSANDRA
+        | -Dit.monitor_service=jolokia
+        | -Dit.monitor_services_url=%%CASSANDRA:8000
+        | -DJACOCO_SERVER=%%CASSANDRA
+        | -Dit-embedded=false"""
+
+    DEV = { config ->
+
+        doCompile(config)
+        doUT(config)
+        doPackage(config)
+
+        parallel(DOC: {
+            doDoc(config)
+        }, QC: {
+            doStaticAnalysis(config)
+        }, DEPLOY: {
+            doDeploy(config)
+        }, DOCKER : {    
+            doDocker(config)
+        }, failFast: config.FAILFAST)
+
+        doAT(config)
+    }
+}
diff --git a/builder/pom.xml b/builder/pom.xml
@@ -38,12 +38,12 @@
         <dependency>
             <groupId>com.fasterxml.jackson.core</groupId>
             <artifactId>jackson-core</artifactId>
-            <version>2.8.0</version>
+            <version>2.8.4</version>
         </dependency>
         <dependency>
             <groupId>com.fasterxml.jackson.core</groupId>
             <artifactId>jackson-databind</artifactId>
-            <version>2.8.0</version>
+            <version>2.8.4</version>
         </dependency>
         <dependency>
             <groupId>junit</groupId>

diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/Builder.java
@@ -18,6 +18,7 @@
 import com.stratio.cassandra.lucene.builder.common.GeoShape;
 import com.stratio.cassandra.lucene.builder.common.GeoTransformation;
 import com.stratio.cassandra.lucene.builder.index.Index;
+import com.stratio.cassandra.lucene.builder.index.Partitioner;
 import com.stratio.cassandra.lucene.builder.index.schema.Schema;
 import com.stratio.cassandra.lucene.builder.index.schema.analysis.ClasspathAnalyzer;
 import com.stratio.cassandra.lucene.builder.index.schema.analysis.SnowballAnalyzer;
@@ -755,4 +756,32 @@ public static GeoShape.Union union(List<GeoShape> shapes) {
     public static GeoShape.Union union(String... shapes) {
         return union(Stream.of(shapes).map(Builder::wkt).collect(Collectors.toList()));
     }
+
+    /**
+     * Returns a new {@link Partitioner.None} to not partitioning the index.
+     *
+     * Index partitioning is useful to speed up some queries to the detriment of others, depending on the implementation.
+     * It is also useful to overcome the Lucene's hard limit of 2147483519 documents per index.
+     *
+     * @return a new no-action partitioning, equivalent to just don't partitioning the index
+     */
+    public static Partitioner nonePartitioner() {
+        return new Partitioner.None();
+    }
+
+    /**
+     * Returns a new {@link Partitioner.OnToken} to split the index in {@code numPartitions} based on the row token.
+     *
+     * Index partitioning is useful to speed up some queries to the detriment of others, depending on the implementation.
+     * It is also useful to overcome the Lucene's hard limit of 2147483519 documents per index.
+     *
+     * Partitioning on token guarantees a good load balancing between partitions while speeding up partition-directed
+     * searches to the detriment of token range searches.
+     *
+     * @param numPartitions the number of partitions
+     * @return a new partitioner based on Cassandra's partitioning token
+     */
+    public static Partitioner partitionerOnToken(int numPartitions) {
+        return new Partitioner.OnToken(numPartitions);
+    }
 }
diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/Index.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/Index.java
@@ -40,6 +40,7 @@ public class Index extends JSONBuilder {
     private Integer indexingThreads;
     private Integer indexingQueuesSize;
     private String excludedDataCenters;
+    private Partitioner partitioner;
 
     /**
      * Builds a new {@link Index} creation statement for the specified table and column.
@@ -209,6 +210,20 @@ public Index schema(Schema schema) {
         return this;
     }
 
+    /**
+     * Sets the {@link Partitioner}.
+     *
+     * Index partitioning is useful to speed up some queries to the detriment of others, depending on the implementation.
+     * It is also useful to overcome the Lucene's hard limit of 2147483519 documents per index.
+     *
+     * @param partitioner the {@link Partitioner}
+     * @return this with the specified partitioner
+     */
+    public Index partitioner(Partitioner partitioner) {
+        this.partitioner = partitioner;
+        return this;
+    }
+
     /** {@inheritDoc} */
     @Override
     public String build() {
@@ -226,6 +241,7 @@ public String build() {
         option(sb, "indexing_threads", indexingThreads);
         option(sb, "indexing_queues_size", indexingQueuesSize);
         option(sb, "excluded_data_centers", excludedDataCenters);
+        option(sb, "partitioner", partitioner);
         sb.append(String.format("'schema':'%s'}", schema));
         return sb.toString();
     }

diff --git a/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/Partitioner.java b/builder/src/main/java/com/stratio/cassandra/lucene/builder/index/Partitioner.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2014 Stratio (http://stratio.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.stratio.cassandra.lucene.builder.index;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonSubTypes;
+import com.fasterxml.jackson.annotation.JsonTypeInfo;
+import com.stratio.cassandra.lucene.builder.JSONBuilder;
+import com.stratio.cassandra.lucene.builder.index.Partitioner.*;
+
+/**
+ * An index partitioner to split the index in multiple partitions.
+ *
+ * Index partitioning is useful to speed up some searches to the detriment of others, depending on the implementation.
+ * It is also useful to overcome the Lucene's hard limit of 2147483519 documents per index.
+ *
+ * @author Andres de la Pena {@literal <[email protected]>}
+ */
+@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type", defaultImpl = None.class)
+@JsonSubTypes({@JsonSubTypes.Type(value = None.class, name = "none"),
+               @JsonSubTypes.Type(value = OnToken.class, name = "token")})
+public abstract class Partitioner extends JSONBuilder {
+
+    /**
+     * {@link Partitioner} with no action, equivalent to not defining a partitioner.
+     */
+    public static class None extends Partitioner {
+    }
+
+    /**
+     * {@link Partitioner} based on the Cassandra's partitioning token.
+     *
+     * Partitioning on token guarantees a good load balancing between partitions while speeding up partition-directed
+     * searches to the detriment of token range searches.
+     */
+    public static class OnToken extends Partitioner {
+
+        @JsonProperty("partitions")
+        public final int partitions;
+
+        public OnToken(int partitions) {
+            this.partitions = partitions;
+        }
+    }
+}
diff --git a/builder/src/test/java/com/stratio/cassandra/lucene/builder/BuilderTest.java b/builder/src/test/java/com/stratio/cassandra/lucene/builder/BuilderTest.java
@@ -52,6 +52,7 @@ public void testIndexFull() {
                                                    .indexingThreads(4)
                                                    .indexingQueuesSize(100)
                                                    .excludedDataCenters("DC1,DC2")
+                                                   .partitioner(partitionerOnToken(8))
                                                    .defaultAnalyzer("my_analyzer")
                                                    .analyzer("my_analyzer", classpathAnalyzer("my_class"))
                                                    .analyzer("snow", snowballAnalyzer("tartar").stopwords("a,b,c"))
@@ -60,9 +61,17 @@ public void testIndexFull() {
                                                    .build();
         String expected = "CREATE CUSTOM INDEX idx ON keyspace.table(lucene) " +
                           "USING 'com.stratio.cassandra.lucene.Index' " +
-                          "WITH OPTIONS = {'refresh_seconds':'10.0','directory_path':'path','ram_buffer_mb':'64'," +
-                          "'max_merge_mb':'16','max_cached_mb':'32','indexing_threads':'4'," +
-                          "'indexing_queues_size':'100','excluded_data_centers':'DC1,DC2','schema':'{" +
+                          "WITH OPTIONS = {" +
+                          "'refresh_seconds':'10.0'," +
+                          "'directory_path':'path'," +
+                          "'ram_buffer_mb':'64'," +
+                          "'max_merge_mb':'16'," +
+                          "'max_cached_mb':'32'," +
+                          "'indexing_threads':'4'," +
+                          "'indexing_queues_size':'100'," +
+                          "'excluded_data_centers':'DC1,DC2'," +
+                          "'partitioner':'{\"type\":\"token\",\"partitions\":8}'," +
+                          "'schema':'{" +
                           "\"default_analyzer\":\"my_analyzer\",\"analyzers\":{" +
                           "\"my_analyzer\":{\"type\":\"classpath\",\"class\":\"my_class\"}," +
                           "\"snow\":{\"type\":\"snowball\",\"language\":\"tartar\",\"stopwords\":\"a,b,c\"}}," +
@@ -71,6 +80,20 @@ public void testIndexFull() {
         assertEquals("index serialization is wrong", expected, actual);
     }
 
+    @Test
+    public void testNonePartitioner() {
+        String actual = nonePartitioner().build();
+        String expected = "{\"type\":\"none\"}";
+        assertEquals("none partitioner serialization is wrong", expected, actual);
+    }
+
+    @Test
+    public void testTokenPartitioner() {
+        String actual = partitionerOnToken(6).build();
+        String expected = "{\"type\":\"token\",\"partitions\":6}";
+        assertEquals("token partitioner serialization is wrong", expected, actual);
+    }
+
     @Test
     public void testBigDecimalMapperDefaults() {
         String actual = bigDecimalMapper().build();

diff --git a/doc/documentation.rst b/doc/documentation.rst
@@ -11,6 +11,9 @@ Stratio's Cassandra Lucene Index
     - `Example <#example>`__
     - `Alternative syntaxes <#alternative-syntaxes>`__
 - `Indexing <#indexing>`__
+    - `Partitioners <#partitioners>`__
+        - `None partitioner <#none-partitioner>`__
+        - `Token partitioner <#token-partitioner>`__
     - `Analyzers <#analyzers>`__
         - `Classpath analyzer <#classpath-analyzer>`__
         - `Snowball analyzer <#snowball-analyzer>`__
@@ -244,7 +247,6 @@ and create them again with running newer version.
 If you have huge amount of data in your cluster this could be an expensive task. We have tested it and here you have a
 compatibility matrix that states between which versions it is not needed to delete the index:
 
-
 +-----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | From\\ To | 3.0.3.0 | 3.0.3.1 | 3.0.4.0 | 3.0.4.1 | 3.0.5.0 |  3.5.0  |  3.5.1  |  3.5.2  |  3.6.0  |  3.7.0  |  3.7.1  |  3.7.2  |  3.7.3  |  3.8.0  |  3.8.1  |  3.8.2  |
 +===========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
@@ -544,6 +546,7 @@ where <options> is a JSON object:
        ('indexing_queues_size': '<int_value>',)?
        ('directory_path': '<string_value>',)?
        ('excluded_data_centers': '<string_value>',)?
+       ('partitioner': '<partitioner_definition>',)?
        'schema': '<schema_definition>'
     };
 
@@ -565,6 +568,9 @@ All options take a value enclosed in single quotes:
 -  **excluded\_data\_centers**: The comma-separated list of the data centers
    to be excluded. The index will be created on this data centers but all the
    write operations will be silently ignored.
+-  **partitioner**: The optional index `partitioner <#partitioners>`__. Index partitioning is useful
+   to speed up some searches to the detriment of others, depending on the implementation. It is also
+   useful to overcome the Lucene's hard limit of 2147483519 documents per index.
 -  **schema**: see below
 
 .. code-block:: sql
@@ -589,6 +595,52 @@ Where default\_analyzer defaults to ‘org.apache.lucene.analysis.standard.Stand
        type: "<mapper_type>" (, <option>: "<value>")*
     }
 
+Partitioners
+============
+
+Lucene indexes can be partitioned on a per-node basis. This means that the local index in each node
+can be split in multiple smaller fragments. Index partitioning is useful to speed up some searches
+to the detriment of others, depending on the implementation. It is also useful to overcome the
+Lucene's hard limit of 2147483519 documents per local index.
+
+Partitioning is disabled by default, and it can be activated specifying a partitioner implementation
+in the index creation statement.
+
+Please note that the index creation statement specifies the values of several Lucene memory-related
+attributes, such as *max_merge_mb* or *ram_buffer_mb*. These attributes are applied to each local
+Lucene index or partition, so the amount of memory should be multiplied by the number of partitions.
+
+None partitioner
+________________
+
+A partitioner with no action, equivalent to not defining a partitioner. This is the default
+implementation.
+
+.. code-block:: sql
+
+    CREATE CUSTOM INDEX test_idx ON test()
+    USING 'com.stratio.cassandra.lucene.Index'
+    WITH OPTIONS = {
+       'schema': '{...}',
+       'partitioner': '{type: "none"}',
+    };
+
+Token partitioner
+_________________
+
+A partitioner based on the partition key token. Partitioning on token guarantees a good load
+balancing between partitions while speeding up partition-directed searches to the detriment of any
+other searches. The number of partitions per node should be specified.
+
+.. code-block:: sql
+
+    CREATE CUSTOM INDEX test_idx ON test()
+    USING 'com.stratio.cassandra.lucene.Index'
+    WITH OPTIONS = {
+       'schema': '{...}',
+       'partitioner': '{type: "token", partitions: 4}',
+    };
+
 Analyzers
 =========
 
@@ -1655,7 +1707,6 @@ Maps an UUID value.
        }'
     };
 
-
 Example
 =======
 
@@ -1675,6 +1726,7 @@ Cassandra shell:
        'max_merge_mb': '5',
        'max_cached_mb': '30',
        'excluded_data_centers': 'dc2,dc3',
+       'partitioner': '{type: "token", partitions: 4}',
        'schema': '{
           analyzers: {
              my_custom_analyzer: {