Skip to content

Commit

Permalink
detect and reject ambiguous equality predicates; testAmbiguousPredica…
Browse files Browse the repository at this point in the history
…tes passes
  • Loading branch information
jbellis committed Dec 12, 2024
1 parent 3315a12 commit b19fead
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 35 deletions.
27 changes: 20 additions & 7 deletions src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

import org.apache.cassandra.db.marshal.VectorType;
import org.apache.cassandra.index.IndexRegistry;
Expand Down Expand Up @@ -194,15 +195,27 @@ protected Restriction newEQRestriction(TableMetadata table, VariableSpecificatio
if (mapKey == null)
{
Term term = toTerm(toReceivers(columnDef), value, table.keyspace, boundNames);
var analyzedIndex = IndexRegistry.obtain(table).supportsAnalyzedEq(columnDef);
if (analyzedIndex == null)
// Leave the restriction as EQ if no analyzed index in backwards compatibility mode is present
var ebi = IndexRegistry.obtain(table).getEqBehavior(columnDef);
if (ebi.behavior == IndexRegistry.EqBehavior.EQ)
return new SingleColumnRestriction.EQRestriction(columnDef, term);

ClientWarn.instance.warn(String.format(AnalyzerEqOperatorSupport.EQ_RESTRICTION_ON_ANALYZED_WARNING,
columnDef.toString(),
analyzedIndex.getIndexMetadata().name),
columnDef);
return new SingleColumnRestriction.AnalyzerMatchesRestriction(columnDef, term);
// the index is configured to transform EQ into MATCH for backwards compatibility
if (ebi.behavior == IndexRegistry.EqBehavior.MATCH)
{
ClientWarn.instance.warn(String.format(AnalyzerEqOperatorSupport.EQ_RESTRICTION_ON_ANALYZED_WARNING,
columnDef.toString(),
ebi.matchIndex.getIndexMetadata().name),
columnDef);
return new SingleColumnRestriction.AnalyzerMatchesRestriction(columnDef, term);
}

// multiple indexes support EQ, this is unsupported
assert ebi.behavior == IndexRegistry.EqBehavior.AMBIGUOUS;
throw invalidRequest(AnalyzerEqOperatorSupport.EQ_AMBIGUOUS_ERROR,
columnDef.toString(),
ebi.matchIndex.getIndexMetadata().name,
ebi.eqIndex.getIndexMetadata().name);
}
List<? extends ColumnSpecification> receivers = toReceivers(columnDef);
Term entryKey = toTerm(Collections.singletonList(receivers.get(0)), mapKey, table.keyspace, boundNames);
Expand Down
90 changes: 85 additions & 5 deletions src/java/org/apache/cassandra/index/IndexRegistry.java
Original file line number Diff line number Diff line change
Expand Up @@ -343,14 +343,94 @@ public static IndexRegistry obtain(TableMetadata table)
return table.isVirtual() ? EMPTY : Keyspace.openAndGetStore(table).indexManager;
}

default Index supportsAnalyzedEq(ColumnMetadata cm)
enum EqBehavior
{
EQ,
MATCH,
AMBIGUOUS
}

class EqBehaviorIndexes
{
public EqBehavior behavior;
public final Index eqIndex;
public final Index matchIndex;

private EqBehaviorIndexes(Index eqIndex, Index matchIndex, EqBehavior behavior)
{
this.eqIndex = eqIndex;
this.matchIndex = matchIndex;
this.behavior = behavior;
}

public static EqBehaviorIndexes eq(Index eqIndex)
{
return new EqBehaviorIndexes(eqIndex, null, EqBehavior.EQ);
}

public static EqBehaviorIndexes match(Index eqAndMatchIndex)
{
return new EqBehaviorIndexes(eqAndMatchIndex, eqAndMatchIndex, EqBehavior.MATCH);
}

public static EqBehaviorIndexes ambiguous(Index firstEqIndex, Index secondEqIndex)
{
return new EqBehaviorIndexes(firstEqIndex, secondEqIndex, EqBehavior.AMBIGUOUS);
}
}

/**
* @return - EQ if a single index supports EQ
* - MATCHES if a single index supports both
* - AMBIGUOUS if multiple indexes support EQ
*/
default EqBehaviorIndexes getEqBehavior(ColumnMetadata cm)
{
// scan the indexes for MATCHES and EQ support
Index matchesIndex = null;
Index eqIndex = null;
for (Index index : listIndexes())
{
if (index.supportsExpression(cm, Operator.ANALYZER_MATCHES) &&
index.supportsExpression(cm, Operator.EQ))
return index;
if (index.supportsExpression(cm, Operator.EQ))
{
if (eqIndex == null)
{
eqIndex = index;
continue;
}

// If we find a second EQ index, return AMBIGUOUS, taking care to assign the eqIndex and matchIndex correctly
if (index.supportsExpression(cm, Operator.ANALYZER_MATCHES))
{
matchesIndex = index;
}
else
{
assert eqIndex.supportsExpression(cm, Operator.ANALYZER_MATCHES);
matchesIndex = eqIndex;
eqIndex = index;
}
return EqBehaviorIndexes.ambiguous(eqIndex, matchesIndex);
}

if (index.supportsExpression(cm, Operator.ANALYZER_MATCHES))
{
// should only ever have one
assert matchesIndex == null;
matchesIndex = index;
}
}
return null;

// If we didn't find any indexes that support EQ or MATCHES, return EQ
if (eqIndex == null && matchesIndex == null)
return EqBehaviorIndexes.eq(null);

// If the same index supports both EQ and MATCHES, promote to MATCHES
if (eqIndex == matchesIndex)
return EqBehaviorIndexes.match(matchesIndex);

// Otherwise we either have no MATCHES index, or it's distinct from the EQ index.
// In both cases we want to return EQ
return EqBehaviorIndexes.eq(eqIndex);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import com.google.common.annotations.VisibleForTesting;

import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.stress.settings.OptionEnumProbabilities;

/**
* Index config property for defining the behaviour of the equals operator (=) when the index is analyzed.
Expand Down Expand Up @@ -49,7 +50,7 @@ public class AnalyzerEqOperatorSupport
OPTION, Arrays.toString(Value.values()));

public static final String EQ_RESTRICTION_ON_ANALYZED_WARNING =
String.format("Columns [%%s] are restricted by '=' and have analyzed indexes [%%s] able to process those restrictions. " +
String.format("Column [%%s] is restricted by '=' and has an analyzed index [%%s] able to process those restrictions. " +
"Analyzed indexes might process '=' restrictions in a way that is inconsistent with non-indexed queries. " +
"While '=' is still supported on analyzed indexes for backwards compatibility, " +
"it is recommended to use the ':' operator instead to prevent the ambiguity. " +
Expand All @@ -58,6 +59,13 @@ public class AnalyzerEqOperatorSupport
"please use '%s':'%s' in the index options.",
OPTION, Value.UNSUPPORTED.toString().toLowerCase());

public static final String EQ_AMBIGUOUS_ERROR =
String.format("Column [%%s] equality predicate is ambiguous. It has both an analyzed index [%%s] configured with '%s':'%s', " +
"and an un-analyzed index [%%s]. " +
"To avoid ambiguity, drop the analyzed index and recreate it with option '%s':'%s'.",
OPTION, Value.MATCH.toString().toLowerCase(), OPTION, Value.UNSUPPORTED.toString().toLowerCase());


public static final String LWT_CONDITION_ON_ANALYZED_WARNING =
"Index analyzers not applied to LWT conditions on columns [%s].";

Expand Down
5 changes: 5 additions & 0 deletions test/unit/org/apache/cassandra/cql3/CQLTester.java
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,11 @@ protected String currentIndex()
return indexes.get(indexes.size() - 1);
}

protected String getIndex(int i)
{
return indexes.get(i);
}

protected Collection<String> currentTables()
{
if (tables == null || tables.isEmpty())
Expand Down
146 changes: 124 additions & 22 deletions test/unit/org/apache/cassandra/index/sai/cql/BM25Test.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,25 @@

import org.junit.Test;

import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.index.sai.SAITester;
import org.apache.cassandra.index.sai.plan.QueryController;

import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.apache.cassandra.index.sai.analyzer.AnalyzerEqOperatorSupport.EQ_AMBIGUOUS_ERROR;
import static org.assertj.core.api.AssertionsForInterfaceTypes.assertThat;

public class BM25Test extends SAITester
{
@Test
public void testMatchingAllowed() throws Throwable
{
// match operator should be allowed with BM25 on the same column
// (seems obvious but exercises a corner case in the internal RestrictionSet processing)
createSimpleTable();

execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");

beforeAndAfterFlush(() ->
{
var result = execute("SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3");
assertRows(result, row(1));
});
}

@Test
public void testTwoIndexes() throws Throwable
public void testTwoIndexes()
{
// create un-analyzed index
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");

// BM25 should fail with only an equality index
assertThatThrownBy(() -> execute("SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3"))
.isInstanceOf(InvalidRequestException.class)
.hasMessage("BM25 ordering on column v requires an analyzed index");
assertInvalidMessage("BM25 ordering on column v requires an analyzed index",
"SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3");

// create analyzed index
analyzeIndex();
Expand All @@ -65,6 +47,126 @@ public void testTwoIndexes() throws Throwable
assertRows(result, row(1));
}

@Test
public void testTwoIndexesAmbiguousPredicate() throws Throwable
{
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");

// Create analyzed and un-analyzed indexes
analyzeIndex();
createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");

execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");
execute("INSERT INTO %s (k, v) VALUES (2, 'apple juice')");
execute("INSERT INTO %s (k, v) VALUES (3, 'orange juice')");

// equality predicate is ambiguous (both analyzed and un-analyzed indexes could support it) so it should
// be rejected
beforeAndAfterFlush(() -> {
// Single predicate
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)),
"SELECT k FROM %s WHERE v = 'apple'");

// AND
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)),
"SELECT k FROM %s WHERE v = 'apple' AND v : 'juice'");

// OR
assertInvalidMessage(String.format(EQ_AMBIGUOUS_ERROR, "v", getIndex(0), getIndex(1)),
"SELECT k FROM %s WHERE v = 'apple' OR v : 'juice'");
});
}

@Test
public void testTwoIndexesWithEqualsUnsupported() throws Throwable
{
createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
// analyzed index with equals_behavior:unsupported option
createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " +
"WITH OPTIONS = { 'equals_behaviour_when_analyzed': 'unsupported', " +
"'index_analyzer':'{\"tokenizer\":{\"name\":\"standard\"},\"filters\":[{\"name\":\"porterstem\"}]}' }");

execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");
execute("INSERT INTO %s (k, v) VALUES (2, 'apple juice')");

beforeAndAfterFlush(() -> {
// combining two EQ predicates is not allowed
assertInvalid("SELECT k FROM %s WHERE v = 'apple' AND v = 'juice'");

// combining EQ and MATCH predicates is also not allowed (when we're not converting EQ to MATCH)
assertInvalid("SELECT k FROM %s WHERE v = 'apple' AND v : 'apple'");

// combining two MATCH predicates is fine
assertRows(execute("SELECT k FROM %s WHERE v : 'apple' AND v : 'juice'"),
row(2));

// = operator should use un-analyzed index since equals is unsupported in analyzed index
assertRows(execute("SELECT k FROM %s WHERE v = 'apple'"),
row(1));

// : operator should use analyzed index
assertRows(execute("SELECT k FROM %s WHERE v : 'apple'"),
row(1), row(2));
});
}

@Test
public void testComplexQueriesWithMultipleIndexes() throws Throwable
{
createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 text, v2 text, v3 int)");

// Create mix of analyzed, unanalyzed, and non-text indexes
createIndex("CREATE CUSTOM INDEX ON %s(v1) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");
createIndex("CREATE CUSTOM INDEX ON %s(v2) " +
"USING 'org.apache.cassandra.index.sai.StorageAttachedIndex' " +
"WITH OPTIONS = {" +
"'index_analyzer': '{" +
"\"tokenizer\" : {\"name\" : \"standard\"}, " +
"\"filters\" : [{\"name\" : \"porterstem\"}]" +
"}'" +
"}");
createIndex("CREATE CUSTOM INDEX ON %s(v3) USING 'org.apache.cassandra.index.sai.StorageAttachedIndex'");

execute("INSERT INTO %s (k, v1, v2, v3) VALUES (1, 'apple', 'orange juice', 5)");
execute("INSERT INTO %s (k, v1, v2, v3) VALUES (2, 'apple juice', 'apple', 10)");
execute("INSERT INTO %s (k, v1, v2, v3) VALUES (3, 'banana', 'grape juice', 5)");

beforeAndAfterFlush(() -> {
// Complex query mixing different types of indexes and operators
assertRows(execute("SELECT k FROM %s WHERE v1 = 'apple' AND v2 : 'juice' AND v3 = 5"),
row(1));

// Mix of AND and OR conditions across different index types
assertRows(execute("SELECT k FROM %s WHERE v3 = 5 AND (v1 = 'apple' OR v2 : 'apple')"),
row(1));

// Multi-term analyzed query
assertRows(execute("SELECT k FROM %s WHERE v2 : 'orange juice'"),
row(1));

// Range query with text match
assertRows(execute("SELECT k FROM %s WHERE v3 >= 5 AND v2 : 'juice'"),
row(1), row(3));
});
}

@Test
public void testMatchingAllowed() throws Throwable
{
// match operator should be allowed with BM25 on the same column
// (seems obvious but exercises a corner case in the internal RestrictionSet processing)
createSimpleTable();

execute("INSERT INTO %s (k, v) VALUES (1, 'apple')");

beforeAndAfterFlush(() ->
{
var result = execute("SELECT k FROM %s WHERE v : 'apple' ORDER BY v BM25 OF 'apple' LIMIT 3");
assertRows(result, row(1));
});
}

@Test
public void testTermFrequencyOrdering() throws Throwable
{
Expand Down

0 comments on commit b19fead

Please sign in to comment.