Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement ORDER BY BM25 #1434

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ddd9b69
remove unnecessary generification of IndexColumnComparator
jbellis Nov 19, 2024
9f1b794
Simplify the ordering logic by making IndexColumnComparator only resp…
jbellis Nov 19, 2024
9986108
CNDB-11725 use +score pseudo-column to order ANN results with instead…
jbellis Nov 19, 2024
e0ea872
CNDB-11725 add SYNTHETIC ColumnMetadata.Kind to represent the score c…
jbellis Nov 19, 2024
478ca65
merge with main
jbellis Dec 2, 2024
237cec4
implement BM25
jbellis Dec 6, 2024
56a6e0f
re-disallow DESC with ORDER BY ANN
jbellis Dec 6, 2024
30b6545
cleanup and comments
jbellis Dec 9, 2024
6c9a0e6
address review notes
jbellis Dec 11, 2024
e107fcc
remove unused `limit` parameter from IndexSearcher::search
jbellis Dec 11, 2024
cfe204a
eliminate currentRowIds
jbellis Dec 11, 2024
cef71e3
handle eq in an analyzed index by transforming it into a Match restri…
jbellis Dec 11, 2024
3d17e2f
add testMatchingAllowed and make it work via shouldMerge
jbellis Dec 11, 2024
907a2ee
disambiguate the BM25 error message when the index isn't analyzed
jbellis Dec 12, 2024
3315a12
validateOptions treats analyzed and un-analyzed indexes as distinct, …
jbellis Dec 12, 2024
c0de416
detect and reject ambiguous equality predicates; testAmbiguousPredica…
jbellis Dec 12, 2024
3967e7c
don't inject +score unless coordinator requests it; this is a cleaner…
jbellis Dec 12, 2024
893b87b
fix getEqBahavior, this is most of the test failures
jbellis Dec 13, 2024
c1eaa63
LongBM25Test
jbellis Dec 13, 2024
ddbfd16
misc bugfixes related to zero matches for a term
jbellis Dec 13, 2024
7ff2374
ramIndexer deduplicates (term, row) pairs
jbellis Dec 13, 2024
cfa1157
need to use compareUnsigned once we have more than 4 KINDs
jbellis Dec 13, 2024
1620d3e
simplify
jbellis Dec 13, 2024
73f35df
make SYNTHETIC the first Column Kind instead of the last. This avoids…
jbellis Dec 13, 2024
3ad8ae2
fix tests
jbellis Dec 13, 2024
dbbc678
DRY refactor
jbellis Dec 16, 2024
1b57d55
add tests for unknown query terms, duplicate query terms, no query terms
jbellis Dec 16, 2024
0b5ce5c
// since doc frequencies can be an estimate from the index hi…
jbellis Dec 17, 2024
f3f7a15
parameterize version to test with/without histograms
jbellis Dec 17, 2024
3416389
merge with main
jbellis Dec 17, 2024
d83e18d
actually parameterize both versions
jbellis Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/antlr/Lexer.g
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ K_DROPPED: D R O P P E D;
K_COLUMN: C O L U M N;
K_RECORD: R E C O R D;
K_ANN_OF: A N N WS+ O F;
K_BM25_OF: 'BM25' WS+ 'OF';

// Case-insensitive alpha characters
fragment A: ('a'|'A');
Expand Down
15 changes: 10 additions & 5 deletions src/antlr/Parser.g
Original file line number Diff line number Diff line change
Expand Up @@ -457,14 +457,18 @@ customIndexExpression [WhereClause.Builder clause]
;

orderByClause[List<Ordering.Raw> orderings]
@init{
@init {
Ordering.Direction direction = Ordering.Direction.ASC;
Ordering.Raw.Expression expr = null;
}
: c=cident (K_ANN_OF t=term)? (K_ASC | K_DESC { direction = Ordering.Direction.DESC; })?
: c=cident
( K_ANN_OF t=term { expr = new Ordering.Raw.Ann(c, t); }
| K_BM25_OF t=term { expr = new Ordering.Raw.Bm25(c, t); }
)?
(K_ASC | K_DESC { direction = Ordering.Direction.DESC; })?
{
Ordering.Raw.Expression expr = (t == null)
? new Ordering.Raw.SingleColumn(c)
: new Ordering.Raw.Ann(c, t);
if (expr == null)
expr = new Ordering.Raw.SingleColumn(c);
orderings.add(new Ordering.Raw(expr, direction));
}
;
Expand Down Expand Up @@ -1967,6 +1971,7 @@ basic_unreserved_keyword returns [String str]
| K_COLUMN
| K_RECORD
| K_ANN_OF
| K_BM25_OF
| K_OFFSET
) { $str = $k.text; }
;
6 changes: 6 additions & 0 deletions src/java/org/apache/cassandra/cql3/GeoDistanceRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ protected Restriction newAnnRestriction(TableMetadata table, VariableSpecificati
throw invalidRequest("%s cannot be used with the GEO_DISTANCE function", operator());
}

@Override
protected Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames)
{
throw invalidRequest("%s cannot be used with the GEO_DISTANCE function", operator());
michaeljmarshall marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
protected Restriction newAnalyzerMatchesRestriction(TableMetadata table, VariableSpecifications boundNames)
{
Expand Down
6 changes: 6 additions & 0 deletions src/java/org/apache/cassandra/cql3/MultiColumnRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,12 @@ protected Restriction newAnnRestriction(TableMetadata table, VariableSpecificati
throw invalidRequest("%s cannot be used for multi-column relations", operator());
}

@Override
protected Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames)
{
throw invalidRequest("%s cannot be used for multi-column relations", operator());
}

@Override
protected Restriction newAnalyzerMatchesRestriction(TableMetadata table, VariableSpecifications boundNames)
{
Expand Down
17 changes: 16 additions & 1 deletion src/java/org/apache/cassandra/cql3/Operator.java
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ public String toString()
@Override
public boolean isSatisfiedBy(AbstractType<?> type, ByteBuffer leftOperand, ByteBuffer rightOperand, @Nullable Index.Analyzer analyzer)
{
return true;
throw new UnsupportedOperationException();
}
},
NOT_IN(16)
Expand Down Expand Up @@ -415,6 +415,7 @@ private boolean hasToken(AbstractType<?> type, List<ByteBuffer> tokens, ByteBuff
return false;
}
},

/**
* An operator that performs a distance bounded approximate nearest neighbor search against a vector column such
* that all result vectors are within a given distance of the query vector. The notable difference between this
Expand Down Expand Up @@ -459,6 +460,20 @@ public String toString()
return "DESC";
}

@Override
public boolean isSatisfiedBy(AbstractType<?> type, ByteBuffer leftOperand, ByteBuffer rightOperand, @Nullable Index.Analyzer analyzer)
{
throw new UnsupportedOperationException();
}
},
BM25(104)
{
@Override
public String toString()
{
return "BM25";
}

@Override
public boolean isSatisfiedBy(AbstractType<?> type, ByteBuffer leftOperand, ByteBuffer rightOperand, @Nullable Index.Analyzer analyzer)
{
Expand Down
75 changes: 75 additions & 0 deletions src/java/org/apache/cassandra/cql3/Ordering.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction;
import org.apache.cassandra.cql3.restrictions.SingleRestriction;
import org.apache.cassandra.cql3.statements.SelectStatement;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.TableMetadata;

Expand Down Expand Up @@ -48,6 +49,11 @@ public interface Expression
SingleRestriction toRestriction();

ColumnMetadata getColumn();

default boolean isScored()
{
return false;
}
}

/**
Expand Down Expand Up @@ -118,6 +124,54 @@ public ColumnMetadata getColumn()
{
return column;
}

@Override
public boolean isScored()
{
return SelectStatement.ANN_USE_SYNTHETIC_SCORE;
}
}

/**
* An expression used in BM25 ordering.
* <code>ORDER BY column BM25 OF value</code>
*/
public static class Bm25 implements Expression
{
final ColumnMetadata column;
final Term queryValue;
final Direction direction;

public Bm25(ColumnMetadata column, Term queryValue, Direction direction)
{
this.column = column;
this.queryValue = queryValue;
this.direction = direction;
}

@Override
public boolean hasNonClusteredOrdering()
{
return true;
}

@Override
public SingleRestriction toRestriction()
{
return new SingleColumnRestriction.Bm25Restriction(column, queryValue);
}

@Override
public ColumnMetadata getColumn()
{
return column;
}

@Override
public boolean isScored()
{
return true;
}
}

public enum Direction
Expand Down Expand Up @@ -190,6 +244,27 @@ public Ordering.Expression bind(TableMetadata table, VariableSpecifications boun
return new Ordering.Ann(column, value, direction);
}
}

public static class Bm25 implements Expression
{
final ColumnIdentifier columnId;
final Term.Raw queryValue;

Bm25(ColumnIdentifier column, Term.Raw queryValue)
{
this.columnId = column;
this.queryValue = queryValue;
}

@Override
public Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames, Direction direction)
{
ColumnMetadata column = table.getExistingColumn(columnId);
Term value = queryValue.prepare(table.keyspace, column);
value.collectMarkerSpecification(boundNames);
return new Ordering.Bm25(column, value, direction);
}
}
}
}

Expand Down
7 changes: 7 additions & 0 deletions src/java/org/apache/cassandra/cql3/Relation.java
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ public final Restriction toRestriction(TableMetadata table, VariableSpecificatio
return newLikeRestriction(table, boundNames, relationType);
case ANN:
return newAnnRestriction(table, boundNames);
case BM25:
return newBm25Restriction(table, boundNames);
case ANALYZER_MATCHES:
return newAnalyzerMatchesRestriction(table, boundNames);
default: throw invalidRequest("Unsupported \"!=\" relation: %s", this);
Expand Down Expand Up @@ -296,6 +298,11 @@ protected abstract Restriction newSliceRestriction(TableMetadata table,
*/
protected abstract Restriction newAnnRestriction(TableMetadata table, VariableSpecifications boundNames);

/**
* Creates a new BM25 restriction instance.
*/
protected abstract Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames);

/**
* Creates a new Analyzer Matches restriction instance.
*/
Expand Down
34 changes: 33 additions & 1 deletion src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

import org.apache.cassandra.db.marshal.VectorType;
import org.apache.cassandra.index.IndexRegistry;
import org.apache.cassandra.index.sai.analyzer.AnalyzerEqOperatorSupport;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.cql3.Term.Raw;
Expand All @@ -33,6 +36,7 @@
import org.apache.cassandra.db.marshal.ListType;
import org.apache.cassandra.db.marshal.MapType;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.service.ClientWarn;

import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
Expand Down Expand Up @@ -191,7 +195,27 @@ protected Restriction newEQRestriction(TableMetadata table, VariableSpecificatio
if (mapKey == null)
{
Term term = toTerm(toReceivers(columnDef), value, table.keyspace, boundNames);
return new SingleColumnRestriction.EQRestriction(columnDef, term);
// Leave the restriction as EQ if no analyzed index in backwards compatibility mode is present
var ebi = IndexRegistry.obtain(table).getEqBehavior(columnDef);
if (ebi.behavior == IndexRegistry.EqBehavior.EQ)
return new SingleColumnRestriction.EQRestriction(columnDef, term);

// the index is configured to transform EQ into MATCH for backwards compatibility
if (ebi.behavior == IndexRegistry.EqBehavior.MATCH)
{
ClientWarn.instance.warn(String.format(AnalyzerEqOperatorSupport.EQ_RESTRICTION_ON_ANALYZED_WARNING,
columnDef.toString(),
ebi.matchIndex.getIndexMetadata().name),
columnDef);
return new SingleColumnRestriction.AnalyzerMatchesRestriction(columnDef, term);
}

// multiple indexes support EQ, this is unsupported
assert ebi.behavior == IndexRegistry.EqBehavior.AMBIGUOUS;
throw invalidRequest(AnalyzerEqOperatorSupport.EQ_AMBIGUOUS_ERROR,
columnDef.toString(),
ebi.matchIndex.getIndexMetadata().name,
ebi.eqIndex.getIndexMetadata().name);
}
List<? extends ColumnSpecification> receivers = toReceivers(columnDef);
Term entryKey = toTerm(Collections.singletonList(receivers.get(0)), mapKey, table.keyspace, boundNames);
Expand Down Expand Up @@ -333,6 +357,14 @@ protected Restriction newAnnRestriction(TableMetadata table, VariableSpecificati
return new SingleColumnRestriction.AnnRestriction(columnDef, term);
}

@Override
protected Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames)
{
ColumnMetadata columnDef = table.getExistingColumn(entity);
Term term = toTerm(toReceivers(columnDef), value, table.keyspace, boundNames);
return new SingleColumnRestriction.Bm25Restriction(columnDef, term);
}

@Override
protected Restriction newAnalyzerMatchesRestriction(TableMetadata table, VariableSpecifications boundNames)
{
Expand Down
8 changes: 7 additions & 1 deletion src/java/org/apache/cassandra/cql3/TokenRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,13 @@ protected Restriction newLikeRestriction(TableMetadata table, VariableSpecificat
@Override
protected Restriction newAnnRestriction(TableMetadata table, VariableSpecifications boundNames)
{
throw invalidRequest("%s cannot be used for toekn relations", operator());
throw invalidRequest("%s cannot be used for token relations", operator());
}

@Override
protected Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames)
{
throw invalidRequest("%s cannot be used for token relations", operator());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ public ClusteringColumnRestrictions.Builder addRestriction(Restriction restricti
SingleRestriction lastRestriction = restrictions.lastRestriction();
ColumnMetadata lastRestrictionStart = lastRestriction.getFirstColumn();
ColumnMetadata newRestrictionStart = newRestriction.getFirstColumn();
restrictions.addRestriction(newRestriction, isDisjunction, indexRegistry);
restrictions.addRestriction(newRestriction, isDisjunction);

checkFalse(lastRestriction.isSlice() && newRestrictionStart.position() > lastRestrictionStart.position(),
"Clustering column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
Expand All @@ -203,7 +203,7 @@ public ClusteringColumnRestrictions.Builder addRestriction(Restriction restricti
}
else
{
restrictions.addRestriction(newRestriction, isDisjunction, indexRegistry);
restrictions.addRestriction(newRestriction, isDisjunction);
}

return this;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,7 @@ protected final String getColumnsInCommons(Restriction otherRestriction)
@Override
public final boolean hasSupportingIndex(IndexRegistry indexRegistry)
{
for (Index index : indexRegistry.listIndexes())
if (isSupportingIndex(index))
return true;
return false;
return findSupportingIndex(indexRegistry) != null;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ public PartitionKeyRestrictions build(IndexRegistry indexRegistry, boolean isDis
if (restriction.isOnToken())
return buildWithTokens(restrictionSet, i, indexRegistry);

restrictionSet.addRestriction((SingleRestriction) restriction, isDisjunction, indexRegistry);
restrictionSet.addRestriction((SingleRestriction) restriction, isDisjunction);
}

return buildPartitionKeyRestrictions(restrictionSet);
Expand Down
Loading