Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement ORDER BY BM25 #1434

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ddd9b69
remove unnecessary generification of IndexColumnComparator
jbellis Nov 19, 2024
9f1b794
Simplify the ordering logic by making IndexColumnComparator only resp…
jbellis Nov 19, 2024
9986108
CNDB-11725 use +score pseudo-column to order ANN results with instead…
jbellis Nov 19, 2024
e0ea872
CNDB-11725 add SYNTHETIC ColumnMetadata.Kind to represent the score c…
jbellis Nov 19, 2024
478ca65
merge with main
jbellis Dec 2, 2024
237cec4
implement BM25
jbellis Dec 6, 2024
56a6e0f
re-disallow DESC with ORDER BY ANN
jbellis Dec 6, 2024
30b6545
cleanup and comments
jbellis Dec 9, 2024
6c9a0e6
address review notes
jbellis Dec 11, 2024
e107fcc
remove unused `limit` parameter from IndexSearcher::search
jbellis Dec 11, 2024
cfe204a
eliminate currentRowIds
jbellis Dec 11, 2024
cef71e3
handle eq in an analyzed index by transforming it into a Match restri…
jbellis Dec 11, 2024
3d17e2f
add testMatchingAllowed and make it work via shouldMerge
jbellis Dec 11, 2024
907a2ee
disambiguate the BM25 error message when the index isn't analyzed
jbellis Dec 12, 2024
3315a12
validateOptions treats analyzed and un-analyzed indexes as distinct, …
jbellis Dec 12, 2024
c0de416
detect and reject ambiguous equality predicates; testAmbiguousPredica…
jbellis Dec 12, 2024
3967e7c
don't inject +score unless coordinator requests it; this is a cleaner…
jbellis Dec 12, 2024
893b87b
fix getEqBahavior, this is most of the test failures
jbellis Dec 13, 2024
c1eaa63
LongBM25Test
jbellis Dec 13, 2024
ddbfd16
misc bugfixes related to zero matches for a term
jbellis Dec 13, 2024
7ff2374
ramIndexer deduplicates (term, row) pairs
jbellis Dec 13, 2024
cfa1157
need to use compareUnsigned once we have more than 4 KINDs
jbellis Dec 13, 2024
1620d3e
simplify
jbellis Dec 13, 2024
73f35df
make SYNTHETIC the first Column Kind instead of the last. This avoids…
jbellis Dec 13, 2024
3ad8ae2
fix tests
jbellis Dec 13, 2024
dbbc678
DRY refactor
jbellis Dec 16, 2024
1b57d55
add tests for unknown query terms, duplicate query terms, no query terms
jbellis Dec 16, 2024
0b5ce5c
// since doc frequencies can be an estimate from the index hi…
jbellis Dec 17, 2024
f3f7a15
parameterize version to test with/without histograms
jbellis Dec 17, 2024
3416389
merge with main
jbellis Dec 17, 2024
d83e18d
actually parameterize both versions
jbellis Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/antlr/Lexer.g
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ K_DROPPED: D R O P P E D;
K_COLUMN: C O L U M N;
K_RECORD: R E C O R D;
K_ANN_OF: A N N WS+ O F;
K_BM25_OF: 'BM25' WS+ 'OF';

// Case-insensitive alpha characters
fragment A: ('a'|'A');
Expand Down
15 changes: 10 additions & 5 deletions src/antlr/Parser.g
Original file line number Diff line number Diff line change
Expand Up @@ -457,14 +457,18 @@ customIndexExpression [WhereClause.Builder clause]
;

orderByClause[List<Ordering.Raw> orderings]
@init{
@init {
Ordering.Direction direction = Ordering.Direction.ASC;
Ordering.Raw.Expression expr = null;
}
: c=cident (K_ANN_OF t=term)? (K_ASC | K_DESC { direction = Ordering.Direction.DESC; })?
: c=cident
( K_ANN_OF t=term { expr = new Ordering.Raw.Ann(c, t); }
| K_BM25_OF t=term { expr = new Ordering.Raw.Bm25(c, t); }
)?
(K_ASC | K_DESC { direction = Ordering.Direction.DESC; })?
{
Ordering.Raw.Expression expr = (t == null)
? new Ordering.Raw.SingleColumn(c)
: new Ordering.Raw.Ann(c, t);
if (expr == null)
expr = new Ordering.Raw.SingleColumn(c);
orderings.add(new Ordering.Raw(expr, direction));
}
;
Expand Down Expand Up @@ -1967,6 +1971,7 @@ basic_unreserved_keyword returns [String str]
| K_COLUMN
| K_RECORD
| K_ANN_OF
| K_BM25_OF
| K_OFFSET
) { $str = $k.text; }
;
6 changes: 6 additions & 0 deletions src/java/org/apache/cassandra/cql3/GeoDistanceRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ protected Restriction newAnnRestriction(TableMetadata table, VariableSpecificati
throw invalidRequest("%s cannot be used with the GEO_DISTANCE function", operator());
}

@Override
protected Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames)
{
throw invalidRequest("%s cannot be used with the GEO_DISTANCE function", operator());
michaeljmarshall marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
protected Restriction newAnalyzerMatchesRestriction(TableMetadata table, VariableSpecifications boundNames)
{
Expand Down
6 changes: 6 additions & 0 deletions src/java/org/apache/cassandra/cql3/MultiColumnRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,12 @@ protected Restriction newAnnRestriction(TableMetadata table, VariableSpecificati
throw invalidRequest("%s cannot be used for multi-column relations", operator());
}

@Override
protected Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames)
{
throw invalidRequest("%s cannot be used for multi-column relations", operator());
}

@Override
protected Restriction newAnalyzerMatchesRestriction(TableMetadata table, VariableSpecifications boundNames)
{
Expand Down
14 changes: 14 additions & 0 deletions src/java/org/apache/cassandra/cql3/Operator.java
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,20 @@ public boolean isSatisfiedBy(AbstractType<?> type, ByteBuffer leftOperand, ByteB
return !LIKE.isSatisfiedBy(type, leftOperand, rightOperand, analyzer);
}
},
BM25(25)
jbellis marked this conversation as resolved.
Show resolved Hide resolved
{
@Override
public String toString()
{
return "BM25";
}

@Override
public boolean isSatisfiedBy(AbstractType<?> type, ByteBuffer leftOperand, ByteBuffer rightOperand, @Nullable Index.Analyzer analyzer)
{
throw new UnsupportedOperationException();
}
},

/**
* An operator that only performs matching against analyzed columns.
Expand Down
75 changes: 75 additions & 0 deletions src/java/org/apache/cassandra/cql3/Ordering.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction;
import org.apache.cassandra.cql3.restrictions.SingleRestriction;
import org.apache.cassandra.cql3.statements.SelectStatement;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.TableMetadata;

Expand Down Expand Up @@ -48,6 +49,11 @@ public interface Expression
SingleRestriction toRestriction();

ColumnMetadata getColumn();

default boolean isScored()
{
return false;
}
}

/**
Expand Down Expand Up @@ -118,6 +124,54 @@ public ColumnMetadata getColumn()
{
return column;
}

@Override
public boolean isScored()
{
return SelectStatement.ANN_USE_SYNTHETIC_SCORE;
}
}

/**
* An expression used in BM25 ordering.
* <code>ORDER BY column BM25 OF value</code>
*/
public static class Bm25 implements Expression
{
final ColumnMetadata column;
final Term queryValue;
final Direction direction;

public Bm25(ColumnMetadata column, Term queryValue, Direction direction)
{
this.column = column;
this.queryValue = queryValue;
this.direction = direction;
}

@Override
public boolean hasNonClusteredOrdering()
{
return true;
}

@Override
public SingleRestriction toRestriction()
{
return new SingleColumnRestriction.Bm25Restriction(column, queryValue);
}

@Override
public ColumnMetadata getColumn()
{
return column;
}

@Override
public boolean isScored()
{
return true;
}
}

public enum Direction
Expand Down Expand Up @@ -190,6 +244,27 @@ public Ordering.Expression bind(TableMetadata table, VariableSpecifications boun
return new Ordering.Ann(column, value, direction);
}
}

public static class Bm25 implements Expression
{
final ColumnIdentifier columnId;
final Term.Raw queryValue;

Bm25(ColumnIdentifier column, Term.Raw queryValue)
{
this.columnId = column;
this.queryValue = queryValue;
}

@Override
public Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames, Direction direction)
{
ColumnMetadata column = table.getExistingColumn(columnId);
Term value = queryValue.prepare(table.keyspace, column);
value.collectMarkerSpecification(boundNames);
return new Ordering.Bm25(column, value, direction);
}
}
}
}

Expand Down
7 changes: 7 additions & 0 deletions src/java/org/apache/cassandra/cql3/Relation.java
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ public final Restriction toRestriction(TableMetadata table, VariableSpecificatio
return newLikeRestriction(table, boundNames, relationType);
case ANN:
return newAnnRestriction(table, boundNames);
case BM25:
return newBm25Restriction(table, boundNames);
case ANALYZER_MATCHES:
return newAnalyzerMatchesRestriction(table, boundNames);
default: throw invalidRequest("Unsupported \"!=\" relation: %s", this);
Expand Down Expand Up @@ -296,6 +298,11 @@ protected abstract Restriction newSliceRestriction(TableMetadata table,
*/
protected abstract Restriction newAnnRestriction(TableMetadata table, VariableSpecifications boundNames);

/**
* Creates a new BM25 restriction instance.
*/
protected abstract Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames);

/**
* Creates a new Analyzer Matches restriction instance.
*/
Expand Down
8 changes: 8 additions & 0 deletions src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,14 @@ protected Restriction newAnnRestriction(TableMetadata table, VariableSpecificati
return new SingleColumnRestriction.AnnRestriction(columnDef, term);
}

@Override
protected Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames)
{
ColumnMetadata columnDef = table.getExistingColumn(entity);
Term term = toTerm(toReceivers(columnDef), value, table.keyspace, boundNames);
return new SingleColumnRestriction.AnnRestriction(columnDef, term);
}

@Override
protected Restriction newAnalyzerMatchesRestriction(TableMetadata table, VariableSpecifications boundNames)
{
Expand Down
8 changes: 7 additions & 1 deletion src/java/org/apache/cassandra/cql3/TokenRelation.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,13 @@ protected Restriction newLikeRestriction(TableMetadata table, VariableSpecificat
@Override
protected Restriction newAnnRestriction(TableMetadata table, VariableSpecifications boundNames)
{
throw invalidRequest("%s cannot be used for toekn relations", operator());
throw invalidRequest("%s cannot be used for token relations", operator());
}

@Override
protected Restriction newBm25Restriction(TableMetadata table, VariableSpecifications boundNames)
{
throw invalidRequest("%s cannot be used for token relations", operator());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,74 @@ public boolean isBoundedAnn()
}
}

public static final class Bm25Restriction extends SingleColumnRestriction
{
private final Term value;

public Bm25Restriction(ColumnMetadata columnDef, Term value)
{
super(columnDef);
this.value = value;
}

public ByteBuffer value(QueryOptions options)
{
return value.bindAndGet(options);
}

@Override
public void addFunctionsTo(List<Function> functions)
{
value.addFunctionsTo(functions);
}

@Override
MultiColumnRestriction toMultiColumnRestriction()
{
throw new UnsupportedOperationException();
}

@Override
public void addToRowFilter(RowFilter.Builder filter,
IndexRegistry indexRegistry,
QueryOptions options)
{
filter.add(columnDef, Operator.BM25, value.bindAndGet(options));
}

@Override
public MultiClusteringBuilder appendTo(MultiClusteringBuilder builder, QueryOptions options)
{
throw new UnsupportedOperationException();
}

@Override
public String toString()
{
return String.format("BM25(%s)", value);
}

@Override
public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
{
if (otherRestriction.isIndexBasedOrdering())
throw invalidRequest("%s cannot be restricted by multiple BM25 restrictions", columnDef.name);
throw invalidRequest("%s cannot be restricted by both BM25 and %s", columnDef.name, otherRestriction.toString());
jbellis marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
protected boolean isSupportedBy(Index index)
{
return index.supportsExpression(columnDef, Operator.BM25);
}

@Override
public boolean isIndexBasedOrdering()
{
return true;
}
}

/**
* A Bounded ANN Restriction is one that uses a similarity score as the limiting factor for ANN instead of a number
* of results.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,8 @@ else if (indexOrderings.size() == 1)
if (orderings.size() > 1)
throw new InvalidRequestException("Cannot combine clustering column ordering with non-clustering column ordering");
Ordering ordering = indexOrderings.get(0);
if (ordering.direction != Ordering.Direction.ASC && ordering.expression instanceof Ordering.Ann)
// TODO remove the instanceof with SelectStatement.ANN_USE_SYNTHETIC_SCORE.
if (ordering.direction != Ordering.Direction.ASC && (ordering.expression.isScored() || ordering.expression instanceof Ordering.Ann))
throw new InvalidRequestException("Descending ANN ordering is not supported");
michaeljmarshall marked this conversation as resolved.
Show resolved Hide resolved
if (!ENABLE_SAI_GENERAL_ORDER_BY && ordering.expression instanceof Ordering.SingleColumn)
throw new InvalidRequestException("SAI based ORDER BY on non-vector column is not supported");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,21 @@ abstract class ColumnFilterFactory
*/
abstract ColumnFilter newInstance(List<Selector> selectors);

public static ColumnFilterFactory wildcard(TableMetadata table)
public static ColumnFilterFactory wildcard(TableMetadata table, Set<ColumnMetadata> orderingColumns)
{
return new PrecomputedColumnFilter(ColumnFilter.all(table));
ColumnFilter cf;
if (orderingColumns.isEmpty())
{
cf = ColumnFilter.all(table);
}
else
{
ColumnFilter.Builder builder = ColumnFilter.selectionBuilder();
builder.addAll(table.regularAndStaticColumns());
builder.addAll(orderingColumns);
cf = builder.build();
}
return new PrecomputedColumnFilter(cf);
}

public static ColumnFilterFactory fromColumns(TableMetadata table,
Expand Down
Loading