Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: #850 add threshold logic back #888

Merged
merged 3 commits into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 56 additions & 6 deletions packages/orama/src/components/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -387,16 +387,16 @@ export function calculateResultScores(
bm25Relevance: Required<BM25Params>,
resultsMap: Map<number, number>,
boostPerProperty: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined
whereFiltersIDs: Set<InternalDocumentID> | undefined,
keywordMatchesMap: Map<InternalDocumentID, Map<string, number>>
) {
const documentIDs = Array.from(ids)

// Exact fields for TF-IDF
const avgFieldLength = index.avgFieldLength[prop]
const fieldLengths = index.fieldLengths[prop]
const oramaOccurrences = index.tokenOccurrences[prop]
const oramaFrequencies = index.frequencies[prop]

// oramaOccurrences[term] can be undefined, 0, string, or { [k: string]: number }
const termOccurrences = typeof oramaOccurrences[term] === 'number' ? oramaOccurrences[term] ?? 0 : 0

Expand All @@ -408,6 +408,13 @@ export function calculateResultScores(
continue
}

// Track keyword matches per property
if (!keywordMatchesMap.has(internalId)) {
keywordMatchesMap.set(internalId, new Map())
}
const propertyMatches = keywordMatchesMap.get(internalId)!
propertyMatches.set(prop, (propertyMatches.get(prop) || 0) + 1)

const tf = oramaFrequencies?.[internalId]?.[term] ?? 0

const bm25 = BM25(
Expand Down Expand Up @@ -438,12 +445,12 @@ function searchInProperty(
boostPerProperty: number,
bm25Relevance: Required<BM25Params>,
docsCount: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined
whereFiltersIDs: Set<InternalDocumentID> | undefined,
keywordMatchesMap: Map<InternalDocumentID, Map<string, number>>
) {
const tokenLength = tokens.length;
for (let i = 0; i < tokenLength; i++) {
const term = tokens[i];

const searchResult = tree.find({ term, exact, tolerance })

const termsFound = Object.keys(searchResult)
Expand All @@ -461,6 +468,7 @@ function searchInProperty(
resultsMap,
boostPerProperty,
whereFiltersIDs,
keywordMatchesMap,
)
}
}
Expand All @@ -478,10 +486,15 @@ export function search(
relevance: Required<BM25Params>,
docsCount: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined,
threshold = 0,
): TokenScore[] {
const tokens = tokenizer.tokenize(term, language)
const keywordsCount = tokens.length || 1

// Track keyword matches per document and property
const keywordMatchesMap = new Map<InternalDocumentID, Map<string, number>>()
const resultsMap = new Map<number, number>()

for (const prop of propertiesToSearch) {
if (!(prop in index.indexes)) {
continue
Expand Down Expand Up @@ -514,10 +527,47 @@ export function search(
relevance,
docsCount,
whereFiltersIDs,
keywordMatchesMap
)
}

return Array.from(resultsMap)
// Convert to array and sort by score
const results = Array.from(resultsMap.entries())
.map(([id, score]): TokenScore => [id, score])
.sort((a, b) => b[1] - a[1])

if (results.length === 0) {
return []
}

// If threshold is 1, return all results
if (threshold === 1) {
return results
}

// Find documents that have all keywords in at least one property
const fullMatches = results.filter(([id]) => {
const propertyMatches = keywordMatchesMap.get(id)
if (!propertyMatches) return false

// Check if any property has all keywords
return Array.from(propertyMatches.values()).some(matches => matches === keywordsCount)
})

// If threshold is 0, return only full matches
if (threshold === 0) {
return fullMatches
}

// If we have full matches and threshold < 1, return full matches plus a percentage of partial matches
if (fullMatches.length > 0) {
const remainingResults = results.filter(([id]) => !fullMatches.some(([fid]) => fid === id))
const additionalResults = Math.ceil(remainingResults.length * threshold)
return [...fullMatches, ...remainingResults.slice(0, additionalResults)]
}

// If no full matches, return all results
return results
}

export function searchByWhereClause<T extends AnyOrama>(
Expand Down
7 changes: 5 additions & 2 deletions packages/orama/src/methods/search-fulltext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ import type {
import { getNanosecondsTime, removeVectorsFromHits, sortTokenScorePredicate } from '../utils.js'
import { count } from './docs.js'
import { fetchDocuments, fetchDocumentsWithDistinct } from './search.js'
import { prioritizeTokenScores } from '../components/algorithms.js'

export function innerFullTextSearch<T extends AnyOrama>(
orama: T,
params: Pick<SearchParamsFullText<T>, 'term' | 'properties' | 'where' | 'exact' | 'tolerance' | 'boost' | 'relevance'>,
params: Pick<SearchParamsFullText<T>, 'term' | 'properties' | 'where' | 'exact' | 'tolerance' | 'boost' | 'relevance' | 'threshold'>,
language: Language | undefined
) {
const { term, properties } = params
Expand Down Expand Up @@ -64,7 +65,6 @@ export function innerFullTextSearch<T extends AnyOrama>(
// - or we have properties to search
// in this case, we need to return all the documents that contains at least one of the given properties
if (term || properties) {

const docsCount = count(orama)
uniqueDocsIDs = orama.index.search(
index,
Expand All @@ -78,7 +78,10 @@ export function innerFullTextSearch<T extends AnyOrama>(
applyDefault(params.relevance),
docsCount,
whereFiltersIDs,
params.threshold !== undefined && params.threshold !== null ? params.threshold : 1
)


} else {
// Tokenizer returns empty array and the search term is empty as well.
// We return all the documents.
Expand Down
4 changes: 3 additions & 1 deletion packages/orama/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,8 @@ export interface IIndex<I extends AnyIndexStore> {
resultsMap: Map<number, number>,
boostPerProperty: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined,
)
keywordMatchesMap: Map<InternalDocumentID, Map<string, number>>
): void

search<T extends AnyOrama>(
index: AnyIndexStore,
Expand All @@ -981,6 +982,7 @@ export interface IIndex<I extends AnyIndexStore> {
relevance: Required<BM25Params>,
docsCount: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined,
threshold?: number
): TokenScore[]

searchByWhereClause<T extends AnyOrama>(
Expand Down
154 changes: 154 additions & 0 deletions packages/orama/tests/threshold.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import t from 'tap'
import { create, insert, search } from '../src/index.js'

t.test('should only return results with all the search terms (exact match)', async (t) => {
t.plan(4)

const db = await create({
schema: {
title: 'string'
}
})

await insert(db, { title: 'Blue t-shirt slim fit' })
await insert(db, { title: 'Blue t-shirt oversize fit' })
await insert(db, { title: 'Red t-shirt v-neck cut' })
await insert(db, { title: 'Colored t-shirt slim fit' })
await insert(db, { title: 'Red t-shirt slim fit' })

const r1 = await search(db, {
term: 'blue t-shirt',
threshold: 0
})
const r2 = await search(db, {
term: 'red t-shirt',
threshold: 0
})
const r3 = await search(db, {
term: 'slim fit',
threshold: 0
})
const r4 = await search(db, {
term: 'red fit',
threshold: 0
})

t.same(r1.count, 2)
t.same(r2.count, 2)
t.same(r3.count, 3)
t.same(r4.count, 1)
})

t.test('should only return results with all the search terms (exact match) on more complex schema', async (t) => {
t.plan(4)

const db = await create({
schema: {
title: 'string',
description: 'string'
}
})

await insert(db, {
title: 'Blue t-shirt',
description: 'Beautiful blue t-shirt, slim fit. Wears well with jeans and sneakers.'
})

await insert(db, {
title: 'Blue t-shirt',
description: 'Beautiful blue t-shirt. A bit oversize.'
})

await insert(db, {
title: 'Red t-shirt v-neck cut',
description: 'Great t-shirt for a night out.'
})

await insert(db, {
title: 'Colored t-shirt slim fit',
description: 'Colorful t-shirt, slim fit.'
})

await insert(db, {
title: 'Green t-shirt',
description: 'Green t-shirt, oversize fit.'
})

const r1 = await search(db, {
term: 'blue t-shirt',
threshold: 0
})
const r2 = await search(db, {
term: 'red t-shirt',
threshold: 0
})
const r3 = await search(db, {
term: 'slim fit',
threshold: 0
})
const r4 = await search(db, {
term: 'oversize fit',
threshold: 0
})

t.same(r1.count, 2)
t.same(r2.count, 1)
t.same(r3.count, 2)
t.same(r4.count, 1)
})

t.test('should return all the results if threshold is 1', async (t) => {
t.plan(2)

const db = await create({
schema: {
title: 'string'
}
})

await insert(db, { title: 'Blue t-shirt slim fit' })
await insert(db, { title: 'Blue t-shirt oversize fit' })
await insert(db, { title: 'Red t-shirt v-neck cut' })
await insert(db, { title: 'Colored t-shirt slim fit' })

const r1 = await search(db, {
term: 'blue t-shirt',
threshold: 1
})

const r2 = await search(db, {
term: 'slim fit',
threshold: 1
})

t.same(r1.count, 4)
t.same(r2.count, 3)
})

t.test('should return all the exact matches + X% of the partial matches', async (t) => {
t.plan(2)

const db = await create({
schema: {
title: 'string'
}
})

await insert(db, { title: 'Blue t-shirt slim fit' })
await insert(db, { title: 'Blue t-shirt oversize fit' })
await insert(db, { title: 'Red t-shirt v-neck cut' })
await insert(db, { title: 'Colored t-shirt slim fit' })

const r1 = await search(db, {
term: 'blue t-shirt',
threshold: 0.6
})

const r2 = await search(db, {
term: 'slim fit',
threshold: 0.7
})

t.same(r1.count, 4)
t.same(r2.count, 3)
})