Skip to content

Commit

Permalink
fix: add the missing threshold functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
matijagaspar committed Feb 12, 2025
1 parent 1ed45ba commit a56f6fb
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 20 deletions.
62 changes: 55 additions & 7 deletions packages/orama/src/components/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -387,27 +387,32 @@ export function calculateResultScores(
bm25Relevance: Required<BM25Params>,
resultsMap: Map<number, number>,
boostPerProperty: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined
whereFiltersIDs: Set<InternalDocumentID> | undefined,
keywordMatchesMap: Map<InternalDocumentID, Map<string, number>>
) {
const documentIDs = Array.from(ids)

// Exact fields for TF-IDF
const avgFieldLength = index.avgFieldLength[prop]
const fieldLengths = index.fieldLengths[prop]
const oramaOccurrences = index.tokenOccurrences[prop]
const oramaFrequencies = index.frequencies[prop]

// oramaOccurrences[term] can be undefined, 0, string, or { [k: string]: number }
const termOccurrences = typeof oramaOccurrences[term] === 'number' ? oramaOccurrences[term] ?? 0 : 0

// Calculate TF-IDF value for each term, in each document, for each index.
const documentIDsLength = documentIDs.length
for (let k = 0; k < documentIDsLength; k++) {
const internalId = documentIDs[k]
if (whereFiltersIDs && !whereFiltersIDs.has(internalId)) {
continue
}

// Track keyword matches per property
if (!keywordMatchesMap.has(internalId)) {
keywordMatchesMap.set(internalId, new Map())
}
const propertyMatches = keywordMatchesMap.get(internalId)!
propertyMatches.set(prop, (propertyMatches.get(prop) || 0) + 1)

const tf = oramaFrequencies?.[internalId]?.[term] ?? 0

const bm25 = BM25(
Expand Down Expand Up @@ -438,12 +443,12 @@ function searchInProperty(
boostPerProperty: number,
bm25Relevance: Required<BM25Params>,
docsCount: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined
whereFiltersIDs: Set<InternalDocumentID> | undefined,
keywordMatchesMap: Map<InternalDocumentID, Map<string, number>>
) {
const tokenLength = tokens.length;
for (let i = 0; i < tokenLength; i++) {
const term = tokens[i];

const searchResult = tree.find({ term, exact, tolerance })

const termsFound = Object.keys(searchResult)
Expand All @@ -461,6 +466,7 @@ function searchInProperty(
resultsMap,
boostPerProperty,
whereFiltersIDs,
keywordMatchesMap,
)
}
}
Expand All @@ -478,10 +484,15 @@ export function search(
relevance: Required<BM25Params>,
docsCount: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined,
threshold = 0,
): TokenScore[] {
const tokens = tokenizer.tokenize(term, language)
const keywordsCount = tokens.length || 1

// Track keyword matches per document and property
const keywordMatchesMap = new Map<InternalDocumentID, Map<string, number>>()
const resultsMap = new Map<number, number>()

for (const prop of propertiesToSearch) {
if (!(prop in index.indexes)) {
continue
Expand Down Expand Up @@ -514,10 +525,47 @@ export function search(
relevance,
docsCount,
whereFiltersIDs,
keywordMatchesMap
)
}

return Array.from(resultsMap)
// Convert to array and sort by score
const results = Array.from(resultsMap.entries())
.map(([id, score]): TokenScore => [id, score])
.sort((a, b) => b[1] - a[1])

if (results.length === 0) {
return []
}

// If threshold is 1, return all results
if (threshold === 1) {
return results
}

// Find documents that have all keywords in at least one property
const fullMatches = results.filter(([id]) => {
const propertyMatches = keywordMatchesMap.get(id)
if (!propertyMatches) return false

// Check if any property has all keywords
return Array.from(propertyMatches.values()).some(matches => matches === keywordsCount)
})

// If threshold is 0, return only full matches
if (threshold === 0) {
return fullMatches
}

// If we have full matches and threshold < 1, return full matches plus a percentage of partial matches
if (fullMatches.length > 0) {
const remainingResults = results.filter(([id]) => !fullMatches.some(([fid]) => fid === id))
const additionalResults = Math.ceil(remainingResults.length * threshold)
return [...fullMatches, ...remainingResults.slice(0, additionalResults)]
}

// If no full matches, return all results
return results
}

export function searchByWhereClause<T extends AnyOrama>(
Expand Down
16 changes: 4 additions & 12 deletions packages/orama/src/methods/search-fulltext.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import { prioritizeTokenScores } from '../components/algorithms.js'

export function innerFullTextSearch<T extends AnyOrama>(
orama: T,
params: Pick<SearchParamsFullText<T>, 'term' | 'properties' | 'where' | 'exact' | 'tolerance' | 'boost' | 'relevance'>,
params: Pick<SearchParamsFullText<T>, 'term' | 'properties' | 'where' | 'exact' | 'tolerance' | 'boost' | 'relevance' | 'threshold'>,
language: Language | undefined
) {
const { term, properties } = params
Expand Down Expand Up @@ -66,7 +66,7 @@ export function innerFullTextSearch<T extends AnyOrama>(
// in this case, we need to return all the documents that contains at least one of the given properties
if (term || properties) {
const docsCount = count(orama)
const searchResults = orama.index.search(
uniqueDocsIDs = orama.index.search(
index,
term || '',
orama.tokenizer,
Expand All @@ -78,18 +78,10 @@ export function innerFullTextSearch<T extends AnyOrama>(
applyDefault(params.relevance),
docsCount,
whereFiltersIDs,
params.threshold !== undefined && params.threshold !== null ? params.threshold : 1
)

// Get the number of keywords from the tokenized search term
const keywordsCount = term ? orama.tokenizer.tokenize(term, language).length : 1

// Apply prioritization to search results
uniqueDocsIDs = prioritizeTokenScores(
[searchResults],
1, // Using default boost of 1 since boost is already applied in the search
params.tolerance || 0,
keywordsCount
)

} else {
// Tokenizer returns empty array and the search term is empty as well.
// We return all the documents.
Expand Down
4 changes: 3 additions & 1 deletion packages/orama/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -967,7 +967,8 @@ export interface IIndex<I extends AnyIndexStore> {
resultsMap: Map<number, number>,
boostPerProperty: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined,
)
keywordMatchesMap: Map<InternalDocumentID, Map<string, number>>
): void

search<T extends AnyOrama>(
index: AnyIndexStore,
Expand All @@ -981,6 +982,7 @@ export interface IIndex<I extends AnyIndexStore> {
relevance: Required<BM25Params>,
docsCount: number,
whereFiltersIDs: Set<InternalDocumentID> | undefined,
threshold?: number
): TokenScore[]

searchByWhereClause<T extends AnyOrama>(
Expand Down

0 comments on commit a56f6fb

Please sign in to comment.