Skip to content

Commit

Permalink
fix: fix bm25s bug in building index
Browse files Browse the repository at this point in the history
  • Loading branch information
ignorejjj committed Nov 22, 2024
1 parent 831ad83 commit 2bf75ee
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion flashrag/retriever/index_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,19 @@ def build_bm25_index_bm25s(self):
"""Building BM25 index based on bm25s library."""

import bm25s
import Stemmer

self.save_dir = os.path.join(self.save_dir, 'bm25')
os.makedirs(self.save_dir, exist_ok=True)

corpus = datasets.load_dataset("json", data_files=self.corpus_path, split="train")
corpus_text = corpus['contents']
stemmer = Stemmer.Stemmer('english')
tokenizer = bm25s.tokenization.Tokenizer(stemmer=stemmer)
corpus_tokens = tokenizer.tokenize(corpus_text, return_as='tuple')

retriever = bm25s.BM25(corpus=corpus, backend='numba')
retriever.index(corpus_text)
retriever.index(corpus_tokens)
retriever.save(self.save_dir,corpus=corpus)

print("Finish!")
Expand Down

0 comments on commit 2bf75ee

Please sign in to comment.