From 6998939bd1b84425ac17a1fbf17d12c1d2dc924d Mon Sep 17 00:00:00 2001 From: Ashoat Tevosyan Date: Fri, 27 Oct 2023 14:02:54 -0400 Subject: [PATCH] [lib] Avoid constructing multiple duplicate tokenizers in SearchIndex Summary: This simple change improves the perf of `useChatMentionSearchIndex` by 45% in my local environment. Before this it takes an average of 1777.5ms, but after it takes an average of 978ms. Linear task: [ENG-5480](https://linear.app/comm/issue/ENG-5480/investigate-chatmentioncontextprovider-tti-regression) Test Plan: I used [this patch](https://gist.github.com/Ashoat/fc1c91a61009de0e9959527454be8236) to test performance before and after this change. I made sure I had at least three samples of each scenario. Will also link my [messy Gist of results](https://gist.github.com/Ashoat/b871afaaaee10b435b8676175d120d53), but it's not really interpretable by anyone other than me. Here's the relevant portion: ``` BEFORE LOG useChatMentionSearchIndex took 1801ms LOG useChatMentionSearchIndex took 1748ms LOG useChatMentionSearchIndex took 1730ms LOG useChatMentionSearchIndex took 1831ms AVERAGE 1777.5ms JUST DEDUP LOG useChatMentionSearchIndex took 1027ms LOG useChatMentionSearchIndex took 949ms LOG useChatMentionSearchIndex took 957ms AVERAGE 977.7ms ``` Reviewers: tomek, atul, inka, rohan Reviewed By: rohan Subscribers: wyilio Differential Revision: https://phab.comm.dev/D9625 --- lib/shared/search-index.js | 7 +++++-- lib/shared/sentence-prefix-search-index.js | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/shared/search-index.js b/lib/shared/search-index.js index 432b2cadbc..026eca1024 100644 --- a/lib/shared/search-index.js +++ b/lib/shared/search-index.js @@ -15,13 +15,16 @@ type Token = { ... }; +type TokenizeFunc = (str: string) => Token[]; +const defaultTokenize: TokenizeFunc = new Tokenizer().words(); + class SearchIndex { tokenize: (str: string) => Token[]; fullTextIndex: { [token: string]: Set }; partialTextIndex: { [token: string]: Set }; - constructor() { - this.tokenize = new Tokenizer().words(); + constructor(inputTokenize?: TokenizeFunc) { + this.tokenize = inputTokenize ?? defaultTokenize; this.fullTextIndex = {}; this.partialTextIndex = {}; } diff --git a/lib/shared/sentence-prefix-search-index.js b/lib/shared/sentence-prefix-search-index.js index 4848f2f0bd..928c025fad 100644 --- a/lib/shared/sentence-prefix-search-index.js +++ b/lib/shared/sentence-prefix-search-index.js @@ -4,12 +4,15 @@ import Tokenizer from 'tokenize-text'; import SearchIndex from './search-index.js'; +// defaultTokenize used in SearchIndex splits on punctuation +// We use this alternative because we only want to split on whitespace +const tokenize = new Tokenizer().re(/\S+/); + class SentencePrefixSearchIndex extends SearchIndex { entries: Set; constructor() { - super(); - this.tokenize = new Tokenizer().re(/\S+/); + super(tokenize); this.entries = new Set(); }