From c01b6c7778a6cd60b7a6be8db0a9f140560060d8 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 1 Aug 2024 16:12:43 +0200 Subject: [PATCH] remove SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT (#800) After defaulting to shard merging for all inactive repos, this in fact makes searches slightly slower. So we can remove the experiment. Test Plan: go test --- bits.go | 22 ---------------------- indexdata.go | 30 +----------------------------- 2 files changed, 1 insertion(+), 51 deletions(-) diff --git a/bits.go b/bits.go index 5fc3e288..d438cbf1 100644 --- a/bits.go +++ b/bits.go @@ -18,8 +18,6 @@ import ( "cmp" "encoding/binary" "math" - "math/rand/v2" - "slices" "sort" "unicode" "unicode/utf8" @@ -126,11 +124,6 @@ func (a runeNgramOff) Compare(b runeNgramOff) int { } func splitNGrams(str []byte) []runeNgramOff { - // len(maxNgrams) >= the number of ngrams in str => no limit - return splitNGramsLimit(str, len(str)) -} - -func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff { var runeGram [3]rune var off [3]uint32 var runeCount int @@ -160,21 +153,6 @@ func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff { }) } - // We return a random subset of size maxNgrams. This is to prevent the start - // of the string biasing ngram selection. - if maxNgrams < len(result) { - // Deterministic seed for tests. Additionally makes comparing repeated - // queries performance easier. - r := rand.New(rand.NewPCG(uint64(maxNgrams), 0)) - - // Pick random subset via a shuffle - r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] }) - result = result[:maxNgrams] - - // Caller expects ngrams in order of appearance. - slices.SortFunc(result, runeNgramOff.Compare) - } - return result } diff --git a/indexdata.go b/indexdata.go index 91314f12..7dd72894 100644 --- a/indexdata.go +++ b/indexdata.go @@ -21,9 +21,7 @@ import ( "hash/crc64" "log" "math/bits" - "os" "slices" - "strconv" "unicode/utf8" "github.com/sourcegraph/zoekt/query" @@ -403,37 +401,11 @@ func (r *ngramIterationResults) candidates() []*candidateMatch { return cs } -// experimentIterateNgramLookupLimit when non-zero will only lookup this many -// ngrams from a query string. Note: that if case-insensitive, this only -// limits the input. So we will still lookup the case folding. -// -// This experiment is targetting looking up large snippets. If it is -// successful, we will likely hardcode the value we use in production. -// -// Future note: if we find cases where this works badly, we can consider only -// searching a random subset of the query string to avoid bad strings. -var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT") - -func getEnvInt(k string) int { - v, _ := strconv.Atoi(os.Getenv(k)) - if v != 0 { - log.Printf("%s = %d\n", k, v) - } - return v -} - func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) { str := query.Pattern // Find the 2 least common ngrams from the string. - var ngramOffs []runeNgramOff - if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 { - // Note: we can't just do str = str[:ngramLimit] due to utf-8 and str - // length is asked later on for other optimizations. - ngramOffs = splitNGramsLimit([]byte(str), ngramLimit) - } else { - ngramOffs = splitNGrams([]byte(str)) - } + ngramOffs := splitNGrams([]byte(str)) // protect against accidental searching of empty strings if len(ngramOffs) == 0 {