Skip to content

Commit

Permalink
remove SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT (#800)
Browse files Browse the repository at this point in the history
After defaulting to shard merging for all inactive repos, this in fact
makes searches slightly slower. So we can remove the experiment.

Test Plan: go test
  • Loading branch information
keegancsmith authored Aug 1, 2024
1 parent ebb3ca2 commit c01b6c7
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 51 deletions.
22 changes: 0 additions & 22 deletions bits.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ import (
"cmp"
"encoding/binary"
"math"
"math/rand/v2"
"slices"
"sort"
"unicode"
"unicode/utf8"
Expand Down Expand Up @@ -126,11 +124,6 @@ func (a runeNgramOff) Compare(b runeNgramOff) int {
}

func splitNGrams(str []byte) []runeNgramOff {
// len(maxNgrams) >= the number of ngrams in str => no limit
return splitNGramsLimit(str, len(str))
}

func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
var runeGram [3]rune
var off [3]uint32
var runeCount int
Expand Down Expand Up @@ -160,21 +153,6 @@ func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
})
}

// We return a random subset of size maxNgrams. This is to prevent the start
// of the string biasing ngram selection.
if maxNgrams < len(result) {
// Deterministic seed for tests. Additionally makes comparing repeated
// queries performance easier.
r := rand.New(rand.NewPCG(uint64(maxNgrams), 0))

// Pick random subset via a shuffle
r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] })
result = result[:maxNgrams]

// Caller expects ngrams in order of appearance.
slices.SortFunc(result, runeNgramOff.Compare)
}

return result
}

Expand Down
30 changes: 1 addition & 29 deletions indexdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ import (
"hash/crc64"
"log"
"math/bits"
"os"
"slices"
"strconv"
"unicode/utf8"

"github.com/sourcegraph/zoekt/query"
Expand Down Expand Up @@ -403,37 +401,11 @@ func (r *ngramIterationResults) candidates() []*candidateMatch {
return cs
}

// experimentIterateNgramLookupLimit when non-zero will only lookup this many
// ngrams from a query string. Note: that if case-insensitive, this only
// limits the input. So we will still lookup the case folding.
//
// This experiment is targetting looking up large snippets. If it is
// successful, we will likely hardcode the value we use in production.
//
// Future note: if we find cases where this works badly, we can consider only
// searching a random subset of the query string to avoid bad strings.
var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")

func getEnvInt(k string) int {
v, _ := strconv.Atoi(os.Getenv(k))
if v != 0 {
log.Printf("%s = %d\n", k, v)
}
return v
}

func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
str := query.Pattern

// Find the 2 least common ngrams from the string.
var ngramOffs []runeNgramOff
if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
// length is asked later on for other optimizations.
ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
} else {
ngramOffs = splitNGrams([]byte(str))
}
ngramOffs := splitNGrams([]byte(str))

// protect against accidental searching of empty strings
if len(ngramOffs) == 0 {
Expand Down

0 comments on commit c01b6c7

Please sign in to comment.