From c01b6c7778a6cd60b7a6be8db0a9f140560060d8 Mon Sep 17 00:00:00 2001
From: Keegan Carruthers-Smith <keegan.csmith@gmail.com>
Date: Thu, 1 Aug 2024 16:12:43 +0200
Subject: [PATCH] remove SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT (#800)

After defaulting to shard merging for all inactive repos, this in fact
makes searches slightly slower. So we can remove the experiment.

Test Plan: go test
---
 bits.go      | 22 ----------------------
 indexdata.go | 30 +-----------------------------
 2 files changed, 1 insertion(+), 51 deletions(-)

diff --git a/bits.go b/bits.go
index 5fc3e288..d438cbf1 100644
--- a/bits.go
+++ b/bits.go
@@ -18,8 +18,6 @@ import (
 	"cmp"
 	"encoding/binary"
 	"math"
-	"math/rand/v2"
-	"slices"
 	"sort"
 	"unicode"
 	"unicode/utf8"
@@ -126,11 +124,6 @@ func (a runeNgramOff) Compare(b runeNgramOff) int {
 }
 
 func splitNGrams(str []byte) []runeNgramOff {
-	// len(maxNgrams) >= the number of ngrams in str => no limit
-	return splitNGramsLimit(str, len(str))
-}
-
-func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
 	var runeGram [3]rune
 	var off [3]uint32
 	var runeCount int
@@ -160,21 +153,6 @@ func splitNGramsLimit(str []byte, maxNgrams int) []runeNgramOff {
 		})
 	}
 
-	// We return a random subset of size maxNgrams. This is to prevent the start
-	// of the string biasing ngram selection.
-	if maxNgrams < len(result) {
-		//  Deterministic seed for tests. Additionally makes comparing repeated
-		//  queries performance easier.
-		r := rand.New(rand.NewPCG(uint64(maxNgrams), 0))
-
-		// Pick random subset via a shuffle
-		r.Shuffle(maxNgrams, func(i, j int) { result[i], result[j] = result[j], result[i] })
-		result = result[:maxNgrams]
-
-		// Caller expects ngrams in order of appearance.
-		slices.SortFunc(result, runeNgramOff.Compare)
-	}
-
 	return result
 }
 
diff --git a/indexdata.go b/indexdata.go
index 91314f12..7dd72894 100644
--- a/indexdata.go
+++ b/indexdata.go
@@ -21,9 +21,7 @@ import (
 	"hash/crc64"
 	"log"
 	"math/bits"
-	"os"
 	"slices"
-	"strconv"
 	"unicode/utf8"
 
 	"github.com/sourcegraph/zoekt/query"
@@ -403,37 +401,11 @@ func (r *ngramIterationResults) candidates() []*candidateMatch {
 	return cs
 }
 
-// experimentIterateNgramLookupLimit when non-zero will only lookup this many
-// ngrams from a query string. Note: that if case-insensitive, this only
-// limits the input. So we will still lookup the case folding.
-//
-// This experiment is targetting looking up large snippets. If it is
-// successful, we will likely hardcode the value we use in production.
-//
-// Future note: if we find cases where this works badly, we can consider only
-// searching a random subset of the query string to avoid bad strings.
-var experimentIterateNgramLookupLimit = getEnvInt("SRC_EXPERIMENT_ITERATE_NGRAM_LOOKUP_LIMIT")
-
-func getEnvInt(k string) int {
-	v, _ := strconv.Atoi(os.Getenv(k))
-	if v != 0 {
-		log.Printf("%s = %d\n", k, v)
-	}
-	return v
-}
-
 func (d *indexData) iterateNgrams(query *query.Substring) (*ngramIterationResults, error) {
 	str := query.Pattern
 
 	// Find the 2 least common ngrams from the string.
-	var ngramOffs []runeNgramOff
-	if ngramLimit := experimentIterateNgramLookupLimit; ngramLimit > 0 {
-		// Note: we can't just do str = str[:ngramLimit] due to utf-8 and str
-		// length is asked later on for other optimizations.
-		ngramOffs = splitNGramsLimit([]byte(str), ngramLimit)
-	} else {
-		ngramOffs = splitNGrams([]byte(str))
-	}
+	ngramOffs := splitNGrams([]byte(str))
 
 	// protect against accidental searching of empty strings
 	if len(ngramOffs) == 0 {