Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into jtibs/score-kind
Browse files Browse the repository at this point in the history
  • Loading branch information
jtibshirani committed Oct 26, 2023
2 parents 7052188 + b5a5fdc commit afd8dca
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 11 deletions.
4 changes: 2 additions & 2 deletions build/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1051,8 +1051,8 @@ func Get() {
&query.Symbol{Expr: &query.Substring{Pattern: "http", Content: true}},
&query.Symbol{Expr: &query.Substring{Pattern: "Get", Content: true}}}},
wantLanguage: "Go",
// 7000 (full base match) + 800 (Go func) + 500 (word) + 200 (atom) + 10 (file order)
wantScore: 8510,
// 7000 (full base match) + 800 (Go func) + 50 (Exported Go) + 500 (word) + 200 (atom) + 10 (file order)
wantScore: 8560,
},
//
// C++
Expand Down
103 changes: 94 additions & 9 deletions contentprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@ import (
"bytes"
"fmt"
"log"
"os"
"path"
"sort"
"strings"
"unicode"
"unicode/utf8"

"github.com/sourcegraph/zoekt/ctags"
"golang.org/x/exp/slices"
)

var _ = log.Println
Expand Down Expand Up @@ -508,6 +512,9 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch,
score.score += s
}

data := p.data(m.FileName)
filename := p.data(true)

for i, r := range m.Ranges {
// calculate the start and end offset relative to the start of the content
relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset)
Expand Down Expand Up @@ -559,7 +566,8 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch,
}
if si != nil {
symbolKind := ctags.ParseSymbolKind(si.Kind)
addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, symbolKind))
sym := sectionSlice(data, sec)
addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
}
}

Expand Down Expand Up @@ -592,6 +600,9 @@ func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, langu
score.score += s
}

data := p.data(m.FileName)
filename := p.data(true)

for _, f := range m.LineFragments {
startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset]))

Expand Down Expand Up @@ -639,7 +650,8 @@ func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, langu
if si != nil {
// the LineFragment may not be on a symbol, then si will be nil.
symbolKind := ctags.ParseSymbolKind(si.Kind)
addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, symbolKind))
sym := sectionSlice(data, sec)
addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, symbolKind))
}
}

Expand All @@ -656,9 +668,24 @@ func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, langu
return maxScore.score, maxScore.what
}

// scoreKind boosts a match based on the combination of language and kind. The
// language string comes from go-enry, the kind string from ctags.
func scoreKind(language string, kind ctags.SymbolKind) float64 {
// sectionSlice will return data[sec.Start:sec.End] but will clip Start and
// End such that it won't be out of range.
func sectionSlice(data []byte, sec DocumentSection) []byte {
l := uint32(len(data))
if sec.Start >= l {
return nil
}
if sec.End > l {
sec.End = l
}
return data[sec.Start:sec.End]
}


// scoreSymbolKind boosts a match based on the combination of language, symbol
// and kind. The language string comes from go-enry, the symbol and kind from
// ctags.
func scoreSymbolKind(language string, filename []byte, sym []byte, kind ctags.SymbolKind) float64 {
var factor float64

// Generic ranking which will be overriden by language specific ranking
Expand Down Expand Up @@ -725,7 +752,7 @@ func scoreKind(language string, kind ctags.SymbolKind) float64 {
// scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659
// for each case a description of the fields in ctags in the comment
case ctags.Type: // interface struct talias
factor = 10
factor = 9
case ctags.Interface: // interfaces
factor = 10
case ctags.Struct: // structs
Expand All @@ -743,6 +770,16 @@ func scoreKind(language string, kind ctags.SymbolKind) float64 {
case ctags.Variable: // variables
factor = 5
}

// Boost exported go symbols. Same implementation as token.IsExported
if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) {
factor += 0.5
}

if bytes.HasSuffix(filename, []byte("_test.go")) {
factor *= 0.8
}

// Could also rank on:
//
// - anonMember struct anonymous members
Expand Down Expand Up @@ -899,9 +936,57 @@ func sortChunkMatchesByScore(ms []ChunkMatch) {
sort.Sort(chunkMatchScoreSlice(ms))
}

// SortFiles sorts files matches. The order depends on the match score, which includes both
// query-dependent signals like word overlap, and file-only signals like the file ranks (if
// file ranks are enabled).
var doNovelty = os.Getenv("ZOEKT_NOVELTY_DISABLE") == ""

// SortFiles sorts files matches in the order we want to present results to
// users. The order depends on the match score, which includes both
// query-dependent signals like word overlap, and file-only signals like the
// file ranks (if file ranks are enabled).
//
// We don't only use the scores, we will also boost some results to present
// files with novel extensions.
func SortFiles(ms []FileMatch) {
sort.Sort(fileMatchesByScore(ms))

if doNovelty {
// Experimentally boost something into the third filematch
boostNovelExtension(ms, 2, 0.9)
}
}

func boostNovelExtension(ms []FileMatch, boostOffset int, minScoreRatio float64) {
if len(ms) <= boostOffset+1 {
return
}

top := ms[:boostOffset]
candidates := ms[boostOffset:]

// Don't bother boosting something which is significantly different to the
// result it replaces.
minScoreForNovelty := candidates[0].Score * minScoreRatio

// We want to look for an ext that isn't in the top exts
exts := make([]string, len(top))
for i := range top {
exts[i] = path.Ext(top[i].FileName)
}

for i := range candidates {
// Do not assume sorted due to boostNovelExtension being called on subsets
if candidates[i].Score < minScoreForNovelty {
continue
}

if slices.Contains(exts, path.Ext(candidates[i].FileName)) {
continue
}

// Found what we are looking for, now boost to front of candidates (which
// is ms[boostOffset])
for ; i > 0; i-- {
candidates[i], candidates[i-1] = candidates[i-1], candidates[i]
}
return
}
}

0 comments on commit afd8dca

Please sign in to comment.