From b5a5fdc86e830d80b2be3154bb9f896fd20c3e3e Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Thu, 26 Oct 2023 13:20:24 +0200 Subject: [PATCH] score: boost exported go ident and downrank _test.go (#675) Right now our symbol analyser doesn't tell us if a symbol is exported. We add a go specific tweak here to boost those results. Ideally this could be something that is encoded in the symbol information. Additionally we do downrank _test.go files via the doc-order. But in the case of symbol matches the boosting overweighs doc order signficantly. I found the extra downraking quite useful when experimenting. Test Plan: lots of manual testing on the keyword branch --- build/e2e_test.go | 4 ++-- contentprovider.go | 47 +++++++++++++++++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/build/e2e_test.go b/build/e2e_test.go index 94dbf6b9..bf631108 100644 --- a/build/e2e_test.go +++ b/build/e2e_test.go @@ -1041,8 +1041,8 @@ func Get() { &query.Symbol{Expr: &query.Substring{Pattern: "http", Content: true}}, &query.Symbol{Expr: &query.Substring{Pattern: "Get", Content: true}}}}, wantLanguage: "Go", - // 7000 (full base match) + 800 (Go func) + 500 (word) + 200 (atom) + 10 (file order) - wantScore: 8510, + // 7000 (full base match) + 800 (Go func) + 50 (Exported Go) + 500 (word) + 200 (atom) + 10 (file order) + wantScore: 8560, }, // // C++ diff --git a/contentprovider.go b/contentprovider.go index 1ebda79d..1886458f 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -22,6 +22,7 @@ import ( "path" "sort" "strings" + "unicode" "unicode/utf8" "golang.org/x/exp/slices" @@ -510,6 +511,9 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, score.score += s } + data := p.data(m.FileName) + filename := p.data(true) + for i, r := range m.Ranges { // calculate the start and end offset relative to the start of the content relStartOffset := int(r.Start.ByteOffset - m.ContentStart.ByteOffset) @@ -560,7 +564,8 @@ func (p *contentProvider) chunkMatchScore(secs []DocumentSection, m *ChunkMatch, si = p.id.symbols.data(start + uint32(secIdx)) } if si != nil { - addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind)) + sym := sectionSlice(data, sec) + addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind)) } } @@ -593,6 +598,9 @@ func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, langu score.score += s } + data := p.data(m.FileName) + filename := p.data(true) + for _, f := range m.LineFragments { startBoundary := f.LineOffset < len(m.Line) && (f.LineOffset == 0 || byteClass(m.Line[f.LineOffset-1]) != byteClass(m.Line[f.LineOffset])) @@ -639,7 +647,8 @@ func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, langu } if si != nil { // the LineFragment may not be on a symbol, then si will be nil. - addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreKind(language, si.Kind)) + sym := sectionSlice(data, sec) + addScore(fmt.Sprintf("kind:%s:%s", language, si.Kind), scoreSymbolKind(language, filename, sym, si.Kind)) } } @@ -656,9 +665,23 @@ func (p *contentProvider) matchScore(secs []DocumentSection, m *LineMatch, langu return maxScore.score, maxScore.what } -// scoreKind boosts a match based on the combination of language and kind. The -// language string comes from go-enry, the kind string from ctags. -func scoreKind(language string, kind string) float64 { +// sectionSlice will return data[sec.Start:sec.End] but will clip Start and +// End such that it won't be out of range. +func sectionSlice(data []byte, sec DocumentSection) []byte { + l := uint32(len(data)) + if sec.Start >= l { + return nil + } + if sec.End > l { + sec.End = l + } + return data[sec.Start:sec.End] +} + +// scoreSymbolKind boosts a match based on the combination of language, symbol +// and kind. The language string comes from go-enry, the symbol and kind from +// ctags. +func scoreSymbolKind(language string, filename []byte, sym []byte, kind string) float64 { var factor float64 // Generic ranking which will be overriden by language specific ranking @@ -731,8 +754,8 @@ func scoreKind(language string, kind string) float64 { // scip-ctags regression workaround https://github.com/sourcegraph/sourcegraph/issues/57659 // for each case a description of the fields in ctags in the comment case "type": // interface struct talias - factor = 10 - case "method", "function": // methodSpec + factor = 9 + case "method", "function": // methodSpec func factor = 8 case "variable": // var member factor = 7 @@ -756,6 +779,16 @@ func scoreKind(language string, kind string) float64 { case "var": // variables factor = 5 } + + // Boost exported go symbols. Same implementation as token.IsExported + if ch, _ := utf8.DecodeRune(sym); unicode.IsUpper(ch) { + factor += 0.5 + } + + if bytes.HasSuffix(filename, []byte("_test.go")) { + factor *= 0.8 + } + // Could also rank on: // // - anonMember struct anonymous members