From 6d36bba0d8d59ece9bd558426daad4147c6f95bf Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Thu, 22 Feb 2024 17:10:01 +0100 Subject: [PATCH 1/6] ranking: allow partial overlap with symbol --- build/scoring_test.go | 32 ++++++++++++++ contentprovider.go | 35 +++++++++++++-- contentprovider_test.go | 43 +++++++++++++++++++ internal/e2e/e2e_rank_test.go | 1 + internal/e2e/testdata/bytes_buffer.txt | 10 ++--- .../e2e/testdata/coverage_data_writer.txt | 2 +- internal/e2e/testdata/rank_stats.txt | 8 ++-- internal/e2e/testdata/time_compare.txt | 38 ++++++++++++++++ internal/e2e/testdata/zoekt_searcher.txt | 4 +- read_test.go | 4 +- .../TestReadSearch/ctagsrepo_v16.00000.golden | 4 +- .../TestReadSearch/ctagsrepo_v17.00000.golden | 4 +- .../TestReadSearch/repo2_v16.00000.golden | 4 +- 13 files changed, 166 insertions(+), 23 deletions(-) create mode 100644 internal/e2e/testdata/time_compare.txt diff --git a/build/scoring_test.go b/build/scoring_test.go index e5378fe88..8a93d08b4 100644 --- a/build/scoring_test.go +++ b/build/scoring_test.go @@ -162,6 +162,38 @@ func TestJava(t *testing.T) { // 7000 (symbol) + 900 (Java enum) + 500 (word) + 300 (atom) + 10 (file order) wantScore: 8710, }, + { + fileName: "example.java", + content: exampleJava, + query: &query.Substring{Content: true, Pattern: "unInnerInterface("}, + language: "Java", + // 4000 (overlap Symbol) + 700 (Java method) + 50 (partial word) + 10 (file order) + wantScore: 4760, + }, + { + fileName: "example.java", + content: exampleJava, + query: &query.Substring{Content: true, Pattern: "InnerEnum"}, + language: "Java", + // 7000 (Symbol) + 900 (Java enum) + 500 (word) + 10 (file order) + wantScore: 8410, + }, + { + fileName: "example.java", + content: exampleJava, + query: &query.Substring{Content: true, Pattern: "enum InnerEnum"}, + language: "Java", + // 5500 (edge Symbol) + 900 (Java enum) + 500 (word) + 10 (file order) + wantScore: 6910, + }, + { + fileName: "example.java", + content: exampleJava, + query: &query.Substring{Content: true, Pattern: "public enum InnerEnum {"}, + language: "Java", + // 4000 (overlap Symbol) + 900 (Java enum) + 500 (word) + 10 (file order) + wantScore: 5410, + }, } for _, c := range cases { diff --git a/contentprovider.go b/contentprovider.go index 5efe191a4..34a2d793c 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -552,6 +552,35 @@ func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { return 0, false } +// findMaxOverlappingSection returns the index of the section in secs that +// overlaps the most with the area defined by off and sz. If no section +// overlaps, it returns 0, false. +// +// The implementation assumes that secs is sorted by DocumentSection.End. +func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { + // Find the first section that overlaps. + j := sort.Search(len(secs), func(i int) bool { return secs[i].End > off }) + + if j == len(secs) { + return 0, false + } + + var maxSz uint32 + var maxIdx uint32 + for i := j; i < len(secs); i++ { + if secs[i].End <= off || secs[i].Start >= off+sz { + // No overlap + break + } + overlap := min(secs[i].End, off+sz) - max(secs[i].Start, off) + if overlap > maxSz { + maxSz = overlap + maxIdx = uint32(i) + } + } + return maxIdx, maxSz > 0 +} + func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { if cm.fileName { return DocumentSection{}, nil, false @@ -561,8 +590,8 @@ func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symb secIdx, ok := cm.symbolIdx, cm.symbol if !ok { - // Not from a symbol matchtree. Lets see if it intersects with a symbol. - secIdx, ok = findSection(secs, cm.byteOffset, cm.byteMatchSz) + // Not from a symbol matchTree. Let's see if it overlaps with a symbol. + secIdx, ok = findMaxOverlappingSection(secs, cm.byteOffset, cm.byteMatchSz) } if !ok { return DocumentSection{}, nil, false @@ -637,7 +666,7 @@ func (p *contentProvider) candidateMatchScore(ms []*candidateMatch, language str } else if startMatch || endMatch { addScore("EdgeSymbol", (scoreSymbol+scorePartialSymbol)/2) } else { - addScore("InnerSymbol", scorePartialSymbol) + addScore("OverlapSymbol", scorePartialSymbol) } // Score based on symbol data diff --git a/contentprovider_test.go b/contentprovider_test.go index d811f7886..645699d8c 100644 --- a/contentprovider_test.go +++ b/contentprovider_test.go @@ -407,3 +407,46 @@ func TestColumnHelper(t *testing.T) { t.Fatal("unexpected value for backwards call for first column on second line", got) } } + +func TestFindMaxOverlappingSection(t *testing.T) { + secs := []DocumentSection{ + {Start: 0, End: 10}, + {Start: 5, End: 15}, + {Start: 17, End: 21}, + } + // 0123456789012345678901 + // [.........[ + // [..........[ + // [...[ + + testcases := []struct { + name string + off uint32 + sz uint32 + wantSecIx uint32 + wantOverlap bool + }{ + {off: 1, sz: 5, wantSecIx: 0, wantOverlap: true}, + {off: 0, sz: 10, wantSecIx: 0, wantOverlap: true}, + {off: 3, sz: 10, wantSecIx: 1, wantOverlap: true}, + {off: 8, sz: 4, wantSecIx: 1, wantOverlap: true}, + {off: 12, sz: 15, wantSecIx: 2, wantOverlap: true}, + {off: 16, sz: 10, wantSecIx: 2, wantOverlap: true}, + + // No overlap + {off: 15, sz: 2, wantOverlap: false}, + {off: 21, sz: 1, wantOverlap: false}, + } + + for _, tt := range testcases { + t.Run(fmt.Sprintf("off=%d size=%d", tt.off, tt.sz), func(t *testing.T) { + got, haveOverlap := findMaxOverlappingSection(secs, tt.off, tt.sz) + if haveOverlap != tt.wantOverlap { + t.Fatalf("expected overlap %v, got %v", tt.wantOverlap, haveOverlap) + } + if got != tt.wantSecIx { + t.Fatalf("expected section %d, got %d", tt.wantSecIx, got) + } + }) + } +} diff --git a/internal/e2e/e2e_rank_test.go b/internal/e2e/e2e_rank_test.go index 77e37e44e..547399bab 100644 --- a/internal/e2e/e2e_rank_test.go +++ b/internal/e2e/e2e_rank_test.go @@ -54,6 +54,7 @@ func TestRanking(t *testing.T) { q("test server", "github.com/golang/go/src/net/http/httptest/server.go"), q("bytes buffer", "github.com/golang/go/src/bytes/buffer.go"), q("bufio buffer", "github.com/golang/go/src/bufio/scan.go"), + q("time compare\\(", "github.com/golang/go/src/time/time.go"), // sourcegraph/sourcegraph q("graphql type User", "github.com/sourcegraph/sourcegraph/cmd/frontend/graphqlbackend/schema.graphql"), diff --git a/internal/e2e/testdata/bytes_buffer.txt b/internal/e2e/testdata/bytes_buffer.txt index dba9506ab..21ee3ca16 100644 --- a/internal/e2e/testdata/bytes_buffer.txt +++ b/internal/e2e/testdata/bytes_buffer.txt @@ -14,11 +14,11 @@ github.com/golang/go/src/cmd/internal/edit/edit.go 41:func NewBuffer(data []byte) *Buffer { hidden 13 more line matches -github.com/golang/go/src/hash/crc32/crc32_ppc64le.s -122: SLD $2,R8 // convert index-> bytes -59: MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian? -60: MOVWZ 4(R5),R9 // 4-7 bytes of p -hidden 35 more line matches +github.com/golang/go/misc/wasm/wasm_exec.js +497: const bytes = encoder.encode(str + "\0"); +48: read(fd, buffer, offset, length, position, callback) { callback(enosys()); }, +192: return new Uint8Array(this._inst.exports.mem.buffer, array, len); +hidden 10 more line matches github.com/golang/go/src/fmt/print.go 101:type buffer []byte diff --git a/internal/e2e/testdata/coverage_data_writer.txt b/internal/e2e/testdata/coverage_data_writer.txt index a4a094f7c..06abe41e5 100644 --- a/internal/e2e/testdata/coverage_data_writer.txt +++ b/internal/e2e/testdata/coverage_data_writer.txt @@ -5,7 +5,7 @@ targetRank: 13 github.com/golang/go/src/internal/coverage/stringtab/stringtab.go 19:type Writer struct { 27:func (stw *Writer) InitWriter() { -9: "internal/coverage/slicereader" +70:func (stw *Writer) Write(w io.Writer) error { hidden 16 more line matches github.com/golang/go/src/cmd/cover/func.go diff --git a/internal/e2e/testdata/rank_stats.txt b/internal/e2e/testdata/rank_stats.txt index 81969e3f6..b937357cc 100644 --- a/internal/e2e/testdata/rank_stats.txt +++ b/internal/e2e/testdata/rank_stats.txt @@ -1,4 +1,4 @@ -queries: 14 -recall@1: 9 (64%) -recall@5: 11 (79%) -mrr: 0.710733 +queries: 15 +recall@1: 10 (67%) +recall@5: 12 (80%) +mrr: 0.730017 diff --git a/internal/e2e/testdata/time_compare.txt b/internal/e2e/testdata/time_compare.txt new file mode 100644 index 000000000..56944c3ce --- /dev/null +++ b/internal/e2e/testdata/time_compare.txt @@ -0,0 +1,38 @@ +queryString: time compare\( +query: (and substr:"time" substr:"compare(") +targetRank: 1 + +**github.com/golang/go/src/time/time.go** +129:type Time struct { +79:package time +271:func (t Time) Compare(u Time) int { +hidden 250 more line matches + +github.com/sourcegraph/sourcegraph/internal/api/api.go +127:func (r ExternalRepoSpec) Compare(s ExternalRepoSpec) int { +7: "time" +170: CreatedAt time.Time // the date when this settings value was created + +github.com/sourcegraph/sourcegraph/client/shared/src/codeintel/scip.ts +117: public compare(other: Range): number { +53: return this.compare(other) < 0 +56: return this.compare(other) <= 0 +hidden 10 more line matches + +github.com/golang/go/src/strings/compare.go +13:func Compare(a, b string) int { +14: // NOTE(rsc): This function does NOT call the runtime cmpstring function, + +github.com/golang/go/src/go/constant/value.go +1337:func Compare(x_ Value, op token.Token, y_ Value) bool { +1102:// Division by zero leads to a run-time panic. +1381: re := Compare(x.re, token.EQL, y.re) +hidden 1 more line matches + +github.com/golang/go/src/syscall/zsyscall_windows.go +878:func GetSystemTimeAsFileTime(time *Filetime) { +1088:func SetFileTime(handle Handle, ctime *Filetime, atime *Filetime, wtime *Filetime) (err error) { +132: procGetSystemTimeAsFileTime = modkernel32.NewProc("GetSystemTimeAsFileTime") +hidden 19 more line matches + +hidden 139 more file matches diff --git a/internal/e2e/testdata/zoekt_searcher.txt b/internal/e2e/testdata/zoekt_searcher.txt index 0059b49eb..981da4a21 100644 --- a/internal/e2e/testdata/zoekt_searcher.txt +++ b/internal/e2e/testdata/zoekt_searcher.txt @@ -11,7 +11,7 @@ hidden 13 more line matches github.com/sourcegraph/zoekt/rpc/internal/srv/srv.go 33:type Searcher struct { 34: Searcher zoekt.Searcher -7: "github.com/sourcegraph/zoekt" +37:func (s *Searcher) Search(ctx context.Context, args *SearchArgs, reply *SearchReply) error { hidden 9 more line matches github.com/sourcegraph/sourcegraph/doc/admin/observability/dashboards.md @@ -35,7 +35,7 @@ hidden 1 more line matches github.com/sourcegraph/zoekt/json/json.go 26: Searcher zoekt.Searcher 25:type jsonSearcher struct { -9: "github.com/sourcegraph/zoekt" +48:func (s *jsonSearcher) jsonSearch(w http.ResponseWriter, req *http.Request) { hidden 16 more line matches hidden 119 more file matches diff --git a/read_test.go b/read_test.go index 1e5668d8a..9e7acd135 100644 --- a/read_test.go +++ b/read_test.go @@ -306,8 +306,8 @@ func TestReadSearch(t *testing.T) { continue } - if d := cmp.Diff(res.Files, want.FileMatches[j]); d != "" { - t.Errorf("matches for %s on %s\n%s", q, name, d) + if d := cmp.Diff(want.FileMatches[j], res.Files); d != "" { + t.Errorf("matches for %s on %s (-want +got)\n%s", q, name, d) } } } diff --git a/testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden b/testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden index fb556563b..2ec38c772 100644 --- a/testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden +++ b/testdata/golden/TestReadSearch/ctagsrepo_v16.00000.golden @@ -16,7 +16,7 @@ "Before": null, "After": null, "FileName": false, - "Score": 501, + "Score": 6801, "DebugScore": "", "LineFragments": [ { @@ -29,7 +29,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 510 + "Score": 6810 } ], [ diff --git a/testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden b/testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden index a594eb2fc..de1f98e2f 100644 --- a/testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden +++ b/testdata/golden/TestReadSearch/ctagsrepo_v17.00000.golden @@ -16,7 +16,7 @@ "Before": null, "After": null, "FileName": false, - "Score": 501, + "Score": 6801, "DebugScore": "", "LineFragments": [ { @@ -29,7 +29,7 @@ } ], "Checksum": "n9fUYqacPXg=", - "Score": 510 + "Score": 6810 } ], [ diff --git a/testdata/golden/TestReadSearch/repo2_v16.00000.golden b/testdata/golden/TestReadSearch/repo2_v16.00000.golden index 429a5b73d..82a6112d8 100644 --- a/testdata/golden/TestReadSearch/repo2_v16.00000.golden +++ b/testdata/golden/TestReadSearch/repo2_v16.00000.golden @@ -16,7 +16,7 @@ "Before": null, "After": null, "FileName": false, - "Score": 501, + "Score": 6801, "DebugScore": "", "LineFragments": [ { @@ -29,7 +29,7 @@ } ], "Checksum": "Ju1TnQKZ6mE=", - "Score": 510 + "Score": 6810 } ], [ From 11029defc61ce3212ad313a872395f12019c1d65 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Thu, 22 Feb 2024 18:19:07 +0100 Subject: [PATCH 2/6] state assumption about non-overlapping sections --- contentprovider.go | 2 +- contentprovider_test.go | 32 +++++++++++++++++--------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/contentprovider.go b/contentprovider.go index 34a2d793c..f0be27995 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -556,7 +556,7 @@ func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { // overlaps the most with the area defined by off and sz. If no section // overlaps, it returns 0, false. // -// The implementation assumes that secs is sorted by DocumentSection.End. +// The implementation assumes that sections do not overlap. func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { // Find the first section that overlaps. j := sort.Search(len(secs), func(i int) bool { return secs[i].End > off }) diff --git a/contentprovider_test.go b/contentprovider_test.go index 645699d8c..f795af9fa 100644 --- a/contentprovider_test.go +++ b/contentprovider_test.go @@ -410,14 +410,14 @@ func TestColumnHelper(t *testing.T) { func TestFindMaxOverlappingSection(t *testing.T) { secs := []DocumentSection{ - {Start: 0, End: 10}, - {Start: 5, End: 15}, - {Start: 17, End: 21}, + {Start: 0, End: 5}, + {Start: 8, End: 19}, + {Start: 22, End: 26}, } - // 0123456789012345678901 - // [.........[ - // [..........[ - // [...[ + // 012345678901234567890123456 + // [....[ + // [..........[ + // [...[ testcases := []struct { name string @@ -426,16 +426,18 @@ func TestFindMaxOverlappingSection(t *testing.T) { wantSecIx uint32 wantOverlap bool }{ - {off: 1, sz: 5, wantSecIx: 0, wantOverlap: true}, - {off: 0, sz: 10, wantSecIx: 0, wantOverlap: true}, - {off: 3, sz: 10, wantSecIx: 1, wantOverlap: true}, - {off: 8, sz: 4, wantSecIx: 1, wantOverlap: true}, - {off: 12, sz: 15, wantSecIx: 2, wantOverlap: true}, - {off: 16, sz: 10, wantSecIx: 2, wantOverlap: true}, + {off: 0, sz: 1, wantSecIx: 0, wantOverlap: true}, + {off: 0, sz: 5, wantSecIx: 0, wantOverlap: true}, + {off: 2, sz: 5, wantSecIx: 0, wantOverlap: true}, + {off: 2, sz: 50, wantSecIx: 1, wantOverlap: true}, + {off: 4, sz: 10, wantSecIx: 1, wantOverlap: true}, + {off: 5, sz: 15, wantSecIx: 1, wantOverlap: true}, + {off: 18, sz: 10, wantSecIx: 2, wantOverlap: true}, // No overlap - {off: 15, sz: 2, wantOverlap: false}, - {off: 21, sz: 1, wantOverlap: false}, + {off: 5, sz: 2, wantOverlap: false}, + {off: 20, sz: 1, wantOverlap: false}, + {off: 99, sz: 1, wantOverlap: false}, } for _, tt := range testcases { From d9626cd16e4cc06cb88f8623acb628e952838e03 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Thu, 22 Feb 2024 18:36:33 +0100 Subject: [PATCH 3/6] reverse change to bytes_buffer.txt --- internal/e2e/testdata/bytes_buffer.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/e2e/testdata/bytes_buffer.txt b/internal/e2e/testdata/bytes_buffer.txt index 21ee3ca16..dba9506ab 100644 --- a/internal/e2e/testdata/bytes_buffer.txt +++ b/internal/e2e/testdata/bytes_buffer.txt @@ -14,11 +14,11 @@ github.com/golang/go/src/cmd/internal/edit/edit.go 41:func NewBuffer(data []byte) *Buffer { hidden 13 more line matches -github.com/golang/go/misc/wasm/wasm_exec.js -497: const bytes = encoder.encode(str + "\0"); -48: read(fd, buffer, offset, length, position, callback) { callback(enosys()); }, -192: return new Uint8Array(this._inst.exports.mem.buffer, array, len); -hidden 10 more line matches +github.com/golang/go/src/hash/crc32/crc32_ppc64le.s +122: SLD $2,R8 // convert index-> bytes +59: MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian? +60: MOVWZ 4(R5),R9 // 4-7 bytes of p +hidden 35 more line matches github.com/golang/go/src/fmt/print.go 101:type buffer []byte From eefa29c014b9897936356dbdbcd1893e356ec78a Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Fri, 23 Feb 2024 13:29:48 +0100 Subject: [PATCH 4/6] pin go version in tests to 1.21 --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 75038b07d..c749bf884 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,7 +8,8 @@ name: CI jobs: test: runs-on: ubuntu-latest - container: alpine:edge # go1.19 needs > alpine 3.15 + # Pinned to alpine 3.19 to fix go version to 1.21. Remove this once Sourcegraph is on Go 1.22. + container: alpine:3.19 steps: - name: checkout uses: actions/checkout@v3 From 0a7b689ea306feb7aae3242769640887538fd9c2 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Thu, 29 Feb 2024 11:03:09 +0100 Subject: [PATCH 5/6] use relative overlap instead of absolute overlap --- contentprovider.go | 47 ++++++++++++++++++++++++++--------------- contentprovider_test.go | 28 +++++++++++++++--------- 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/contentprovider.go b/contentprovider.go index f0be27995..05b2b7a57 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -553,32 +553,45 @@ func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { } // findMaxOverlappingSection returns the index of the section in secs that -// overlaps the most with the area defined by off and sz. If no section -// overlaps, it returns 0, false. +// overlaps the most with the area defined by off and sz, relative to the size +// of the section. If no section overlaps, it returns 0, false. If multiple +// sections overlap the same amount, the first one is returned. // -// The implementation assumes that sections do not overlap. +// The implementation assumes that sections do not overlap and are sorted by +// DocumentSection.Start. func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { - // Find the first section that overlaps. + // Find the first section that might overlap j := sort.Search(len(secs), func(i int) bool { return secs[i].End > off }) - if j == len(secs) { + if j == len(secs) || secs[j].Start >= off+sz { + // No overlap. return 0, false } - var maxSz uint32 - var maxIdx uint32 - for i := j; i < len(secs); i++ { - if secs[i].End <= off || secs[i].Start >= off+sz { - // No overlap - break - } - overlap := min(secs[i].End, off+sz) - max(secs[i].Start, off) - if overlap > maxSz { - maxSz = overlap - maxIdx = uint32(i) + relOverlap := func(j int) float64 { + secSize := secs[j].End - secs[j].Start + if secSize == 0 { + return 0 } + // This cannot overflow because we make sure there is overlap before calling relOverlap + overlap := min(secs[j].End, off+sz) - max(secs[j].Start, off) + return float64(overlap) / float64(secSize) + } + + ol1 := relOverlap(j) + if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= off+sz { + return uint32(j), ol1 > 0 + } + + // We know that [off,off+sz[ overlaps with at least 2 sections. We only have to check + // if the second section overlaps more than the first one, because a third + // section can only overlap if the overlap with the second section is complete. + ol2 := relOverlap(j + 1) + if ol2 > ol1 { + return uint32(j + 1), ol2 > 0 } - return maxIdx, maxSz > 0 + + return uint32(j), ol1 > 0 } func (p *contentProvider) findSymbol(cm *candidateMatch) (DocumentSection, *Symbol, bool) { diff --git a/contentprovider_test.go b/contentprovider_test.go index f795af9fa..b8bbc253b 100644 --- a/contentprovider_test.go +++ b/contentprovider_test.go @@ -423,21 +423,29 @@ func TestFindMaxOverlappingSection(t *testing.T) { name string off uint32 sz uint32 - wantSecIx uint32 + wantSecIdx uint32 wantOverlap bool }{ - {off: 0, sz: 1, wantSecIx: 0, wantOverlap: true}, - {off: 0, sz: 5, wantSecIx: 0, wantOverlap: true}, - {off: 2, sz: 5, wantSecIx: 0, wantOverlap: true}, - {off: 2, sz: 50, wantSecIx: 1, wantOverlap: true}, - {off: 4, sz: 10, wantSecIx: 1, wantOverlap: true}, - {off: 5, sz: 15, wantSecIx: 1, wantOverlap: true}, - {off: 18, sz: 10, wantSecIx: 2, wantOverlap: true}, + {off: 0, sz: 1, wantSecIdx: 0, wantOverlap: true}, + {off: 0, sz: 5, wantSecIdx: 0, wantOverlap: true}, + {off: 2, sz: 5, wantSecIdx: 0, wantOverlap: true}, + {off: 2, sz: 50, wantSecIdx: 1, wantOverlap: true}, + {off: 4, sz: 10, wantSecIdx: 1, wantOverlap: true}, + {off: 5, sz: 15, wantSecIdx: 1, wantOverlap: true}, + {off: 18, sz: 10, wantSecIdx: 2, wantOverlap: true}, + + // Prefer full overlap, break ties by preferring the earlier section + {off: 10, sz: 20, wantSecIdx: 2, wantOverlap: true}, + {off: 0, sz: 100, wantSecIdx: 0, wantOverlap: true}, + {off: 8, sz: 100, wantSecIdx: 1, wantOverlap: true}, + {off: 0, sz: 10, wantSecIdx: 0, wantOverlap: true}, + {off: 16, sz: 10, wantSecIdx: 2, wantOverlap: true}, // No overlap {off: 5, sz: 2, wantOverlap: false}, {off: 20, sz: 1, wantOverlap: false}, {off: 99, sz: 1, wantOverlap: false}, + {off: 0, sz: 0, wantOverlap: false}, } for _, tt := range testcases { @@ -446,8 +454,8 @@ func TestFindMaxOverlappingSection(t *testing.T) { if haveOverlap != tt.wantOverlap { t.Fatalf("expected overlap %v, got %v", tt.wantOverlap, haveOverlap) } - if got != tt.wantSecIx { - t.Fatalf("expected section %d, got %d", tt.wantSecIx, got) + if got != tt.wantSecIdx { + t.Fatalf("expected section %d, got %d", tt.wantSecIdx, got) } }) } From c412114517e099161c4d3a05df826e8be3aaf759 Mon Sep 17 00:00:00 2001 From: Stefan Hengl Date: Fri, 1 Mar 2024 10:06:38 +0100 Subject: [PATCH 6/6] introduce start and end --- contentprovider.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/contentprovider.go b/contentprovider.go index 05b2b7a57..28e70669d 100644 --- a/contentprovider.go +++ b/contentprovider.go @@ -560,10 +560,13 @@ func findSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { // The implementation assumes that sections do not overlap and are sorted by // DocumentSection.Start. func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, bool) { + start := off + end := off + sz + // Find the first section that might overlap - j := sort.Search(len(secs), func(i int) bool { return secs[i].End > off }) + j := sort.Search(len(secs), func(i int) bool { return secs[i].End > start }) - if j == len(secs) || secs[j].Start >= off+sz { + if j == len(secs) || secs[j].Start >= end { // No overlap. return 0, false } @@ -574,12 +577,12 @@ func findMaxOverlappingSection(secs []DocumentSection, off, sz uint32) (uint32, return 0 } // This cannot overflow because we make sure there is overlap before calling relOverlap - overlap := min(secs[j].End, off+sz) - max(secs[j].Start, off) + overlap := min(secs[j].End, end) - max(secs[j].Start, start) return float64(overlap) / float64(secSize) } ol1 := relOverlap(j) - if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= off+sz { + if epsilonEqualsOne(ol1) || j == len(secs)-1 || secs[j+1].Start >= end { return uint32(j), ol1 > 0 }