From 09c5343b92d38ddc650c09a3e3cadef83020d8a7 Mon Sep 17 00:00:00 2001 From: Keegan Carruthers-Smith Date: Mon, 7 Oct 2024 08:53:38 +0200 Subject: [PATCH] web: escape + as %2B in file path templates When constructing a shard we specify templates for constructing a URL. Finally those URLs end up going via html/template which has pretty strict escaping rules. This commit makes two changes: URL construction via text/template. We still get the safety benefits later on when finally rendering the output, but given we are constructing URLs it makes more sense to use text/template. Special escaping of + in URLs. I couldn't convince html/template to not break URls containing + in it. So instead we use + escaped to %2B. I tested gerrit, github and sourcegraph with %2B in filenames and they all worked. To do the above I introduced a template function called URLJoinPath which is a wrapper around url.JoinPath with the additional behaviour around + escaping. Test Plan: Did lots of updates in tests. See diff. --- gitindex/index.go | 44 ++++++++++++------- gitindex/index_test.go | 95 +++++++++++++++++++++++++++++++++++++++++- indexbuilder.go | 36 +++++++++++++++- web/e2e_test.go | 13 +++--- web/server.go | 28 +++++++++++-- web/snippets.go | 6 +-- 6 files changed, 191 insertions(+), 31 deletions(-) diff --git a/gitindex/index.go b/gitindex/index.go index 172651c4f..183c3249a 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -90,22 +90,38 @@ func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { u.User = nil } + // helper to generate u.JoinPath as a template + varVersion := ".Version" + varPath := ".Path" + urlJoinPath := func(elem ...string) string { + elem = append([]string{u.String()}, elem...) + var parts []string + for _, e := range elem { + if e == varVersion || e == varPath { + parts = append(parts, e) + } else { + parts = append(parts, strconv.Quote(e)) + } + } + return fmt.Sprintf("{{URLJoinPath %s}}", strings.Join(parts, " ")) + } + repo.URL = u.String() switch typ { case "gitiles": // eg. https://gerrit.googlesource.com/gitiles/+/master/tools/run_dev.sh#20 - repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" - repo.FileURLTemplate = u.String() + "/+/{{.Version}}/{{.Path}}" + repo.CommitURLTemplate = urlJoinPath("+", varVersion) + repo.FileURLTemplate = urlJoinPath("+", varVersion, varPath) repo.LineFragmentTemplate = "#{{.LineNumber}}" case "github": // eg. https://github.com/hanwen/go-fuse/blob/notify/genversion.sh#L10 - repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" - repo.FileURLTemplate = u.String() + "/blob/{{.Version}}/{{.Path}}" + repo.CommitURLTemplate = urlJoinPath("commit", varVersion) + repo.FileURLTemplate = urlJoinPath("blob", varVersion, varPath) repo.LineFragmentTemplate = "#L{{.LineNumber}}" case "cgit": // http://git.savannah.gnu.org/cgit/lilypond.git/tree/elisp/lilypond-mode.el?h=dev/philh&id=b2ca0fefe3018477aaca23b6f672c7199ba5238e#n100 - repo.CommitURLTemplate = u.String() + "/commit/?id={{.Version}}" - repo.FileURLTemplate = u.String() + "/tree/{{.Path}}/?id={{.Version}}" + repo.CommitURLTemplate = urlJoinPath("commit") + "/?id={{.Version}}" + repo.FileURLTemplate = urlJoinPath("tree", varPath) + "/?id={{.Version}}" repo.LineFragmentTemplate = "#n{{.LineNumber}}" case "gitweb": // https://gerrit.libreoffice.org/gitweb?p=online.git;a=blob;f=Makefile.am;h=cfcfd7c36fbae10e269653dc57a9b68c92d4c10b;hb=848145503bf7b98ce4a4aa0a858a0d71dd0dbb26#l10 @@ -115,29 +131,29 @@ func setTemplates(repo *zoekt.Repository, u *url.URL, typ string) error { case "source.bazel.build": // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9 // https://source.bazel.build/bazel/+/57bc201346e61c62a921c1cbf32ad24f185c10c9:tools/cpp/BUILD.empty;l=10 - repo.CommitURLTemplate = u.String() + "/+/{{.Version}}" - repo.FileURLTemplate = u.String() + "/+/{{.Version}}:{{.Path}}" + repo.CommitURLTemplate = u.String() + "/%2B/{{.Version}}" + repo.FileURLTemplate = u.String() + "/%2B/{{.Version}}:{{.Path}}" repo.LineFragmentTemplate = ";l={{.LineNumber}}" case "bitbucket-server": // https:///projects//repos//commits/5be7ca73b898bf17a08e607918accfdeafe1e0bc // https:///projects//repos//browse/?at=5be7ca73b898bf17a08e607918accfdeafe1e0bc - repo.CommitURLTemplate = u.String() + "/commits/{{.Version}}" - repo.FileURLTemplate = u.String() + "/{{.Path}}?at={{.Version}}" + repo.CommitURLTemplate = urlJoinPath("commits", varVersion) + repo.FileURLTemplate = urlJoinPath(varPath) + "?at={{.Version}}" repo.LineFragmentTemplate = "#{{.LineNumber}}" case "gitlab": // https://gitlab.com/gitlab-org/omnibus-gitlab/-/commit/b152c864303dae0e55377a1e2c53c9592380ffed // https://gitlab.com/gitlab-org/omnibus-gitlab/-/blob/aad04155b3f6fc50ede88aedaee7fc624d481149/files/gitlab-config-template/gitlab.rb.template - repo.CommitURLTemplate = u.String() + "/-/commit/{{.Version}}" - repo.FileURLTemplate = u.String() + "/-/blob/{{.Version}}/{{.Path}}" + repo.CommitURLTemplate = urlJoinPath("-/commit", varVersion) + repo.FileURLTemplate = urlJoinPath("-/blob", varVersion, varPath) repo.LineFragmentTemplate = "#L{{.LineNumber}}" case "gitea": - repo.CommitURLTemplate = u.String() + "/commit/{{.Version}}" + repo.CommitURLTemplate = urlJoinPath("commit", varVersion) // NOTE The `display=source` query parameter is required to disable file rendering. // Since line numbers are disabled in rendered files, you wouldn't be able to jump to // a line without `display=source`. This is supported since gitea 1.17.0. // When /src/{{.Version}} is used it will redirect to /src/commit/{{.Version}}, // but the query parameters are obmitted. - repo.FileURLTemplate = u.String() + "/src/commit/{{.Version}}/{{.Path}}?display=source" + repo.FileURLTemplate = urlJoinPath("src/commit", varVersion, varPath) + "?display=source" repo.LineFragmentTemplate = "#L{{.LineNumber}}" default: return fmt.Errorf("URL scheme type %q unknown", typ) diff --git a/gitindex/index_test.go b/gitindex/index_test.go index c18d7ed34..d38b49f03 100644 --- a/gitindex/index_test.go +++ b/gitindex/index_test.go @@ -18,11 +18,13 @@ import ( "bytes" "context" "fmt" + "net/url" "os" "os/exec" "path/filepath" "runtime" "sort" + "strings" "testing" "github.com/go-git/go-git/v5" @@ -770,7 +772,7 @@ func runScript(t *testing.T, cwd string, script string) { } } -func TestSetTemplate(t *testing.T) { +func TestSetTemplates_e2e(t *testing.T) { repositoryDir := t.TempDir() // setup: initialize the repository and all of its branches @@ -781,11 +783,100 @@ func TestSetTemplate(t *testing.T) { t.Fatalf("setTemplatesFromConfig: %v", err) } - if got, want := desc.FileURLTemplate, "https://github.com/sourcegraph/zoekt/blob/{{.Version}}/{{.Path}}"; got != want { + if got, want := desc.FileURLTemplate, `{{URLJoinPath "https://github.com/sourcegraph/zoekt" "blob" .Version .Path}}`; got != want { t.Errorf("got %q, want %q", got, want) } } +func TestSetTemplates(t *testing.T) { + base := "https://example.com/repo/name" + version := "VERSION" + path := "dir/name.txt" + lineNumber := 10 + cases := []struct { + typ string + commit string + file string + line string + }{{ + typ: "gitiles", + commit: "https://example.com/repo/name/%2B/VERSION", + file: "https://example.com/repo/name/%2B/VERSION/dir/name.txt", + line: "#10", + }, { + typ: "github", + commit: "https://example.com/repo/name/commit/VERSION", + file: "https://example.com/repo/name/blob/VERSION/dir/name.txt", + line: "#L10", + }, { + typ: "cgit", + commit: "https://example.com/repo/name/commit/?id=VERSION", + file: "https://example.com/repo/name/tree/dir/name.txt/?id=VERSION", + line: "#n10", + }, { + typ: "gitweb", + commit: "https://example.com/repo/name;a=commit;h=VERSION", + file: "https://example.com/repo/name;a=blob;f=dir/name.txt;hb=VERSION", + line: "#l10", + }, { + typ: "source.bazel.build", + commit: "https://example.com/repo/name/%2B/VERSION", + file: "https://example.com/repo/name/%2B/VERSION:dir/name.txt", + line: ";l=10", + }, { + typ: "bitbucket-server", + commit: "https://example.com/repo/name/commits/VERSION", + file: "https://example.com/repo/name/dir/name.txt?at=VERSION", + line: "#10", + }, { + typ: "gitlab", + commit: "https://example.com/repo/name/-/commit/VERSION", + file: "https://example.com/repo/name/-/blob/VERSION/dir/name.txt", + line: "#L10", + }, { + typ: "gitea", + commit: "https://example.com/repo/name/commit/VERSION", + file: "https://example.com/repo/name/src/commit/VERSION/dir/name.txt?display=source", + line: "#L10", + }} + + for _, tc := range cases { + t.Run(tc.typ, func(t *testing.T) { + assertOutput := func(templateText string, want string) { + t.Helper() + + tt, err := zoekt.ParseTemplate(templateText) + if err != nil { + t.Fatal(err) + } + + var sb strings.Builder + err = tt.Execute(&sb, map[string]any{ + "Version": version, + "Path": path, + "LineNumber": lineNumber, + }) + if err != nil { + t.Fatal(err) + } + if got := sb.String(); got != want { + t.Fatalf("want: %q\ngot: %q", want, got) + } + } + + var repo zoekt.Repository + u, _ := url.Parse(base) + err := setTemplates(&repo, u, tc.typ) + if err != nil { + t.Fatal(err) + } + assertOutput(repo.CommitURLTemplate, tc.commit) + assertOutput(repo.FileURLTemplate, tc.file) + assertOutput(repo.LineFragmentTemplate, tc.line) + }) + } +} + func BenchmarkPrepareNormalBuild(b *testing.B) { // NOTE: To run the benchmark, download a large repo (like github.com/chromium/chromium/) and change this to its path. repoDir := "/path/to/your/repo" diff --git a/indexbuilder.go b/indexbuilder.go index 026fd7e8d..4b9fdc385 100644 --- a/indexbuilder.go +++ b/indexbuilder.go @@ -19,11 +19,13 @@ import ( "encoding/binary" "fmt" "hash/crc64" - "html/template" "log" + "net/url" "os" "path/filepath" "sort" + "strings" + "text/template" "time" "unicode/utf8" @@ -216,13 +218,43 @@ type IndexBuilder struct { func (d *Repository) verify() error { for _, t := range []string{d.FileURLTemplate, d.LineFragmentTemplate, d.CommitURLTemplate} { - if _, err := template.New("").Parse(t); err != nil { + if _, err := ParseTemplate(t); err != nil { return err } } return nil } +func urlJoinPath(base string, elem ...string) string { + // golangs html/template always escapes "+" appearing in an HTML attribute + // [1]. We may even want to treat more characters, differently but this + // atleast makes it possible to visit URLs like [2]. + // + // We only do this to elem since base will normally be a hardcoded string. + // + // [1]: https://sourcegraph.com/github.com/golang/go@go1.23.2/-/blob/src/html/template/html.go?L71-80 + // [2]: https://github.com/apple/swift-system/blob/main/Sources/System/Util+StringArray.swift + elem = append([]string{}, elem...) // copy to mutate + for i := range elem { + elem[i] = strings.ReplaceAll(elem[i], "+", "%2B") + } + u, err := url.JoinPath(base, elem...) + if err != nil { + return "#!error: " + err.Error() + } + return u +} + +// ParseTemplate will parse the templates for FileURLTemplate, +// LineFragmentTemplate and CommitURLTemplate. +// +// It makes available the extra function UrlJoinPath. +func ParseTemplate(text string) (*template.Template, error) { + return template.New("").Funcs(template.FuncMap{ + "URLJoinPath": urlJoinPath, + }).Parse(text) +} + // ContentSize returns the number of content bytes so far ingested. func (b *IndexBuilder) ContentSize() uint32 { // Add the name too so we don't skip building index if we have diff --git a/web/e2e_test.go b/web/e2e_test.go index 5cbb63d13..2b234837e 100644 --- a/web/e2e_test.go +++ b/web/e2e_test.go @@ -86,16 +86,17 @@ func TestBasic(t *testing.T) { b, err := zoekt.NewIndexBuilder(&zoekt.Repository{ Name: "name", URL: "repo-url", - CommitURLTemplate: "{{.Version}}", - FileURLTemplate: "file-url", - LineFragmentTemplate: "#line", + CommitURLTemplate: `{{ URLJoinPath "https://github.com/org/repo/commit/" .Version}}`, + FileURLTemplate: `{{ URLJoinPath "https://github.com/org/repo/blob/" .Version .Path}}`, + LineFragmentTemplate: "#L{{.LineNumber}}", Branches: []zoekt.RepositoryBranch{{Name: "master", Version: "1234"}}, }) if err != nil { t.Fatalf("NewIndexBuilder: %v", err) } if err := b.Add(zoekt.Document{ - Name: "f2", + // use a name which requires correct escaping. https://github.com/sourcegraph/zoekt/issues/807 + Name: "foo/bar+baz", Content: []byte("to carry water in the no later bla"), // --------------0123456789012345678901234567890123 // --------------0 1 2 3 @@ -123,7 +124,7 @@ func TestBasic(t *testing.T) { for req, needles := range map[string][]string{ "/": {"from 1 repositories"}, "/search?q=water": { - "href=\"file-url#line", + `href="https://github.com/org/repo/blob/1234/foo/bar%2Bbaz"`, "carry water", }, "/search?q=r:": { @@ -131,7 +132,7 @@ func TestBasic(t *testing.T) { "Found 1 repositories", nowStr, "repo-url\">name", - "1 files (36B)", + "1 files (45B)", }, "/search?q=magic": { `value=magic`, diff --git a/web/server.go b/web/server.go index acf4dc59a..eb713ddfc 100644 --- a/web/server.go +++ b/web/server.go @@ -28,6 +28,7 @@ import ( "strconv" "strings" "sync" + texttemplate "text/template" "time" "github.com/grafana/regexp" @@ -116,8 +117,9 @@ type Server struct { startTime time.Time - templateMu sync.Mutex - templateCache map[string]*template.Template + templateMu sync.Mutex + templateCache map[string]*template.Template + textTemplateCache map[string]*texttemplate.Template lastStatsMu sync.Mutex lastStats *zoekt.RepoStats @@ -134,13 +136,30 @@ func (s *Server) getTemplate(str string) *template.Template { t, err := template.New("cache").Parse(str) if err != nil { - log.Printf("template parse error: %v", err) + log.Printf("html template parse error: %v", err) t = template.Must(template.New("empty").Parse("")) } s.templateCache[str] = t return t } +func (s *Server) getTextTemplate(str string) *texttemplate.Template { + s.templateMu.Lock() + defer s.templateMu.Unlock() + t := s.textTemplateCache[str] + if t != nil { + return t + } + + t, err := zoekt.ParseTemplate(str) + if err != nil { + log.Printf("text template parse error: %v", err) + t = texttemplate.Must(texttemplate.New("empty").Parse("")) + } + s.textTemplateCache[str] = t + return t +} + func NewMux(s *Server) (*http.ServeMux, error) { s.print = s.Top.Lookup("print") if s.print == nil { @@ -162,6 +181,7 @@ func NewMux(s *Server) (*http.ServeMux, error) { } s.templateCache = map[string]*template.Template{} + s.textTemplateCache = map[string]*texttemplate.Template{} s.startTime = time.Now() mux := http.NewServeMux() @@ -510,7 +530,7 @@ func (s *Server) serveListReposErr(q query.Q, qStr string, r *http.Request) (*Re } for _, r := range repos.Repos { - t := s.getTemplate(r.Repository.CommitURLTemplate) + t := s.getTextTemplate(r.Repository.CommitURLTemplate) repo := Repository{ Name: r.Repository.Name, diff --git a/web/snippets.go b/web/snippets.go index 3427cee06..4d2e60936 100644 --- a/web/snippets.go +++ b/web/snippets.go @@ -16,11 +16,11 @@ package web import ( "bytes" - "html/template" "log" "net/url" "strconv" "strings" + "text/template" "github.com/sourcegraph/zoekt" ) @@ -33,12 +33,12 @@ func (s *Server) formatResults(result *zoekt.SearchResult, query string, localPr if !localPrint { for repo, str := range result.RepoURLs { if str != "" { - templateMap[repo] = s.getTemplate(str) + templateMap[repo] = s.getTextTemplate(str) } } for repo, str := range result.LineFragments { if str != "" { - fragmentMap[repo] = s.getTemplate(str) + fragmentMap[repo] = s.getTextTemplate(str) } } }