diff --git a/tools/go.mod b/tools/go.mod index fd3434f7e..ad4a6134d 100644 --- a/tools/go.mod +++ b/tools/go.mod @@ -3,8 +3,8 @@ module github.com/publicsuffix/list/tools go 1.21 require ( + github.com/creachadair/mds v0.15.0 github.com/google/go-cmp v0.6.0 + golang.org/x/net v0.26.0 golang.org/x/text v0.16.0 ) - -require github.com/creachadair/mds v0.15.0 // indirect diff --git a/tools/go.sum b/tools/go.sum index a00b0d317..33f7c4f53 100644 --- a/tools/go.sum +++ b/tools/go.sum @@ -2,5 +2,7 @@ github.com/creachadair/mds v0.15.0 h1:St6HvUcrX1UJ517Zha6GKxVibGyRDBDtInOjuaaHOr github.com/creachadair/mds v0.15.0/go.mod h1:4vrFYUzTXMJpMBU+OA292I6IUxKWCCfZkgXg+/kBZMo= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= diff --git a/tools/govalidate/govalidate.go b/tools/govalidate/govalidate.go index f4cd6ead9..c0001ffb9 100644 --- a/tools/govalidate/govalidate.go +++ b/tools/govalidate/govalidate.go @@ -6,12 +6,15 @@ import ( "flag" "fmt" "os" + "strconv" + "strings" "github.com/publicsuffix/list/tools/internal/parser" ) func main() { - warnings := flag.Bool("with-warnings", false, "also print errors that were downgraded to warnings") + debugPrintTree := flag.Bool("debug-print", false, "print the parse tree for debugging") + flag.Usage = func() { fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] pslfile\n", os.Args[0]) flag.PrintDefaults() @@ -30,19 +33,91 @@ func main() { os.Exit(1) } - psl := parser.Parse(bs) + psl, errs := parser.Parse(bs) + + if *debugPrintTree { + debugPrint(psl) + } - for _, err := range psl.Errors { + for _, err := range errs { fmt.Println(err) } - if *warnings { - for _, err := range psl.Warnings { - fmt.Println(err, "(warning)") - } + + verrs := parser.ValidateOffline(psl) + for _, err := range verrs { + fmt.Println(err) } - if len(psl.Errors) > 0 { + + if total := len(errs) + len(verrs); total > 0 { + fmt.Printf("\nFile has %d errors.\n", total) os.Exit(1) } else { - fmt.Printf("%q seems to be a valid PSL file.\n", file) + fmt.Println("\nFile is valid.") + } +} + +// debugPrint prints out a PSL syntax tree in a private, subject to +// change text format. +func debugPrint(p *parser.List) { + fmt.Println("List {") + for _, b := range p.Blocks { + debugPrintRec(b, " ") + } + fmt.Println("}") +} + +func debugPrintRec(b parser.Block, indent string) { + nextIndent := indent + " " + f := func(msg string, args ...any) { + fmt.Printf(indent+msg+"\n", args...) + } + src := b.SrcRange() + loc := fmt.Sprintf("[%d:%d]", src.FirstLine, src.LastLine) + if src.FirstLine+1 == src.LastLine { + loc = strconv.Itoa(src.FirstLine) + } + + switch v := b.(type) { + case *parser.Blank: + f("Blank(%s)", loc) + case *parser.Comment: + f("Comment(%s) {", loc) + for _, t := range v.Text { + f(" %q,", t) + } + f("}") + case *parser.Section: + f("Section(%s, %q) {", loc, v.Name) + for _, b := range v.Blocks { + debugPrintRec(b, nextIndent) + } + f("}") + case *parser.Suffixes: + items := []string{loc} + if v.Entity != "" { + items = append(items, fmt.Sprintf("name=%q", v.Entity)) + } + if v.URL != nil { + items = append(items, fmt.Sprintf("url=%q", v.URL)) + } + if v.Submitter != nil { + items = append(items, fmt.Sprintf("contact=%q", v.Submitter)) + } + + f("SuffixBlock(%s) {", strings.Join(items, fmt.Sprintf(",\n%s ", indent))) + for _, b := range v.Blocks { + debugPrintRec(b, nextIndent) + } + f("}") + case *parser.Suffix: + f("Suffix(%s, %q)", loc, strings.Join(v.Labels, ".")) + case *parser.Wildcard: + if len(v.Exceptions) > 0 { + f("Wildcard(%s, %q, except=%v)", loc, strings.Join(v.Labels, "."), v.Exceptions) + } else { + f("Wildcard(%s, %q)", loc, strings.Join(v.Labels, ".")) + } + default: + panic("unknown block type") } } diff --git a/tools/internal/parser/errors.go b/tools/internal/parser/errors.go index 544201388..05bc25b5a 100644 --- a/tools/internal/parser/errors.go +++ b/tools/internal/parser/errors.go @@ -5,158 +5,148 @@ import ( "strings" ) -// InvalidEncodingError reports that the input is encoded with +// ErrInvalidEncoding reports that the input is encoded with // something other than UTF-8. -type InvalidEncodingError struct { +type ErrInvalidEncoding struct { Encoding string } -func (e InvalidEncodingError) Error() string { - return fmt.Sprintf("file uses invalid character encoding %s", e.Encoding) +func (e ErrInvalidEncoding) Error() string { + return fmt.Sprintf("invalid character encoding %s", e.Encoding) } -// UTF8BOMError reports that the input has an unnecessary UTF-8 byte +// ErrUTF8BOM reports that the input has an unnecessary UTF-8 byte // order mark (BOM) at the start. -type UTF8BOMError struct{} +type ErrUTF8BOM struct{} -func (e UTF8BOMError) Error() string { - return "file starts with an unnecessary UTF-8 BOM (byte order mark)" -} +func (e ErrUTF8BOM) Error() string { return "file has a UTF-8 byte order mark (BOM)" } -// InvalidUTF8Error reports that a line contains bytes that are not +// ErrInvalidUTF8 reports that a line contains bytes that are not // valid UTF-8. -type InvalidUTF8Error struct { - Line Source +type ErrInvalidUTF8 struct { + SourceRange } -func (e InvalidUTF8Error) Error() string { - return fmt.Sprintf("found non UTF-8 bytes at %s", e.Line.LocationString()) +func (e ErrInvalidUTF8) Error() string { + return fmt.Sprintf("%s: invalid UTF-8 bytes", e.SourceRange.LocationString()) } -// DOSNewlineError reports that a line has a DOS style line ending. -type DOSNewlineError struct { - Line Source +// ErrDOSNewline reports that a line has a DOS style line ending. +type ErrDOSNewline struct { + SourceRange } -func (e DOSNewlineError) Error() string { - return fmt.Sprintf("%s has a DOS line ending (\\r\\n instead of just \\n)", e.Line.LocationString()) +func (e ErrDOSNewline) Error() string { + return fmt.Sprintf("%s: found DOS line ending (\\r\\n instead of just \\n)", e.SourceRange.LocationString()) } -// TrailingWhitespaceError reports that a line has trailing whitespace. -type TrailingWhitespaceError struct { - Line Source +// ErrTrailingWhitespace reports that a line has trailing whitespace. +type ErrTrailingWhitespace struct { + SourceRange } -func (e TrailingWhitespaceError) Error() string { - return fmt.Sprintf("%s has trailing whitespace", e.Line.LocationString()) +func (e ErrTrailingWhitespace) Error() string { + return fmt.Sprintf("%s: trailing whitespace", e.SourceRange.LocationString()) } -// LeadingWhitespaceError reports that a line has leading whitespace. -type LeadingWhitespaceError struct { - Line Source +// ErrLeadingWhitespace reports that a line has leading whitespace. +type ErrLeadingWhitespace struct { + SourceRange } -func (e LeadingWhitespaceError) Error() string { - return fmt.Sprintf("%s has leading whitespace", e.Line.LocationString()) +func (e ErrLeadingWhitespace) Error() string { + return fmt.Sprintf("%s: leading whitespace", e.SourceRange.LocationString()) } -// SectionInSuffixBlock reports that a comment within a block of -// suffixes contains a section delimiter. -type SectionInSuffixBlock struct { - Line Source +// ErrSectionInSuffixBlock reports that a comment within a suffix +// block contains a section delimiter. +type ErrSectionInSuffixBlock struct { + SourceRange } -func (e SectionInSuffixBlock) Error() string { - return fmt.Sprintf("section delimiters are not allowed in suffix block comment at %s", e.Line.LocationString()) +func (e ErrSectionInSuffixBlock) Error() string { + return fmt.Sprintf("%s: section delimiter not allowed in suffix block comment", e.SourceRange.LocationString()) } -// UnclosedSectionError reports that a file section was not closed +// ErrUnclosedSection reports that a file section was not closed // properly before EOF. -type UnclosedSectionError struct { - Start *StartSection // The unpaired section start +type ErrUnclosedSection struct { + Section *Section } -func (e UnclosedSectionError) Error() string { - return fmt.Sprintf("section %q started at %s, but is never closed", e.Start.Name, e.Start.LocationString()) +func (e ErrUnclosedSection) Error() string { + return fmt.Sprintf("%s: section %q is missing its closing marker", e.Section.SourceRange.LocationString(), e.Section.Name) } -// NestedSectionError reports that a file section is being started -// while already within a section, which the PSL format does not -// allow. -type NestedSectionError struct { - Outer *StartSection - Inner *StartSection +// ErrNestedSection reports that a file section is being started while +// already within a section. +type ErrNestedSection struct { + SourceRange + Name string + Section *Section } -func (e NestedSectionError) Error() string { - return fmt.Sprintf("new section %q started at %s while still in section %q (started at %s)", e.Inner.Name, e.Inner.LocationString(), e.Outer.Name, e.Outer.LocationString()) +func (e ErrNestedSection) Error() string { + return fmt.Sprintf("%s: section %q is nested inside section %q (%s)", e.SourceRange.LocationString(), e.Name, e.Section.Name, e.Section.SourceRange.LocationString()) } -// UnstartedSectionError reports that a file section end marker was -// found without a corresponding start. -type UnstartedSectionError struct { - End *EndSection +// ErrUnstartedSection reports that section end marker was found +// without a corresponding start. +type ErrUnstartedSection struct { + SourceRange + Name string } -func (e UnstartedSectionError) Error() string { - return fmt.Sprintf("section %q closed at %s but was not started", e.End.Name, e.End.LocationString()) +func (e ErrUnstartedSection) Error() string { + return fmt.Sprintf("%s: end marker for non-existent section %q", e.SourceRange.LocationString(), e.Name) } -// MismatchedSectionError reports that a file section was started +// ErrMismatchedSection reports that a file section was started // under one name but ended under another. -type MismatchedSectionError struct { - Start *StartSection - End *EndSection +type ErrMismatchedSection struct { + SourceRange + EndName string + Section *Section } -func (e MismatchedSectionError) Error() string { - return fmt.Sprintf("section %q closed at %s while in section %q (started at %s)", e.End.Name, e.End.LocationString(), e.Start.Name, e.Start.LocationString()) +func (e ErrMismatchedSection) Error() string { + return fmt.Sprintf("%s: section %q (%s) closed with wrong name %q", e.SourceRange.LocationString(), e.Section.Name, e.Section.SourceRange.LocationString(), e.EndName) } -// UnknownSectionMarker reports that a line looks like a file section +// ErrUnknownSectionMarker reports that a line looks like a file section // marker (e.g. "===BEGIN ICANN DOMAINS==="), but is not one of the // recognized kinds of marker. -type UnknownSectionMarker struct { - Line Source -} - -func (e UnknownSectionMarker) Error() string { - return fmt.Sprintf("unknown kind of section marker %q at %s", e.Line.Text(), e.Line.LocationString()) -} - -// UnterminatedSectionMarker reports that a section marker is missing -// the required trailing "===", e.g. "===BEGIN ICANN DOMAINS". -type UnterminatedSectionMarker struct { - Line Source +type ErrUnknownSectionMarker struct { + SourceRange } -func (e UnterminatedSectionMarker) Error() string { - return fmt.Sprintf(`section marker %q at %s is missing trailing "==="`, e.Line.Text(), e.Line.LocationString()) +func (e ErrUnknownSectionMarker) Error() string { + return fmt.Sprintf("%s: unknown kind of section marker", e.SourceRange.LocationString()) } // MissingEntityName reports that a block of suffixes does not have a // parseable owner name in its header comment. -type MissingEntityName struct { +type ErrMissingEntityName struct { Suffixes *Suffixes } -func (e MissingEntityName) Error() string { - return fmt.Sprintf("could not find entity name for %s at %s", e.Suffixes.shortName(), e.Suffixes.LocationString()) +func (e ErrMissingEntityName) Error() string { + return fmt.Sprintf("%s: suffix block has no owner name", e.Suffixes.SourceRange.LocationString()) } -// MissingEntityEmail reports that a block of suffixes does not have a +// ErrMissingEntityEmail reports that a block of suffixes does not have a // parseable contact email address in its header comment. -type MissingEntityEmail struct { +type ErrMissingEntityEmail struct { Suffixes *Suffixes } -func (e MissingEntityEmail) Error() string { - return fmt.Sprintf("could not find a contact email for %s at %s", e.Suffixes.shortName(), e.Suffixes.LocationString()) +func (e ErrMissingEntityEmail) Error() string { + return fmt.Sprintf("%s: suffix block has no contact email", e.Suffixes.SourceRange.LocationString()) } -// SuffixBlocksInWrongPlace reports that some suffix blocks of the +// ErrSuffixBlocksInWrongPlace reports that some suffix blocks of the // private section are in the wrong sort order. -type SuffixBlocksInWrongPlace struct { +type ErrSuffixBlocksInWrongPlace struct { // EditScript is a list of suffix block movements to put the // private domains section in the correct order. Note that each // step assumes that the previous steps have already been done. @@ -174,7 +164,7 @@ type MoveSuffixBlock struct { InsertAfter string } -func (e SuffixBlocksInWrongPlace) Error() string { +func (e ErrSuffixBlocksInWrongPlace) Error() string { if len(e.EditScript) == 1 { after := e.EditScript[0].InsertAfter if after == "" { @@ -198,3 +188,15 @@ func (e SuffixBlocksInWrongPlace) Error() string { return ret.String() } + +// ErrInvalidSuffix reports that a suffix suffix is not a valid PSL +// entry. +type ErrInvalidSuffix struct { + SourceRange + Suffix string + Err error +} + +func (e ErrInvalidSuffix) Error() string { + return fmt.Sprintf("%s: invalid suffix %q: %v", e.SourceRange.LocationString(), e.Suffix, e.Err) +} diff --git a/tools/internal/parser/exceptions.go b/tools/internal/parser/exceptions.go index 63c3fbcef..18adf3da6 100644 --- a/tools/internal/parser/exceptions.go +++ b/tools/internal/parser/exceptions.go @@ -1,1003 +1,123 @@ package parser -import "strings" +import "slices" -// Exceptions are blocks of the PSL that would fail current validation +// Exceptions are parts of the PSL that would fail current validation // and stylistic requirements, but are exempted due to predating those // rules. // -// These exceptions are deliberately built to be brittle: editing a -// block revokes its exemptions and requires the block to pass all -// modern validations (or the exceptions below need to be -// updated). This hopefully ratchets the PSL to always become more -// conformant with current policy, while not requiring that all -// existing lint be fixed immediately. -// // See the bottom of this file for the exceptions themselves. -// downgradeToWarning reports whether e is a legacy exception to -// normal parsing and validation rules, and should be reported as a -// warning rather than a validation error. -func downgradeToWarning(e error) bool { - switch v := e.(type) { - case MissingEntityEmail: - return sourceIsExempted(missingEmail, v.Suffixes.Text()) - } - return false +// exemptFromContactInfo reports whether the block owned by entity is +// exempt from the requirement to have a contact email address. +func exemptFromContactInfo(entity string) bool { + return slices.Contains(missingEmail, entity) } // exemptFromSorting reports whether the block owned by entity is // exempt from the sorting requirement that normally applies in the // private domains section. -func exemptFromSorting(source Source) bool { - return sourceIsExempted(incorrectSort, source.Text()) -} - -func sourceIsExempted(exceptions []string, source string) bool { - for _, exc := range exceptions { - if exc == source { - return true - } - } - return false -} - -func lines(lines ...string) string { - return strings.Join(lines, "\n") +func exemptFromSorting(entity string) bool { + return slices.Contains(incorrectSort, entity) } // missingEmail are source code blocks in the private domains section // that are allowed to lack email contact information. var missingEmail = []string{ - lines( - "// 611coin : https://611project.org/", - "611.to", - ), - lines( - "// c.la : http://www.c.la/", - "c.la", - ), - lines( - "// co.ca : http://registry.co.ca/", - "co.ca", - ), - lines( - "// DynDNS.com : http://www.dyndns.com/services/dns/dyndns/", - "dyndns.biz", - "for-better.biz", - "for-more.biz", - "for-some.biz", - "for-the.biz", - "selfip.biz", - "webhop.biz", - "ftpaccess.cc", - "game-server.cc", - "myphotos.cc", - "scrapping.cc", - "blogdns.com", - "cechire.com", - "dnsalias.com", - "dnsdojo.com", - "doesntexist.com", - "dontexist.com", - "doomdns.com", - "dyn-o-saur.com", - "dynalias.com", - "dyndns-at-home.com", - "dyndns-at-work.com", - "dyndns-blog.com", - "dyndns-free.com", - "dyndns-home.com", - "dyndns-ip.com", - "dyndns-mail.com", - "dyndns-office.com", - "dyndns-pics.com", - "dyndns-remote.com", - "dyndns-server.com", - "dyndns-web.com", - "dyndns-wiki.com", - "dyndns-work.com", - "est-a-la-maison.com", - "est-a-la-masion.com", - "est-le-patron.com", - "est-mon-blogueur.com", - "from-ak.com", - "from-al.com", - "from-ar.com", - "from-ca.com", - "from-ct.com", - "from-dc.com", - "from-de.com", - "from-fl.com", - "from-ga.com", - "from-hi.com", - "from-ia.com", - "from-id.com", - "from-il.com", - "from-in.com", - "from-ks.com", - "from-ky.com", - "from-ma.com", - "from-md.com", - "from-mi.com", - "from-mn.com", - "from-mo.com", - "from-ms.com", - "from-mt.com", - "from-nc.com", - "from-nd.com", - "from-ne.com", - "from-nh.com", - "from-nj.com", - "from-nm.com", - "from-nv.com", - "from-oh.com", - "from-ok.com", - "from-or.com", - "from-pa.com", - "from-pr.com", - "from-ri.com", - "from-sc.com", - "from-sd.com", - "from-tn.com", - "from-tx.com", - "from-ut.com", - "from-va.com", - "from-vt.com", - "from-wa.com", - "from-wi.com", - "from-wv.com", - "from-wy.com", - "getmyip.com", - "gotdns.com", - "hobby-site.com", - "homelinux.com", - "homeunix.com", - "iamallama.com", - "is-a-anarchist.com", - "is-a-blogger.com", - "is-a-bookkeeper.com", - "is-a-bulls-fan.com", - "is-a-caterer.com", - "is-a-chef.com", - "is-a-conservative.com", - "is-a-cpa.com", - "is-a-cubicle-slave.com", - "is-a-democrat.com", - "is-a-designer.com", - "is-a-doctor.com", - "is-a-financialadvisor.com", - "is-a-geek.com", - "is-a-green.com", - "is-a-guru.com", - "is-a-hard-worker.com", - "is-a-hunter.com", - "is-a-landscaper.com", - "is-a-lawyer.com", - "is-a-liberal.com", - "is-a-libertarian.com", - "is-a-llama.com", - "is-a-musician.com", - "is-a-nascarfan.com", - "is-a-nurse.com", - "is-a-painter.com", - "is-a-personaltrainer.com", - "is-a-photographer.com", - "is-a-player.com", - "is-a-republican.com", - "is-a-rockstar.com", - "is-a-socialist.com", - "is-a-student.com", - "is-a-teacher.com", - "is-a-techie.com", - "is-a-therapist.com", - "is-an-accountant.com", - "is-an-actor.com", - "is-an-actress.com", - "is-an-anarchist.com", - "is-an-artist.com", - "is-an-engineer.com", - "is-an-entertainer.com", - "is-certified.com", - "is-gone.com", - "is-into-anime.com", - "is-into-cars.com", - "is-into-cartoons.com", - "is-into-games.com", - "is-leet.com", - "is-not-certified.com", - "is-slick.com", - "is-uberleet.com", - "is-with-theband.com", - "isa-geek.com", - "isa-hockeynut.com", - "issmarterthanyou.com", - "likes-pie.com", - "likescandy.com", - "neat-url.com", - "saves-the-whales.com", - "selfip.com", - "sells-for-less.com", - "sells-for-u.com", - "servebbs.com", - "simple-url.com", - "space-to-rent.com", - "teaches-yoga.com", - "writesthisblog.com", - "ath.cx", - "fuettertdasnetz.de", - "isteingeek.de", - "istmein.de", - "lebtimnetz.de", - "leitungsen.de", - "traeumtgerade.de", - "barrel-of-knowledge.info", - "barrell-of-knowledge.info", - "dyndns.info", - "for-our.info", - "groks-the.info", - "groks-this.info", - "here-for-more.info", - "knowsitall.info", - "selfip.info", - "webhop.info", - "forgot.her.name", - "forgot.his.name", - "at-band-camp.net", - "blogdns.net", - "broke-it.net", - "buyshouses.net", - "dnsalias.net", - "dnsdojo.net", - "does-it.net", - "dontexist.net", - "dynalias.net", - "dynathome.net", - "endofinternet.net", - "from-az.net", - "from-co.net", - "from-la.net", - "from-ny.net", - "gets-it.net", - "ham-radio-op.net", - "homeftp.net", - "homeip.net", - "homelinux.net", - "homeunix.net", - "in-the-band.net", - "is-a-chef.net", - "is-a-geek.net", - "isa-geek.net", - "kicks-ass.net", - "office-on-the.net", - "podzone.net", - "scrapper-site.net", - "selfip.net", - "sells-it.net", - "servebbs.net", - "serveftp.net", - "thruhere.net", - "webhop.net", - "merseine.nu", - "mine.nu", - "shacknet.nu", - "blogdns.org", - "blogsite.org", - "boldlygoingnowhere.org", - "dnsalias.org", - "dnsdojo.org", - "doesntexist.org", - "dontexist.org", - "doomdns.org", - "dvrdns.org", - "dynalias.org", - "dyndns.org", - "go.dyndns.org", - "home.dyndns.org", - "endofinternet.org", - "endoftheinternet.org", - "from-me.org", - "game-host.org", - "gotdns.org", - "hobby-site.org", - "homedns.org", - "homeftp.org", - "homelinux.org", - "homeunix.org", - "is-a-bruinsfan.org", - "is-a-candidate.org", - "is-a-celticsfan.org", - "is-a-chef.org", - "is-a-geek.org", - "is-a-knight.org", - "is-a-linux-user.org", - "is-a-patsfan.org", - "is-a-soxfan.org", - "is-found.org", - "is-lost.org", - "is-saved.org", - "is-very-bad.org", - "is-very-evil.org", - "is-very-good.org", - "is-very-nice.org", - "is-very-sweet.org", - "isa-geek.org", - "kicks-ass.org", - "misconfused.org", - "podzone.org", - "readmyblog.org", - "selfip.org", - "sellsyourhome.org", - "servebbs.org", - "serveftp.org", - "servegame.org", - "stuff-4-sale.org", - "webhop.org", - "better-than.tv", - "dyndns.tv", - "on-the-web.tv", - "worse-than.tv", - "is-by.us", - "land-4-sale.us", - "stuff-4-sale.us", - "dyndns.ws", - "mypets.ws", - ), - lines( - "// Hashbang : https://hashbang.sh", - "hashbang.sh", - ), - lines( - "// HostyHosting (https://hostyhosting.com)", - "hostyhosting.io", - ), - lines( - "// info.at : http://www.info.at/", - "biz.at", - "info.at", - ), - lines( - "// .KRD : http://nic.krd/data/krd/Registration%20Policy.pdf", - "co.krd", - "edu.krd", - ), - lines( - "// Michau Enterprises Limited : http://www.co.pl/", - "co.pl", - ), - lines( - "// Nicolaus Copernicus University in Torun - MSK TORMAN (https://www.man.torun.pl)", - "torun.pl", - ), - lines( - "// TASK geographical domains (https://www.task.gda.pl/uslugi/dns)", - "gda.pl", - "gdansk.pl", - "gdynia.pl", - "med.pl", - "sopot.pl", - ), - lines( - "// CoDNS B.V.", - "co.nl", - "co.no", - ), - lines( - "// .pl domains (grandfathered)", - "art.pl", - "gliwice.pl", - "krakow.pl", - "poznan.pl", - "wroc.pl", - "zakopane.pl", - ), - lines( - "// QA2", - "// Submitted by Daniel Dent (https://www.danieldent.com/)", - "qa2.com", - ), + "611coin", + "c.la", + "co.ca", + "DynDNS.com", + "Hashbang", + "HostyHosting", + "info.at", + ".KRD", + "Michau Enterprises Limited", + "Nicolaus Copernicus University in Torun - MSK TORMAN", + "TASK geographical domains", + "CoDNS B.V.", + ".pl domains (grandfathered)", + "QA2", } -// incorrectSort are source code blocks in the private domains section -// that are allowed to be in the wrong sort order. +// incorrectSort are entities in the private domains section that are +// allowed to be in the wrong sort order. var incorrectSort = []string{ - lines( - "// AAA workspace : https://aaa.vodka", - "// Submitted by Kirill Rezraf ", - "aaa.vodka", - ), - lines( - "// University of Banja Luka : https://unibl.org", - "// Domains for Republic of Srpska administrative entity.", - "// Submitted by Marko Ivanovic ", - "rs.ba", - ), - lines( - "// University of Bielsko-Biala regional domain: http://dns.bielsko.pl/", - "// Submitted by Marcin ", - "bielsko.pl", - ), - lines( - "// No longer operated by CentralNic, these entries should be adopted and/or removed by current operators", - "// Submitted by Gavin Brown ", - "ar.com", - "hu.com", - "kr.com", - "no.com", - "qc.com", - "uy.com", - ), - lines( - "// Africa.com Web Solutions Ltd : https://registry.africa.com", - "// Submitted by Gavin Brown ", - "africa.com", - ), - lines( - "// iDOT Services Limited : http://www.domain.gr.com", - "// Submitted by Gavin Brown ", - "gr.com", - ), - lines( - "// Radix FZC : http://domains.in.net", - "// Submitted by Gavin Brown ", - "web.in", - "in.net", - ), - lines( - "// US REGISTRY LLC : http://us.org", - "// Submitted by Gavin Brown ", - "us.org", - ), - lines( - "// co.com Registry, LLC : https://registry.co.com", - "// Submitted by Gavin Brown ", - "co.com", - ), - lines( - "// Roar Domains LLC : https://roar.basketball/", - "// Submitted by Gavin Brown ", - "aus.basketball", - "nz.basketball", - ), - lines( - "// BRS Media : https://brsmedia.com/", - "// Submitted by Gavin Brown ", - "radio.am", - "radio.fm", - ), - lines( - "// c.la : http://www.c.la/", - "c.la", - ), - lines( - "// Clever Cloud : https://www.clever-cloud.com/", - "// Submitted by Quentin Adam ", - "cleverapps.cc", - "*.services.clever-cloud.com", - "cleverapps.io", - "cleverapps.tech", - ), - lines( - "// co.ca : http://registry.co.ca/", - "co.ca", - ), - lines( - "// Co & Co : https://co-co.nl/", - "// Submitted by Govert Versluis ", - "*.otap.co", - ), - lines( - "// i-registry s.r.o. : http://www.i-registry.cz/", - "// Submitted by Martin Semrad ", - "co.cz", - ), - lines( - "// CDN77.com : http://www.cdn77.com", - "// Submitted by Jan Krpes ", - "cdn77-storage.com", - "rsc.contentproxy9.cz", - "r.cdn77.net", - "cdn77-ssl.net", - "c.cdn77.org", - "rsc.cdn77.org", - "ssl.origin.cdn77-secure.org", - ), - lines( - "// Cloud DNS Ltd : http://www.cloudns.net", - "// Submitted by Aleksander Hristov & Boyan Peychev ", - "cloudns.asia", - "cloudns.be", - "cloudns.biz", - "cloudns.cc", - "cloudns.ch", - "cloudns.cl", - "cloudns.club", - "dnsabr.com", - "cloudns.cx", - "cloudns.eu", - "cloudns.in", - "cloudns.info", - "dns-cloud.net", - "dns-dynamic.net", - "cloudns.nz", - "cloudns.org", - "cloudns.ph", - "cloudns.pro", - "cloudns.pw", - "cloudns.us", - ), - lines( - "// Daplie, Inc : https://daplie.com", - "// Submitted by AJ ONeal ", - "daplie.me", - "localhost.daplie.me", - ), - lines( - "// Datto, Inc. : https://www.datto.com/", - "// Submitted by Philipp Heckel ", - "dattolocal.com", - "dattorelay.com", - "dattoweb.com", - "mydatto.com", - "dattolocal.net", - "mydatto.net", - ), - lines( - "// Bip : https://bip.sh", - "// Submitted by Joel Kennedy ", - "bip.sh", - ), - lines( - "// bitbridge.net : Submitted by Craig Welch, abeliidev@gmail.com", - "bitbridge.net", - ), - lines( - "// ddnss.de : https://www.ddnss.de/", - "// Submitted by Robert Niedziela ", - "ddnss.de", - "dyn.ddnss.de", - "dyndns.ddnss.de", - "dyn-ip24.de", - "dyndns1.de", - "home-webserver.de", - "dyn.home-webserver.de", - "myhome-server.de", - "ddnss.org", - ), - lines( - "// Definima : http://www.definima.com/", - "// Submitted by Maxence Bitterli ", - "definima.io", - "definima.net", - ), - lines( - "// DigitalOcean App Platform : https://www.digitalocean.com/products/app-platform/", - "// Submitted by Braxton Huggins ", - "ondigitalocean.app", - ), - lines( - "// DigitalOcean Spaces : https://www.digitalocean.com/products/spaces/", - "// Submitted by Robin H. Johnson ", - "*.digitaloceanspaces.com", - ), - lines( - "// DigitalPlat : https://www.digitalplat.org/", - "// Submitted by Edward Hsing ", - "us.kg", - ), - lines( - "// dnstrace.pro : https://dnstrace.pro/", - "// Submitted by Chris Partridge ", - "bci.dnstrace.pro", - ), - lines( - "// ECG Robotics, Inc: https://ecgrobotics.org", - "// Submitted by ", - "onred.one", - "staging.onred.one", - ), - lines( - "// Fedora : https://fedoraproject.org/", - "// submitted by Patrick Uiterwijk ", - "fedorainfracloud.org", - "fedorapeople.org", - "cloud.fedoraproject.org", - "app.os.fedoraproject.org", - "app.os.stg.fedoraproject.org", - ), - lines( - "// Frusky MEDIA&PR : https://www.frusky.de", - "// Submitted by Victor Pupynin ", - "*.frusky.de", - ), - lines( - "// RavPage : https://www.ravpage.co.il", - "// Submitted by Roni Horowitz ", - "ravpage.co.il", - ), - lines( - "// CDDO : https://www.gov.uk/guidance/get-an-api-domain-on-govuk", - "// Submitted by Jamie Tanna ", - "api.gov.uk", - ), - lines( - "// GOV.UK Platform as a Service : https://www.cloud.service.gov.uk/", - "// Submitted by Tom Whitwell ", - "cloudapps.digital", - "london.cloudapps.digital", - ), - lines( - "// GOV.UK Pay : https://www.payments.service.gov.uk/", - "// Submitted by Richard Baker ", - "pymnt.uk", - ), - lines( - "// Helio Networks : https://heliohost.org", - "// Submitted by Ben Frede ", - "helioho.st", - "heliohost.us", - ), - lines( - "// Häkkinen.fi", - "// Submitted by Eero Häkkinen ", - "häkkinen.fi", - ), - lines( - "// is-a.dev : https://www.is-a.dev", - "// Submitted by William Harrison ", - "is-a.dev", - ), - lines( - "// I-O DATA DEVICE, INC. : http://www.iodata.com/", - "// Submitted by Yuji Minagawa ", - "iobb.net", - ), - lines( - "// KUROKU LTD : https://kuroku.ltd/", - "// Submitted by DisposaBoy ", - "oya.to", - ), - lines( - "// Katholieke Universiteit Leuven: https://www.kuleuven.be", - "// Submitted by Abuse KU Leuven ", - "ezproxy.kuleuven.be", - "kuleuven.cloud", - ), - lines( - "// .KRD : http://nic.krd/data/krd/Registration%20Policy.pdf", - "co.krd", - "edu.krd", - ), - lines( - "// Lokalized : https://lokalized.nl", - "// Submitted by Noah Taheij ", - "servers.run", - ), - lines( - "// May First - People Link : https://mayfirst.org/", - "// Submitted by Jamie McClelland ", - "mayfirst.info", - "mayfirst.org", - ), - lines( - "// mcpe.me : https://mcpe.me", - "// Submitted by Noa Heyl ", - "mcpe.me", - ), - lines( - "// NFSN, Inc. : https://www.NearlyFreeSpeech.NET/", - "// Submitted by Jeff Wheelhouse ", - "nfshost.com", - ), - lines( - "// NFT.Storage : https://nft.storage/", - "// Submitted by Vasco Santos or ", - "ipfs.nftstorage.link", - ), - lines( - "// No-IP.com : https://noip.com/", - "// Submitted by Deven Reza ", - "mmafan.biz", - "myftp.biz", - "no-ip.biz", - "no-ip.ca", - "fantasyleague.cc", - "gotdns.ch", - "3utilities.com", - "blogsyte.com", - "ciscofreak.com", - "damnserver.com", - "ddnsking.com", - "ditchyourip.com", - "dnsiskinky.com", - "dynns.com", - "geekgalaxy.com", - "health-carereform.com", - "homesecuritymac.com", - "homesecuritypc.com", - "myactivedirectory.com", - "mysecuritycamera.com", - "myvnc.com", - "net-freaks.com", - "onthewifi.com", - "point2this.com", - "quicksytes.com", - "securitytactics.com", - "servebeer.com", - "servecounterstrike.com", - "serveexchange.com", - "serveftp.com", - "servegame.com", - "servehalflife.com", - "servehttp.com", - "servehumour.com", - "serveirc.com", - "servemp3.com", - "servep2p.com", - "servepics.com", - "servequake.com", - "servesarcasm.com", - "stufftoread.com", - "unusualperson.com", - "workisboring.com", - "dvrcam.info", - "ilovecollege.info", - "no-ip.info", - "brasilia.me", - "ddns.me", - "dnsfor.me", - "hopto.me", - "loginto.me", - "noip.me", - "webhop.me", - "bounceme.net", - "ddns.net", - "eating-organic.net", - "mydissent.net", - "myeffect.net", - "mymediapc.net", - "mypsx.net", - "mysecuritycamera.net", - "nhlfan.net", - "no-ip.net", - "pgafan.net", - "privatizehealthinsurance.net", - "redirectme.net", - "serveblog.net", - "serveminecraft.net", - "sytes.net", - "cable-modem.org", - "collegefan.org", - "couchpotatofries.org", - "hopto.org", - "mlbfan.org", - "myftp.org", - "mysecuritycamera.org", - "nflfan.org", - "no-ip.org", - "read-books.org", - "ufcfan.org", - "zapto.org", - "no-ip.co.uk", - "golffan.us", - "noip.us", - "pointto.us", - ), - lines( - "// NodeArt : https://nodeart.io", - "// Submitted by Konstantin Nosov ", - "stage.nodeart.io", - ), - lines( - "// One.com: https://www.one.com/", - "// Submitted by Jacob Bunk Nielsen ", - "123webseite.at", - "123website.be", - "simplesite.com.br", - "123website.ch", - "simplesite.com", - "123webseite.de", - "123hjemmeside.dk", - "123miweb.es", - "123kotisivu.fi", - "123siteweb.fr", - "simplesite.gr", - "123homepage.it", - "123website.lu", - "123website.nl", - "123hjemmeside.no", - "service.one", - "simplesite.pl", - "123paginaweb.pt", - "123minsida.se", - ), - lines( - "// .pl domains (grandfathered)", - "art.pl", - "gliwice.pl", - "krakow.pl", - "poznan.pl", - "wroc.pl", - "zakopane.pl", - ), - lines( - "// Pantheon Systems, Inc. : https://pantheon.io/", - "// Submitted by Gary Dylina ", - "gotpantheon.com", - "pantheonsite.io", - ), - lines( - "// PE Ulyanov Kirill Sergeevich : https://airy.host", - "// Submitted by Kirill Ulyanov ", - "lk3.ru", - ), - lines( - "// Rad Web Hosting: https://radwebhosting.com", - "// Submitted by Scott Claeys ", - "cloudsite.builders", - "myradweb.net", - "servername.us", - ), - lines( - "// Raidboxes GmbH : https://raidboxes.de", - "// Submitted by Auke Tembrink ", - "myrdbx.io", - "site.rb-hosting.io", - ), - lines( - "// Redgate Software: https://red-gate.com", - "// Submitted by Andrew Farries ", - "instances.spawn.cc", - ), - lines( - "// Redstar Consultants : https://www.redstarconsultants.com/", - "// Submitted by Jons Slemmer ", - "instantcloud.cn", - ), - lines( - "// Russian Academy of Sciences", - "// Submitted by Tech Support ", - "ras.ru", - ), - lines( - "// QA2", - "// Submitted by Daniel Dent (https://www.danieldent.com/)", - "qa2.com", - ), - lines( - "// QCX", - "// Submitted by Cassandra Beelen ", - "qcx.io", - "*.sys.qcx.io", - ), - lines( - "// QNAP System Inc : https://www.qnap.com", - "// Submitted by Nick Chang ", - "myqnapcloud.cn", - "alpha-myqnapcloud.com", - "dev-myqnapcloud.com", - "mycloudnas.com", - "mynascloud.com", - "myqnapcloud.com", - ), - lines( - "// Senseering GmbH : https://www.senseering.de", - "// Submitted by Felix Mönckemeyer ", - "senseering.net", - ), - lines( - "// Smallregistry by Promopixel SARL: https://www.smallregistry.net", - "// Former AFNIC's SLDs", - "// Submitted by Jérôme Lipowicz ", - "aeroport.fr", - "avocat.fr", - "chambagri.fr", - "chirurgiens-dentistes.fr", - "experts-comptables.fr", - "medecin.fr", - "notaires.fr", - "pharmacien.fr", - "port.fr", - "veterinaire.fr", - ), - lines( - "// staticland : https://static.land", - "// Submitted by Seth Vincent ", - "static.land", - "dev.static.land", - "sites.static.land", - ), - lines( - "// Storebase : https://www.storebase.io", - "// Submitted by Tony Schirmer ", - "storebase.store", - ), - lines( - "// Strapi : https://strapi.io/", - "// Submitted by Florent Baldino ", - "strapiapp.com", - "media.strapiapp.com", - ), - lines( - "// Strategic System Consulting (eApps Hosting): https://www.eapps.com/", - "// Submitted by Alex Oancea ", - "vps-host.net", - "atl.jelastic.vps-host.net", - "njs.jelastic.vps-host.net", - "ric.jelastic.vps-host.net", - ), - lines( - "// Sony Interactive Entertainment LLC : https://sie.com/", - "// Submitted by David Coles ", - "playstation-cloud.com", - ), - lines( - "// SourceLair PC : https://www.sourcelair.com", - "// Submitted by Antonis Kalipetis ", - "apps.lair.io", - "*.stolos.io", - ), - lines( - "// SpaceKit : https://www.spacekit.io/", - "// Submitted by Reza Akhavan ", - "spacekit.io", - ), - lines( - "// SpeedPartner GmbH: https://www.speedpartner.de/", - "// Submitted by Stefan Neufeind ", - "customer.speedpartner.de", - ), - lines( - "// Spreadshop (sprd.net AG) : https://www.spreadshop.com/", - "// Submitted by Martin Breest ", - "myspreadshop.at", - "myspreadshop.com.au", - "myspreadshop.be", - "myspreadshop.ca", - "myspreadshop.ch", - "myspreadshop.com", - "myspreadshop.de", - "myspreadshop.dk", - "myspreadshop.es", - "myspreadshop.fi", - "myspreadshop.fr", - "myspreadshop.ie", - "myspreadshop.it", - "myspreadshop.net", - "myspreadshop.nl", - "myspreadshop.no", - "myspreadshop.pl", - "myspreadshop.se", - "myspreadshop.co.uk", - ), - lines( - "// Studenten Net Twente : http://www.snt.utwente.nl/", - "// Submitted by Silke Hofstra ", - "utwente.io", - ), - lines( - "// UNIVERSAL DOMAIN REGISTRY : https://www.udr.org.yt/", - "// see also: whois -h whois.udr.org.yt help", - "// Submitted by Atanunu Igbunuroghene ", - "name.pm", - "sch.tf", - "biz.wf", - "sch.wf", - "org.yt", - ), - lines( - "// .US", - "// Submitted by Ed Moore ", - "lib.de.us", - ), - lines( - "// VeryPositive SIA : http://very.lv", - "// Submitted by Danko Aleksejevs ", - "2038.io", - ), - lines( - "// V.UA Domain Administrator : https://domain.v.ua/", - "// Submitted by Serhii Rostilo ", - "v.ua", - ), + "AAA workspace", + "University of Banja Luka", + "University of Bielsko-Biala regional domain", + "No longer operated by CentralNic, these entries should be adopted and/or removed by current operators", + "Africa.com Web Solutions Ltd", + "iDOT Services Limited", + "Radix FZC", + "US REGISTRY LLC", + "co.com Registry, LLC", + "Roar Domains LLC", + "BRS Media", + "c.la", + "Clever Cloud", + "co.ca", + "Co & Co", + "i-registry s.r.o.", + "CDN77.com", + "Cloud DNS Ltd", + "Daplie, Inc", + "Datto, Inc.", + "Bip", + "bitbridge.net", + "ddnss.de", + "Definima", + "DigitalOcean App Platform", + "DigitalOcean Spaces", + "DigitalPlat", + "dnstrace.pro", + "ECG Robotics, Inc", + "Fedora", + "Frusky MEDIA&PR", + "RavPage", + "CDDO", + "GOV.UK Platform as a Service", + "GOV.UK Pay", + "Helio Networks", + "Häkkinen.fi", + "is-a.dev", + "I-O DATA DEVICE, INC.", + "KUROKU LTD", + "Katholieke Universiteit Leuven", + ".KRD", + "Lokalized", + "May First - People Link", + "mcpe.me", + "NFSN, Inc.", + "NFT.Storage", + "No-IP.com", + "NodeArt", + "One.com", + ".pl domains (grandfathered)", + "Pantheon Systems, Inc.", + "PE Ulyanov Kirill Sergeevich", + "Rad Web Hosting", + "Raidboxes GmbH", + "Redgate Software", + "Redstar Consultants", + "Russian Academy of Sciences", + "QA2", + "QCX", + "QNAP System Inc", + "Senseering GmbH", + "Smallregistry by Promopixel SARL", + "staticland", + "Storebase", + "Strapi", + "Strategic System Consulting (eApps Hosting)", + "Sony Interactive Entertainment LLC", + "SourceLair PC", + "SpaceKit", + "SpeedPartner GmbH", + "Spreadshop (sprd.net AG)", + "Studenten Net Twente", + "UNIVERSAL DOMAIN REGISTRY", + ".US", + "VeryPositive SIA", + "V.UA Domain Administrator", } diff --git a/tools/internal/parser/file.go b/tools/internal/parser/file.go index 4663ff4d9..af15e9ff3 100644 --- a/tools/internal/parser/file.go +++ b/tools/internal/parser/file.go @@ -1,114 +1,62 @@ package parser import ( - "fmt" "net/mail" "net/url" ) -// File is a parsed PSL file. -// A PSL file consists of blocks separated by an empty line. Most -// blocks are annotated lists of suffixes, but some are plain -// top-level comments or delimiters for sections of the file. -type File struct { - // Blocks are the data blocks of the file, in the order they - // appear. +// List is a parsed public suffix list. +type List struct { + SourceRange + + // Blocks are the top-level elements of the list, in the order + // they appear. Blocks []Block - // Errors are parse errors encountered while reading the - // file. This includes fatal validation errors, not just malformed - // syntax. - Errors []error - // Warnings are errors that were downgraded to just - // warnings. Warnings are a concession to old PSL entries that now - // have validation errors, due to PSL policy changes. As long as - // the entries in question don't change, their preexisting - // validation errors are downgraded to lint warnings. - Warnings []error } -// AllSuffixBlocks returns all suffix blocks in f. -func (f *File) AllSuffixBlocks() []*Suffixes { - var ret []*Suffixes - - for _, block := range f.Blocks { - switch v := block.(type) { - case *Suffixes: - ret = append(ret, v) - } - } +func (l *List) Children() []Block { return l.Blocks } - return ret +// A Block is a parsed chunk of a PSL file. Each block is one of the +// concrete types Blank, Comment, Section, Suffixes, Suffix, or +// Wildcard. +type Block interface { + // SrcRange returns the block's SourceRange. + SrcRange() SourceRange + // Children returns the block's direct children, if any. + Children() []Block } -// SuffixBlocksInSection returns all suffix blocks within the named -// file section (for example, "ICANN DOMAINS" or "PRIVATE DOMAINS"). -func (f *File) SuffixBlocksInSection(name string) []*Suffixes { - var ret []*Suffixes - - var curSection string - for _, block := range f.Blocks { - switch v := block.(type) { - case *StartSection: - curSection = v.Name - case *EndSection: - if curSection == name { - return ret - } - curSection = "" - case *Suffixes: - if curSection == name { - ret = append(ret, v) - } - } - } - return ret +// Blank is a set of one or more consecutive blank lines. +type Blank struct { + SourceRange } -// A Block is a parsed chunk of a PSL file. -// In Parse's output, a Block is one of the following concrete types: -// Comment, StartSection, EndSection, Suffixes. -type Block interface { - source() Source -} +func (b *Blank) Children() []Block { return nil } -// Comment is a standalone top-level comment block. +// Comment is a comment block, consisting of one or more contiguous +// lines of commented text. type Comment struct { - Source + SourceRange + // Text is the unprocessed content of the comment lines, with the + // leading comment syntax removed. + Text []string } -func (c *Comment) source() Source { return c.Source } +func (c *Comment) Children() []Block { return nil } -// StartSection is a top-level marker that indicates the start of a -// logical section, such as ICANN suffixes or privately managed -// domains. -// -// Sections cannot be nested, at any one point in a file you are -// either not in any logical section, or within a single section. In -// a File that has no parse errors, StartSection and EndSection blocks -// are correctly paired, and all sections are closed by an EndSection -// before any following StartSection. -type StartSection struct { - Source - Name string // section name, e.g. "ICANN DOMAINS", "PRIVATE DOMAINS" -} +// Section is a named part of a PSL file, containing suffixes which +// behave similarly. +type Section struct { + SourceRange -func (b *StartSection) source() Source { return b.Source } - -// EndSection is a top-level marker that indicates the end of a -// logical section, such as ICANN suffixes or privately managed -// domains. -// -// Sections cannot be nested, at any one point in a file you are -// either not in any logical section, or within a single section. In -// a File that has no parse errors, StartSection and EndSection blocks -// are correctly paired, and all sections are closed by an EndSection -// before any following StartSection. -type EndSection struct { - Source - Name string // e.g. "ICANN DOMAINS", "PRIVATE DOMAINS" + // Name is he section name. In a normal well-formed PSL file, the + // names are "ICANN DOMAINS" and "PRIVATE DOMAINS". + Name string + // Blocks are the child blocks contained within the section. + Blocks []Block } -func (b *EndSection) source() Source { return b.Source } +func (s *Section) Children() []Block { return s.Blocks } // Suffixes is a list of PSL domain suffixes with optional additional // metadata. @@ -118,24 +66,7 @@ func (b *EndSection) source() Source { return b.Source } // domain suffixes. The suffix list may contain additional // unstructured inline comments. type Suffixes struct { - Source - - // Header lists the comment lines that appear before the first - // domain suffix. Any structured data they contain is also parsed - // into separate fields. - Header []Source - // Entries lists the lines that contain domain suffixes. In an - // error-free PSL file, each slice element is a single suffix. - Entries []Source - // InlineComments lists the comment lines that appear between - // suffix lines, rather than as part of the header. These are - // uncommon in the PSL overall, but some suffix blocks - // (particularly hand-curated ICANN blocks) feature some guidance - // comments to guide future maintainers. - InlineComments []Source - - // The following fields are extracted from Header, if available. - + SourceRange // Entity is the name of the entity responsible for this block of // suffixes. // @@ -166,15 +97,38 @@ type Suffixes struct { // This field may be nil if the block header doesn't have email // contact information. Submitter *mail.Address + + // Blocks are the child blocks contained within the section. + Blocks []Block } -func (s *Suffixes) source() Source { return s.Source } +func (s *Suffixes) Children() []Block { return s.Blocks } -// shortName returns either the quoted name of the responsible Entity, -// or a generic descriptor of this suffix block if Entity is unset. -func (s *Suffixes) shortName() string { - if s.Entity != "" { - return fmt.Sprintf("%q", s.Entity) - } - return fmt.Sprintf("%d unowned suffixes", len(s.Entries)) +// Suffix is one public suffix, represented in the standard domain +// name format. +type Suffix struct { + SourceRange + + // Labels are the DNS labels of the public suffix. + Labels []string +} + +func (s *Suffix) Children() []Block { return nil } + +// Wildcard is a wildcard public suffix, along with any exceptions to +// that wildcard. +type Wildcard struct { + SourceRange + + // Labels are the DNS labels of the public suffix, without the + // leading "*" label. + Labels []string + // Exceptions are the DNS label values that, when they appear in + // the wildcard position, cause a FQDN to _not_ match this + // wildcard. For example, if Labels=[foo, com] and + // Exceptions=[bar, qux], zot.foo.com is a public suffix, but + // bar.foo.com and qux.foo.com are not. + Exceptions []string } + +func (w *Wildcard) Children() []Block { return nil } diff --git a/tools/internal/parser/metadata.go b/tools/internal/parser/metadata.go index 8c1f6110c..62f63a42a 100644 --- a/tools/internal/parser/metadata.go +++ b/tools/internal/parser/metadata.go @@ -8,11 +8,7 @@ import ( // enrichSuffixes extracts structured metadata from metadata and // populates the appropriate fields of suffixes. -func enrichSuffixes(suffixes *Suffixes, metadata []string) { - if len(metadata) == 0 { - return - } - +func enrichSuffixes(suffixes *Suffixes, comment *Comment) { // Try to find an entity name in the header. There are a few // possible ways this can appear, but the canonical is a first // header line of the form ": ". @@ -23,7 +19,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { // validation errors in future, but currently do not. // // See splitNameish for a list of accepted alternate forms. - for _, line := range metadata { + for _, line := range comment.Text { name, url, contact := splitNameish(line) if name == "" { continue @@ -41,7 +37,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { if suffixes.Entity == "" { // Assume the first line is the entity name, if it's not // obviously something else. - first := metadata[0] + first := comment.Text[0] // "see also" is the first line of a number of ICANN TLD // sections. if getSubmitter(first) == nil && getURL(first) == nil && first != "see also" { @@ -54,7 +50,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { // "Submitted by ", or failing that a parseable RFC5322 // email on a line by itself. if suffixes.Submitter == nil { - for _, line := range metadata { + for _, line := range comment.Text { if submitter := getSubmitter(line); submitter != nil { suffixes.Submitter = submitter break @@ -62,7 +58,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { } } if suffixes.Submitter == nil { - for _, line := range metadata { + for _, line := range comment.Text { if submitter, err := mail.ParseAddress(line); err == nil { suffixes.Submitter = submitter break @@ -74,7 +70,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { // only remaining format we understand is a line with a URL by // itself. if suffixes.URL == nil { - for _, line := range metadata { + for _, line := range comment.Text { if u := getURL(line); u != nil { suffixes.URL = u break diff --git a/tools/internal/parser/parser.go b/tools/internal/parser/parser.go index 28daf787e..e825f1f4c 100644 --- a/tools/internal/parser/parser.go +++ b/tools/internal/parser/parser.go @@ -2,7 +2,11 @@ package parser import ( + "fmt" + "slices" "strings" + + "golang.org/x/net/idna" ) // Parse parses bs as a PSL file and returns the parse result. @@ -17,230 +21,433 @@ import ( // submission guidelines // (https://github.com/publicsuffix/list/wiki/Guidelines). A File with // errors should not be used to calculate public suffixes for FQDNs. -func Parse(bs []byte) *File { - return &parseWithExceptions(bs, downgradeToWarning, true).File -} - -func parseWithExceptions(bs []byte, downgradeToWarning func(error) bool, validate bool) *parser { - src, errs := newSource(bs) - p := parser{ - downgradeToWarning: downgradeToWarning, +func Parse(bs []byte) (*List, []error) { + lines, errs := normalizeToUTF8Lines(bs) + p := &parser{ + input: lines, + inputLine: 0, } for _, err := range errs { p.addError(err) } - p.Parse(src) - if validate { - p.Validate() - } - return &p + ret := p.parseTopLevel() + return ret, p.errs } // parser is the state for a single PSL file parse. type parser struct { - // currentSection is the logical file section the parser is - // currently in. This is used to verify that StartSection and - // EndSection blocks are paired correctly, and may be nil when the - // parser is not currently within a logical section. - currentSection *StartSection - - // downgradeToWarning is a function that reports whether an error - // should be recorded as a non-fatal warning. See exceptions.go - // for the normal implementation. It's a struct field so that - // tests can replace the normal list of exceptions with something - // else for testing. - downgradeToWarning func(error) bool - - // File is the parser's output. - File -} - -// Parse parses src as a PSL file and returns the parse result. -func (p *parser) Parse(src Source) { - blankLine := func(line Source) bool { return line.Text() == "" } - blocks := src.split(blankLine) - - for _, block := range blocks { - // Does this block have any non-comments in it? If so, it's a - // suffix block, otherwise it's a comment/section marker - // block. - notComment := func(line Source) bool { return !strings.HasPrefix(line.Text(), "//") } - comment, rest, hasSuffixes := block.cut(notComment) - if hasSuffixes { - p.processSuffixes(block, comment, rest) - } else { - p.processTopLevelComment(comment) - } + // input is the remaining unparsed and untokenized source text. + input []string + // inputLine is the offset for input[0]. That is, input[0] is line + // number inputLine of the source text. + inputLine int + // peekBuf is a buffer containing zero or one input tokens. + peekBuf any + // errs are the accumulated parse errors so far. + errs []error +} + +// addError records err as a parse/validation error. +// +// If err matches a legacy exemption from current validation rules, +// err is recorded as a non-fatal warning instead. +func (p *parser) addError(err error) { + p.errs = append(p.errs, err) +} + +// The following types and functions are the lexer portion of the +// parsing logic. This is a very simplistic lexer, since +// normalizeToUTF8Lines has already done a lot of heavy lifting to +// clean up the input. Each line of input is converted to a token for +// that line's content. The parser then assembles that stream of +// tokens into multiline blocks, and eventually into a parse tree. + +const ( + sectionStartPrefix = "// ===BEGIN " + sectionEndPrefix = "// ===END " + sectionPrefix = "// ===" + commentPrefix = "// " + wildcardPrefix = "*." + exceptionPrefix = "!" +) + +type line struct { + SourceRange + Text string +} +type tokenEOF struct{} +type tokenBlank struct{ line } +type tokenComment struct{ line } +type tokenSectionUnknown struct{ line } +type tokenSectionStart struct { + line + Name string +} +type tokenSectionEnd struct { + line + Name string +} +type tokenSuffix struct{ line } +type tokenWildcard struct { + line + Suffix string +} +type tokenException struct { + line + Suffix string +} + +// next lexes the next token of input and returns it. +func (p *parser) next() (ret any) { + if p.peekBuf != nil { + ret := p.peekBuf + p.peekBuf = nil + return ret } - // At EOF with an open section. - if p.currentSection != nil { - p.addError(UnclosedSectionError{ - Start: p.currentSection, - }) + if len(p.input) == 0 { + return tokenEOF{} } -} -// processSuffixes parses a block that consists of domain suffixes and -// a metadata header. -func (p *parser) processSuffixes(block, header, rest Source) { - s := &Suffixes{ - Source: block, + // No matter what, next is going to emit the next line of p.input, + // the rest of the function is just to determine what kind of + // token to return. + src := line{ + SourceRange: SourceRange{p.inputLine, p.inputLine + 1}, + Text: p.input[0], } + p.input = p.input[1:] + p.inputLine++ - var metadataSrc []string - for _, line := range header.lineSources() { - // TODO: s.Header should be a single Source for the entire - // comment. - s.Header = append(s.Header, line) - if strings.HasPrefix(line.Text(), sectionMarkerPrefix) { - p.addError(SectionInSuffixBlock{line}) - } else { - // Trim the comment prefix in two steps, because some PSL - // comments don't have whitepace between the // and the - // following text. - metadataSrc = append(metadataSrc, strings.TrimSpace(strings.TrimPrefix(line.Text(), "//"))) + switch { + case src.Text == "": + return tokenBlank{src} + + case strings.HasPrefix(src.Text, sectionStartPrefix): + // To avoid repeated string processing in different portions + // of the parser code, the lexer tears apart section markers + // here to extract the section name. + name := strings.TrimPrefix(src.Text, sectionStartPrefix) + name, ok := strings.CutSuffix(name, "===") + if !ok { + return tokenSectionUnknown{src} + } + return tokenSectionStart{src, name} + case strings.HasPrefix(src.Text, sectionEndPrefix): + name := strings.TrimPrefix(src.Text, sectionEndPrefix) + name, ok := strings.CutSuffix(name, "===") + if !ok { + return tokenSectionUnknown{src} } + return tokenSectionEnd{src, name} + case strings.HasPrefix(src.Text, sectionPrefix): + return tokenSectionUnknown{src} + + case strings.HasPrefix(src.Text, commentPrefix): + // Similarly, the following do some light processing of the + // input so that this doesn't need to be repeated in several + // portions of the parser. + src.Text = strings.TrimPrefix(src.Text, "// ") + return tokenComment{src} + case strings.HasPrefix(src.Text, wildcardPrefix): + return tokenWildcard{src, strings.TrimPrefix(src.Text, wildcardPrefix)} + case strings.HasPrefix(src.Text, exceptionPrefix): + return tokenException{src, strings.TrimPrefix(src.Text, exceptionPrefix)} + + default: + return tokenSuffix{src} } +} + +// peek returns the next token of input, without consuming it. +func (p *parser) peek() any { + if p.peekBuf == nil { + p.peekBuf = p.next() + } + return p.peekBuf +} + +// The rest of this file is the parser itself. It follows the common +// recursive descent structure. + +// blockEmitter returns a function that appends blocks to a given +// output list, and also updates an output SourceRange to cover the +// superset of all emitted blocks. +// +// This is a helper to make the functions that parse intermediate AST +// nodes (which have to accumulate a list of children) more readable. +func blockEmitter(out *[]Block, srcRange *SourceRange) func(...Block) { - // rest consists of suffixes and possibly inline comments. - commentLine := func(line Source) bool { return strings.HasPrefix(line.Text(), "//") } - rest.forEachRun(commentLine, func(block Source, isComment bool) { - if isComment { - for _, line := range block.lineSources() { - if strings.HasPrefix(line.Text(), sectionMarkerPrefix) { - p.addError(SectionInSuffixBlock{line}) - } + return func(bs ...Block) { + for _, b := range bs { + if b == nil { + // Sub-parsers sometimes return nil to indicate the + // thing they tried to parse was bad and they have + // nothing to contribute to the output. + continue } - s.InlineComments = append(s.InlineComments, block) - } else { - // TODO: parse entries properly, for how we just - // accumulate them as individual Sources, one per suffix. - for _, entry := range block.lineSources() { - s.Entries = append(s.Entries, entry) + + *out = append(*out, b) + + if srcRange == nil { + continue + } else if *srcRange == (SourceRange{}) { + // Zero value, this is the first emitted block. + *srcRange = b.SrcRange() + } else { + *srcRange = (*srcRange).merge(b.SrcRange()) } } - }) - - enrichSuffixes(s, metadataSrc) - p.addBlock(s) + } } -const sectionMarkerPrefix = "// ===" +// parseTopLevel parses the top level of a PSL file. +func (p *parser) parseTopLevel() *List { + ret := &List{} + emit := blockEmitter(&ret.Blocks, nil) -// processTopLevelComment parses a block that has only comment lines, -// no suffixes. Some of those comments may be markers for the -// start/end of file sections. -func (p *parser) processTopLevelComment(block Source) { - sectionLine := func(line Source) bool { - return strings.HasPrefix(line.Text(), sectionMarkerPrefix) + for { + switch tok := p.peek().(type) { + case tokenEOF: + return ret + case tokenBlank: + emit(p.parseBlank()) + case tokenComment: + emit(p.parseCommentOrSuffixBlock()) + case tokenSectionStart: + emit(p.parseSection()) + case tokenSectionEnd: + p.addError(ErrUnstartedSection{tok.SourceRange, tok.Name}) + p.next() + case tokenSectionUnknown: + p.addError(ErrUnknownSectionMarker{tok.SourceRange}) + p.next() + case tokenSuffix, tokenWildcard, tokenException: + emit(p.parseSuffixBlock(nil)) + default: + panic("unhandled token") + } + } +} + +// parseSection parses the contents of a PSL file section. +func (p *parser) parseSection() *Section { + // Initialize with the start-of-section marker's data. + start := p.next().(tokenSectionStart) + ret := &Section{ + SourceRange: start.SourceRange, + Name: start.Name, } - block.forEachRun(sectionLine, func(block Source, isSectionLine bool) { - if isSectionLine { - for _, line := range block.lineSources() { - p.processSectionMarker(line) + emit := blockEmitter(&ret.Blocks, &ret.SourceRange) + + for { + switch tok := p.peek().(type) { + case tokenEOF: + p.addError(ErrUnclosedSection{ret}) + return ret + case tokenBlank: + emit(p.parseBlank()) + case tokenComment: + emit(p.parseCommentOrSuffixBlock()) + case tokenSectionStart: + // The PSL doesn't allow nested sections, so we pretend + // like the inner section never existed and grab all its + // blocks for ourselves. Still record an error for the + // nested section though. + inner := p.parseSection() + emit(inner.Blocks...) + p.addError(ErrNestedSection{inner.SourceRange, inner.Name, ret}) + case tokenSectionEnd: + p.next() + if tok.Name != ret.Name { + p.addError(ErrMismatchedSection{tok.SourceRange, tok.Name, ret}) } - } else { - p.addBlock(&Comment{block}) + ret.SourceRange.LastLine = tok.SourceRange.LastLine + return ret + case tokenSectionUnknown: + p.next() + p.addError(ErrUnknownSectionMarker{tok.SourceRange}) + case tokenSuffix, tokenWildcard, tokenException: + emit(p.parseSuffixBlock(nil)) + default: + panic("unhandled token") } - }) -} - -// processSectionMarker parses line as a file section marker, and -// enforces correct start/end pairing. -func (p *parser) processSectionMarker(line Source) { - // Trim here rather than in the caller, so that we still have the - // complete input line available to use in errors. - marker := strings.TrimPrefix(line.Text(), sectionMarkerPrefix) - - // Note hasTrailer gets used below to report an error if the - // trailing "===" is missing. We delay reporting the error so that - // if the entire line is invalid, we don't report both a - // whole-line error and also an unterminated marker error. - marker, hasTrailer := strings.CutSuffix(marker, "===") - - markerType, name, ok := strings.Cut(marker, " ") - if !ok { - // There are no spaces, markerType is the whole text between - // the ===. Clear it out, so that the switch below goes to the - // error case, otherwise "===BEGIN===" would be accepted as a - // no-name section start. - markerType = "" - } - - // No matter what, we're going to output something that needs to - // reference this line. - src := line - - switch markerType { - case "BEGIN": - start := &StartSection{ - Source: src, - Name: name, - } - if p.currentSection != nil { - // Nested sections aren't allowed. Note the error and - // continue parsing as if the previous section was closed - // correctly before this one started. - p.addError(NestedSectionError{ - Outer: p.currentSection, - Inner: start, - }) - } - if !hasTrailer { - p.addError(UnterminatedSectionMarker{src}) + } +} + +// parseCommentOrSuffixBlock parses a comment, then either returns it +// as a lone comment or chains into suffix block parsing, depending on +// what follows the comment. +// +// This is used to resolve an ambiguity in the PSL format when parsing +// linearly: if we see a comment, that could be a standalone comment, +// or it could be the beginning of a suffix block. In the latter case, +// it's very important to attach the comment to the suffix block, +// since it contains metadata about those suffixes. +func (p *parser) parseCommentOrSuffixBlock() Block { + comment := p.parseComment() + switch p.peek().(type) { + case tokenSuffix, tokenWildcard, tokenException: + return p.parseSuffixBlock(comment) + default: + return comment + } +} + +// parseSuffixBlock parses a suffix block, starting with the provided +// optional initial comment. +func (p *parser) parseSuffixBlock(initialComment *Comment) *Suffixes { + ret := &Suffixes{} + emit := blockEmitter(&ret.Blocks, &ret.SourceRange) + + if initialComment != nil { + emit(initialComment) + enrichSuffixes(ret, initialComment) + } + + for { + switch tok := p.peek().(type) { + case tokenBlank: + return ret + case tokenComment: + emit(p.parseComment()) + case tokenSectionUnknown: + p.next() + p.addError(ErrUnknownSectionMarker{tok.SourceRange}) + case tokenSectionStart: + p.next() + p.addError(ErrSectionInSuffixBlock{tok.SourceRange}) + case tokenSectionEnd: + p.next() + p.addError(ErrSectionInSuffixBlock{tok.SourceRange}) + case tokenSuffix: + emit(p.parseSuffix()) + case tokenWildcard: + emit(p.parseWildcard()) + case tokenException: + // Note we don't emit here, exceptions receive a list of + // existing blocks and attach the exception to the + // corresponding wildcard entry. + p.parseException(ret.Blocks) + case tokenEOF: + return ret + default: + panic("unhandled token") } - p.currentSection = start - p.addBlock(start) - case "END": - end := &EndSection{ - Source: src, - Name: name, + } +} + +// parseSuffix parses a basic public suffix entry (i.e. not a wildcard +// or an exception. +func (p *parser) parseSuffix() Block { + tok := p.next().(tokenSuffix) + + labels, err := parseDomainString(tok.Text) + if err != nil { + p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Text, err}) + return nil + } + + return &Suffix{ + SourceRange: tok.SourceRange, + Labels: labels, + } +} + +// parseWildcard parses a public suffix wildcard entry, of the form +// "*.example.com". +func (p *parser) parseWildcard() Block { + tok := p.next().(tokenWildcard) + + labels, err := parseDomainString(tok.Suffix) + if err != nil { + p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, err}) + return nil + } + + return &Wildcard{ + SourceRange: tok.SourceRange, + Labels: labels, + } +} + +// parseException parses a public suffix wildcard exception, of the +// form "!foo.example.com". The parsed exception is attached to the +// related Wildcard block in previous. If no such block exists, the +// exception is dropped and a parse error recorded. +func (p *parser) parseException(previous []Block) { + tok := p.next().(tokenException) + + labels, err := parseDomainString(tok.Suffix) + if err != nil { + p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, err}) + return + } + + for _, block := range previous { + w, ok := block.(*Wildcard) + if !ok { + continue } - if p.currentSection == nil { - // Rogue end marker. Note and continue parsing as if this - // section name was correctly opened earlier. - p.addError(UnstartedSectionError{ - End: end, - }) - } else if p.currentSection.Name != name { - // Mismatched start/end. - p.addError(MismatchedSectionError{ - Start: p.currentSection, - End: end, - }) + + if len(labels) == len(w.Labels)+1 && slices.Equal(labels[1:], w.Labels) { + w.Exceptions = append(w.Exceptions, labels[0]) + return } - if !hasTrailer { - p.addError(UnterminatedSectionMarker{src}) + } + p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, fmt.Errorf("exception %q does not match any wildcard", tok.Suffix)}) +} + +// parseComment parses a multiline comment block. +func (p *parser) parseComment() *Comment { + tok := p.next().(tokenComment) + ret := &Comment{ + SourceRange: tok.SourceRange, + Text: []string{tok.Text}, + } + for { + if tok, ok := p.peek().(tokenComment); ok { + p.next() + ret.SourceRange = ret.SourceRange.merge(tok.SourceRange) + ret.Text = append(ret.Text, tok.Text) + } else { + return ret } - p.currentSection = nil - p.addBlock(end) - default: - // Unknown kind of marker - // - // We want all non-whitespace bytes to be present in the - // parsed output somewhere, so record this malformed line as a - // Comment. Top-level comments are just freeform text, which - // is technically correct here since this isn't a valid - // section marker. - p.addError(UnknownSectionMarker{src}) - p.addBlock(&Comment{src}) } } -// addBlock adds b to p.File.Blocks. -func (p *parser) addBlock(b Block) { - p.File.Blocks = append(p.File.Blocks, b) +// parseBlank parses a run of empty lines. +func (p *parser) parseBlank() Block { + tok := p.next().(tokenBlank) + ret := &Blank{tok.SourceRange} + for { + if tok, ok := p.peek().(tokenBlank); ok { + p.next() + ret.SourceRange = ret.SourceRange.merge(tok.SourceRange) + } else { + return ret + } + } } -// addError records err as a parse/validation error. -// -// If err matches a legacy exemption from current validation rules, -// err is recorded as a non-fatal warning instead. -func (p *parser) addError(err error) { - if p.downgradeToWarning(err) { - p.File.Warnings = append(p.File.Warnings, err) - } else { - p.File.Errors = append(p.File.Errors, err) +// parseDomainString parses a DNS domain string into its component +// labels, validated and normalized to IDNA ascii representation. +func parseDomainString(domain string) (labels []string, err error) { + cleaned, err := idna.Registration.ToUnicode(domain) + if err != nil { + return nil, err + } else if cleaned != domain { + return nil, fmt.Errorf("not in canonical form, should be %q", cleaned) } + + // TODO: the parse tree normalizes to the ASCII (aka punycode) + // representation. Should it normalize to the unicode + // representation instead, to keep parity with the policy of the + // source text? + puny, err := idna.Registration.ToASCII(cleaned) + if err != nil { + panic("punycode translation error on canonical unicode value") + } + + return strings.Split(puny, "."), nil } diff --git a/tools/internal/parser/parser_test.go b/tools/internal/parser/parser_test.go index 207105537..864427cda 100644 --- a/tools/internal/parser/parser_test.go +++ b/tools/internal/parser/parser_test.go @@ -1,16 +1,11 @@ package parser import ( - "bytes" - "cmp" "net/mail" "net/url" "os" - "slices" "strings" "testing" - - diff "github.com/google/go-cmp/cmp" ) // TestParser runs a battery of synthetic parse and validation tests. @@ -29,12 +24,13 @@ func TestParser(t *testing.T) { name string psl []byte downgradeToWarning func(error) bool - want File + want *List + wantErrs []error }{ { name: "empty", psl: byteLines(""), - want: File{}, + want: list(), }, { @@ -44,45 +40,35 @@ func TestParser(t *testing.T) { "", "// Here is a second comment.", ), - want: File{ - Blocks: []Block{ - &Comment{Source: mkSrc(0, "// This is an empty PSL file.")}, - &Comment{Source: mkSrc(2, "// Here is a second comment.")}, - }, - }, + want: list( + comment(0, "This is an empty PSL file."), + blank(1, 2), + comment(2, "Here is a second comment."), + ), }, { - name: "just_suffixes", + name: "just_suffixes_in_block", psl: byteLines( + "// ===BEGIN PRIVATE DOMAINS===", + "", "example.com", "other.example.com", "*.example.org", + "", + "// ===END PRIVATE DOMAINS===", + ), + want: list( + section(0, 7, "PRIVATE DOMAINS", + blank(1, 2), + suffixes(2, 5, "", "", "", + suffix(2, "example.com"), + suffix(3, "other.example.com"), + wildcard(4, 5, "example.org"), + ), + blank(5, 6), + ), ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, "example.com", "other.example.com", "*.example.org"), - Entries: []Source{ - mkSrc(0, "example.com"), - mkSrc(1, "other.example.com"), - mkSrc(2, "*.example.org"), - }, - }, - }, - Errors: []error{ - MissingEntityName{ - Suffixes: &Suffixes{ - Source: mkSrc(0, "example.com", "other.example.com", "*.example.org"), - Entries: []Source{ - mkSrc(0, "example.com"), - mkSrc(1, "other.example.com"), - mkSrc(2, "*.example.org"), - }, - }, - }, - }, - }, }, { @@ -94,26 +80,12 @@ func TestParser(t *testing.T) { "// ===BEGIN FAKE DOMAINS===", "// ===END FAKE DOMAINS===", ), - want: File{ - Blocks: []Block{ - &StartSection{ - Source: mkSrc(0, "// ===BEGIN IMAGINARY DOMAINS==="), - Name: "IMAGINARY DOMAINS", - }, - &EndSection{ - Source: mkSrc(2, "// ===END IMAGINARY DOMAINS==="), - Name: "IMAGINARY DOMAINS", - }, - &StartSection{ - Source: mkSrc(3, "// ===BEGIN FAKE DOMAINS==="), - Name: "FAKE DOMAINS", - }, - &EndSection{ - Source: mkSrc(4, "// ===END FAKE DOMAINS==="), - Name: "FAKE DOMAINS", - }, - }, - }, + want: list( + section(0, 3, "IMAGINARY DOMAINS", // TEST RIGHT, CODE WRONG + blank(1, 2), + ), + section(3, 5, "FAKE DOMAINS"), + ), }, { @@ -121,21 +93,11 @@ func TestParser(t *testing.T) { psl: byteLines( "// ===BEGIN ICANN DOMAINS===", ), - want: File{ - Blocks: []Block{ - &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - Errors: []error{ - UnclosedSectionError{ - Start: &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - }, + want: list( + section(0, 1, "ICANN DOMAINS"), + ), + wantErrs: []error{ + ErrUnclosedSection{section(0, 1, "ICANN DOMAINS")}, }, }, @@ -147,74 +109,15 @@ func TestParser(t *testing.T) { "// ===END SECRET DOMAINS===", "// ===END ICANN DOMAINS===", ), - want: File{ - Blocks: []Block{ - &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - &StartSection{ - Source: mkSrc(1, "// ===BEGIN SECRET DOMAINS==="), - Name: "SECRET DOMAINS", - }, - &EndSection{ - Source: mkSrc(2, "// ===END SECRET DOMAINS==="), - Name: "SECRET DOMAINS", - }, - &EndSection{ - Source: mkSrc(3, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - Errors: []error{ - NestedSectionError{ - Outer: &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - Inner: &StartSection{ - Source: mkSrc(1, "// ===BEGIN SECRET DOMAINS==="), - Name: "SECRET DOMAINS", - }, - }, - UnstartedSectionError{ - &EndSection{ - Source: mkSrc(3, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - }, - }, - }, - { - name: "mismatched_sections", - psl: byteLines( - "// ===BEGIN ICANN DOMAINS===", - "", - "// ===END PRIVATE DOMAINS===", + want: list( + section(0, 4, "ICANN DOMAINS"), ), - want: File{ - Blocks: []Block{ - &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - &EndSection{ - Source: mkSrc(2, "// ===END PRIVATE DOMAINS==="), - Name: "PRIVATE DOMAINS", - }, - }, - Errors: []error{ - MismatchedSectionError{ - Start: &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - End: &EndSection{ - Source: mkSrc(2, "// ===END PRIVATE DOMAINS==="), - Name: "PRIVATE DOMAINS", - }, - }, + + wantErrs: []error{ + ErrNestedSection{ + SourceRange: mkSrc(1, 3), + Name: "SECRET DOMAINS", + Section: section(0, 4, "ICANN DOMAINS"), }, }, }, @@ -224,22 +127,14 @@ func TestParser(t *testing.T) { psl: byteLines( "// ===TRANSFORM DOMAINS===", ), - want: File{ - Blocks: []Block{ - &Comment{ - Source: mkSrc(0, "// ===TRANSFORM DOMAINS==="), - }, - }, - Errors: []error{ - UnknownSectionMarker{ - Line: mkSrc(0, "// ===TRANSFORM DOMAINS==="), - }, - }, + want: list(), + wantErrs: []error{ + ErrUnknownSectionMarker{mkSrc(0, 1)}, }, }, { - name: "suffixes_with_section_markers_in_header", + name: "suffixes_with_section_marker_in_header", psl: byteLines( "// Just some suffixes", "// ===BEGIN ICANN DOMAINS===", @@ -248,105 +143,45 @@ func TestParser(t *testing.T) { "", "// ===END ICANN DOMAINS===", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// Just some suffixes", - "// ===BEGIN ICANN DOMAINS===", - "com", - "org", - ), - Header: []Source{ - mkSrc(0, "// Just some suffixes"), - mkSrc(1, "// ===BEGIN ICANN DOMAINS==="), - }, - Entries: []Source{ - mkSrc(2, "com"), - mkSrc(3, "org"), - }, - Entity: "Just some suffixes", - }, - &EndSection{ - Source: mkSrc(5, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - Errors: []error{ - SectionInSuffixBlock{ - Line: mkSrc(1, "// ===BEGIN ICANN DOMAINS==="), - }, - // Note: trying to gracefully parse the - // StartSection would require splitting the suffix - // block in two, which would need more code and - // also result in additional spurious validation - // errors. Instead this tests that section markers - // within suffix blocks are ignored for section - // validation. - UnstartedSectionError{ - End: &EndSection{ - Source: mkSrc(5, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - }, - }, + want: list( + comment(0, "Just some suffixes"), + section(1, 6, "ICANN DOMAINS", + suffixes(2, 4, "", "", "", + suffix(2, "com"), + suffix(3, "org"), + ), + blank(4, 5), + ), + ), }, { name: "suffixes_with_section_markers_inline", psl: byteLines( + "// ===BEGIN ICANN DOMAINS===", "// Just some suffixes", "com", - "// ===BEGIN ICANN DOMAINS===", + "// ===BEGIN OTHER DOMAINS===", "org", + "// ===END OTHER DOMAINS===", + "net", "", "// ===END ICANN DOMAINS===", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// Just some suffixes", - "com", - "// ===BEGIN ICANN DOMAINS===", - "org", - ), - Header: []Source{ - mkSrc(0, "// Just some suffixes"), - }, - Entries: []Source{ - mkSrc(1, "com"), - mkSrc(3, "org"), - }, - InlineComments: []Source{ - mkSrc(2, "// ===BEGIN ICANN DOMAINS==="), - }, - Entity: "Just some suffixes", - }, - &EndSection{ - Source: mkSrc(5, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - Errors: []error{ - SectionInSuffixBlock{ - Line: mkSrc(2, "// ===BEGIN ICANN DOMAINS==="), - }, - // Note: trying to gracefully parse the - // StartSection would require splitting the suffix - // block in two, which would need more code and - // also result in additional spurious validation - // errors. Instead this tests that section markers - // within suffix blocks are ignored for section - // validation. - UnstartedSectionError{ - End: &EndSection{ - Source: mkSrc(5, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - }, + want: list( + section(0, 9, "ICANN DOMAINS", + suffixes(1, 7, "Just some suffixes", "", "", + comment(1, "Just some suffixes"), + suffix(2, "com"), + suffix(4, "org"), + suffix(6, "net"), + ), + blank(7, 8), + ), + ), + wantErrs: []error{ + ErrSectionInSuffixBlock{mkSrc(3, 4)}, + ErrSectionInSuffixBlock{mkSrc(5, 6)}, }, }, @@ -358,27 +193,13 @@ func TestParser(t *testing.T) { "example.com", "example.org", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// Unstructured header.", - "// I'm just going on about random things.", - "example.com", - "example.org", - ), - Header: []Source{ - mkSrc(0, "// Unstructured header."), - mkSrc(1, "// I'm just going on about random things."), - }, - Entries: []Source{ - mkSrc(2, "example.com"), - mkSrc(3, "example.org"), - }, - Entity: "Unstructured header.", - }, - }, - }, + want: list( + suffixes(0, 4, "Unstructured header.", "", "", + comment(0, "Unstructured header.", "I'm just going on about random things."), + suffix(2, "example.com"), + suffix(3, "example.org"), + ), + ), }, { @@ -390,31 +211,17 @@ func TestParser(t *testing.T) { "example.com", "example.org", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// DuckCorp Inc: https://example.com", - "// Submitted by Not A Duck ", - "// Seriously, not a duck", - "example.com", - "example.org", - ), - Header: []Source{ - mkSrc(0, "// DuckCorp Inc: https://example.com"), - mkSrc(1, "// Submitted by Not A Duck "), - mkSrc(2, "// Seriously, not a duck"), - }, - Entries: []Source{ - mkSrc(3, "example.com"), - mkSrc(4, "example.org"), - }, - Entity: "DuckCorp Inc", - URL: mustURL("https://example.com"), - Submitter: mustEmail("Not A Duck "), - }, - }, - }, + want: list( + suffixes(0, 5, + "DuckCorp Inc", + "https://example.com", + `"Not A Duck" `, + comment(0, "DuckCorp Inc: https://example.com", "Submitted by Not A Duck ", + "Seriously, not a duck"), + suffix(3, "example.com"), + suffix(4, "example.org"), + ), + ), }, { @@ -423,24 +230,15 @@ func TestParser(t *testing.T) { "// DuckCorp Inc: submitted by Not A Duck ", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// DuckCorp Inc: submitted by Not A Duck ", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// DuckCorp Inc: submitted by Not A Duck "), - }, - Entries: []Source{ - mkSrc(1, "example.com"), - }, - Entity: "DuckCorp Inc", - Submitter: mustEmail("Not A Duck "), - }, - }, - }, + want: list( + suffixes(0, 2, + "DuckCorp Inc", + "", + `"Not A Duck" `, + comment(0, "DuckCorp Inc: submitted by Not A Duck "), + suffix(1, "example.com"), + ), + ), }, { @@ -451,29 +249,15 @@ func TestParser(t *testing.T) { "// Submitted by Not A Duck ", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// DuckCorp Inc", - "// https://example.com", - "// Submitted by Not A Duck ", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// DuckCorp Inc"), - mkSrc(1, "// https://example.com"), - mkSrc(2, "// Submitted by Not A Duck "), - }, - Entries: []Source{ - mkSrc(3, "example.com"), - }, - Entity: "DuckCorp Inc", - URL: mustURL("https://example.com"), - Submitter: mustEmail("Not A Duck "), - }, - }, - }, + want: list( + suffixes(0, 4, + "DuckCorp Inc", + "https://example.com", + `"Not A Duck" `, + comment(0, "DuckCorp Inc", "https://example.com", `Submitted by Not A Duck `), + suffix(3, "example.com"), + ), + ), }, { @@ -483,27 +267,17 @@ func TestParser(t *testing.T) { "// DuckCorp Inc: https://example.com", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// Submitted by Not A Duck ", - "// DuckCorp Inc: https://example.com", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// Submitted by Not A Duck "), - mkSrc(1, "// DuckCorp Inc: https://example.com"), - }, - Entries: []Source{ - mkSrc(2, "example.com"), - }, - Entity: "DuckCorp Inc", - URL: mustURL("https://example.com"), - Submitter: mustEmail("Not A Duck "), - }, - }, - }, + want: list( + suffixes(0, 3, + "DuckCorp Inc", + "https://example.com", + `"Not A Duck" `, + comment(0, + "Submitted by Not A Duck ", + "DuckCorp Inc: https://example.com"), + suffix(2, "example.com"), + ), + ), }, { @@ -514,74 +288,17 @@ func TestParser(t *testing.T) { "// Submitted by Not A Duck ", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// This is an unstructured comment.", - "// DuckCorp Inc: https://example.com", - "// Submitted by Not A Duck ", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// This is an unstructured comment."), - mkSrc(1, "// DuckCorp Inc: https://example.com"), - mkSrc(2, "// Submitted by Not A Duck "), - }, - Entries: []Source{ - mkSrc(3, "example.com"), - }, - Entity: "DuckCorp Inc", - URL: mustURL("https://example.com"), - Submitter: mustEmail("Not A Duck "), - }, - }, - }, - }, - - { - name: "legacy_error_downgrade", - psl: byteLines( - "// https://example.com", - "example.com", + want: list( + suffixes(0, 4, + "DuckCorp Inc", + "https://example.com", + `"Not A Duck" `, + comment(0, "This is an unstructured comment.", + "DuckCorp Inc: https://example.com", + "Submitted by Not A Duck "), + suffix(3, "example.com"), + ), ), - downgradeToWarning: func(e error) bool { - return true - }, - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// https://example.com", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// https://example.com"), - }, - Entries: []Source{ - mkSrc(1, "example.com"), - }, - URL: mustURL("https://example.com"), - }, - }, - Warnings: []error{ - MissingEntityName{ - Suffixes: &Suffixes{ - Source: mkSrc(0, - "// https://example.com", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// https://example.com"), - }, - Entries: []Source{ - mkSrc(1, "example.com"), - }, - URL: mustURL("https://example.com"), - }, - }, - }, - }, }, { @@ -592,21 +309,12 @@ func TestParser(t *testing.T) { "// Parens Appreciation Society (https://example.org)", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, "// Parens Appreciation Society (https://example.org)", "example.com"), - Header: []Source{ - mkSrc(0, "// Parens Appreciation Society (https://example.org)"), - }, - Entries: []Source{ - mkSrc(1, "example.com"), - }, - Entity: "Parens Appreciation Society", - URL: mustURL("https://example.org"), - }, - }, - }, + want: list( + suffixes(0, 2, "Parens Appreciation Society", "https://example.org", "", + comment(0, "Parens Appreciation Society (https://example.org)"), + suffix(1, "example.com"), + ), + ), }, { @@ -621,67 +329,28 @@ func TestParser(t *testing.T) { "// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1", "cd", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// cd : https://en.wikipedia.org/wiki/.cd", - "// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1", - "cd", - ), - Header: []Source{ - mkSrc(0, "// cd : https://en.wikipedia.org/wiki/.cd"), - mkSrc(1, "// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1"), - }, - Entries: []Source{ - mkSrc(2, "cd"), - }, - Entity: "cd", - URL: mustURL("https://en.wikipedia.org/wiki/.cd"), - }, - }, - }, + want: list( + suffixes(0, 3, "cd", "https://en.wikipedia.org/wiki/.cd", "", + comment(0, "cd : https://en.wikipedia.org/wiki/.cd", + "see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1"), + suffix(2, "cd"), + ), + ), }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - exc := test.downgradeToWarning - if exc == nil { - // use real exceptions if the test doesn't provide something else - exc = downgradeToWarning - } - got := parseWithExceptions(test.psl, exc, true).File + got, errs := Parse(test.psl) checkDiff(t, "parse result", got, test.want) + checkDiff(t, "parse errors", errs, test.wantErrs) }) } } -// mustURL returns the given string as a URL, or panics if not a URL. -func mustURL(s string) *url.URL { - u, err := url.Parse(s) - if err != nil { - panic(err) - } - return u -} - -// mustEmail returns the given string as an RFC 5322 address, or -// panics if the parse fails. -func mustEmail(s string) *mail.Address { - a, err := mail.ParseAddress(s) - if err != nil { - panic(err) - } - return a -} - -// mkSrc returns a Source with the given start, end, and dedented text. -func mkSrc(start int, lines ...string) Source { - return Source{ - lineOffset: start, - lines: lines, - } +// mkSrc returns a SourceRange with the given start and end. +func mkSrc(start, end int) SourceRange { + return SourceRange{start, end} } // TestParseRealList checks that the real public suffix list can parse @@ -692,133 +361,81 @@ func TestParseRealList(t *testing.T) { t.Fatal(err) } - f := Parse(bs) + _, errs := Parse(bs) - for _, err := range f.Errors { + for _, err := range errs { t.Errorf("Parse error: %v", err) } } -// TestRoundtripRealList checks that concatenating the source text of -// all top-level blocks, with appropriate additional blank lines, -// exactly reproduces the source text that was parsed. Effectively, -// this is a "prove that the parser didn't discard any bytes" check. -func TestRoundtripRealList(t *testing.T) { - bs, err := os.ReadFile("../../../public_suffix_list.dat") - if err != nil { - t.Fatal(err) - } - f := Parse(bs) - - if len(f.Errors) > 0 { - t.Fatal("Parse errors, not attempting to roundtrip") +func list(blocks ...Block) *List { + return &List{ + Blocks: blocks, } +} - prevLine := 0 - var rebuilt bytes.Buffer - for _, block := range f.Blocks { - src := block.source() - if src.lineOffset < prevLine { - t.Fatalf("ordering error: previous block ended at %d but this block starts at %d:\n%s", prevLine, src.lineOffset, src.Text()) - } - for prevLine < src.lineOffset { - rebuilt.WriteByte('\n') - prevLine++ - } - rebuilt.WriteString(src.Text()) - rebuilt.WriteByte('\n') - prevLine = src.lineOffset + len(src.lines) +func blank(start, end int) *Blank { + return &Blank{ + SourceRange: mkSrc(start, end), } +} - got := strings.Split(strings.TrimSpace(rebuilt.String()), "\n") - want := strings.Split(strings.TrimSpace(string(bs)), "\n") - - if diff := diff.Diff(want, got); diff != "" { - t.Errorf("roundtrip failed (-want +got):\n%s", diff) +func comment(start int, lines ...string) *Comment { + return &Comment{ + SourceRange: mkSrc(start, start+len(lines)), + Text: lines, } } -// TestRoundtripRealListDetailed is like the prior round-tripping -// test, but Suffix blocks are written out using their -// Header/Entries/InlineComments fields, again as proof that no suffix -// block elements were lost during parsing. -func TestRoundtripRealListDetailed(t *testing.T) { - bs, err := os.ReadFile("../../../public_suffix_list.dat") - if err != nil { - t.Fatal(err) +func section(start, end int, name string, blocks ...Block) *Section { + if len(blocks) == 0 { + return &Section{ + SourceRange: mkSrc(start, end), + Name: name, + } } - f := Parse(bs) - if len(f.Errors) > 0 { - t.Fatal("Parse errors, not attempting to roundtrip") + return &Section{ + SourceRange: mkSrc(start, end), + Name: name, + Blocks: blocks, } +} - prevLine := 0 - var rebuilt bytes.Buffer - for _, block := range f.Blocks { - srcs := []Source{block.source()} - if v, ok := block.(*Suffixes); ok { - srcs = []Source{} - for _, h := range v.Header { - srcs = append(srcs, h) - } - for _, e := range v.Entries { - srcs = append(srcs, e) - } - for _, c := range v.InlineComments { - srcs = append(srcs, c) - } - slices.SortFunc(srcs, func(a, b Source) int { - return cmp.Compare(a.lineOffset, b.lineOffset) - }) - } - - for _, src := range srcs { - if src.lineOffset < prevLine { - t.Fatalf("ordering error: previous block ended at %d but this block starts at %d:\n%s", prevLine, src.lineOffset, src.Text()) - } - for prevLine < src.lineOffset { - rebuilt.WriteByte('\n') - prevLine++ - } - rebuilt.WriteString(src.Text()) - rebuilt.WriteByte('\n') - prevLine = src.lineOffset + len(src.lines) +func suffixes(start, end int, entity string, urlStr string, email string, blocks ...Block) *Suffixes { + ret := &Suffixes{ + SourceRange: mkSrc(start, end), + Entity: entity, + Blocks: blocks, + } + if urlStr != "" { + u, err := url.Parse(urlStr) + if err != nil { + panic(err) } + ret.URL = u } - - got := strings.Split(strings.TrimSpace(rebuilt.String()), "\n") - want := strings.Split(strings.TrimSpace(string(bs)), "\n") - - if diff := diff.Diff(want, got); diff != "" { - t.Errorf("roundtrip failed (-want +got):\n%s", diff) + if email != "" { + e, err := mail.ParseAddress(email) + if err != nil { + panic(err) + } + ret.Submitter = e } + return ret } -// TestExceptionsStillNecessary checks that all the exceptions in -// exeptions.go are still needed to parse the PSL without errors. -func TestExceptionsStillNecessary(t *testing.T) { - bs, err := os.ReadFile("../../../public_suffix_list.dat") - if err != nil { - t.Fatal(err) +func suffix(line int, domain string) *Suffix { + return &Suffix{ + SourceRange: mkSrc(line, line+1), + Labels: strings.Split(domain, "."), } - - forEachOmitted(missingEmail, func(omitted string, trimmed []string) { - old := missingEmail - defer func() { missingEmail = old }() - missingEmail = trimmed - - f := Parse(bs) - if len(f.Errors) == 0 { - t.Errorf("missingEmail exception no longer necessary:\n%s", omitted) - } - }) } -func forEachOmitted(exceptions []string, fn func(string, []string)) { - for i := range exceptions { - next := append([]string(nil), exceptions[:i]...) - next = append(next, exceptions[i+1:]...) - fn(exceptions[i], next) +func wildcard(start, end int, base string, exceptions ...string) *Wildcard { + return &Wildcard{ + SourceRange: mkSrc(start, end), + Labels: strings.Split(base, "."), + Exceptions: exceptions, } } diff --git a/tools/internal/parser/text.go b/tools/internal/parser/text.go index 42644a821..4ef48cd66 100644 --- a/tools/internal/parser/text.go +++ b/tools/internal/parser/text.go @@ -11,158 +11,52 @@ import ( xunicode "golang.org/x/text/encoding/unicode" ) -// Source is a piece of source text with location information. -// -// A Source is effectively a slice of the input file's lines, with -// some extra information attached. As such, the start/end indexes -// behave the same as in Go slices, and select the half-open interval -// [start:end). -type Source struct { - // The lines of source text, sanitized to valid UTF-8 and with - // leading and trailing whitespace removed. - lines []string - // lineOffset is how many lines are before the beginning of lines, - // for sources that represent a subset of the input. - lineOffset int -} - -// newSource returns a source for bs, along with a preliminary set of -// input validation errors. -// -// source always returns a usable, non-nil result, even when it -// returns errors. -func newSource(bs []byte) (Source, []error) { - lines, errs := normalizeToUTF8Lines(bs) - - ret := Source{ - lines: lines, - lineOffset: 0, - } - - return ret, errs +// SourceRange describes a slice of lines from an unparsed source +// file. FirstLine and LastLine behave like normal slice offsets, +// i.e. they represent the half-open range [FirstLine:LastLine). +type SourceRange struct { + FirstLine int + LastLine int } -// Text returns the source text of s as a string. -func (s Source) Text() string { - if len(s.lines) == 1 { - return s.lines[0] - } - return strings.Join(s.lines, "\n") -} - -// LocationString returns a short string describing the source -// location. -func (s Source) LocationString() string { - // For printing diagnostics, 0-indexed [start:end) is confusing - // and not how editors present text to people. Adjust the offsets - // to be 1-indexed [start:end] instead. - start := s.lineOffset + 1 - end := s.lineOffset + len(s.lines) - - if end < start { - // Zero line Source. We can sometimes produce these internally - // during parsing, but they should not escape outside the - // package. We still print them gracefully instead of - // panicking, because it's useful for debugging the parser. - return fmt.Sprintf("", start) - } - - if start == end { - return fmt.Sprintf("line %d", start) - } - return fmt.Sprintf("lines %d-%d", start, end) -} - -// slice returns the slice of s between startLine and endLine. -// -// startLine and endLine behave like normal slice offsets, i.e. they -// represent the half-open range [startLine:endLine). -func (s Source) slice(startLine, endLine int) Source { - if startLine < 0 || startLine > len(s.lines) || endLine < startLine || endLine > len(s.lines) { - panic("invalid input to slice") - } - return Source{ - lines: s.lines[startLine:endLine], - lineOffset: s.lineOffset + startLine, +// NumLines returns the number of source lines described by +// SourceRange. +func (s SourceRange) NumLines() int { + if s.FirstLine >= s.LastLine { + return 0 } + return s.LastLine - s.FirstLine } -// line returns the nth line of s. -func (s Source) line(n int) Source { - return s.slice(n, n+1) -} - -// lineSources slices s into one Source per line. -func (s Source) lineSources() []Source { - if len(s.lines) == 1 { - return []Source{s} - } - - ret := make([]Source, len(s.lines)) - for i := range s.lines { - ret[i] = s.slice(i, i+1) +// LocationString prints a human-readable description of the +// SourceRange. +func (s SourceRange) LocationString() string { + switch { + case s.LastLine <= s.FirstLine: + return "" + case s.LastLine == s.FirstLine+1: + return fmt.Sprintf("line %d", s.FirstLine+1) + default: + return fmt.Sprintf("lines %d-%d", s.FirstLine+1, s.LastLine) } - return ret } -// cut slices s at the first cut line, as determined by cutHere. It -// returns two Source blocks: the part of s before the cut line, and -// the rest of s including the cut line. The found result reports -// whether a cut was found. If s does not contain a cut line, cut -// returns s, , false. -func (s Source) cut(cutHere func(Source) bool) (before Source, rest Source, found bool) { - for i := range s.lines { - if cutHere(s.line(i)) { - return s.slice(0, i), s.slice(i, len(s.lines)), true - } +// merge returns a SourceRange that contains both s and other. If s +// and other are not contiguous or overlapping, the returned +// SourceRange also spans unrelated lines, but always covers both s +// and other. +func (s SourceRange) merge(other SourceRange) SourceRange { + return SourceRange{ + FirstLine: min(s.FirstLine, other.LastLine), + LastLine: max(s.LastLine, other.LastLine), } - return s, Source{}, false } -// split slices s into all sub-blocks separated by lines identified by -// isSeparator, and returns a slice of the non-empty blocks between -// those separators. -// -// Note the semantics are different from strings.Split: sub-blocks -// that contain no lines are not returned. This works better for what -// the PSL format needs. -func (s Source) split(isSeparator func(line Source) bool) []Source { - ret := []Source{} - s.forEachRun(isSeparator, func(block Source, isSep bool) { - if isSep { - return - } - ret = append(ret, block) - }) - return ret -} - -// forEachRun calls processBlock for every run of consecutive lines -// where classify returns the same result. -// -// For example, if classify returns true on lines starting with "//", -// processBlock gets called with alternating blocks consisting of only -// comments, or only non-comments. -func (s Source) forEachRun(classify func(line Source) bool, processBlock func(block Source, classifyResult bool)) { - if len(s.lines) == 0 { - return - } - - currentBlock := 0 - currentVal := classify(s.line(0)) - for i := range s.lines[1:] { - line := i + 1 - v := classify(s.line(line)) - if v != currentVal { - processBlock(s.slice(currentBlock, line), currentVal) - currentVal = v - currentBlock = line - } - } - if currentBlock != len(s.lines) { - processBlock(s.slice(currentBlock, len(s.lines)), currentVal) - } -} +// SrcRange returns the SourceRange. This looks a little strange, but +// it's to satisfy the Block interface. This allows other code to +// retrieve the SourceRange of any Block without having to typeswitch +// all the possible sub-types. +func (s SourceRange) SrcRange() SourceRange { return s } const ( bomUTF8 = "\xEF\xBB\xBF" @@ -198,20 +92,20 @@ func normalizeToUTF8Lines(bs []byte) ([]string, []error) { enc := utf8Transform switch { case bytes.HasPrefix(bs, []byte(bomUTF8)): - errs = append(errs, UTF8BOMError{}) + errs = append(errs, ErrUTF8BOM{}) case bytes.HasPrefix(bs, []byte(bomUTF16BE)): enc = utf16BigEndianTransform - errs = append(errs, InvalidEncodingError{"UTF-16BE"}) + errs = append(errs, ErrInvalidEncoding{"UTF-16BE"}) case bytes.HasPrefix(bs, []byte(bomUTF16LE)): enc = utf16LittleEndianTransform - errs = append(errs, InvalidEncodingError{"UTF-16LE"}) + errs = append(errs, ErrInvalidEncoding{"UTF-16LE"}) default: enc = guessUTFVariant(bs) switch enc { case utf16BigEndianTransform: - errs = append(errs, InvalidEncodingError{"UTF-16BE (guessed)"}) + errs = append(errs, ErrInvalidEncoding{"UTF-16BE (guessed)"}) case utf16LittleEndianTransform: - errs = append(errs, InvalidEncodingError{"UTF-16LE (guessed)"}) + errs = append(errs, ErrInvalidEncoding{"UTF-16LE (guessed)"}) } } @@ -239,27 +133,24 @@ func normalizeToUTF8Lines(bs []byte) ([]string, []error) { // replacement character is a distinctive shape that stands // out, it should provide enough hints as to where any invalid // byte sequences are. - src := Source{ - lineOffset: i, - lines: []string{line}, - } + src := SourceRange{i, i + 1} if strings.ContainsRune(line, utf8.RuneError) { - errs = append(errs, InvalidUTF8Error{src}) + errs = append(errs, ErrInvalidUTF8{src}) } line, ok := strings.CutSuffix(line, "\r") if ok { ret[i] = line - errs = append(errs, DOSNewlineError{src}) + errs = append(errs, ErrDOSNewline{src}) } if ln := strings.TrimRightFunc(line, unicode.IsSpace); ln != line { line = ln ret[i] = line - errs = append(errs, TrailingWhitespaceError{src}) + errs = append(errs, ErrTrailingWhitespace{src}) } if ln := strings.TrimLeftFunc(line, unicode.IsSpace); ln != line { line = ln ret[i] = line - errs = append(errs, LeadingWhitespaceError{src}) + errs = append(errs, ErrLeadingWhitespace{src}) } } diff --git a/tools/internal/parser/text_test.go b/tools/internal/parser/text_test.go index 5e42851fd..ee1a6dfbf 100644 --- a/tools/internal/parser/text_test.go +++ b/tools/internal/parser/text_test.go @@ -3,8 +3,6 @@ package parser import ( "bytes" "fmt" - "strconv" - "strings" "testing" "github.com/google/go-cmp/cmp" @@ -51,31 +49,31 @@ func TestNormalize(t *testing.T) { name: "utf16be_input_with_bom", in: utf16BigWithBOM("utf-16 text"), want: []string{"utf-16 text"}, - wantErrs: []error{InvalidEncodingError{"UTF-16BE"}}, + wantErrs: []error{ErrInvalidEncoding{"UTF-16BE"}}, }, { name: "utf16le_input_with_bom", in: utf16LittleWithBOM("utf-16 text"), want: []string{"utf-16 text"}, - wantErrs: []error{InvalidEncodingError{"UTF-16LE"}}, + wantErrs: []error{ErrInvalidEncoding{"UTF-16LE"}}, }, { name: "utf16be_input", in: utf16Big("utf-16 text utf-16 text utf-16 text"), want: []string{"utf-16 text utf-16 text utf-16 text"}, - wantErrs: []error{InvalidEncodingError{"UTF-16BE (guessed)"}}, + wantErrs: []error{ErrInvalidEncoding{"UTF-16BE (guessed)"}}, }, { name: "utf16le_input", in: utf16Little("utf-16 text utf-16 text utf-16 text"), want: []string{"utf-16 text utf-16 text utf-16 text"}, - wantErrs: []error{InvalidEncodingError{"UTF-16LE (guessed)"}}, + wantErrs: []error{ErrInvalidEncoding{"UTF-16LE (guessed)"}}, }, { name: "utf8_with_bom", in: utf8WithBOM("utf-8 text"), want: []string{"utf-8 text"}, - wantErrs: []error{UTF8BOMError{}}, + wantErrs: []error{ErrUTF8BOM{}}, }, { name: "utf8_with_garbage", @@ -107,10 +105,10 @@ func TestNormalize(t *testing.T) { "this line is ok", }, wantErrs: []error{ - InvalidUTF8Error{mkSrc(1, "bad1: \uFFFDabc")}, - InvalidUTF8Error{mkSrc(2, "bad2: \uFFFDabc")}, - InvalidUTF8Error{mkSrc(3, "bad3: \uFFFDabc")}, - InvalidUTF8Error{mkSrc(4, "bad4: \uFFFD\uFFFDabc")}, + ErrInvalidUTF8{mkSrc(1, 2)}, + ErrInvalidUTF8{mkSrc(2, 3)}, + ErrInvalidUTF8{mkSrc(3, 4)}, + ErrInvalidUTF8{mkSrc(4, 5)}, }, }, { @@ -125,12 +123,8 @@ func TestNormalize(t *testing.T) { "end like it's 1991", }, wantErrs: []error{ - DOSNewlineError{ - Line: mkSrc(0, "normal file\r"), - }, - DOSNewlineError{ - Line: mkSrc(1, "except the lines\r"), - }, + ErrDOSNewline{mkSrc(0, 1)}, + ErrDOSNewline{mkSrc(1, 2)}, }, }, { @@ -152,18 +146,10 @@ func TestNormalize(t *testing.T) { "and one good line", }, wantErrs: []error{ - TrailingWhitespaceError{ - Line: mkSrc(0, "a file "), - }, - TrailingWhitespaceError{ - Line: mkSrc(1, "with all kinds\t\t"), - }, - TrailingWhitespaceError{ - Line: mkSrc(2, " \r\t"), - }, - TrailingWhitespaceError{ - Line: mkSrc(3, "of trailing space\u2003\u3000\u205f"), - }, + ErrTrailingWhitespace{mkSrc(0, 1)}, + ErrTrailingWhitespace{mkSrc(1, 2)}, + ErrTrailingWhitespace{mkSrc(2, 3)}, + ErrTrailingWhitespace{mkSrc(3, 4)}, }, }, { @@ -185,18 +171,10 @@ func TestNormalize(t *testing.T) { "and one good line", }, wantErrs: []error{ - LeadingWhitespaceError{ - Line: mkSrc(0, " a file"), - }, - LeadingWhitespaceError{ - Line: mkSrc(1, "\t\twith all kinds"), - }, - TrailingWhitespaceError{ - Line: mkSrc(2, " \r\t"), - }, - LeadingWhitespaceError{ - Line: mkSrc(3, "\u2003\u3000\u205fof leading space"), - }, + ErrLeadingWhitespace{mkSrc(0, 1)}, + ErrLeadingWhitespace{mkSrc(1, 2)}, + ErrTrailingWhitespace{mkSrc(2, 3)}, + ErrLeadingWhitespace{mkSrc(3, 4)}, }, }, { @@ -204,386 +182,20 @@ func TestNormalize(t *testing.T) { in: byteLines("\xef\xbb\xbf \t // Hello\xc3\x28 very broken line\t \r"), want: []string{"// Hello\uFFFD( very broken line"}, wantErrs: []error{ - UTF8BOMError{}, - InvalidUTF8Error{ - Line: mkSrc(0, " \t // Hello\uFFFD( very broken line\t \r"), - }, - DOSNewlineError{ - Line: mkSrc(0, " \t // Hello\uFFFD( very broken line\t \r"), - }, - TrailingWhitespaceError{ - Line: mkSrc(0, " \t // Hello\uFFFD( very broken line\t \r"), - }, - LeadingWhitespaceError{ - Line: mkSrc(0, " \t // Hello\uFFFD( very broken line\t \r"), - }, + ErrUTF8BOM{}, + ErrInvalidUTF8{mkSrc(0, 1)}, + ErrDOSNewline{mkSrc(0, 1)}, + ErrTrailingWhitespace{mkSrc(0, 1)}, + ErrLeadingWhitespace{mkSrc(0, 1)}, }, }, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - src, errs := newSource(tc.in) + lines, errs := normalizeToUTF8Lines(tc.in) checkDiff(t, "newSource error set", errs, tc.wantErrs) - checkDiff(t, "newSource result", src.lines, tc.want) - }) - } -} - -func TestLineSlicing(t *testing.T) { - t.Parallel() - - lines := []string{"abc", "def", "ghi", "jkl"} - src := mkSrc(0, lines...) - - wantLines := []Source{ - mkSrc(0, "abc"), - mkSrc(1, "def"), - mkSrc(2, "ghi"), - mkSrc(3, "jkl"), - } - checkDiff(t, "src.lineSources()", src.lineSources(), wantLines) - - // slice and line are internal helpers, but if they behave - // incorrectly some higher level methods have very confusing - // behavior, so test explicitly as well. - for i, wantLine := range wantLines { - checkDiff(t, fmt.Sprintf("src.line(%d)", i), src.line(i), wantLine) - } - - for start := 0; start <= len(lines); start++ { - for end := start + 1; end <= len(lines); end++ { - t.Run(fmt.Sprintf("slice_%d_to_%d", start, end), func(t *testing.T) { - want := mkSrc(start, lines[start:end]...) - checkDiff(t, fmt.Sprintf("src.slice(%d, %d)", start, end), src.slice(start, end), want) - }) - } - } -} - -func TestSourceText(t *testing.T) { - t.Parallel() - - tests := []struct { - src Source - wantText string - wantLocation string - }{ - { - src: mkSrc(0), - wantText: "", - wantLocation: "", - }, - { - src: mkSrc(0, "abc"), - wantText: "abc", - wantLocation: "line 1", - }, - { - src: mkSrc(0, "abc", "def"), - wantText: "abc\ndef", - wantLocation: "lines 1-2", - }, - { - src: mkSrc(0, "abc", "def").line(0), - wantText: "abc", - wantLocation: "line 1", - }, - { - src: mkSrc(0, "abc", "def").line(1), - wantText: "def", - wantLocation: "line 2", - }, - } - - for i, tc := range tests { - t.Run(strconv.Itoa(i), func(t *testing.T) { - checkDiff(t, "src.Text()", tc.src.Text(), tc.wantText) - checkDiff(t, "mkSrc().LocationString()", tc.src.LocationString(), tc.wantLocation) - }) - } -} - -func TestForEachRun(t *testing.T) { - t.Parallel() - - isComment := func(line Source) bool { - return strings.HasPrefix(line.Text(), "// ") - } - // some weird arbitrary classifier, to verify that forEachRun is - // using the classifier correctly - groupCnt := 0 - groupsOf2And1 := func(line Source) bool { - groupCnt = (groupCnt + 1) % 3 - return groupCnt == 0 - } - - type Run struct { - IsMatch bool - Block Source - } - tests := []struct { - name string - src Source - classify func(Source) bool - want []Run - }{ - { - name: "comments", - src: mkSrc(0, - "// foo", - "// bar", - "abc", - "def", - "// other", - "ghi", - ), - classify: isComment, - want: []Run{ - {true, mkSrc(0, "// foo", "// bar")}, - {false, mkSrc(2, "abc", "def")}, - {true, mkSrc(4, "// other")}, - {false, mkSrc(5, "ghi")}, - }, - }, - { - name: "only_comments", - src: mkSrc(0, - "// abc", - "// def", - "// ghi", - ), - classify: isComment, - want: []Run{ - {true, mkSrc(0, "// abc", "// def", "// ghi")}, - }, - }, - { - name: "comment_at_end", - src: mkSrc(0, - "// abc", - "def", - "// ghi", - ), - classify: isComment, - want: []Run{ - {true, mkSrc(0, "// abc")}, - {false, mkSrc(1, "def")}, - {true, mkSrc(2, "// ghi")}, - }, - }, - { - name: "no_comments", - src: mkSrc(0, - "abc", - "def", - "ghi", - ), - classify: isComment, - want: []Run{ - {false, mkSrc(0, "abc", "def", "ghi")}, - }, - }, - { - name: "weird_classifier", - src: mkSrc(0, - "abc", - "def", - "ghi", - "jkl", - "mno", - "pqr", - "stu", - ), - classify: groupsOf2And1, - want: []Run{ - {false, mkSrc(0, "abc", "def")}, - {true, mkSrc(2, "ghi")}, - {false, mkSrc(3, "jkl", "mno")}, - {true, mkSrc(5, "pqr")}, - {false, mkSrc(6, "stu")}, // truncated final group - }, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - var got []Run - tc.src.forEachRun(tc.classify, func(block Source, isMatch bool) { - got = append(got, Run{isMatch, block}) - }) - checkDiff(t, "forEachRun", got, tc.want) - }) - } -} - -func TestSplit(t *testing.T) { - t.Parallel() - - lines := mkSrc(0, - "// comment", - "abc", - "", - "// other", - "def", - "", - "// end", - "ghi", - ) - - exact := func(s string) func(Source) bool { - return func(line Source) bool { - return line.Text() == s - } - } - prefix := func(s string) func(Source) bool { - return func(line Source) bool { - return strings.HasPrefix(line.Text(), s) - } - } - - tests := []struct { - name string - src Source - fn func(Source) bool - want []Source - }{ - { - name: "simple", - src: lines, - fn: exact("abc"), - want: []Source{ - mkSrc(0, "// comment"), - mkSrc(2, "", "// other", "def", "", "// end", "ghi"), - }, - }, - { - name: "start", - src: lines, - fn: exact("// comment"), - want: []Source{ - mkSrc(1, "abc", "", "// other", "def", "", "// end", "ghi"), - }, - }, - { - name: "end", - src: lines, - fn: exact("ghi"), - want: []Source{ - mkSrc(0, "// comment", "abc", "", "// other", "def", "", "// end"), - }, - }, - { - name: "no_match", - src: lines, - fn: exact("xyz"), - want: []Source{ - mkSrc(0, "// comment", "abc", "", "// other", "def", "", "// end", "ghi"), - }, - }, - { - name: "prefix", - src: lines, - fn: prefix("ab"), - want: []Source{ - mkSrc(0, "// comment"), - mkSrc(2, "", "// other", "def", "", "// end", "ghi"), - }, - }, - { - name: "prefix_comment", - src: lines, - fn: prefix("// "), - want: []Source{ - mkSrc(1, "abc", ""), - mkSrc(4, "def", ""), - mkSrc(7, "ghi"), - }, - }, - - { - name: "empty", - src: mkSrc(0), - fn: exact("xyz"), - want: []Source{}, - }, - { - name: "empty_split_blank", - src: mkSrc(0), - fn: exact(""), - want: []Source{}, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - got := tc.src.split(tc.fn) - checkDiff(t, "split", got, tc.want) - }) - } -} - -func TestCut(t *testing.T) { - t.Parallel() - - exact := func(s string) func(Source) bool { - return func(line Source) bool { - return line.Text() == s - } - } - prefix := func(s string) func(Source) bool { - return func(line Source) bool { - return strings.HasPrefix(line.Text(), s) - } - } - - tests := []struct { - name string - src Source - fn func(Source) bool - before, rest Source - found bool - }{ - { - name: "simple", - src: mkSrc(0, "abc", "def", "ghi"), - fn: exact("def"), - before: mkSrc(0, "abc"), - rest: mkSrc(1, "def", "ghi"), - found: true, - }, - { - name: "cut_on_first", - src: mkSrc(0, - "abc", - "// def", - "ghi", - "// jkl", - "mno", - ), - fn: prefix("// "), - before: mkSrc(0, "abc"), - rest: mkSrc(1, "// def", "ghi", "// jkl", "mno"), - found: true, - }, - { - name: "no_match", - src: mkSrc(0, "abc", "def", "ghi"), - fn: exact("xyz"), - before: mkSrc(0, "abc", "def", "ghi"), - rest: Source{}, - found: false, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - gotBefore, gotRest, gotFound := tc.src.cut(tc.fn) - checkDiff(t, "cut() before", gotBefore, tc.before) - checkDiff(t, "cut() after", gotRest, tc.rest) - if gotFound != tc.found { - t.Errorf("cut() found=%v, want %v", gotFound, tc.found) - } + checkDiff(t, "newSource result", lines, tc.want) }) } } @@ -635,7 +247,7 @@ func utf8WithBOM(s string) []byte { func checkDiff(t *testing.T, whatIsBeingDiffed string, got, want any) { t.Helper() - if diff := cmp.Diff(got, want, cmp.AllowUnexported(Source{})); diff != "" { + if diff := cmp.Diff(got, want); diff != "" { t.Errorf("%s is wrong (-got+want):\n%s", whatIsBeingDiffed, diff) } } diff --git a/tools/internal/parser/validate.go b/tools/internal/parser/validate.go index aa330ea40..01709f04f 100644 --- a/tools/internal/parser/validate.go +++ b/tools/internal/parser/validate.go @@ -8,43 +8,39 @@ import ( "github.com/creachadair/mds/slice" ) -// Validate runs validations on a parsed File. -// -// Validation only runs on a file that does not yet have any -// errors. The presence of errors may indicate structural issues that -// can break some validations. -func (p *parser) Validate() { - if len(p.Errors) > 0 { - return +// ValidateOffline runs offline validations on a parsed PSL. +func ValidateOffline(l *List) []error { + var ret []error + + for _, block := range blocksOfType[*Section](l) { + if block.Name == "PRIVATE DOMAINS" { + ret = append(ret, validateEntityMetadata(block)...) + if err := validatePrivateSectionOrder(block); err != nil { + ret = append(ret, err) + } + break + } } - p.requireEntityNames() - p.requirePrivateDomainEmailContact() - p.requireSortedPrivateSection() + return ret } -// requireEntityNames verifies that all Suffix blocks have some kind -// of entity name. -func (p *parser) requireEntityNames() { - for _, block := range p.AllSuffixBlocks() { +// validateEntityMetadata verifies that all suffix blocks have some +// kind of entity name. +func validateEntityMetadata(block *Section) []error { + var ret []error + for _, block := range blocksOfType[*Suffixes](block) { if block.Entity == "" { - p.addError(MissingEntityName{ + ret = append(ret, ErrMissingEntityName{ Suffixes: block, }) - } - } -} - -// requirePrivateDomainEmailContact verifies that all Suffix blocks in -// the private section have email contact information. -func (p *parser) requirePrivateDomainEmailContact() { - for _, block := range p.File.SuffixBlocksInSection("PRIVATE DOMAINS") { - if block.Submitter == nil { - p.addError(MissingEntityEmail{ + } else if block.Submitter == nil && !exemptFromContactInfo(block.Entity) { + ret = append(ret, ErrMissingEntityEmail{ Suffixes: block, }) } } + return ret } const ( @@ -52,9 +48,9 @@ const ( amazonSuperblockEnd = "concludes Amazon" ) -// requireSortedPrivateSection verifies that the blocks in the private +// validatePrivateSectionOrder verifies that the blocks in the private // domains section is sorted according to PSL policy. -func (p *parser) requireSortedPrivateSection() { +func validatePrivateSectionOrder(block *Section) error { // Amazon has a semi-automated "superblock" of suffix blocks, // which are in the PSL at the correct sort location for "Amazon", // but are not correctly interleaved with other non-Amazon @@ -78,44 +74,28 @@ func (p *parser) requireSortedPrivateSection() { var blocks []superblock inAmazonSuperblock := false - for _, block := range allBlocksInPrivateSection(&p.File) { - if comm, ok := block.(*Comment); ok { - if !inAmazonSuperblock && strings.Contains(comm.Text(), amazonSuperblockStart) { + for _, block := range block.Children() { + switch v := block.(type) { + case *Comment: + if !inAmazonSuperblock && strings.Contains(v.Text[0], amazonSuperblockStart) { // Start of the Amazon superblock. We will accumulate // suffix blocks into here further down. inAmazonSuperblock = true blocks = append(blocks, superblock{ Name: "Amazon", }) - } else if inAmazonSuperblock && strings.Contains(comm.Text(), amazonSuperblockEnd) { + } else if inAmazonSuperblock && strings.Contains(v.Text[0], amazonSuperblockEnd) { // End of Amazon superblock, go back to normal // behavior. inAmazonSuperblock = false } - continue - } - - // Aside from the Amazon superblock comments, we only care - // about Suffix blocks in this validation. - suffixes, ok := block.(*Suffixes) - if !ok { - continue - } - - // While we're inside the Amazon superblock, all suffix blocks - // get grouped into one. Outside of the Amazon superblock, - // each suffix block gets its own superblock. - if inAmazonSuperblock { - last := len(blocks) - 1 - blocks[last].Suffixes = append(blocks[last].Suffixes, suffixes) - continue - } else if exemptFromSorting(suffixes.Source) { - continue - } else { - blocks = append(blocks, superblock{ - Name: suffixes.Entity, - Suffixes: []*Suffixes{suffixes}, - }) + case *Suffixes: + if inAmazonSuperblock { + last := len(blocks) - 1 + blocks[last].Suffixes = append(blocks[last].Suffixes, v) + } else if !exemptFromSorting(v.Entity) { + blocks = append(blocks, superblock{v.Entity, []*Suffixes{v}}) + } } } @@ -137,7 +117,7 @@ func (p *parser) requireSortedPrivateSection() { if len(sorted) == len(blocks) { // Already sorted, we're done. - return + return nil } // Scan through the superblocks and find where the incorrectly @@ -171,7 +151,7 @@ func (p *parser) requireSortedPrivateSection() { fixed := make([]superblock, 0, len(blocks)) fixed = append(fixed, sorted...) - err := SuffixBlocksInWrongPlace{ + err := ErrSuffixBlocksInWrongPlace{ EditScript: make([]MoveSuffixBlock, 0, len(blocks)-len(sorted)), } @@ -223,28 +203,34 @@ func (p *parser) requireSortedPrivateSection() { blocksIdx++ } - // At last, we can report the ordering error. - p.addError(err) + return err } -func allBlocksInPrivateSection(f *File) []Block { - start := 0 - for i, block := range f.Blocks { - switch v := block.(type) { - case *StartSection: - if v.Name != "PRIVATE DOMAINS" { - continue - } - start = i + 1 - case *EndSection: - if v.Name != "PRIVATE DOMAINS" { - continue - } - return f.Blocks[start:i] +// A childrener can return a list of its children. +// Yes, the interface name sounds a bit silly, but it's the +// conventional Go name given what it does. +type childrener interface { + Children() []Block +} + +// blocksOfType recursively walks the subtree rooted at c and returns +// all tree nodes of concrete block type T. +// +// For example, blocksOfType[*Comment](n) returns all comment nodes +// under n. +func blocksOfType[T Block](c childrener) []T { + var ret []T + + var rec func(childrener) + rec = func(c childrener) { + if v, ok := c.(T); ok { + ret = append(ret, v) + } + for _, child := range c.Children() { + rec(child) } } - // We can only get here if there's no private section (so nothing - // to validate), or if the file has structural issues (but we - // don't run validations in that case). - return []Block{} + rec(c) + + return ret } diff --git a/tools/internal/parser/validate_test.go b/tools/internal/parser/validate_test.go index 805b289e3..77c2fcbc3 100644 --- a/tools/internal/parser/validate_test.go +++ b/tools/internal/parser/validate_test.go @@ -1,185 +1,137 @@ package parser import ( - "bytes" - "errors" - "fmt" "testing" ) func TestRequireSortedPrivateSection(t *testing.T) { - // Shorthand for a simple suffix block with the right source data. - suffixBlock := func(lineOffset int, name, suffix string) Suffixes { - // For this test, every suffix block just has one suffix. - src := mkSrc(lineOffset, fmt.Sprintf("// %s", name), suffix) - return Suffixes{ - Source: src, - Header: []Source{src.slice(0, 1)}, - Entries: []Source{src.slice(1, 2)}, - Entity: name, - } - } - // Shorthand for an input file containing a series of suffixes. - suffixBlocks := func(suffixes ...Suffixes) []byte { - var ret bytes.Buffer - ret.WriteString("// ===BEGIN PRIVATE DOMAINS===\n\n") - for _, block := range suffixes { - for _, ln := range block.lineSources() { - ret.WriteString(ln.Text()) - ret.WriteByte('\n') - } - ret.WriteByte('\n') - } - ret.WriteString("// ===END PRIVATE DOMAINS===\n") - return ret.Bytes() - } - - aaa := suffixBlock(0, "AAA Corp", "aaa.com") - bbb := suffixBlock(0, "BBB Inc", "bbb.net") - ccc := suffixBlock(0, "CCC Ltd", "ccc.org") - dddLeadingDot := suffixBlock(0, ".DDD GmbH", "ddd.de") - aaaUmlaut := suffixBlock(0, "AÄA", "aaa.de") - aaaUmlautShort := suffixBlock(0, "AÄ", "aaa.ee") - aaaUmlautLong := suffixBlock(0, "AÄAA", "aaa.sk") - a3b := suffixBlock(0, "a3b", "a3b.com") - a24b := suffixBlock(0, "a24b", "a24b.com") + aaa := suffixes(0, 1, "AAA Corp", "", "", suffix(0, "aaa.com")) + bbb := suffixes(0, 1, "BBB Inc", "", "", suffix(0, "bbb.net")) + ccc := suffixes(0, 1, "CCC Ltd", "", "", suffix(0, "ccc.org")) + dddLeadingDot := suffixes(0, 1, ".DDD GmbH", "", "", suffix(0, "ddd.de")) + aaaUmlaut := suffixes(0, 1, "AÄA", "", "", suffix(0, "aaa.de")) + aaaUmlautShort := suffixes(0, 1, "AÄ", "", "", suffix(0, "aaa.ee")) + aaaUmlautLong := suffixes(0, 1, "AÄAA", "", "", suffix(0, "aaa.sk")) + a3b := suffixes(0, 1, "a3b", "", "", suffix(0, "a3b.com")) + a24b := suffixes(0, 1, "a24b", "", "", suffix(0, "a24b.com")) tests := []struct { name string - in []byte - want []error + in *Section + want error }{ { name: "easy_correct_order", - in: suffixBlocks(aaa, bbb, ccc), + in: section(0, 0, "", aaa, bbb, ccc), }, + { name: "easy_wrong_order", // correct order: aaa, bbb, ccc - in: suffixBlocks(bbb, aaa, ccc), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: bbb.Entity, - InsertAfter: aaa.Entity, - }, + in: section(0, 0, "", bbb, aaa, ccc), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: bbb.Entity, + InsertAfter: aaa.Entity, }, }, }, }, + { name: "reversed", // correct order: aaa, bbb, ccc - in: suffixBlocks(ccc, bbb, aaa), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: ccc.Entity, - InsertAfter: aaa.Entity, - }, - { - Name: bbb.Entity, - InsertAfter: aaa.Entity, - }, + in: section(0, 0, "", ccc, bbb, aaa), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: ccc.Entity, + InsertAfter: aaa.Entity, + }, + { + Name: bbb.Entity, + InsertAfter: aaa.Entity, }, }, }, }, + { name: "leading_punctuation", // correct order: dddLeadingDot, aaa, bbb, ccc - in: suffixBlocks(aaa, bbb, ccc, dddLeadingDot), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: dddLeadingDot.Entity, - InsertAfter: "", - }, + in: section(0, 0, "", aaa, bbb, ccc, dddLeadingDot), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: dddLeadingDot.Entity, + InsertAfter: "", }, }, }, }, + { name: "diacritics", // correct order: aaaUmlautShort, aaaUmlaut, aaa, aaaUmlautLong, bbb, ccc - in: suffixBlocks(aaa, bbb, ccc, aaaUmlaut, aaaUmlautShort, aaaUmlautLong), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: aaaUmlaut.Entity, - InsertAfter: "", - }, - { - Name: aaaUmlautShort.Entity, - InsertAfter: "", - }, - { - Name: aaaUmlautLong.Entity, - InsertAfter: aaa.Entity, - }, + in: section(0, 0, "", aaa, bbb, ccc, aaaUmlaut, aaaUmlautShort, aaaUmlautLong), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: aaaUmlaut.Entity, + InsertAfter: "", + }, + { + Name: aaaUmlautShort.Entity, + InsertAfter: "", + }, + { + Name: aaaUmlautLong.Entity, + InsertAfter: aaa.Entity, }, }, }, }, + { name: "numbers", // correct order: a24b, a3b, aaa, bbb - in: suffixBlocks(aaa, a3b, a24b, bbb), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: aaa.Entity, - InsertAfter: a24b.Entity, - }, - { - Name: a3b.Entity, - InsertAfter: a24b.Entity, - }, + in: section(0, 0, "", aaa, a3b, a24b, bbb), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: aaa.Entity, + InsertAfter: a24b.Entity, + }, + { + Name: a3b.Entity, + InsertAfter: a24b.Entity, }, }, }, }, + { name: "amazon_superblock", - in: byteLines( - "// ===BEGIN PRIVATE DOMAINS===", - "", - "// AA Ltd", - "aa.com", - "", - "// Amazon : https://www.amazon.com", - "// several blocks follow", - "", - // note: incorrect order, but ignored because in Amazon superblock - "// eero", - "eero.com", - "", - "// AWS", - "aws.com", - "", - "// concludes Amazon", - "", - // note: out of order, not ignored - "// Altavista", - "altavista.com", - "", - "// BB Ltd", - "bb.com", - "", - "// ===END PRIVATE DOMAINS===", + in: section(0, 23, "", + suffixes(2, 4, "AA Ltd", "", "", suffix(3, "aa.com")), + + comment(5, "Amazon : https://www.amazon.com", "several blocks follow"), + // Note, incorrect sort, but ignored because it's in + // the Amazon superblock. + suffixes(8, 10, "eero", "", "", suffix(9, "eero.com")), + suffixes(11, 13, "AWS", "", "", suffix(12, "aws.com")), + comment(14, "concludes Amazon"), + + suffixes(16, 18, "Altavista", "", "", suffix(17, "altavista.com")), + + suffixes(19, 21, "BB Ltd", "", "", suffix(20, "bb.com")), ), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: `Amazon (all blocks until "concludes ..." comment)`, - InsertAfter: "Altavista", - }, + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: `Amazon (all blocks until "concludes ..." comment)`, + InsertAfter: "Altavista", }, }, }, @@ -188,13 +140,8 @@ func TestRequireSortedPrivateSection(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - p := parseWithExceptions(tc.in, downgradeToWarning, false) - if len(p.File.Errors) > 0 { - t.Fatalf("parse error before attempting validation: %v", errors.Join(p.File.Errors...)) - } - p.requireSortedPrivateSection() - - checkDiff(t, "validation result", p.File.Errors, tc.want) + errs := validatePrivateSectionOrder(tc.in) + checkDiff(t, "validation result", errs, tc.want) }) } }