From 12b354d3767ebd9b27f1e4c459dd8edec4eb9315 Mon Sep 17 00:00:00 2001 From: Dave Anderson Date: Thu, 4 Jul 2024 04:26:56 -0700 Subject: [PATCH] tools/internal/parser: rewrite parser to output a syntax tree (#2025) This makes the layout into a much more standard-looking recursive descent parser, and lightens the amount of effort spent on reporting egregiously invalid files in exchange for better code readability for the more common situation of structurally valid but failing on policy/lint issues. --- tools/go.mod | 4 +- tools/go.sum | 2 + tools/govalidate/govalidate.go | 93 +- tools/internal/parser/errors.go | 172 ++-- tools/internal/parser/exceptions.go | 1082 +++--------------------- tools/internal/parser/file.go | 182 ++-- tools/internal/parser/metadata.go | 16 +- tools/internal/parser/parser.go | 587 ++++++++----- tools/internal/parser/parser_test.go | 787 +++++------------ tools/internal/parser/text.go | 199 +---- tools/internal/parser/text_test.go | 442 +--------- tools/internal/parser/validate.go | 142 ++-- tools/internal/parser/validate_test.go | 221 ++--- 13 files changed, 1169 insertions(+), 2760 deletions(-) diff --git a/tools/go.mod b/tools/go.mod index fd3434f7e..ad4a6134d 100644 --- a/tools/go.mod +++ b/tools/go.mod @@ -3,8 +3,8 @@ module github.com/publicsuffix/list/tools go 1.21 require ( + github.com/creachadair/mds v0.15.0 github.com/google/go-cmp v0.6.0 + golang.org/x/net v0.26.0 golang.org/x/text v0.16.0 ) - -require github.com/creachadair/mds v0.15.0 // indirect diff --git a/tools/go.sum b/tools/go.sum index a00b0d317..33f7c4f53 100644 --- a/tools/go.sum +++ b/tools/go.sum @@ -2,5 +2,7 @@ github.com/creachadair/mds v0.15.0 h1:St6HvUcrX1UJ517Zha6GKxVibGyRDBDtInOjuaaHOr github.com/creachadair/mds v0.15.0/go.mod h1:4vrFYUzTXMJpMBU+OA292I6IUxKWCCfZkgXg+/kBZMo= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= diff --git a/tools/govalidate/govalidate.go b/tools/govalidate/govalidate.go index f4cd6ead9..c0001ffb9 100644 --- a/tools/govalidate/govalidate.go +++ b/tools/govalidate/govalidate.go @@ -6,12 +6,15 @@ import ( "flag" "fmt" "os" + "strconv" + "strings" "github.com/publicsuffix/list/tools/internal/parser" ) func main() { - warnings := flag.Bool("with-warnings", false, "also print errors that were downgraded to warnings") + debugPrintTree := flag.Bool("debug-print", false, "print the parse tree for debugging") + flag.Usage = func() { fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] pslfile\n", os.Args[0]) flag.PrintDefaults() @@ -30,19 +33,91 @@ func main() { os.Exit(1) } - psl := parser.Parse(bs) + psl, errs := parser.Parse(bs) + + if *debugPrintTree { + debugPrint(psl) + } - for _, err := range psl.Errors { + for _, err := range errs { fmt.Println(err) } - if *warnings { - for _, err := range psl.Warnings { - fmt.Println(err, "(warning)") - } + + verrs := parser.ValidateOffline(psl) + for _, err := range verrs { + fmt.Println(err) } - if len(psl.Errors) > 0 { + + if total := len(errs) + len(verrs); total > 0 { + fmt.Printf("\nFile has %d errors.\n", total) os.Exit(1) } else { - fmt.Printf("%q seems to be a valid PSL file.\n", file) + fmt.Println("\nFile is valid.") + } +} + +// debugPrint prints out a PSL syntax tree in a private, subject to +// change text format. +func debugPrint(p *parser.List) { + fmt.Println("List {") + for _, b := range p.Blocks { + debugPrintRec(b, " ") + } + fmt.Println("}") +} + +func debugPrintRec(b parser.Block, indent string) { + nextIndent := indent + " " + f := func(msg string, args ...any) { + fmt.Printf(indent+msg+"\n", args...) + } + src := b.SrcRange() + loc := fmt.Sprintf("[%d:%d]", src.FirstLine, src.LastLine) + if src.FirstLine+1 == src.LastLine { + loc = strconv.Itoa(src.FirstLine) + } + + switch v := b.(type) { + case *parser.Blank: + f("Blank(%s)", loc) + case *parser.Comment: + f("Comment(%s) {", loc) + for _, t := range v.Text { + f(" %q,", t) + } + f("}") + case *parser.Section: + f("Section(%s, %q) {", loc, v.Name) + for _, b := range v.Blocks { + debugPrintRec(b, nextIndent) + } + f("}") + case *parser.Suffixes: + items := []string{loc} + if v.Entity != "" { + items = append(items, fmt.Sprintf("name=%q", v.Entity)) + } + if v.URL != nil { + items = append(items, fmt.Sprintf("url=%q", v.URL)) + } + if v.Submitter != nil { + items = append(items, fmt.Sprintf("contact=%q", v.Submitter)) + } + + f("SuffixBlock(%s) {", strings.Join(items, fmt.Sprintf(",\n%s ", indent))) + for _, b := range v.Blocks { + debugPrintRec(b, nextIndent) + } + f("}") + case *parser.Suffix: + f("Suffix(%s, %q)", loc, strings.Join(v.Labels, ".")) + case *parser.Wildcard: + if len(v.Exceptions) > 0 { + f("Wildcard(%s, %q, except=%v)", loc, strings.Join(v.Labels, "."), v.Exceptions) + } else { + f("Wildcard(%s, %q)", loc, strings.Join(v.Labels, ".")) + } + default: + panic("unknown block type") } } diff --git a/tools/internal/parser/errors.go b/tools/internal/parser/errors.go index 544201388..05bc25b5a 100644 --- a/tools/internal/parser/errors.go +++ b/tools/internal/parser/errors.go @@ -5,158 +5,148 @@ import ( "strings" ) -// InvalidEncodingError reports that the input is encoded with +// ErrInvalidEncoding reports that the input is encoded with // something other than UTF-8. -type InvalidEncodingError struct { +type ErrInvalidEncoding struct { Encoding string } -func (e InvalidEncodingError) Error() string { - return fmt.Sprintf("file uses invalid character encoding %s", e.Encoding) +func (e ErrInvalidEncoding) Error() string { + return fmt.Sprintf("invalid character encoding %s", e.Encoding) } -// UTF8BOMError reports that the input has an unnecessary UTF-8 byte +// ErrUTF8BOM reports that the input has an unnecessary UTF-8 byte // order mark (BOM) at the start. -type UTF8BOMError struct{} +type ErrUTF8BOM struct{} -func (e UTF8BOMError) Error() string { - return "file starts with an unnecessary UTF-8 BOM (byte order mark)" -} +func (e ErrUTF8BOM) Error() string { return "file has a UTF-8 byte order mark (BOM)" } -// InvalidUTF8Error reports that a line contains bytes that are not +// ErrInvalidUTF8 reports that a line contains bytes that are not // valid UTF-8. -type InvalidUTF8Error struct { - Line Source +type ErrInvalidUTF8 struct { + SourceRange } -func (e InvalidUTF8Error) Error() string { - return fmt.Sprintf("found non UTF-8 bytes at %s", e.Line.LocationString()) +func (e ErrInvalidUTF8) Error() string { + return fmt.Sprintf("%s: invalid UTF-8 bytes", e.SourceRange.LocationString()) } -// DOSNewlineError reports that a line has a DOS style line ending. -type DOSNewlineError struct { - Line Source +// ErrDOSNewline reports that a line has a DOS style line ending. +type ErrDOSNewline struct { + SourceRange } -func (e DOSNewlineError) Error() string { - return fmt.Sprintf("%s has a DOS line ending (\\r\\n instead of just \\n)", e.Line.LocationString()) +func (e ErrDOSNewline) Error() string { + return fmt.Sprintf("%s: found DOS line ending (\\r\\n instead of just \\n)", e.SourceRange.LocationString()) } -// TrailingWhitespaceError reports that a line has trailing whitespace. -type TrailingWhitespaceError struct { - Line Source +// ErrTrailingWhitespace reports that a line has trailing whitespace. +type ErrTrailingWhitespace struct { + SourceRange } -func (e TrailingWhitespaceError) Error() string { - return fmt.Sprintf("%s has trailing whitespace", e.Line.LocationString()) +func (e ErrTrailingWhitespace) Error() string { + return fmt.Sprintf("%s: trailing whitespace", e.SourceRange.LocationString()) } -// LeadingWhitespaceError reports that a line has leading whitespace. -type LeadingWhitespaceError struct { - Line Source +// ErrLeadingWhitespace reports that a line has leading whitespace. +type ErrLeadingWhitespace struct { + SourceRange } -func (e LeadingWhitespaceError) Error() string { - return fmt.Sprintf("%s has leading whitespace", e.Line.LocationString()) +func (e ErrLeadingWhitespace) Error() string { + return fmt.Sprintf("%s: leading whitespace", e.SourceRange.LocationString()) } -// SectionInSuffixBlock reports that a comment within a block of -// suffixes contains a section delimiter. -type SectionInSuffixBlock struct { - Line Source +// ErrSectionInSuffixBlock reports that a comment within a suffix +// block contains a section delimiter. +type ErrSectionInSuffixBlock struct { + SourceRange } -func (e SectionInSuffixBlock) Error() string { - return fmt.Sprintf("section delimiters are not allowed in suffix block comment at %s", e.Line.LocationString()) +func (e ErrSectionInSuffixBlock) Error() string { + return fmt.Sprintf("%s: section delimiter not allowed in suffix block comment", e.SourceRange.LocationString()) } -// UnclosedSectionError reports that a file section was not closed +// ErrUnclosedSection reports that a file section was not closed // properly before EOF. -type UnclosedSectionError struct { - Start *StartSection // The unpaired section start +type ErrUnclosedSection struct { + Section *Section } -func (e UnclosedSectionError) Error() string { - return fmt.Sprintf("section %q started at %s, but is never closed", e.Start.Name, e.Start.LocationString()) +func (e ErrUnclosedSection) Error() string { + return fmt.Sprintf("%s: section %q is missing its closing marker", e.Section.SourceRange.LocationString(), e.Section.Name) } -// NestedSectionError reports that a file section is being started -// while already within a section, which the PSL format does not -// allow. -type NestedSectionError struct { - Outer *StartSection - Inner *StartSection +// ErrNestedSection reports that a file section is being started while +// already within a section. +type ErrNestedSection struct { + SourceRange + Name string + Section *Section } -func (e NestedSectionError) Error() string { - return fmt.Sprintf("new section %q started at %s while still in section %q (started at %s)", e.Inner.Name, e.Inner.LocationString(), e.Outer.Name, e.Outer.LocationString()) +func (e ErrNestedSection) Error() string { + return fmt.Sprintf("%s: section %q is nested inside section %q (%s)", e.SourceRange.LocationString(), e.Name, e.Section.Name, e.Section.SourceRange.LocationString()) } -// UnstartedSectionError reports that a file section end marker was -// found without a corresponding start. -type UnstartedSectionError struct { - End *EndSection +// ErrUnstartedSection reports that section end marker was found +// without a corresponding start. +type ErrUnstartedSection struct { + SourceRange + Name string } -func (e UnstartedSectionError) Error() string { - return fmt.Sprintf("section %q closed at %s but was not started", e.End.Name, e.End.LocationString()) +func (e ErrUnstartedSection) Error() string { + return fmt.Sprintf("%s: end marker for non-existent section %q", e.SourceRange.LocationString(), e.Name) } -// MismatchedSectionError reports that a file section was started +// ErrMismatchedSection reports that a file section was started // under one name but ended under another. -type MismatchedSectionError struct { - Start *StartSection - End *EndSection +type ErrMismatchedSection struct { + SourceRange + EndName string + Section *Section } -func (e MismatchedSectionError) Error() string { - return fmt.Sprintf("section %q closed at %s while in section %q (started at %s)", e.End.Name, e.End.LocationString(), e.Start.Name, e.Start.LocationString()) +func (e ErrMismatchedSection) Error() string { + return fmt.Sprintf("%s: section %q (%s) closed with wrong name %q", e.SourceRange.LocationString(), e.Section.Name, e.Section.SourceRange.LocationString(), e.EndName) } -// UnknownSectionMarker reports that a line looks like a file section +// ErrUnknownSectionMarker reports that a line looks like a file section // marker (e.g. "===BEGIN ICANN DOMAINS==="), but is not one of the // recognized kinds of marker. -type UnknownSectionMarker struct { - Line Source -} - -func (e UnknownSectionMarker) Error() string { - return fmt.Sprintf("unknown kind of section marker %q at %s", e.Line.Text(), e.Line.LocationString()) -} - -// UnterminatedSectionMarker reports that a section marker is missing -// the required trailing "===", e.g. "===BEGIN ICANN DOMAINS". -type UnterminatedSectionMarker struct { - Line Source +type ErrUnknownSectionMarker struct { + SourceRange } -func (e UnterminatedSectionMarker) Error() string { - return fmt.Sprintf(`section marker %q at %s is missing trailing "==="`, e.Line.Text(), e.Line.LocationString()) +func (e ErrUnknownSectionMarker) Error() string { + return fmt.Sprintf("%s: unknown kind of section marker", e.SourceRange.LocationString()) } // MissingEntityName reports that a block of suffixes does not have a // parseable owner name in its header comment. -type MissingEntityName struct { +type ErrMissingEntityName struct { Suffixes *Suffixes } -func (e MissingEntityName) Error() string { - return fmt.Sprintf("could not find entity name for %s at %s", e.Suffixes.shortName(), e.Suffixes.LocationString()) +func (e ErrMissingEntityName) Error() string { + return fmt.Sprintf("%s: suffix block has no owner name", e.Suffixes.SourceRange.LocationString()) } -// MissingEntityEmail reports that a block of suffixes does not have a +// ErrMissingEntityEmail reports that a block of suffixes does not have a // parseable contact email address in its header comment. -type MissingEntityEmail struct { +type ErrMissingEntityEmail struct { Suffixes *Suffixes } -func (e MissingEntityEmail) Error() string { - return fmt.Sprintf("could not find a contact email for %s at %s", e.Suffixes.shortName(), e.Suffixes.LocationString()) +func (e ErrMissingEntityEmail) Error() string { + return fmt.Sprintf("%s: suffix block has no contact email", e.Suffixes.SourceRange.LocationString()) } -// SuffixBlocksInWrongPlace reports that some suffix blocks of the +// ErrSuffixBlocksInWrongPlace reports that some suffix blocks of the // private section are in the wrong sort order. -type SuffixBlocksInWrongPlace struct { +type ErrSuffixBlocksInWrongPlace struct { // EditScript is a list of suffix block movements to put the // private domains section in the correct order. Note that each // step assumes that the previous steps have already been done. @@ -174,7 +164,7 @@ type MoveSuffixBlock struct { InsertAfter string } -func (e SuffixBlocksInWrongPlace) Error() string { +func (e ErrSuffixBlocksInWrongPlace) Error() string { if len(e.EditScript) == 1 { after := e.EditScript[0].InsertAfter if after == "" { @@ -198,3 +188,15 @@ func (e SuffixBlocksInWrongPlace) Error() string { return ret.String() } + +// ErrInvalidSuffix reports that a suffix suffix is not a valid PSL +// entry. +type ErrInvalidSuffix struct { + SourceRange + Suffix string + Err error +} + +func (e ErrInvalidSuffix) Error() string { + return fmt.Sprintf("%s: invalid suffix %q: %v", e.SourceRange.LocationString(), e.Suffix, e.Err) +} diff --git a/tools/internal/parser/exceptions.go b/tools/internal/parser/exceptions.go index 63c3fbcef..18adf3da6 100644 --- a/tools/internal/parser/exceptions.go +++ b/tools/internal/parser/exceptions.go @@ -1,1003 +1,123 @@ package parser -import "strings" +import "slices" -// Exceptions are blocks of the PSL that would fail current validation +// Exceptions are parts of the PSL that would fail current validation // and stylistic requirements, but are exempted due to predating those // rules. // -// These exceptions are deliberately built to be brittle: editing a -// block revokes its exemptions and requires the block to pass all -// modern validations (or the exceptions below need to be -// updated). This hopefully ratchets the PSL to always become more -// conformant with current policy, while not requiring that all -// existing lint be fixed immediately. -// // See the bottom of this file for the exceptions themselves. -// downgradeToWarning reports whether e is a legacy exception to -// normal parsing and validation rules, and should be reported as a -// warning rather than a validation error. -func downgradeToWarning(e error) bool { - switch v := e.(type) { - case MissingEntityEmail: - return sourceIsExempted(missingEmail, v.Suffixes.Text()) - } - return false +// exemptFromContactInfo reports whether the block owned by entity is +// exempt from the requirement to have a contact email address. +func exemptFromContactInfo(entity string) bool { + return slices.Contains(missingEmail, entity) } // exemptFromSorting reports whether the block owned by entity is // exempt from the sorting requirement that normally applies in the // private domains section. -func exemptFromSorting(source Source) bool { - return sourceIsExempted(incorrectSort, source.Text()) -} - -func sourceIsExempted(exceptions []string, source string) bool { - for _, exc := range exceptions { - if exc == source { - return true - } - } - return false -} - -func lines(lines ...string) string { - return strings.Join(lines, "\n") +func exemptFromSorting(entity string) bool { + return slices.Contains(incorrectSort, entity) } // missingEmail are source code blocks in the private domains section // that are allowed to lack email contact information. var missingEmail = []string{ - lines( - "// 611coin : https://611project.org/", - "611.to", - ), - lines( - "// c.la : http://www.c.la/", - "c.la", - ), - lines( - "// co.ca : http://registry.co.ca/", - "co.ca", - ), - lines( - "// DynDNS.com : http://www.dyndns.com/services/dns/dyndns/", - "dyndns.biz", - "for-better.biz", - "for-more.biz", - "for-some.biz", - "for-the.biz", - "selfip.biz", - "webhop.biz", - "ftpaccess.cc", - "game-server.cc", - "myphotos.cc", - "scrapping.cc", - "blogdns.com", - "cechire.com", - "dnsalias.com", - "dnsdojo.com", - "doesntexist.com", - "dontexist.com", - "doomdns.com", - "dyn-o-saur.com", - "dynalias.com", - "dyndns-at-home.com", - "dyndns-at-work.com", - "dyndns-blog.com", - "dyndns-free.com", - "dyndns-home.com", - "dyndns-ip.com", - "dyndns-mail.com", - "dyndns-office.com", - "dyndns-pics.com", - "dyndns-remote.com", - "dyndns-server.com", - "dyndns-web.com", - "dyndns-wiki.com", - "dyndns-work.com", - "est-a-la-maison.com", - "est-a-la-masion.com", - "est-le-patron.com", - "est-mon-blogueur.com", - "from-ak.com", - "from-al.com", - "from-ar.com", - "from-ca.com", - "from-ct.com", - "from-dc.com", - "from-de.com", - "from-fl.com", - "from-ga.com", - "from-hi.com", - "from-ia.com", - "from-id.com", - "from-il.com", - "from-in.com", - "from-ks.com", - "from-ky.com", - "from-ma.com", - "from-md.com", - "from-mi.com", - "from-mn.com", - "from-mo.com", - "from-ms.com", - "from-mt.com", - "from-nc.com", - "from-nd.com", - "from-ne.com", - "from-nh.com", - "from-nj.com", - "from-nm.com", - "from-nv.com", - "from-oh.com", - "from-ok.com", - "from-or.com", - "from-pa.com", - "from-pr.com", - "from-ri.com", - "from-sc.com", - "from-sd.com", - "from-tn.com", - "from-tx.com", - "from-ut.com", - "from-va.com", - "from-vt.com", - "from-wa.com", - "from-wi.com", - "from-wv.com", - "from-wy.com", - "getmyip.com", - "gotdns.com", - "hobby-site.com", - "homelinux.com", - "homeunix.com", - "iamallama.com", - "is-a-anarchist.com", - "is-a-blogger.com", - "is-a-bookkeeper.com", - "is-a-bulls-fan.com", - "is-a-caterer.com", - "is-a-chef.com", - "is-a-conservative.com", - "is-a-cpa.com", - "is-a-cubicle-slave.com", - "is-a-democrat.com", - "is-a-designer.com", - "is-a-doctor.com", - "is-a-financialadvisor.com", - "is-a-geek.com", - "is-a-green.com", - "is-a-guru.com", - "is-a-hard-worker.com", - "is-a-hunter.com", - "is-a-landscaper.com", - "is-a-lawyer.com", - "is-a-liberal.com", - "is-a-libertarian.com", - "is-a-llama.com", - "is-a-musician.com", - "is-a-nascarfan.com", - "is-a-nurse.com", - "is-a-painter.com", - "is-a-personaltrainer.com", - "is-a-photographer.com", - "is-a-player.com", - "is-a-republican.com", - "is-a-rockstar.com", - "is-a-socialist.com", - "is-a-student.com", - "is-a-teacher.com", - "is-a-techie.com", - "is-a-therapist.com", - "is-an-accountant.com", - "is-an-actor.com", - "is-an-actress.com", - "is-an-anarchist.com", - "is-an-artist.com", - "is-an-engineer.com", - "is-an-entertainer.com", - "is-certified.com", - "is-gone.com", - "is-into-anime.com", - "is-into-cars.com", - "is-into-cartoons.com", - "is-into-games.com", - "is-leet.com", - "is-not-certified.com", - "is-slick.com", - "is-uberleet.com", - "is-with-theband.com", - "isa-geek.com", - "isa-hockeynut.com", - "issmarterthanyou.com", - "likes-pie.com", - "likescandy.com", - "neat-url.com", - "saves-the-whales.com", - "selfip.com", - "sells-for-less.com", - "sells-for-u.com", - "servebbs.com", - "simple-url.com", - "space-to-rent.com", - "teaches-yoga.com", - "writesthisblog.com", - "ath.cx", - "fuettertdasnetz.de", - "isteingeek.de", - "istmein.de", - "lebtimnetz.de", - "leitungsen.de", - "traeumtgerade.de", - "barrel-of-knowledge.info", - "barrell-of-knowledge.info", - "dyndns.info", - "for-our.info", - "groks-the.info", - "groks-this.info", - "here-for-more.info", - "knowsitall.info", - "selfip.info", - "webhop.info", - "forgot.her.name", - "forgot.his.name", - "at-band-camp.net", - "blogdns.net", - "broke-it.net", - "buyshouses.net", - "dnsalias.net", - "dnsdojo.net", - "does-it.net", - "dontexist.net", - "dynalias.net", - "dynathome.net", - "endofinternet.net", - "from-az.net", - "from-co.net", - "from-la.net", - "from-ny.net", - "gets-it.net", - "ham-radio-op.net", - "homeftp.net", - "homeip.net", - "homelinux.net", - "homeunix.net", - "in-the-band.net", - "is-a-chef.net", - "is-a-geek.net", - "isa-geek.net", - "kicks-ass.net", - "office-on-the.net", - "podzone.net", - "scrapper-site.net", - "selfip.net", - "sells-it.net", - "servebbs.net", - "serveftp.net", - "thruhere.net", - "webhop.net", - "merseine.nu", - "mine.nu", - "shacknet.nu", - "blogdns.org", - "blogsite.org", - "boldlygoingnowhere.org", - "dnsalias.org", - "dnsdojo.org", - "doesntexist.org", - "dontexist.org", - "doomdns.org", - "dvrdns.org", - "dynalias.org", - "dyndns.org", - "go.dyndns.org", - "home.dyndns.org", - "endofinternet.org", - "endoftheinternet.org", - "from-me.org", - "game-host.org", - "gotdns.org", - "hobby-site.org", - "homedns.org", - "homeftp.org", - "homelinux.org", - "homeunix.org", - "is-a-bruinsfan.org", - "is-a-candidate.org", - "is-a-celticsfan.org", - "is-a-chef.org", - "is-a-geek.org", - "is-a-knight.org", - "is-a-linux-user.org", - "is-a-patsfan.org", - "is-a-soxfan.org", - "is-found.org", - "is-lost.org", - "is-saved.org", - "is-very-bad.org", - "is-very-evil.org", - "is-very-good.org", - "is-very-nice.org", - "is-very-sweet.org", - "isa-geek.org", - "kicks-ass.org", - "misconfused.org", - "podzone.org", - "readmyblog.org", - "selfip.org", - "sellsyourhome.org", - "servebbs.org", - "serveftp.org", - "servegame.org", - "stuff-4-sale.org", - "webhop.org", - "better-than.tv", - "dyndns.tv", - "on-the-web.tv", - "worse-than.tv", - "is-by.us", - "land-4-sale.us", - "stuff-4-sale.us", - "dyndns.ws", - "mypets.ws", - ), - lines( - "// Hashbang : https://hashbang.sh", - "hashbang.sh", - ), - lines( - "// HostyHosting (https://hostyhosting.com)", - "hostyhosting.io", - ), - lines( - "// info.at : http://www.info.at/", - "biz.at", - "info.at", - ), - lines( - "// .KRD : http://nic.krd/data/krd/Registration%20Policy.pdf", - "co.krd", - "edu.krd", - ), - lines( - "// Michau Enterprises Limited : http://www.co.pl/", - "co.pl", - ), - lines( - "// Nicolaus Copernicus University in Torun - MSK TORMAN (https://www.man.torun.pl)", - "torun.pl", - ), - lines( - "// TASK geographical domains (https://www.task.gda.pl/uslugi/dns)", - "gda.pl", - "gdansk.pl", - "gdynia.pl", - "med.pl", - "sopot.pl", - ), - lines( - "// CoDNS B.V.", - "co.nl", - "co.no", - ), - lines( - "// .pl domains (grandfathered)", - "art.pl", - "gliwice.pl", - "krakow.pl", - "poznan.pl", - "wroc.pl", - "zakopane.pl", - ), - lines( - "// QA2", - "// Submitted by Daniel Dent (https://www.danieldent.com/)", - "qa2.com", - ), + "611coin", + "c.la", + "co.ca", + "DynDNS.com", + "Hashbang", + "HostyHosting", + "info.at", + ".KRD", + "Michau Enterprises Limited", + "Nicolaus Copernicus University in Torun - MSK TORMAN", + "TASK geographical domains", + "CoDNS B.V.", + ".pl domains (grandfathered)", + "QA2", } -// incorrectSort are source code blocks in the private domains section -// that are allowed to be in the wrong sort order. +// incorrectSort are entities in the private domains section that are +// allowed to be in the wrong sort order. var incorrectSort = []string{ - lines( - "// AAA workspace : https://aaa.vodka", - "// Submitted by Kirill Rezraf ", - "aaa.vodka", - ), - lines( - "// University of Banja Luka : https://unibl.org", - "// Domains for Republic of Srpska administrative entity.", - "// Submitted by Marko Ivanovic ", - "rs.ba", - ), - lines( - "// University of Bielsko-Biala regional domain: http://dns.bielsko.pl/", - "// Submitted by Marcin ", - "bielsko.pl", - ), - lines( - "// No longer operated by CentralNic, these entries should be adopted and/or removed by current operators", - "// Submitted by Gavin Brown ", - "ar.com", - "hu.com", - "kr.com", - "no.com", - "qc.com", - "uy.com", - ), - lines( - "// Africa.com Web Solutions Ltd : https://registry.africa.com", - "// Submitted by Gavin Brown ", - "africa.com", - ), - lines( - "// iDOT Services Limited : http://www.domain.gr.com", - "// Submitted by Gavin Brown ", - "gr.com", - ), - lines( - "// Radix FZC : http://domains.in.net", - "// Submitted by Gavin Brown ", - "web.in", - "in.net", - ), - lines( - "// US REGISTRY LLC : http://us.org", - "// Submitted by Gavin Brown ", - "us.org", - ), - lines( - "// co.com Registry, LLC : https://registry.co.com", - "// Submitted by Gavin Brown ", - "co.com", - ), - lines( - "// Roar Domains LLC : https://roar.basketball/", - "// Submitted by Gavin Brown ", - "aus.basketball", - "nz.basketball", - ), - lines( - "// BRS Media : https://brsmedia.com/", - "// Submitted by Gavin Brown ", - "radio.am", - "radio.fm", - ), - lines( - "// c.la : http://www.c.la/", - "c.la", - ), - lines( - "// Clever Cloud : https://www.clever-cloud.com/", - "// Submitted by Quentin Adam ", - "cleverapps.cc", - "*.services.clever-cloud.com", - "cleverapps.io", - "cleverapps.tech", - ), - lines( - "// co.ca : http://registry.co.ca/", - "co.ca", - ), - lines( - "// Co & Co : https://co-co.nl/", - "// Submitted by Govert Versluis ", - "*.otap.co", - ), - lines( - "// i-registry s.r.o. : http://www.i-registry.cz/", - "// Submitted by Martin Semrad ", - "co.cz", - ), - lines( - "// CDN77.com : http://www.cdn77.com", - "// Submitted by Jan Krpes ", - "cdn77-storage.com", - "rsc.contentproxy9.cz", - "r.cdn77.net", - "cdn77-ssl.net", - "c.cdn77.org", - "rsc.cdn77.org", - "ssl.origin.cdn77-secure.org", - ), - lines( - "// Cloud DNS Ltd : http://www.cloudns.net", - "// Submitted by Aleksander Hristov & Boyan Peychev ", - "cloudns.asia", - "cloudns.be", - "cloudns.biz", - "cloudns.cc", - "cloudns.ch", - "cloudns.cl", - "cloudns.club", - "dnsabr.com", - "cloudns.cx", - "cloudns.eu", - "cloudns.in", - "cloudns.info", - "dns-cloud.net", - "dns-dynamic.net", - "cloudns.nz", - "cloudns.org", - "cloudns.ph", - "cloudns.pro", - "cloudns.pw", - "cloudns.us", - ), - lines( - "// Daplie, Inc : https://daplie.com", - "// Submitted by AJ ONeal ", - "daplie.me", - "localhost.daplie.me", - ), - lines( - "// Datto, Inc. : https://www.datto.com/", - "// Submitted by Philipp Heckel ", - "dattolocal.com", - "dattorelay.com", - "dattoweb.com", - "mydatto.com", - "dattolocal.net", - "mydatto.net", - ), - lines( - "// Bip : https://bip.sh", - "// Submitted by Joel Kennedy ", - "bip.sh", - ), - lines( - "// bitbridge.net : Submitted by Craig Welch, abeliidev@gmail.com", - "bitbridge.net", - ), - lines( - "// ddnss.de : https://www.ddnss.de/", - "// Submitted by Robert Niedziela ", - "ddnss.de", - "dyn.ddnss.de", - "dyndns.ddnss.de", - "dyn-ip24.de", - "dyndns1.de", - "home-webserver.de", - "dyn.home-webserver.de", - "myhome-server.de", - "ddnss.org", - ), - lines( - "// Definima : http://www.definima.com/", - "// Submitted by Maxence Bitterli ", - "definima.io", - "definima.net", - ), - lines( - "// DigitalOcean App Platform : https://www.digitalocean.com/products/app-platform/", - "// Submitted by Braxton Huggins ", - "ondigitalocean.app", - ), - lines( - "// DigitalOcean Spaces : https://www.digitalocean.com/products/spaces/", - "// Submitted by Robin H. Johnson ", - "*.digitaloceanspaces.com", - ), - lines( - "// DigitalPlat : https://www.digitalplat.org/", - "// Submitted by Edward Hsing ", - "us.kg", - ), - lines( - "// dnstrace.pro : https://dnstrace.pro/", - "// Submitted by Chris Partridge ", - "bci.dnstrace.pro", - ), - lines( - "// ECG Robotics, Inc: https://ecgrobotics.org", - "// Submitted by ", - "onred.one", - "staging.onred.one", - ), - lines( - "// Fedora : https://fedoraproject.org/", - "// submitted by Patrick Uiterwijk ", - "fedorainfracloud.org", - "fedorapeople.org", - "cloud.fedoraproject.org", - "app.os.fedoraproject.org", - "app.os.stg.fedoraproject.org", - ), - lines( - "// Frusky MEDIA&PR : https://www.frusky.de", - "// Submitted by Victor Pupynin ", - "*.frusky.de", - ), - lines( - "// RavPage : https://www.ravpage.co.il", - "// Submitted by Roni Horowitz ", - "ravpage.co.il", - ), - lines( - "// CDDO : https://www.gov.uk/guidance/get-an-api-domain-on-govuk", - "// Submitted by Jamie Tanna ", - "api.gov.uk", - ), - lines( - "// GOV.UK Platform as a Service : https://www.cloud.service.gov.uk/", - "// Submitted by Tom Whitwell ", - "cloudapps.digital", - "london.cloudapps.digital", - ), - lines( - "// GOV.UK Pay : https://www.payments.service.gov.uk/", - "// Submitted by Richard Baker ", - "pymnt.uk", - ), - lines( - "// Helio Networks : https://heliohost.org", - "// Submitted by Ben Frede ", - "helioho.st", - "heliohost.us", - ), - lines( - "// Häkkinen.fi", - "// Submitted by Eero Häkkinen ", - "häkkinen.fi", - ), - lines( - "// is-a.dev : https://www.is-a.dev", - "// Submitted by William Harrison ", - "is-a.dev", - ), - lines( - "// I-O DATA DEVICE, INC. : http://www.iodata.com/", - "// Submitted by Yuji Minagawa ", - "iobb.net", - ), - lines( - "// KUROKU LTD : https://kuroku.ltd/", - "// Submitted by DisposaBoy ", - "oya.to", - ), - lines( - "// Katholieke Universiteit Leuven: https://www.kuleuven.be", - "// Submitted by Abuse KU Leuven ", - "ezproxy.kuleuven.be", - "kuleuven.cloud", - ), - lines( - "// .KRD : http://nic.krd/data/krd/Registration%20Policy.pdf", - "co.krd", - "edu.krd", - ), - lines( - "// Lokalized : https://lokalized.nl", - "// Submitted by Noah Taheij ", - "servers.run", - ), - lines( - "// May First - People Link : https://mayfirst.org/", - "// Submitted by Jamie McClelland ", - "mayfirst.info", - "mayfirst.org", - ), - lines( - "// mcpe.me : https://mcpe.me", - "// Submitted by Noa Heyl ", - "mcpe.me", - ), - lines( - "// NFSN, Inc. : https://www.NearlyFreeSpeech.NET/", - "// Submitted by Jeff Wheelhouse ", - "nfshost.com", - ), - lines( - "// NFT.Storage : https://nft.storage/", - "// Submitted by Vasco Santos or ", - "ipfs.nftstorage.link", - ), - lines( - "// No-IP.com : https://noip.com/", - "// Submitted by Deven Reza ", - "mmafan.biz", - "myftp.biz", - "no-ip.biz", - "no-ip.ca", - "fantasyleague.cc", - "gotdns.ch", - "3utilities.com", - "blogsyte.com", - "ciscofreak.com", - "damnserver.com", - "ddnsking.com", - "ditchyourip.com", - "dnsiskinky.com", - "dynns.com", - "geekgalaxy.com", - "health-carereform.com", - "homesecuritymac.com", - "homesecuritypc.com", - "myactivedirectory.com", - "mysecuritycamera.com", - "myvnc.com", - "net-freaks.com", - "onthewifi.com", - "point2this.com", - "quicksytes.com", - "securitytactics.com", - "servebeer.com", - "servecounterstrike.com", - "serveexchange.com", - "serveftp.com", - "servegame.com", - "servehalflife.com", - "servehttp.com", - "servehumour.com", - "serveirc.com", - "servemp3.com", - "servep2p.com", - "servepics.com", - "servequake.com", - "servesarcasm.com", - "stufftoread.com", - "unusualperson.com", - "workisboring.com", - "dvrcam.info", - "ilovecollege.info", - "no-ip.info", - "brasilia.me", - "ddns.me", - "dnsfor.me", - "hopto.me", - "loginto.me", - "noip.me", - "webhop.me", - "bounceme.net", - "ddns.net", - "eating-organic.net", - "mydissent.net", - "myeffect.net", - "mymediapc.net", - "mypsx.net", - "mysecuritycamera.net", - "nhlfan.net", - "no-ip.net", - "pgafan.net", - "privatizehealthinsurance.net", - "redirectme.net", - "serveblog.net", - "serveminecraft.net", - "sytes.net", - "cable-modem.org", - "collegefan.org", - "couchpotatofries.org", - "hopto.org", - "mlbfan.org", - "myftp.org", - "mysecuritycamera.org", - "nflfan.org", - "no-ip.org", - "read-books.org", - "ufcfan.org", - "zapto.org", - "no-ip.co.uk", - "golffan.us", - "noip.us", - "pointto.us", - ), - lines( - "// NodeArt : https://nodeart.io", - "// Submitted by Konstantin Nosov ", - "stage.nodeart.io", - ), - lines( - "// One.com: https://www.one.com/", - "// Submitted by Jacob Bunk Nielsen ", - "123webseite.at", - "123website.be", - "simplesite.com.br", - "123website.ch", - "simplesite.com", - "123webseite.de", - "123hjemmeside.dk", - "123miweb.es", - "123kotisivu.fi", - "123siteweb.fr", - "simplesite.gr", - "123homepage.it", - "123website.lu", - "123website.nl", - "123hjemmeside.no", - "service.one", - "simplesite.pl", - "123paginaweb.pt", - "123minsida.se", - ), - lines( - "// .pl domains (grandfathered)", - "art.pl", - "gliwice.pl", - "krakow.pl", - "poznan.pl", - "wroc.pl", - "zakopane.pl", - ), - lines( - "// Pantheon Systems, Inc. : https://pantheon.io/", - "// Submitted by Gary Dylina ", - "gotpantheon.com", - "pantheonsite.io", - ), - lines( - "// PE Ulyanov Kirill Sergeevich : https://airy.host", - "// Submitted by Kirill Ulyanov ", - "lk3.ru", - ), - lines( - "// Rad Web Hosting: https://radwebhosting.com", - "// Submitted by Scott Claeys ", - "cloudsite.builders", - "myradweb.net", - "servername.us", - ), - lines( - "// Raidboxes GmbH : https://raidboxes.de", - "// Submitted by Auke Tembrink ", - "myrdbx.io", - "site.rb-hosting.io", - ), - lines( - "// Redgate Software: https://red-gate.com", - "// Submitted by Andrew Farries ", - "instances.spawn.cc", - ), - lines( - "// Redstar Consultants : https://www.redstarconsultants.com/", - "// Submitted by Jons Slemmer ", - "instantcloud.cn", - ), - lines( - "// Russian Academy of Sciences", - "// Submitted by Tech Support ", - "ras.ru", - ), - lines( - "// QA2", - "// Submitted by Daniel Dent (https://www.danieldent.com/)", - "qa2.com", - ), - lines( - "// QCX", - "// Submitted by Cassandra Beelen ", - "qcx.io", - "*.sys.qcx.io", - ), - lines( - "// QNAP System Inc : https://www.qnap.com", - "// Submitted by Nick Chang ", - "myqnapcloud.cn", - "alpha-myqnapcloud.com", - "dev-myqnapcloud.com", - "mycloudnas.com", - "mynascloud.com", - "myqnapcloud.com", - ), - lines( - "// Senseering GmbH : https://www.senseering.de", - "// Submitted by Felix Mönckemeyer ", - "senseering.net", - ), - lines( - "// Smallregistry by Promopixel SARL: https://www.smallregistry.net", - "// Former AFNIC's SLDs", - "// Submitted by Jérôme Lipowicz ", - "aeroport.fr", - "avocat.fr", - "chambagri.fr", - "chirurgiens-dentistes.fr", - "experts-comptables.fr", - "medecin.fr", - "notaires.fr", - "pharmacien.fr", - "port.fr", - "veterinaire.fr", - ), - lines( - "// staticland : https://static.land", - "// Submitted by Seth Vincent ", - "static.land", - "dev.static.land", - "sites.static.land", - ), - lines( - "// Storebase : https://www.storebase.io", - "// Submitted by Tony Schirmer ", - "storebase.store", - ), - lines( - "// Strapi : https://strapi.io/", - "// Submitted by Florent Baldino ", - "strapiapp.com", - "media.strapiapp.com", - ), - lines( - "// Strategic System Consulting (eApps Hosting): https://www.eapps.com/", - "// Submitted by Alex Oancea ", - "vps-host.net", - "atl.jelastic.vps-host.net", - "njs.jelastic.vps-host.net", - "ric.jelastic.vps-host.net", - ), - lines( - "// Sony Interactive Entertainment LLC : https://sie.com/", - "// Submitted by David Coles ", - "playstation-cloud.com", - ), - lines( - "// SourceLair PC : https://www.sourcelair.com", - "// Submitted by Antonis Kalipetis ", - "apps.lair.io", - "*.stolos.io", - ), - lines( - "// SpaceKit : https://www.spacekit.io/", - "// Submitted by Reza Akhavan ", - "spacekit.io", - ), - lines( - "// SpeedPartner GmbH: https://www.speedpartner.de/", - "// Submitted by Stefan Neufeind ", - "customer.speedpartner.de", - ), - lines( - "// Spreadshop (sprd.net AG) : https://www.spreadshop.com/", - "// Submitted by Martin Breest ", - "myspreadshop.at", - "myspreadshop.com.au", - "myspreadshop.be", - "myspreadshop.ca", - "myspreadshop.ch", - "myspreadshop.com", - "myspreadshop.de", - "myspreadshop.dk", - "myspreadshop.es", - "myspreadshop.fi", - "myspreadshop.fr", - "myspreadshop.ie", - "myspreadshop.it", - "myspreadshop.net", - "myspreadshop.nl", - "myspreadshop.no", - "myspreadshop.pl", - "myspreadshop.se", - "myspreadshop.co.uk", - ), - lines( - "// Studenten Net Twente : http://www.snt.utwente.nl/", - "// Submitted by Silke Hofstra ", - "utwente.io", - ), - lines( - "// UNIVERSAL DOMAIN REGISTRY : https://www.udr.org.yt/", - "// see also: whois -h whois.udr.org.yt help", - "// Submitted by Atanunu Igbunuroghene ", - "name.pm", - "sch.tf", - "biz.wf", - "sch.wf", - "org.yt", - ), - lines( - "// .US", - "// Submitted by Ed Moore ", - "lib.de.us", - ), - lines( - "// VeryPositive SIA : http://very.lv", - "// Submitted by Danko Aleksejevs ", - "2038.io", - ), - lines( - "// V.UA Domain Administrator : https://domain.v.ua/", - "// Submitted by Serhii Rostilo ", - "v.ua", - ), + "AAA workspace", + "University of Banja Luka", + "University of Bielsko-Biala regional domain", + "No longer operated by CentralNic, these entries should be adopted and/or removed by current operators", + "Africa.com Web Solutions Ltd", + "iDOT Services Limited", + "Radix FZC", + "US REGISTRY LLC", + "co.com Registry, LLC", + "Roar Domains LLC", + "BRS Media", + "c.la", + "Clever Cloud", + "co.ca", + "Co & Co", + "i-registry s.r.o.", + "CDN77.com", + "Cloud DNS Ltd", + "Daplie, Inc", + "Datto, Inc.", + "Bip", + "bitbridge.net", + "ddnss.de", + "Definima", + "DigitalOcean App Platform", + "DigitalOcean Spaces", + "DigitalPlat", + "dnstrace.pro", + "ECG Robotics, Inc", + "Fedora", + "Frusky MEDIA&PR", + "RavPage", + "CDDO", + "GOV.UK Platform as a Service", + "GOV.UK Pay", + "Helio Networks", + "Häkkinen.fi", + "is-a.dev", + "I-O DATA DEVICE, INC.", + "KUROKU LTD", + "Katholieke Universiteit Leuven", + ".KRD", + "Lokalized", + "May First - People Link", + "mcpe.me", + "NFSN, Inc.", + "NFT.Storage", + "No-IP.com", + "NodeArt", + "One.com", + ".pl domains (grandfathered)", + "Pantheon Systems, Inc.", + "PE Ulyanov Kirill Sergeevich", + "Rad Web Hosting", + "Raidboxes GmbH", + "Redgate Software", + "Redstar Consultants", + "Russian Academy of Sciences", + "QA2", + "QCX", + "QNAP System Inc", + "Senseering GmbH", + "Smallregistry by Promopixel SARL", + "staticland", + "Storebase", + "Strapi", + "Strategic System Consulting (eApps Hosting)", + "Sony Interactive Entertainment LLC", + "SourceLair PC", + "SpaceKit", + "SpeedPartner GmbH", + "Spreadshop (sprd.net AG)", + "Studenten Net Twente", + "UNIVERSAL DOMAIN REGISTRY", + ".US", + "VeryPositive SIA", + "V.UA Domain Administrator", } diff --git a/tools/internal/parser/file.go b/tools/internal/parser/file.go index 4663ff4d9..af15e9ff3 100644 --- a/tools/internal/parser/file.go +++ b/tools/internal/parser/file.go @@ -1,114 +1,62 @@ package parser import ( - "fmt" "net/mail" "net/url" ) -// File is a parsed PSL file. -// A PSL file consists of blocks separated by an empty line. Most -// blocks are annotated lists of suffixes, but some are plain -// top-level comments or delimiters for sections of the file. -type File struct { - // Blocks are the data blocks of the file, in the order they - // appear. +// List is a parsed public suffix list. +type List struct { + SourceRange + + // Blocks are the top-level elements of the list, in the order + // they appear. Blocks []Block - // Errors are parse errors encountered while reading the - // file. This includes fatal validation errors, not just malformed - // syntax. - Errors []error - // Warnings are errors that were downgraded to just - // warnings. Warnings are a concession to old PSL entries that now - // have validation errors, due to PSL policy changes. As long as - // the entries in question don't change, their preexisting - // validation errors are downgraded to lint warnings. - Warnings []error } -// AllSuffixBlocks returns all suffix blocks in f. -func (f *File) AllSuffixBlocks() []*Suffixes { - var ret []*Suffixes - - for _, block := range f.Blocks { - switch v := block.(type) { - case *Suffixes: - ret = append(ret, v) - } - } +func (l *List) Children() []Block { return l.Blocks } - return ret +// A Block is a parsed chunk of a PSL file. Each block is one of the +// concrete types Blank, Comment, Section, Suffixes, Suffix, or +// Wildcard. +type Block interface { + // SrcRange returns the block's SourceRange. + SrcRange() SourceRange + // Children returns the block's direct children, if any. + Children() []Block } -// SuffixBlocksInSection returns all suffix blocks within the named -// file section (for example, "ICANN DOMAINS" or "PRIVATE DOMAINS"). -func (f *File) SuffixBlocksInSection(name string) []*Suffixes { - var ret []*Suffixes - - var curSection string - for _, block := range f.Blocks { - switch v := block.(type) { - case *StartSection: - curSection = v.Name - case *EndSection: - if curSection == name { - return ret - } - curSection = "" - case *Suffixes: - if curSection == name { - ret = append(ret, v) - } - } - } - return ret +// Blank is a set of one or more consecutive blank lines. +type Blank struct { + SourceRange } -// A Block is a parsed chunk of a PSL file. -// In Parse's output, a Block is one of the following concrete types: -// Comment, StartSection, EndSection, Suffixes. -type Block interface { - source() Source -} +func (b *Blank) Children() []Block { return nil } -// Comment is a standalone top-level comment block. +// Comment is a comment block, consisting of one or more contiguous +// lines of commented text. type Comment struct { - Source + SourceRange + // Text is the unprocessed content of the comment lines, with the + // leading comment syntax removed. + Text []string } -func (c *Comment) source() Source { return c.Source } +func (c *Comment) Children() []Block { return nil } -// StartSection is a top-level marker that indicates the start of a -// logical section, such as ICANN suffixes or privately managed -// domains. -// -// Sections cannot be nested, at any one point in a file you are -// either not in any logical section, or within a single section. In -// a File that has no parse errors, StartSection and EndSection blocks -// are correctly paired, and all sections are closed by an EndSection -// before any following StartSection. -type StartSection struct { - Source - Name string // section name, e.g. "ICANN DOMAINS", "PRIVATE DOMAINS" -} +// Section is a named part of a PSL file, containing suffixes which +// behave similarly. +type Section struct { + SourceRange -func (b *StartSection) source() Source { return b.Source } - -// EndSection is a top-level marker that indicates the end of a -// logical section, such as ICANN suffixes or privately managed -// domains. -// -// Sections cannot be nested, at any one point in a file you are -// either not in any logical section, or within a single section. In -// a File that has no parse errors, StartSection and EndSection blocks -// are correctly paired, and all sections are closed by an EndSection -// before any following StartSection. -type EndSection struct { - Source - Name string // e.g. "ICANN DOMAINS", "PRIVATE DOMAINS" + // Name is he section name. In a normal well-formed PSL file, the + // names are "ICANN DOMAINS" and "PRIVATE DOMAINS". + Name string + // Blocks are the child blocks contained within the section. + Blocks []Block } -func (b *EndSection) source() Source { return b.Source } +func (s *Section) Children() []Block { return s.Blocks } // Suffixes is a list of PSL domain suffixes with optional additional // metadata. @@ -118,24 +66,7 @@ func (b *EndSection) source() Source { return b.Source } // domain suffixes. The suffix list may contain additional // unstructured inline comments. type Suffixes struct { - Source - - // Header lists the comment lines that appear before the first - // domain suffix. Any structured data they contain is also parsed - // into separate fields. - Header []Source - // Entries lists the lines that contain domain suffixes. In an - // error-free PSL file, each slice element is a single suffix. - Entries []Source - // InlineComments lists the comment lines that appear between - // suffix lines, rather than as part of the header. These are - // uncommon in the PSL overall, but some suffix blocks - // (particularly hand-curated ICANN blocks) feature some guidance - // comments to guide future maintainers. - InlineComments []Source - - // The following fields are extracted from Header, if available. - + SourceRange // Entity is the name of the entity responsible for this block of // suffixes. // @@ -166,15 +97,38 @@ type Suffixes struct { // This field may be nil if the block header doesn't have email // contact information. Submitter *mail.Address + + // Blocks are the child blocks contained within the section. + Blocks []Block } -func (s *Suffixes) source() Source { return s.Source } +func (s *Suffixes) Children() []Block { return s.Blocks } -// shortName returns either the quoted name of the responsible Entity, -// or a generic descriptor of this suffix block if Entity is unset. -func (s *Suffixes) shortName() string { - if s.Entity != "" { - return fmt.Sprintf("%q", s.Entity) - } - return fmt.Sprintf("%d unowned suffixes", len(s.Entries)) +// Suffix is one public suffix, represented in the standard domain +// name format. +type Suffix struct { + SourceRange + + // Labels are the DNS labels of the public suffix. + Labels []string +} + +func (s *Suffix) Children() []Block { return nil } + +// Wildcard is a wildcard public suffix, along with any exceptions to +// that wildcard. +type Wildcard struct { + SourceRange + + // Labels are the DNS labels of the public suffix, without the + // leading "*" label. + Labels []string + // Exceptions are the DNS label values that, when they appear in + // the wildcard position, cause a FQDN to _not_ match this + // wildcard. For example, if Labels=[foo, com] and + // Exceptions=[bar, qux], zot.foo.com is a public suffix, but + // bar.foo.com and qux.foo.com are not. + Exceptions []string } + +func (w *Wildcard) Children() []Block { return nil } diff --git a/tools/internal/parser/metadata.go b/tools/internal/parser/metadata.go index 8c1f6110c..62f63a42a 100644 --- a/tools/internal/parser/metadata.go +++ b/tools/internal/parser/metadata.go @@ -8,11 +8,7 @@ import ( // enrichSuffixes extracts structured metadata from metadata and // populates the appropriate fields of suffixes. -func enrichSuffixes(suffixes *Suffixes, metadata []string) { - if len(metadata) == 0 { - return - } - +func enrichSuffixes(suffixes *Suffixes, comment *Comment) { // Try to find an entity name in the header. There are a few // possible ways this can appear, but the canonical is a first // header line of the form ": ". @@ -23,7 +19,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { // validation errors in future, but currently do not. // // See splitNameish for a list of accepted alternate forms. - for _, line := range metadata { + for _, line := range comment.Text { name, url, contact := splitNameish(line) if name == "" { continue @@ -41,7 +37,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { if suffixes.Entity == "" { // Assume the first line is the entity name, if it's not // obviously something else. - first := metadata[0] + first := comment.Text[0] // "see also" is the first line of a number of ICANN TLD // sections. if getSubmitter(first) == nil && getURL(first) == nil && first != "see also" { @@ -54,7 +50,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { // "Submitted by ", or failing that a parseable RFC5322 // email on a line by itself. if suffixes.Submitter == nil { - for _, line := range metadata { + for _, line := range comment.Text { if submitter := getSubmitter(line); submitter != nil { suffixes.Submitter = submitter break @@ -62,7 +58,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { } } if suffixes.Submitter == nil { - for _, line := range metadata { + for _, line := range comment.Text { if submitter, err := mail.ParseAddress(line); err == nil { suffixes.Submitter = submitter break @@ -74,7 +70,7 @@ func enrichSuffixes(suffixes *Suffixes, metadata []string) { // only remaining format we understand is a line with a URL by // itself. if suffixes.URL == nil { - for _, line := range metadata { + for _, line := range comment.Text { if u := getURL(line); u != nil { suffixes.URL = u break diff --git a/tools/internal/parser/parser.go b/tools/internal/parser/parser.go index 28daf787e..e825f1f4c 100644 --- a/tools/internal/parser/parser.go +++ b/tools/internal/parser/parser.go @@ -2,7 +2,11 @@ package parser import ( + "fmt" + "slices" "strings" + + "golang.org/x/net/idna" ) // Parse parses bs as a PSL file and returns the parse result. @@ -17,230 +21,433 @@ import ( // submission guidelines // (https://github.com/publicsuffix/list/wiki/Guidelines). A File with // errors should not be used to calculate public suffixes for FQDNs. -func Parse(bs []byte) *File { - return &parseWithExceptions(bs, downgradeToWarning, true).File -} - -func parseWithExceptions(bs []byte, downgradeToWarning func(error) bool, validate bool) *parser { - src, errs := newSource(bs) - p := parser{ - downgradeToWarning: downgradeToWarning, +func Parse(bs []byte) (*List, []error) { + lines, errs := normalizeToUTF8Lines(bs) + p := &parser{ + input: lines, + inputLine: 0, } for _, err := range errs { p.addError(err) } - p.Parse(src) - if validate { - p.Validate() - } - return &p + ret := p.parseTopLevel() + return ret, p.errs } // parser is the state for a single PSL file parse. type parser struct { - // currentSection is the logical file section the parser is - // currently in. This is used to verify that StartSection and - // EndSection blocks are paired correctly, and may be nil when the - // parser is not currently within a logical section. - currentSection *StartSection - - // downgradeToWarning is a function that reports whether an error - // should be recorded as a non-fatal warning. See exceptions.go - // for the normal implementation. It's a struct field so that - // tests can replace the normal list of exceptions with something - // else for testing. - downgradeToWarning func(error) bool - - // File is the parser's output. - File -} - -// Parse parses src as a PSL file and returns the parse result. -func (p *parser) Parse(src Source) { - blankLine := func(line Source) bool { return line.Text() == "" } - blocks := src.split(blankLine) - - for _, block := range blocks { - // Does this block have any non-comments in it? If so, it's a - // suffix block, otherwise it's a comment/section marker - // block. - notComment := func(line Source) bool { return !strings.HasPrefix(line.Text(), "//") } - comment, rest, hasSuffixes := block.cut(notComment) - if hasSuffixes { - p.processSuffixes(block, comment, rest) - } else { - p.processTopLevelComment(comment) - } + // input is the remaining unparsed and untokenized source text. + input []string + // inputLine is the offset for input[0]. That is, input[0] is line + // number inputLine of the source text. + inputLine int + // peekBuf is a buffer containing zero or one input tokens. + peekBuf any + // errs are the accumulated parse errors so far. + errs []error +} + +// addError records err as a parse/validation error. +// +// If err matches a legacy exemption from current validation rules, +// err is recorded as a non-fatal warning instead. +func (p *parser) addError(err error) { + p.errs = append(p.errs, err) +} + +// The following types and functions are the lexer portion of the +// parsing logic. This is a very simplistic lexer, since +// normalizeToUTF8Lines has already done a lot of heavy lifting to +// clean up the input. Each line of input is converted to a token for +// that line's content. The parser then assembles that stream of +// tokens into multiline blocks, and eventually into a parse tree. + +const ( + sectionStartPrefix = "// ===BEGIN " + sectionEndPrefix = "// ===END " + sectionPrefix = "// ===" + commentPrefix = "// " + wildcardPrefix = "*." + exceptionPrefix = "!" +) + +type line struct { + SourceRange + Text string +} +type tokenEOF struct{} +type tokenBlank struct{ line } +type tokenComment struct{ line } +type tokenSectionUnknown struct{ line } +type tokenSectionStart struct { + line + Name string +} +type tokenSectionEnd struct { + line + Name string +} +type tokenSuffix struct{ line } +type tokenWildcard struct { + line + Suffix string +} +type tokenException struct { + line + Suffix string +} + +// next lexes the next token of input and returns it. +func (p *parser) next() (ret any) { + if p.peekBuf != nil { + ret := p.peekBuf + p.peekBuf = nil + return ret } - // At EOF with an open section. - if p.currentSection != nil { - p.addError(UnclosedSectionError{ - Start: p.currentSection, - }) + if len(p.input) == 0 { + return tokenEOF{} } -} -// processSuffixes parses a block that consists of domain suffixes and -// a metadata header. -func (p *parser) processSuffixes(block, header, rest Source) { - s := &Suffixes{ - Source: block, + // No matter what, next is going to emit the next line of p.input, + // the rest of the function is just to determine what kind of + // token to return. + src := line{ + SourceRange: SourceRange{p.inputLine, p.inputLine + 1}, + Text: p.input[0], } + p.input = p.input[1:] + p.inputLine++ - var metadataSrc []string - for _, line := range header.lineSources() { - // TODO: s.Header should be a single Source for the entire - // comment. - s.Header = append(s.Header, line) - if strings.HasPrefix(line.Text(), sectionMarkerPrefix) { - p.addError(SectionInSuffixBlock{line}) - } else { - // Trim the comment prefix in two steps, because some PSL - // comments don't have whitepace between the // and the - // following text. - metadataSrc = append(metadataSrc, strings.TrimSpace(strings.TrimPrefix(line.Text(), "//"))) + switch { + case src.Text == "": + return tokenBlank{src} + + case strings.HasPrefix(src.Text, sectionStartPrefix): + // To avoid repeated string processing in different portions + // of the parser code, the lexer tears apart section markers + // here to extract the section name. + name := strings.TrimPrefix(src.Text, sectionStartPrefix) + name, ok := strings.CutSuffix(name, "===") + if !ok { + return tokenSectionUnknown{src} + } + return tokenSectionStart{src, name} + case strings.HasPrefix(src.Text, sectionEndPrefix): + name := strings.TrimPrefix(src.Text, sectionEndPrefix) + name, ok := strings.CutSuffix(name, "===") + if !ok { + return tokenSectionUnknown{src} } + return tokenSectionEnd{src, name} + case strings.HasPrefix(src.Text, sectionPrefix): + return tokenSectionUnknown{src} + + case strings.HasPrefix(src.Text, commentPrefix): + // Similarly, the following do some light processing of the + // input so that this doesn't need to be repeated in several + // portions of the parser. + src.Text = strings.TrimPrefix(src.Text, "// ") + return tokenComment{src} + case strings.HasPrefix(src.Text, wildcardPrefix): + return tokenWildcard{src, strings.TrimPrefix(src.Text, wildcardPrefix)} + case strings.HasPrefix(src.Text, exceptionPrefix): + return tokenException{src, strings.TrimPrefix(src.Text, exceptionPrefix)} + + default: + return tokenSuffix{src} } +} + +// peek returns the next token of input, without consuming it. +func (p *parser) peek() any { + if p.peekBuf == nil { + p.peekBuf = p.next() + } + return p.peekBuf +} + +// The rest of this file is the parser itself. It follows the common +// recursive descent structure. + +// blockEmitter returns a function that appends blocks to a given +// output list, and also updates an output SourceRange to cover the +// superset of all emitted blocks. +// +// This is a helper to make the functions that parse intermediate AST +// nodes (which have to accumulate a list of children) more readable. +func blockEmitter(out *[]Block, srcRange *SourceRange) func(...Block) { - // rest consists of suffixes and possibly inline comments. - commentLine := func(line Source) bool { return strings.HasPrefix(line.Text(), "//") } - rest.forEachRun(commentLine, func(block Source, isComment bool) { - if isComment { - for _, line := range block.lineSources() { - if strings.HasPrefix(line.Text(), sectionMarkerPrefix) { - p.addError(SectionInSuffixBlock{line}) - } + return func(bs ...Block) { + for _, b := range bs { + if b == nil { + // Sub-parsers sometimes return nil to indicate the + // thing they tried to parse was bad and they have + // nothing to contribute to the output. + continue } - s.InlineComments = append(s.InlineComments, block) - } else { - // TODO: parse entries properly, for how we just - // accumulate them as individual Sources, one per suffix. - for _, entry := range block.lineSources() { - s.Entries = append(s.Entries, entry) + + *out = append(*out, b) + + if srcRange == nil { + continue + } else if *srcRange == (SourceRange{}) { + // Zero value, this is the first emitted block. + *srcRange = b.SrcRange() + } else { + *srcRange = (*srcRange).merge(b.SrcRange()) } } - }) - - enrichSuffixes(s, metadataSrc) - p.addBlock(s) + } } -const sectionMarkerPrefix = "// ===" +// parseTopLevel parses the top level of a PSL file. +func (p *parser) parseTopLevel() *List { + ret := &List{} + emit := blockEmitter(&ret.Blocks, nil) -// processTopLevelComment parses a block that has only comment lines, -// no suffixes. Some of those comments may be markers for the -// start/end of file sections. -func (p *parser) processTopLevelComment(block Source) { - sectionLine := func(line Source) bool { - return strings.HasPrefix(line.Text(), sectionMarkerPrefix) + for { + switch tok := p.peek().(type) { + case tokenEOF: + return ret + case tokenBlank: + emit(p.parseBlank()) + case tokenComment: + emit(p.parseCommentOrSuffixBlock()) + case tokenSectionStart: + emit(p.parseSection()) + case tokenSectionEnd: + p.addError(ErrUnstartedSection{tok.SourceRange, tok.Name}) + p.next() + case tokenSectionUnknown: + p.addError(ErrUnknownSectionMarker{tok.SourceRange}) + p.next() + case tokenSuffix, tokenWildcard, tokenException: + emit(p.parseSuffixBlock(nil)) + default: + panic("unhandled token") + } + } +} + +// parseSection parses the contents of a PSL file section. +func (p *parser) parseSection() *Section { + // Initialize with the start-of-section marker's data. + start := p.next().(tokenSectionStart) + ret := &Section{ + SourceRange: start.SourceRange, + Name: start.Name, } - block.forEachRun(sectionLine, func(block Source, isSectionLine bool) { - if isSectionLine { - for _, line := range block.lineSources() { - p.processSectionMarker(line) + emit := blockEmitter(&ret.Blocks, &ret.SourceRange) + + for { + switch tok := p.peek().(type) { + case tokenEOF: + p.addError(ErrUnclosedSection{ret}) + return ret + case tokenBlank: + emit(p.parseBlank()) + case tokenComment: + emit(p.parseCommentOrSuffixBlock()) + case tokenSectionStart: + // The PSL doesn't allow nested sections, so we pretend + // like the inner section never existed and grab all its + // blocks for ourselves. Still record an error for the + // nested section though. + inner := p.parseSection() + emit(inner.Blocks...) + p.addError(ErrNestedSection{inner.SourceRange, inner.Name, ret}) + case tokenSectionEnd: + p.next() + if tok.Name != ret.Name { + p.addError(ErrMismatchedSection{tok.SourceRange, tok.Name, ret}) } - } else { - p.addBlock(&Comment{block}) + ret.SourceRange.LastLine = tok.SourceRange.LastLine + return ret + case tokenSectionUnknown: + p.next() + p.addError(ErrUnknownSectionMarker{tok.SourceRange}) + case tokenSuffix, tokenWildcard, tokenException: + emit(p.parseSuffixBlock(nil)) + default: + panic("unhandled token") } - }) -} - -// processSectionMarker parses line as a file section marker, and -// enforces correct start/end pairing. -func (p *parser) processSectionMarker(line Source) { - // Trim here rather than in the caller, so that we still have the - // complete input line available to use in errors. - marker := strings.TrimPrefix(line.Text(), sectionMarkerPrefix) - - // Note hasTrailer gets used below to report an error if the - // trailing "===" is missing. We delay reporting the error so that - // if the entire line is invalid, we don't report both a - // whole-line error and also an unterminated marker error. - marker, hasTrailer := strings.CutSuffix(marker, "===") - - markerType, name, ok := strings.Cut(marker, " ") - if !ok { - // There are no spaces, markerType is the whole text between - // the ===. Clear it out, so that the switch below goes to the - // error case, otherwise "===BEGIN===" would be accepted as a - // no-name section start. - markerType = "" - } - - // No matter what, we're going to output something that needs to - // reference this line. - src := line - - switch markerType { - case "BEGIN": - start := &StartSection{ - Source: src, - Name: name, - } - if p.currentSection != nil { - // Nested sections aren't allowed. Note the error and - // continue parsing as if the previous section was closed - // correctly before this one started. - p.addError(NestedSectionError{ - Outer: p.currentSection, - Inner: start, - }) - } - if !hasTrailer { - p.addError(UnterminatedSectionMarker{src}) + } +} + +// parseCommentOrSuffixBlock parses a comment, then either returns it +// as a lone comment or chains into suffix block parsing, depending on +// what follows the comment. +// +// This is used to resolve an ambiguity in the PSL format when parsing +// linearly: if we see a comment, that could be a standalone comment, +// or it could be the beginning of a suffix block. In the latter case, +// it's very important to attach the comment to the suffix block, +// since it contains metadata about those suffixes. +func (p *parser) parseCommentOrSuffixBlock() Block { + comment := p.parseComment() + switch p.peek().(type) { + case tokenSuffix, tokenWildcard, tokenException: + return p.parseSuffixBlock(comment) + default: + return comment + } +} + +// parseSuffixBlock parses a suffix block, starting with the provided +// optional initial comment. +func (p *parser) parseSuffixBlock(initialComment *Comment) *Suffixes { + ret := &Suffixes{} + emit := blockEmitter(&ret.Blocks, &ret.SourceRange) + + if initialComment != nil { + emit(initialComment) + enrichSuffixes(ret, initialComment) + } + + for { + switch tok := p.peek().(type) { + case tokenBlank: + return ret + case tokenComment: + emit(p.parseComment()) + case tokenSectionUnknown: + p.next() + p.addError(ErrUnknownSectionMarker{tok.SourceRange}) + case tokenSectionStart: + p.next() + p.addError(ErrSectionInSuffixBlock{tok.SourceRange}) + case tokenSectionEnd: + p.next() + p.addError(ErrSectionInSuffixBlock{tok.SourceRange}) + case tokenSuffix: + emit(p.parseSuffix()) + case tokenWildcard: + emit(p.parseWildcard()) + case tokenException: + // Note we don't emit here, exceptions receive a list of + // existing blocks and attach the exception to the + // corresponding wildcard entry. + p.parseException(ret.Blocks) + case tokenEOF: + return ret + default: + panic("unhandled token") } - p.currentSection = start - p.addBlock(start) - case "END": - end := &EndSection{ - Source: src, - Name: name, + } +} + +// parseSuffix parses a basic public suffix entry (i.e. not a wildcard +// or an exception. +func (p *parser) parseSuffix() Block { + tok := p.next().(tokenSuffix) + + labels, err := parseDomainString(tok.Text) + if err != nil { + p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Text, err}) + return nil + } + + return &Suffix{ + SourceRange: tok.SourceRange, + Labels: labels, + } +} + +// parseWildcard parses a public suffix wildcard entry, of the form +// "*.example.com". +func (p *parser) parseWildcard() Block { + tok := p.next().(tokenWildcard) + + labels, err := parseDomainString(tok.Suffix) + if err != nil { + p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, err}) + return nil + } + + return &Wildcard{ + SourceRange: tok.SourceRange, + Labels: labels, + } +} + +// parseException parses a public suffix wildcard exception, of the +// form "!foo.example.com". The parsed exception is attached to the +// related Wildcard block in previous. If no such block exists, the +// exception is dropped and a parse error recorded. +func (p *parser) parseException(previous []Block) { + tok := p.next().(tokenException) + + labels, err := parseDomainString(tok.Suffix) + if err != nil { + p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, err}) + return + } + + for _, block := range previous { + w, ok := block.(*Wildcard) + if !ok { + continue } - if p.currentSection == nil { - // Rogue end marker. Note and continue parsing as if this - // section name was correctly opened earlier. - p.addError(UnstartedSectionError{ - End: end, - }) - } else if p.currentSection.Name != name { - // Mismatched start/end. - p.addError(MismatchedSectionError{ - Start: p.currentSection, - End: end, - }) + + if len(labels) == len(w.Labels)+1 && slices.Equal(labels[1:], w.Labels) { + w.Exceptions = append(w.Exceptions, labels[0]) + return } - if !hasTrailer { - p.addError(UnterminatedSectionMarker{src}) + } + p.addError(ErrInvalidSuffix{tok.SourceRange, tok.Suffix, fmt.Errorf("exception %q does not match any wildcard", tok.Suffix)}) +} + +// parseComment parses a multiline comment block. +func (p *parser) parseComment() *Comment { + tok := p.next().(tokenComment) + ret := &Comment{ + SourceRange: tok.SourceRange, + Text: []string{tok.Text}, + } + for { + if tok, ok := p.peek().(tokenComment); ok { + p.next() + ret.SourceRange = ret.SourceRange.merge(tok.SourceRange) + ret.Text = append(ret.Text, tok.Text) + } else { + return ret } - p.currentSection = nil - p.addBlock(end) - default: - // Unknown kind of marker - // - // We want all non-whitespace bytes to be present in the - // parsed output somewhere, so record this malformed line as a - // Comment. Top-level comments are just freeform text, which - // is technically correct here since this isn't a valid - // section marker. - p.addError(UnknownSectionMarker{src}) - p.addBlock(&Comment{src}) } } -// addBlock adds b to p.File.Blocks. -func (p *parser) addBlock(b Block) { - p.File.Blocks = append(p.File.Blocks, b) +// parseBlank parses a run of empty lines. +func (p *parser) parseBlank() Block { + tok := p.next().(tokenBlank) + ret := &Blank{tok.SourceRange} + for { + if tok, ok := p.peek().(tokenBlank); ok { + p.next() + ret.SourceRange = ret.SourceRange.merge(tok.SourceRange) + } else { + return ret + } + } } -// addError records err as a parse/validation error. -// -// If err matches a legacy exemption from current validation rules, -// err is recorded as a non-fatal warning instead. -func (p *parser) addError(err error) { - if p.downgradeToWarning(err) { - p.File.Warnings = append(p.File.Warnings, err) - } else { - p.File.Errors = append(p.File.Errors, err) +// parseDomainString parses a DNS domain string into its component +// labels, validated and normalized to IDNA ascii representation. +func parseDomainString(domain string) (labels []string, err error) { + cleaned, err := idna.Registration.ToUnicode(domain) + if err != nil { + return nil, err + } else if cleaned != domain { + return nil, fmt.Errorf("not in canonical form, should be %q", cleaned) } + + // TODO: the parse tree normalizes to the ASCII (aka punycode) + // representation. Should it normalize to the unicode + // representation instead, to keep parity with the policy of the + // source text? + puny, err := idna.Registration.ToASCII(cleaned) + if err != nil { + panic("punycode translation error on canonical unicode value") + } + + return strings.Split(puny, "."), nil } diff --git a/tools/internal/parser/parser_test.go b/tools/internal/parser/parser_test.go index 207105537..864427cda 100644 --- a/tools/internal/parser/parser_test.go +++ b/tools/internal/parser/parser_test.go @@ -1,16 +1,11 @@ package parser import ( - "bytes" - "cmp" "net/mail" "net/url" "os" - "slices" "strings" "testing" - - diff "github.com/google/go-cmp/cmp" ) // TestParser runs a battery of synthetic parse and validation tests. @@ -29,12 +24,13 @@ func TestParser(t *testing.T) { name string psl []byte downgradeToWarning func(error) bool - want File + want *List + wantErrs []error }{ { name: "empty", psl: byteLines(""), - want: File{}, + want: list(), }, { @@ -44,45 +40,35 @@ func TestParser(t *testing.T) { "", "// Here is a second comment.", ), - want: File{ - Blocks: []Block{ - &Comment{Source: mkSrc(0, "// This is an empty PSL file.")}, - &Comment{Source: mkSrc(2, "// Here is a second comment.")}, - }, - }, + want: list( + comment(0, "This is an empty PSL file."), + blank(1, 2), + comment(2, "Here is a second comment."), + ), }, { - name: "just_suffixes", + name: "just_suffixes_in_block", psl: byteLines( + "// ===BEGIN PRIVATE DOMAINS===", + "", "example.com", "other.example.com", "*.example.org", + "", + "// ===END PRIVATE DOMAINS===", + ), + want: list( + section(0, 7, "PRIVATE DOMAINS", + blank(1, 2), + suffixes(2, 5, "", "", "", + suffix(2, "example.com"), + suffix(3, "other.example.com"), + wildcard(4, 5, "example.org"), + ), + blank(5, 6), + ), ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, "example.com", "other.example.com", "*.example.org"), - Entries: []Source{ - mkSrc(0, "example.com"), - mkSrc(1, "other.example.com"), - mkSrc(2, "*.example.org"), - }, - }, - }, - Errors: []error{ - MissingEntityName{ - Suffixes: &Suffixes{ - Source: mkSrc(0, "example.com", "other.example.com", "*.example.org"), - Entries: []Source{ - mkSrc(0, "example.com"), - mkSrc(1, "other.example.com"), - mkSrc(2, "*.example.org"), - }, - }, - }, - }, - }, }, { @@ -94,26 +80,12 @@ func TestParser(t *testing.T) { "// ===BEGIN FAKE DOMAINS===", "// ===END FAKE DOMAINS===", ), - want: File{ - Blocks: []Block{ - &StartSection{ - Source: mkSrc(0, "// ===BEGIN IMAGINARY DOMAINS==="), - Name: "IMAGINARY DOMAINS", - }, - &EndSection{ - Source: mkSrc(2, "// ===END IMAGINARY DOMAINS==="), - Name: "IMAGINARY DOMAINS", - }, - &StartSection{ - Source: mkSrc(3, "// ===BEGIN FAKE DOMAINS==="), - Name: "FAKE DOMAINS", - }, - &EndSection{ - Source: mkSrc(4, "// ===END FAKE DOMAINS==="), - Name: "FAKE DOMAINS", - }, - }, - }, + want: list( + section(0, 3, "IMAGINARY DOMAINS", // TEST RIGHT, CODE WRONG + blank(1, 2), + ), + section(3, 5, "FAKE DOMAINS"), + ), }, { @@ -121,21 +93,11 @@ func TestParser(t *testing.T) { psl: byteLines( "// ===BEGIN ICANN DOMAINS===", ), - want: File{ - Blocks: []Block{ - &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - Errors: []error{ - UnclosedSectionError{ - Start: &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - }, + want: list( + section(0, 1, "ICANN DOMAINS"), + ), + wantErrs: []error{ + ErrUnclosedSection{section(0, 1, "ICANN DOMAINS")}, }, }, @@ -147,74 +109,15 @@ func TestParser(t *testing.T) { "// ===END SECRET DOMAINS===", "// ===END ICANN DOMAINS===", ), - want: File{ - Blocks: []Block{ - &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - &StartSection{ - Source: mkSrc(1, "// ===BEGIN SECRET DOMAINS==="), - Name: "SECRET DOMAINS", - }, - &EndSection{ - Source: mkSrc(2, "// ===END SECRET DOMAINS==="), - Name: "SECRET DOMAINS", - }, - &EndSection{ - Source: mkSrc(3, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - Errors: []error{ - NestedSectionError{ - Outer: &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - Inner: &StartSection{ - Source: mkSrc(1, "// ===BEGIN SECRET DOMAINS==="), - Name: "SECRET DOMAINS", - }, - }, - UnstartedSectionError{ - &EndSection{ - Source: mkSrc(3, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - }, - }, - }, - { - name: "mismatched_sections", - psl: byteLines( - "// ===BEGIN ICANN DOMAINS===", - "", - "// ===END PRIVATE DOMAINS===", + want: list( + section(0, 4, "ICANN DOMAINS"), ), - want: File{ - Blocks: []Block{ - &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - &EndSection{ - Source: mkSrc(2, "// ===END PRIVATE DOMAINS==="), - Name: "PRIVATE DOMAINS", - }, - }, - Errors: []error{ - MismatchedSectionError{ - Start: &StartSection{ - Source: mkSrc(0, "// ===BEGIN ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - End: &EndSection{ - Source: mkSrc(2, "// ===END PRIVATE DOMAINS==="), - Name: "PRIVATE DOMAINS", - }, - }, + + wantErrs: []error{ + ErrNestedSection{ + SourceRange: mkSrc(1, 3), + Name: "SECRET DOMAINS", + Section: section(0, 4, "ICANN DOMAINS"), }, }, }, @@ -224,22 +127,14 @@ func TestParser(t *testing.T) { psl: byteLines( "// ===TRANSFORM DOMAINS===", ), - want: File{ - Blocks: []Block{ - &Comment{ - Source: mkSrc(0, "// ===TRANSFORM DOMAINS==="), - }, - }, - Errors: []error{ - UnknownSectionMarker{ - Line: mkSrc(0, "// ===TRANSFORM DOMAINS==="), - }, - }, + want: list(), + wantErrs: []error{ + ErrUnknownSectionMarker{mkSrc(0, 1)}, }, }, { - name: "suffixes_with_section_markers_in_header", + name: "suffixes_with_section_marker_in_header", psl: byteLines( "// Just some suffixes", "// ===BEGIN ICANN DOMAINS===", @@ -248,105 +143,45 @@ func TestParser(t *testing.T) { "", "// ===END ICANN DOMAINS===", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// Just some suffixes", - "// ===BEGIN ICANN DOMAINS===", - "com", - "org", - ), - Header: []Source{ - mkSrc(0, "// Just some suffixes"), - mkSrc(1, "// ===BEGIN ICANN DOMAINS==="), - }, - Entries: []Source{ - mkSrc(2, "com"), - mkSrc(3, "org"), - }, - Entity: "Just some suffixes", - }, - &EndSection{ - Source: mkSrc(5, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - Errors: []error{ - SectionInSuffixBlock{ - Line: mkSrc(1, "// ===BEGIN ICANN DOMAINS==="), - }, - // Note: trying to gracefully parse the - // StartSection would require splitting the suffix - // block in two, which would need more code and - // also result in additional spurious validation - // errors. Instead this tests that section markers - // within suffix blocks are ignored for section - // validation. - UnstartedSectionError{ - End: &EndSection{ - Source: mkSrc(5, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - }, - }, + want: list( + comment(0, "Just some suffixes"), + section(1, 6, "ICANN DOMAINS", + suffixes(2, 4, "", "", "", + suffix(2, "com"), + suffix(3, "org"), + ), + blank(4, 5), + ), + ), }, { name: "suffixes_with_section_markers_inline", psl: byteLines( + "// ===BEGIN ICANN DOMAINS===", "// Just some suffixes", "com", - "// ===BEGIN ICANN DOMAINS===", + "// ===BEGIN OTHER DOMAINS===", "org", + "// ===END OTHER DOMAINS===", + "net", "", "// ===END ICANN DOMAINS===", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// Just some suffixes", - "com", - "// ===BEGIN ICANN DOMAINS===", - "org", - ), - Header: []Source{ - mkSrc(0, "// Just some suffixes"), - }, - Entries: []Source{ - mkSrc(1, "com"), - mkSrc(3, "org"), - }, - InlineComments: []Source{ - mkSrc(2, "// ===BEGIN ICANN DOMAINS==="), - }, - Entity: "Just some suffixes", - }, - &EndSection{ - Source: mkSrc(5, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - Errors: []error{ - SectionInSuffixBlock{ - Line: mkSrc(2, "// ===BEGIN ICANN DOMAINS==="), - }, - // Note: trying to gracefully parse the - // StartSection would require splitting the suffix - // block in two, which would need more code and - // also result in additional spurious validation - // errors. Instead this tests that section markers - // within suffix blocks are ignored for section - // validation. - UnstartedSectionError{ - End: &EndSection{ - Source: mkSrc(5, "// ===END ICANN DOMAINS==="), - Name: "ICANN DOMAINS", - }, - }, - }, + want: list( + section(0, 9, "ICANN DOMAINS", + suffixes(1, 7, "Just some suffixes", "", "", + comment(1, "Just some suffixes"), + suffix(2, "com"), + suffix(4, "org"), + suffix(6, "net"), + ), + blank(7, 8), + ), + ), + wantErrs: []error{ + ErrSectionInSuffixBlock{mkSrc(3, 4)}, + ErrSectionInSuffixBlock{mkSrc(5, 6)}, }, }, @@ -358,27 +193,13 @@ func TestParser(t *testing.T) { "example.com", "example.org", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// Unstructured header.", - "// I'm just going on about random things.", - "example.com", - "example.org", - ), - Header: []Source{ - mkSrc(0, "// Unstructured header."), - mkSrc(1, "// I'm just going on about random things."), - }, - Entries: []Source{ - mkSrc(2, "example.com"), - mkSrc(3, "example.org"), - }, - Entity: "Unstructured header.", - }, - }, - }, + want: list( + suffixes(0, 4, "Unstructured header.", "", "", + comment(0, "Unstructured header.", "I'm just going on about random things."), + suffix(2, "example.com"), + suffix(3, "example.org"), + ), + ), }, { @@ -390,31 +211,17 @@ func TestParser(t *testing.T) { "example.com", "example.org", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// DuckCorp Inc: https://example.com", - "// Submitted by Not A Duck ", - "// Seriously, not a duck", - "example.com", - "example.org", - ), - Header: []Source{ - mkSrc(0, "// DuckCorp Inc: https://example.com"), - mkSrc(1, "// Submitted by Not A Duck "), - mkSrc(2, "// Seriously, not a duck"), - }, - Entries: []Source{ - mkSrc(3, "example.com"), - mkSrc(4, "example.org"), - }, - Entity: "DuckCorp Inc", - URL: mustURL("https://example.com"), - Submitter: mustEmail("Not A Duck "), - }, - }, - }, + want: list( + suffixes(0, 5, + "DuckCorp Inc", + "https://example.com", + `"Not A Duck" `, + comment(0, "DuckCorp Inc: https://example.com", "Submitted by Not A Duck ", + "Seriously, not a duck"), + suffix(3, "example.com"), + suffix(4, "example.org"), + ), + ), }, { @@ -423,24 +230,15 @@ func TestParser(t *testing.T) { "// DuckCorp Inc: submitted by Not A Duck ", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// DuckCorp Inc: submitted by Not A Duck ", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// DuckCorp Inc: submitted by Not A Duck "), - }, - Entries: []Source{ - mkSrc(1, "example.com"), - }, - Entity: "DuckCorp Inc", - Submitter: mustEmail("Not A Duck "), - }, - }, - }, + want: list( + suffixes(0, 2, + "DuckCorp Inc", + "", + `"Not A Duck" `, + comment(0, "DuckCorp Inc: submitted by Not A Duck "), + suffix(1, "example.com"), + ), + ), }, { @@ -451,29 +249,15 @@ func TestParser(t *testing.T) { "// Submitted by Not A Duck ", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// DuckCorp Inc", - "// https://example.com", - "// Submitted by Not A Duck ", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// DuckCorp Inc"), - mkSrc(1, "// https://example.com"), - mkSrc(2, "// Submitted by Not A Duck "), - }, - Entries: []Source{ - mkSrc(3, "example.com"), - }, - Entity: "DuckCorp Inc", - URL: mustURL("https://example.com"), - Submitter: mustEmail("Not A Duck "), - }, - }, - }, + want: list( + suffixes(0, 4, + "DuckCorp Inc", + "https://example.com", + `"Not A Duck" `, + comment(0, "DuckCorp Inc", "https://example.com", `Submitted by Not A Duck `), + suffix(3, "example.com"), + ), + ), }, { @@ -483,27 +267,17 @@ func TestParser(t *testing.T) { "// DuckCorp Inc: https://example.com", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// Submitted by Not A Duck ", - "// DuckCorp Inc: https://example.com", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// Submitted by Not A Duck "), - mkSrc(1, "// DuckCorp Inc: https://example.com"), - }, - Entries: []Source{ - mkSrc(2, "example.com"), - }, - Entity: "DuckCorp Inc", - URL: mustURL("https://example.com"), - Submitter: mustEmail("Not A Duck "), - }, - }, - }, + want: list( + suffixes(0, 3, + "DuckCorp Inc", + "https://example.com", + `"Not A Duck" `, + comment(0, + "Submitted by Not A Duck ", + "DuckCorp Inc: https://example.com"), + suffix(2, "example.com"), + ), + ), }, { @@ -514,74 +288,17 @@ func TestParser(t *testing.T) { "// Submitted by Not A Duck ", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// This is an unstructured comment.", - "// DuckCorp Inc: https://example.com", - "// Submitted by Not A Duck ", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// This is an unstructured comment."), - mkSrc(1, "// DuckCorp Inc: https://example.com"), - mkSrc(2, "// Submitted by Not A Duck "), - }, - Entries: []Source{ - mkSrc(3, "example.com"), - }, - Entity: "DuckCorp Inc", - URL: mustURL("https://example.com"), - Submitter: mustEmail("Not A Duck "), - }, - }, - }, - }, - - { - name: "legacy_error_downgrade", - psl: byteLines( - "// https://example.com", - "example.com", + want: list( + suffixes(0, 4, + "DuckCorp Inc", + "https://example.com", + `"Not A Duck" `, + comment(0, "This is an unstructured comment.", + "DuckCorp Inc: https://example.com", + "Submitted by Not A Duck "), + suffix(3, "example.com"), + ), ), - downgradeToWarning: func(e error) bool { - return true - }, - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// https://example.com", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// https://example.com"), - }, - Entries: []Source{ - mkSrc(1, "example.com"), - }, - URL: mustURL("https://example.com"), - }, - }, - Warnings: []error{ - MissingEntityName{ - Suffixes: &Suffixes{ - Source: mkSrc(0, - "// https://example.com", - "example.com", - ), - Header: []Source{ - mkSrc(0, "// https://example.com"), - }, - Entries: []Source{ - mkSrc(1, "example.com"), - }, - URL: mustURL("https://example.com"), - }, - }, - }, - }, }, { @@ -592,21 +309,12 @@ func TestParser(t *testing.T) { "// Parens Appreciation Society (https://example.org)", "example.com", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, "// Parens Appreciation Society (https://example.org)", "example.com"), - Header: []Source{ - mkSrc(0, "// Parens Appreciation Society (https://example.org)"), - }, - Entries: []Source{ - mkSrc(1, "example.com"), - }, - Entity: "Parens Appreciation Society", - URL: mustURL("https://example.org"), - }, - }, - }, + want: list( + suffixes(0, 2, "Parens Appreciation Society", "https://example.org", "", + comment(0, "Parens Appreciation Society (https://example.org)"), + suffix(1, "example.com"), + ), + ), }, { @@ -621,67 +329,28 @@ func TestParser(t *testing.T) { "// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1", "cd", ), - want: File{ - Blocks: []Block{ - &Suffixes{ - Source: mkSrc(0, - "// cd : https://en.wikipedia.org/wiki/.cd", - "// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1", - "cd", - ), - Header: []Source{ - mkSrc(0, "// cd : https://en.wikipedia.org/wiki/.cd"), - mkSrc(1, "// see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1"), - }, - Entries: []Source{ - mkSrc(2, "cd"), - }, - Entity: "cd", - URL: mustURL("https://en.wikipedia.org/wiki/.cd"), - }, - }, - }, + want: list( + suffixes(0, 3, "cd", "https://en.wikipedia.org/wiki/.cd", "", + comment(0, "cd : https://en.wikipedia.org/wiki/.cd", + "see also: https://www.nic.cd/domain/insertDomain_2.jsp?act=1"), + suffix(2, "cd"), + ), + ), }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - exc := test.downgradeToWarning - if exc == nil { - // use real exceptions if the test doesn't provide something else - exc = downgradeToWarning - } - got := parseWithExceptions(test.psl, exc, true).File + got, errs := Parse(test.psl) checkDiff(t, "parse result", got, test.want) + checkDiff(t, "parse errors", errs, test.wantErrs) }) } } -// mustURL returns the given string as a URL, or panics if not a URL. -func mustURL(s string) *url.URL { - u, err := url.Parse(s) - if err != nil { - panic(err) - } - return u -} - -// mustEmail returns the given string as an RFC 5322 address, or -// panics if the parse fails. -func mustEmail(s string) *mail.Address { - a, err := mail.ParseAddress(s) - if err != nil { - panic(err) - } - return a -} - -// mkSrc returns a Source with the given start, end, and dedented text. -func mkSrc(start int, lines ...string) Source { - return Source{ - lineOffset: start, - lines: lines, - } +// mkSrc returns a SourceRange with the given start and end. +func mkSrc(start, end int) SourceRange { + return SourceRange{start, end} } // TestParseRealList checks that the real public suffix list can parse @@ -692,133 +361,81 @@ func TestParseRealList(t *testing.T) { t.Fatal(err) } - f := Parse(bs) + _, errs := Parse(bs) - for _, err := range f.Errors { + for _, err := range errs { t.Errorf("Parse error: %v", err) } } -// TestRoundtripRealList checks that concatenating the source text of -// all top-level blocks, with appropriate additional blank lines, -// exactly reproduces the source text that was parsed. Effectively, -// this is a "prove that the parser didn't discard any bytes" check. -func TestRoundtripRealList(t *testing.T) { - bs, err := os.ReadFile("../../../public_suffix_list.dat") - if err != nil { - t.Fatal(err) - } - f := Parse(bs) - - if len(f.Errors) > 0 { - t.Fatal("Parse errors, not attempting to roundtrip") +func list(blocks ...Block) *List { + return &List{ + Blocks: blocks, } +} - prevLine := 0 - var rebuilt bytes.Buffer - for _, block := range f.Blocks { - src := block.source() - if src.lineOffset < prevLine { - t.Fatalf("ordering error: previous block ended at %d but this block starts at %d:\n%s", prevLine, src.lineOffset, src.Text()) - } - for prevLine < src.lineOffset { - rebuilt.WriteByte('\n') - prevLine++ - } - rebuilt.WriteString(src.Text()) - rebuilt.WriteByte('\n') - prevLine = src.lineOffset + len(src.lines) +func blank(start, end int) *Blank { + return &Blank{ + SourceRange: mkSrc(start, end), } +} - got := strings.Split(strings.TrimSpace(rebuilt.String()), "\n") - want := strings.Split(strings.TrimSpace(string(bs)), "\n") - - if diff := diff.Diff(want, got); diff != "" { - t.Errorf("roundtrip failed (-want +got):\n%s", diff) +func comment(start int, lines ...string) *Comment { + return &Comment{ + SourceRange: mkSrc(start, start+len(lines)), + Text: lines, } } -// TestRoundtripRealListDetailed is like the prior round-tripping -// test, but Suffix blocks are written out using their -// Header/Entries/InlineComments fields, again as proof that no suffix -// block elements were lost during parsing. -func TestRoundtripRealListDetailed(t *testing.T) { - bs, err := os.ReadFile("../../../public_suffix_list.dat") - if err != nil { - t.Fatal(err) +func section(start, end int, name string, blocks ...Block) *Section { + if len(blocks) == 0 { + return &Section{ + SourceRange: mkSrc(start, end), + Name: name, + } } - f := Parse(bs) - if len(f.Errors) > 0 { - t.Fatal("Parse errors, not attempting to roundtrip") + return &Section{ + SourceRange: mkSrc(start, end), + Name: name, + Blocks: blocks, } +} - prevLine := 0 - var rebuilt bytes.Buffer - for _, block := range f.Blocks { - srcs := []Source{block.source()} - if v, ok := block.(*Suffixes); ok { - srcs = []Source{} - for _, h := range v.Header { - srcs = append(srcs, h) - } - for _, e := range v.Entries { - srcs = append(srcs, e) - } - for _, c := range v.InlineComments { - srcs = append(srcs, c) - } - slices.SortFunc(srcs, func(a, b Source) int { - return cmp.Compare(a.lineOffset, b.lineOffset) - }) - } - - for _, src := range srcs { - if src.lineOffset < prevLine { - t.Fatalf("ordering error: previous block ended at %d but this block starts at %d:\n%s", prevLine, src.lineOffset, src.Text()) - } - for prevLine < src.lineOffset { - rebuilt.WriteByte('\n') - prevLine++ - } - rebuilt.WriteString(src.Text()) - rebuilt.WriteByte('\n') - prevLine = src.lineOffset + len(src.lines) +func suffixes(start, end int, entity string, urlStr string, email string, blocks ...Block) *Suffixes { + ret := &Suffixes{ + SourceRange: mkSrc(start, end), + Entity: entity, + Blocks: blocks, + } + if urlStr != "" { + u, err := url.Parse(urlStr) + if err != nil { + panic(err) } + ret.URL = u } - - got := strings.Split(strings.TrimSpace(rebuilt.String()), "\n") - want := strings.Split(strings.TrimSpace(string(bs)), "\n") - - if diff := diff.Diff(want, got); diff != "" { - t.Errorf("roundtrip failed (-want +got):\n%s", diff) + if email != "" { + e, err := mail.ParseAddress(email) + if err != nil { + panic(err) + } + ret.Submitter = e } + return ret } -// TestExceptionsStillNecessary checks that all the exceptions in -// exeptions.go are still needed to parse the PSL without errors. -func TestExceptionsStillNecessary(t *testing.T) { - bs, err := os.ReadFile("../../../public_suffix_list.dat") - if err != nil { - t.Fatal(err) +func suffix(line int, domain string) *Suffix { + return &Suffix{ + SourceRange: mkSrc(line, line+1), + Labels: strings.Split(domain, "."), } - - forEachOmitted(missingEmail, func(omitted string, trimmed []string) { - old := missingEmail - defer func() { missingEmail = old }() - missingEmail = trimmed - - f := Parse(bs) - if len(f.Errors) == 0 { - t.Errorf("missingEmail exception no longer necessary:\n%s", omitted) - } - }) } -func forEachOmitted(exceptions []string, fn func(string, []string)) { - for i := range exceptions { - next := append([]string(nil), exceptions[:i]...) - next = append(next, exceptions[i+1:]...) - fn(exceptions[i], next) +func wildcard(start, end int, base string, exceptions ...string) *Wildcard { + return &Wildcard{ + SourceRange: mkSrc(start, end), + Labels: strings.Split(base, "."), + Exceptions: exceptions, } } diff --git a/tools/internal/parser/text.go b/tools/internal/parser/text.go index 42644a821..4ef48cd66 100644 --- a/tools/internal/parser/text.go +++ b/tools/internal/parser/text.go @@ -11,158 +11,52 @@ import ( xunicode "golang.org/x/text/encoding/unicode" ) -// Source is a piece of source text with location information. -// -// A Source is effectively a slice of the input file's lines, with -// some extra information attached. As such, the start/end indexes -// behave the same as in Go slices, and select the half-open interval -// [start:end). -type Source struct { - // The lines of source text, sanitized to valid UTF-8 and with - // leading and trailing whitespace removed. - lines []string - // lineOffset is how many lines are before the beginning of lines, - // for sources that represent a subset of the input. - lineOffset int -} - -// newSource returns a source for bs, along with a preliminary set of -// input validation errors. -// -// source always returns a usable, non-nil result, even when it -// returns errors. -func newSource(bs []byte) (Source, []error) { - lines, errs := normalizeToUTF8Lines(bs) - - ret := Source{ - lines: lines, - lineOffset: 0, - } - - return ret, errs +// SourceRange describes a slice of lines from an unparsed source +// file. FirstLine and LastLine behave like normal slice offsets, +// i.e. they represent the half-open range [FirstLine:LastLine). +type SourceRange struct { + FirstLine int + LastLine int } -// Text returns the source text of s as a string. -func (s Source) Text() string { - if len(s.lines) == 1 { - return s.lines[0] - } - return strings.Join(s.lines, "\n") -} - -// LocationString returns a short string describing the source -// location. -func (s Source) LocationString() string { - // For printing diagnostics, 0-indexed [start:end) is confusing - // and not how editors present text to people. Adjust the offsets - // to be 1-indexed [start:end] instead. - start := s.lineOffset + 1 - end := s.lineOffset + len(s.lines) - - if end < start { - // Zero line Source. We can sometimes produce these internally - // during parsing, but they should not escape outside the - // package. We still print them gracefully instead of - // panicking, because it's useful for debugging the parser. - return fmt.Sprintf("", start) - } - - if start == end { - return fmt.Sprintf("line %d", start) - } - return fmt.Sprintf("lines %d-%d", start, end) -} - -// slice returns the slice of s between startLine and endLine. -// -// startLine and endLine behave like normal slice offsets, i.e. they -// represent the half-open range [startLine:endLine). -func (s Source) slice(startLine, endLine int) Source { - if startLine < 0 || startLine > len(s.lines) || endLine < startLine || endLine > len(s.lines) { - panic("invalid input to slice") - } - return Source{ - lines: s.lines[startLine:endLine], - lineOffset: s.lineOffset + startLine, +// NumLines returns the number of source lines described by +// SourceRange. +func (s SourceRange) NumLines() int { + if s.FirstLine >= s.LastLine { + return 0 } + return s.LastLine - s.FirstLine } -// line returns the nth line of s. -func (s Source) line(n int) Source { - return s.slice(n, n+1) -} - -// lineSources slices s into one Source per line. -func (s Source) lineSources() []Source { - if len(s.lines) == 1 { - return []Source{s} - } - - ret := make([]Source, len(s.lines)) - for i := range s.lines { - ret[i] = s.slice(i, i+1) +// LocationString prints a human-readable description of the +// SourceRange. +func (s SourceRange) LocationString() string { + switch { + case s.LastLine <= s.FirstLine: + return "" + case s.LastLine == s.FirstLine+1: + return fmt.Sprintf("line %d", s.FirstLine+1) + default: + return fmt.Sprintf("lines %d-%d", s.FirstLine+1, s.LastLine) } - return ret } -// cut slices s at the first cut line, as determined by cutHere. It -// returns two Source blocks: the part of s before the cut line, and -// the rest of s including the cut line. The found result reports -// whether a cut was found. If s does not contain a cut line, cut -// returns s, , false. -func (s Source) cut(cutHere func(Source) bool) (before Source, rest Source, found bool) { - for i := range s.lines { - if cutHere(s.line(i)) { - return s.slice(0, i), s.slice(i, len(s.lines)), true - } +// merge returns a SourceRange that contains both s and other. If s +// and other are not contiguous or overlapping, the returned +// SourceRange also spans unrelated lines, but always covers both s +// and other. +func (s SourceRange) merge(other SourceRange) SourceRange { + return SourceRange{ + FirstLine: min(s.FirstLine, other.LastLine), + LastLine: max(s.LastLine, other.LastLine), } - return s, Source{}, false } -// split slices s into all sub-blocks separated by lines identified by -// isSeparator, and returns a slice of the non-empty blocks between -// those separators. -// -// Note the semantics are different from strings.Split: sub-blocks -// that contain no lines are not returned. This works better for what -// the PSL format needs. -func (s Source) split(isSeparator func(line Source) bool) []Source { - ret := []Source{} - s.forEachRun(isSeparator, func(block Source, isSep bool) { - if isSep { - return - } - ret = append(ret, block) - }) - return ret -} - -// forEachRun calls processBlock for every run of consecutive lines -// where classify returns the same result. -// -// For example, if classify returns true on lines starting with "//", -// processBlock gets called with alternating blocks consisting of only -// comments, or only non-comments. -func (s Source) forEachRun(classify func(line Source) bool, processBlock func(block Source, classifyResult bool)) { - if len(s.lines) == 0 { - return - } - - currentBlock := 0 - currentVal := classify(s.line(0)) - for i := range s.lines[1:] { - line := i + 1 - v := classify(s.line(line)) - if v != currentVal { - processBlock(s.slice(currentBlock, line), currentVal) - currentVal = v - currentBlock = line - } - } - if currentBlock != len(s.lines) { - processBlock(s.slice(currentBlock, len(s.lines)), currentVal) - } -} +// SrcRange returns the SourceRange. This looks a little strange, but +// it's to satisfy the Block interface. This allows other code to +// retrieve the SourceRange of any Block without having to typeswitch +// all the possible sub-types. +func (s SourceRange) SrcRange() SourceRange { return s } const ( bomUTF8 = "\xEF\xBB\xBF" @@ -198,20 +92,20 @@ func normalizeToUTF8Lines(bs []byte) ([]string, []error) { enc := utf8Transform switch { case bytes.HasPrefix(bs, []byte(bomUTF8)): - errs = append(errs, UTF8BOMError{}) + errs = append(errs, ErrUTF8BOM{}) case bytes.HasPrefix(bs, []byte(bomUTF16BE)): enc = utf16BigEndianTransform - errs = append(errs, InvalidEncodingError{"UTF-16BE"}) + errs = append(errs, ErrInvalidEncoding{"UTF-16BE"}) case bytes.HasPrefix(bs, []byte(bomUTF16LE)): enc = utf16LittleEndianTransform - errs = append(errs, InvalidEncodingError{"UTF-16LE"}) + errs = append(errs, ErrInvalidEncoding{"UTF-16LE"}) default: enc = guessUTFVariant(bs) switch enc { case utf16BigEndianTransform: - errs = append(errs, InvalidEncodingError{"UTF-16BE (guessed)"}) + errs = append(errs, ErrInvalidEncoding{"UTF-16BE (guessed)"}) case utf16LittleEndianTransform: - errs = append(errs, InvalidEncodingError{"UTF-16LE (guessed)"}) + errs = append(errs, ErrInvalidEncoding{"UTF-16LE (guessed)"}) } } @@ -239,27 +133,24 @@ func normalizeToUTF8Lines(bs []byte) ([]string, []error) { // replacement character is a distinctive shape that stands // out, it should provide enough hints as to where any invalid // byte sequences are. - src := Source{ - lineOffset: i, - lines: []string{line}, - } + src := SourceRange{i, i + 1} if strings.ContainsRune(line, utf8.RuneError) { - errs = append(errs, InvalidUTF8Error{src}) + errs = append(errs, ErrInvalidUTF8{src}) } line, ok := strings.CutSuffix(line, "\r") if ok { ret[i] = line - errs = append(errs, DOSNewlineError{src}) + errs = append(errs, ErrDOSNewline{src}) } if ln := strings.TrimRightFunc(line, unicode.IsSpace); ln != line { line = ln ret[i] = line - errs = append(errs, TrailingWhitespaceError{src}) + errs = append(errs, ErrTrailingWhitespace{src}) } if ln := strings.TrimLeftFunc(line, unicode.IsSpace); ln != line { line = ln ret[i] = line - errs = append(errs, LeadingWhitespaceError{src}) + errs = append(errs, ErrLeadingWhitespace{src}) } } diff --git a/tools/internal/parser/text_test.go b/tools/internal/parser/text_test.go index 5e42851fd..ee1a6dfbf 100644 --- a/tools/internal/parser/text_test.go +++ b/tools/internal/parser/text_test.go @@ -3,8 +3,6 @@ package parser import ( "bytes" "fmt" - "strconv" - "strings" "testing" "github.com/google/go-cmp/cmp" @@ -51,31 +49,31 @@ func TestNormalize(t *testing.T) { name: "utf16be_input_with_bom", in: utf16BigWithBOM("utf-16 text"), want: []string{"utf-16 text"}, - wantErrs: []error{InvalidEncodingError{"UTF-16BE"}}, + wantErrs: []error{ErrInvalidEncoding{"UTF-16BE"}}, }, { name: "utf16le_input_with_bom", in: utf16LittleWithBOM("utf-16 text"), want: []string{"utf-16 text"}, - wantErrs: []error{InvalidEncodingError{"UTF-16LE"}}, + wantErrs: []error{ErrInvalidEncoding{"UTF-16LE"}}, }, { name: "utf16be_input", in: utf16Big("utf-16 text utf-16 text utf-16 text"), want: []string{"utf-16 text utf-16 text utf-16 text"}, - wantErrs: []error{InvalidEncodingError{"UTF-16BE (guessed)"}}, + wantErrs: []error{ErrInvalidEncoding{"UTF-16BE (guessed)"}}, }, { name: "utf16le_input", in: utf16Little("utf-16 text utf-16 text utf-16 text"), want: []string{"utf-16 text utf-16 text utf-16 text"}, - wantErrs: []error{InvalidEncodingError{"UTF-16LE (guessed)"}}, + wantErrs: []error{ErrInvalidEncoding{"UTF-16LE (guessed)"}}, }, { name: "utf8_with_bom", in: utf8WithBOM("utf-8 text"), want: []string{"utf-8 text"}, - wantErrs: []error{UTF8BOMError{}}, + wantErrs: []error{ErrUTF8BOM{}}, }, { name: "utf8_with_garbage", @@ -107,10 +105,10 @@ func TestNormalize(t *testing.T) { "this line is ok", }, wantErrs: []error{ - InvalidUTF8Error{mkSrc(1, "bad1: \uFFFDabc")}, - InvalidUTF8Error{mkSrc(2, "bad2: \uFFFDabc")}, - InvalidUTF8Error{mkSrc(3, "bad3: \uFFFDabc")}, - InvalidUTF8Error{mkSrc(4, "bad4: \uFFFD\uFFFDabc")}, + ErrInvalidUTF8{mkSrc(1, 2)}, + ErrInvalidUTF8{mkSrc(2, 3)}, + ErrInvalidUTF8{mkSrc(3, 4)}, + ErrInvalidUTF8{mkSrc(4, 5)}, }, }, { @@ -125,12 +123,8 @@ func TestNormalize(t *testing.T) { "end like it's 1991", }, wantErrs: []error{ - DOSNewlineError{ - Line: mkSrc(0, "normal file\r"), - }, - DOSNewlineError{ - Line: mkSrc(1, "except the lines\r"), - }, + ErrDOSNewline{mkSrc(0, 1)}, + ErrDOSNewline{mkSrc(1, 2)}, }, }, { @@ -152,18 +146,10 @@ func TestNormalize(t *testing.T) { "and one good line", }, wantErrs: []error{ - TrailingWhitespaceError{ - Line: mkSrc(0, "a file "), - }, - TrailingWhitespaceError{ - Line: mkSrc(1, "with all kinds\t\t"), - }, - TrailingWhitespaceError{ - Line: mkSrc(2, " \r\t"), - }, - TrailingWhitespaceError{ - Line: mkSrc(3, "of trailing space\u2003\u3000\u205f"), - }, + ErrTrailingWhitespace{mkSrc(0, 1)}, + ErrTrailingWhitespace{mkSrc(1, 2)}, + ErrTrailingWhitespace{mkSrc(2, 3)}, + ErrTrailingWhitespace{mkSrc(3, 4)}, }, }, { @@ -185,18 +171,10 @@ func TestNormalize(t *testing.T) { "and one good line", }, wantErrs: []error{ - LeadingWhitespaceError{ - Line: mkSrc(0, " a file"), - }, - LeadingWhitespaceError{ - Line: mkSrc(1, "\t\twith all kinds"), - }, - TrailingWhitespaceError{ - Line: mkSrc(2, " \r\t"), - }, - LeadingWhitespaceError{ - Line: mkSrc(3, "\u2003\u3000\u205fof leading space"), - }, + ErrLeadingWhitespace{mkSrc(0, 1)}, + ErrLeadingWhitespace{mkSrc(1, 2)}, + ErrTrailingWhitespace{mkSrc(2, 3)}, + ErrLeadingWhitespace{mkSrc(3, 4)}, }, }, { @@ -204,386 +182,20 @@ func TestNormalize(t *testing.T) { in: byteLines("\xef\xbb\xbf \t // Hello\xc3\x28 very broken line\t \r"), want: []string{"// Hello\uFFFD( very broken line"}, wantErrs: []error{ - UTF8BOMError{}, - InvalidUTF8Error{ - Line: mkSrc(0, " \t // Hello\uFFFD( very broken line\t \r"), - }, - DOSNewlineError{ - Line: mkSrc(0, " \t // Hello\uFFFD( very broken line\t \r"), - }, - TrailingWhitespaceError{ - Line: mkSrc(0, " \t // Hello\uFFFD( very broken line\t \r"), - }, - LeadingWhitespaceError{ - Line: mkSrc(0, " \t // Hello\uFFFD( very broken line\t \r"), - }, + ErrUTF8BOM{}, + ErrInvalidUTF8{mkSrc(0, 1)}, + ErrDOSNewline{mkSrc(0, 1)}, + ErrTrailingWhitespace{mkSrc(0, 1)}, + ErrLeadingWhitespace{mkSrc(0, 1)}, }, }, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - src, errs := newSource(tc.in) + lines, errs := normalizeToUTF8Lines(tc.in) checkDiff(t, "newSource error set", errs, tc.wantErrs) - checkDiff(t, "newSource result", src.lines, tc.want) - }) - } -} - -func TestLineSlicing(t *testing.T) { - t.Parallel() - - lines := []string{"abc", "def", "ghi", "jkl"} - src := mkSrc(0, lines...) - - wantLines := []Source{ - mkSrc(0, "abc"), - mkSrc(1, "def"), - mkSrc(2, "ghi"), - mkSrc(3, "jkl"), - } - checkDiff(t, "src.lineSources()", src.lineSources(), wantLines) - - // slice and line are internal helpers, but if they behave - // incorrectly some higher level methods have very confusing - // behavior, so test explicitly as well. - for i, wantLine := range wantLines { - checkDiff(t, fmt.Sprintf("src.line(%d)", i), src.line(i), wantLine) - } - - for start := 0; start <= len(lines); start++ { - for end := start + 1; end <= len(lines); end++ { - t.Run(fmt.Sprintf("slice_%d_to_%d", start, end), func(t *testing.T) { - want := mkSrc(start, lines[start:end]...) - checkDiff(t, fmt.Sprintf("src.slice(%d, %d)", start, end), src.slice(start, end), want) - }) - } - } -} - -func TestSourceText(t *testing.T) { - t.Parallel() - - tests := []struct { - src Source - wantText string - wantLocation string - }{ - { - src: mkSrc(0), - wantText: "", - wantLocation: "", - }, - { - src: mkSrc(0, "abc"), - wantText: "abc", - wantLocation: "line 1", - }, - { - src: mkSrc(0, "abc", "def"), - wantText: "abc\ndef", - wantLocation: "lines 1-2", - }, - { - src: mkSrc(0, "abc", "def").line(0), - wantText: "abc", - wantLocation: "line 1", - }, - { - src: mkSrc(0, "abc", "def").line(1), - wantText: "def", - wantLocation: "line 2", - }, - } - - for i, tc := range tests { - t.Run(strconv.Itoa(i), func(t *testing.T) { - checkDiff(t, "src.Text()", tc.src.Text(), tc.wantText) - checkDiff(t, "mkSrc().LocationString()", tc.src.LocationString(), tc.wantLocation) - }) - } -} - -func TestForEachRun(t *testing.T) { - t.Parallel() - - isComment := func(line Source) bool { - return strings.HasPrefix(line.Text(), "// ") - } - // some weird arbitrary classifier, to verify that forEachRun is - // using the classifier correctly - groupCnt := 0 - groupsOf2And1 := func(line Source) bool { - groupCnt = (groupCnt + 1) % 3 - return groupCnt == 0 - } - - type Run struct { - IsMatch bool - Block Source - } - tests := []struct { - name string - src Source - classify func(Source) bool - want []Run - }{ - { - name: "comments", - src: mkSrc(0, - "// foo", - "// bar", - "abc", - "def", - "// other", - "ghi", - ), - classify: isComment, - want: []Run{ - {true, mkSrc(0, "// foo", "// bar")}, - {false, mkSrc(2, "abc", "def")}, - {true, mkSrc(4, "// other")}, - {false, mkSrc(5, "ghi")}, - }, - }, - { - name: "only_comments", - src: mkSrc(0, - "// abc", - "// def", - "// ghi", - ), - classify: isComment, - want: []Run{ - {true, mkSrc(0, "// abc", "// def", "// ghi")}, - }, - }, - { - name: "comment_at_end", - src: mkSrc(0, - "// abc", - "def", - "// ghi", - ), - classify: isComment, - want: []Run{ - {true, mkSrc(0, "// abc")}, - {false, mkSrc(1, "def")}, - {true, mkSrc(2, "// ghi")}, - }, - }, - { - name: "no_comments", - src: mkSrc(0, - "abc", - "def", - "ghi", - ), - classify: isComment, - want: []Run{ - {false, mkSrc(0, "abc", "def", "ghi")}, - }, - }, - { - name: "weird_classifier", - src: mkSrc(0, - "abc", - "def", - "ghi", - "jkl", - "mno", - "pqr", - "stu", - ), - classify: groupsOf2And1, - want: []Run{ - {false, mkSrc(0, "abc", "def")}, - {true, mkSrc(2, "ghi")}, - {false, mkSrc(3, "jkl", "mno")}, - {true, mkSrc(5, "pqr")}, - {false, mkSrc(6, "stu")}, // truncated final group - }, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - var got []Run - tc.src.forEachRun(tc.classify, func(block Source, isMatch bool) { - got = append(got, Run{isMatch, block}) - }) - checkDiff(t, "forEachRun", got, tc.want) - }) - } -} - -func TestSplit(t *testing.T) { - t.Parallel() - - lines := mkSrc(0, - "// comment", - "abc", - "", - "// other", - "def", - "", - "// end", - "ghi", - ) - - exact := func(s string) func(Source) bool { - return func(line Source) bool { - return line.Text() == s - } - } - prefix := func(s string) func(Source) bool { - return func(line Source) bool { - return strings.HasPrefix(line.Text(), s) - } - } - - tests := []struct { - name string - src Source - fn func(Source) bool - want []Source - }{ - { - name: "simple", - src: lines, - fn: exact("abc"), - want: []Source{ - mkSrc(0, "// comment"), - mkSrc(2, "", "// other", "def", "", "// end", "ghi"), - }, - }, - { - name: "start", - src: lines, - fn: exact("// comment"), - want: []Source{ - mkSrc(1, "abc", "", "// other", "def", "", "// end", "ghi"), - }, - }, - { - name: "end", - src: lines, - fn: exact("ghi"), - want: []Source{ - mkSrc(0, "// comment", "abc", "", "// other", "def", "", "// end"), - }, - }, - { - name: "no_match", - src: lines, - fn: exact("xyz"), - want: []Source{ - mkSrc(0, "// comment", "abc", "", "// other", "def", "", "// end", "ghi"), - }, - }, - { - name: "prefix", - src: lines, - fn: prefix("ab"), - want: []Source{ - mkSrc(0, "// comment"), - mkSrc(2, "", "// other", "def", "", "// end", "ghi"), - }, - }, - { - name: "prefix_comment", - src: lines, - fn: prefix("// "), - want: []Source{ - mkSrc(1, "abc", ""), - mkSrc(4, "def", ""), - mkSrc(7, "ghi"), - }, - }, - - { - name: "empty", - src: mkSrc(0), - fn: exact("xyz"), - want: []Source{}, - }, - { - name: "empty_split_blank", - src: mkSrc(0), - fn: exact(""), - want: []Source{}, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - got := tc.src.split(tc.fn) - checkDiff(t, "split", got, tc.want) - }) - } -} - -func TestCut(t *testing.T) { - t.Parallel() - - exact := func(s string) func(Source) bool { - return func(line Source) bool { - return line.Text() == s - } - } - prefix := func(s string) func(Source) bool { - return func(line Source) bool { - return strings.HasPrefix(line.Text(), s) - } - } - - tests := []struct { - name string - src Source - fn func(Source) bool - before, rest Source - found bool - }{ - { - name: "simple", - src: mkSrc(0, "abc", "def", "ghi"), - fn: exact("def"), - before: mkSrc(0, "abc"), - rest: mkSrc(1, "def", "ghi"), - found: true, - }, - { - name: "cut_on_first", - src: mkSrc(0, - "abc", - "// def", - "ghi", - "// jkl", - "mno", - ), - fn: prefix("// "), - before: mkSrc(0, "abc"), - rest: mkSrc(1, "// def", "ghi", "// jkl", "mno"), - found: true, - }, - { - name: "no_match", - src: mkSrc(0, "abc", "def", "ghi"), - fn: exact("xyz"), - before: mkSrc(0, "abc", "def", "ghi"), - rest: Source{}, - found: false, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - gotBefore, gotRest, gotFound := tc.src.cut(tc.fn) - checkDiff(t, "cut() before", gotBefore, tc.before) - checkDiff(t, "cut() after", gotRest, tc.rest) - if gotFound != tc.found { - t.Errorf("cut() found=%v, want %v", gotFound, tc.found) - } + checkDiff(t, "newSource result", lines, tc.want) }) } } @@ -635,7 +247,7 @@ func utf8WithBOM(s string) []byte { func checkDiff(t *testing.T, whatIsBeingDiffed string, got, want any) { t.Helper() - if diff := cmp.Diff(got, want, cmp.AllowUnexported(Source{})); diff != "" { + if diff := cmp.Diff(got, want); diff != "" { t.Errorf("%s is wrong (-got+want):\n%s", whatIsBeingDiffed, diff) } } diff --git a/tools/internal/parser/validate.go b/tools/internal/parser/validate.go index aa330ea40..01709f04f 100644 --- a/tools/internal/parser/validate.go +++ b/tools/internal/parser/validate.go @@ -8,43 +8,39 @@ import ( "github.com/creachadair/mds/slice" ) -// Validate runs validations on a parsed File. -// -// Validation only runs on a file that does not yet have any -// errors. The presence of errors may indicate structural issues that -// can break some validations. -func (p *parser) Validate() { - if len(p.Errors) > 0 { - return +// ValidateOffline runs offline validations on a parsed PSL. +func ValidateOffline(l *List) []error { + var ret []error + + for _, block := range blocksOfType[*Section](l) { + if block.Name == "PRIVATE DOMAINS" { + ret = append(ret, validateEntityMetadata(block)...) + if err := validatePrivateSectionOrder(block); err != nil { + ret = append(ret, err) + } + break + } } - p.requireEntityNames() - p.requirePrivateDomainEmailContact() - p.requireSortedPrivateSection() + return ret } -// requireEntityNames verifies that all Suffix blocks have some kind -// of entity name. -func (p *parser) requireEntityNames() { - for _, block := range p.AllSuffixBlocks() { +// validateEntityMetadata verifies that all suffix blocks have some +// kind of entity name. +func validateEntityMetadata(block *Section) []error { + var ret []error + for _, block := range blocksOfType[*Suffixes](block) { if block.Entity == "" { - p.addError(MissingEntityName{ + ret = append(ret, ErrMissingEntityName{ Suffixes: block, }) - } - } -} - -// requirePrivateDomainEmailContact verifies that all Suffix blocks in -// the private section have email contact information. -func (p *parser) requirePrivateDomainEmailContact() { - for _, block := range p.File.SuffixBlocksInSection("PRIVATE DOMAINS") { - if block.Submitter == nil { - p.addError(MissingEntityEmail{ + } else if block.Submitter == nil && !exemptFromContactInfo(block.Entity) { + ret = append(ret, ErrMissingEntityEmail{ Suffixes: block, }) } } + return ret } const ( @@ -52,9 +48,9 @@ const ( amazonSuperblockEnd = "concludes Amazon" ) -// requireSortedPrivateSection verifies that the blocks in the private +// validatePrivateSectionOrder verifies that the blocks in the private // domains section is sorted according to PSL policy. -func (p *parser) requireSortedPrivateSection() { +func validatePrivateSectionOrder(block *Section) error { // Amazon has a semi-automated "superblock" of suffix blocks, // which are in the PSL at the correct sort location for "Amazon", // but are not correctly interleaved with other non-Amazon @@ -78,44 +74,28 @@ func (p *parser) requireSortedPrivateSection() { var blocks []superblock inAmazonSuperblock := false - for _, block := range allBlocksInPrivateSection(&p.File) { - if comm, ok := block.(*Comment); ok { - if !inAmazonSuperblock && strings.Contains(comm.Text(), amazonSuperblockStart) { + for _, block := range block.Children() { + switch v := block.(type) { + case *Comment: + if !inAmazonSuperblock && strings.Contains(v.Text[0], amazonSuperblockStart) { // Start of the Amazon superblock. We will accumulate // suffix blocks into here further down. inAmazonSuperblock = true blocks = append(blocks, superblock{ Name: "Amazon", }) - } else if inAmazonSuperblock && strings.Contains(comm.Text(), amazonSuperblockEnd) { + } else if inAmazonSuperblock && strings.Contains(v.Text[0], amazonSuperblockEnd) { // End of Amazon superblock, go back to normal // behavior. inAmazonSuperblock = false } - continue - } - - // Aside from the Amazon superblock comments, we only care - // about Suffix blocks in this validation. - suffixes, ok := block.(*Suffixes) - if !ok { - continue - } - - // While we're inside the Amazon superblock, all suffix blocks - // get grouped into one. Outside of the Amazon superblock, - // each suffix block gets its own superblock. - if inAmazonSuperblock { - last := len(blocks) - 1 - blocks[last].Suffixes = append(blocks[last].Suffixes, suffixes) - continue - } else if exemptFromSorting(suffixes.Source) { - continue - } else { - blocks = append(blocks, superblock{ - Name: suffixes.Entity, - Suffixes: []*Suffixes{suffixes}, - }) + case *Suffixes: + if inAmazonSuperblock { + last := len(blocks) - 1 + blocks[last].Suffixes = append(blocks[last].Suffixes, v) + } else if !exemptFromSorting(v.Entity) { + blocks = append(blocks, superblock{v.Entity, []*Suffixes{v}}) + } } } @@ -137,7 +117,7 @@ func (p *parser) requireSortedPrivateSection() { if len(sorted) == len(blocks) { // Already sorted, we're done. - return + return nil } // Scan through the superblocks and find where the incorrectly @@ -171,7 +151,7 @@ func (p *parser) requireSortedPrivateSection() { fixed := make([]superblock, 0, len(blocks)) fixed = append(fixed, sorted...) - err := SuffixBlocksInWrongPlace{ + err := ErrSuffixBlocksInWrongPlace{ EditScript: make([]MoveSuffixBlock, 0, len(blocks)-len(sorted)), } @@ -223,28 +203,34 @@ func (p *parser) requireSortedPrivateSection() { blocksIdx++ } - // At last, we can report the ordering error. - p.addError(err) + return err } -func allBlocksInPrivateSection(f *File) []Block { - start := 0 - for i, block := range f.Blocks { - switch v := block.(type) { - case *StartSection: - if v.Name != "PRIVATE DOMAINS" { - continue - } - start = i + 1 - case *EndSection: - if v.Name != "PRIVATE DOMAINS" { - continue - } - return f.Blocks[start:i] +// A childrener can return a list of its children. +// Yes, the interface name sounds a bit silly, but it's the +// conventional Go name given what it does. +type childrener interface { + Children() []Block +} + +// blocksOfType recursively walks the subtree rooted at c and returns +// all tree nodes of concrete block type T. +// +// For example, blocksOfType[*Comment](n) returns all comment nodes +// under n. +func blocksOfType[T Block](c childrener) []T { + var ret []T + + var rec func(childrener) + rec = func(c childrener) { + if v, ok := c.(T); ok { + ret = append(ret, v) + } + for _, child := range c.Children() { + rec(child) } } - // We can only get here if there's no private section (so nothing - // to validate), or if the file has structural issues (but we - // don't run validations in that case). - return []Block{} + rec(c) + + return ret } diff --git a/tools/internal/parser/validate_test.go b/tools/internal/parser/validate_test.go index 805b289e3..77c2fcbc3 100644 --- a/tools/internal/parser/validate_test.go +++ b/tools/internal/parser/validate_test.go @@ -1,185 +1,137 @@ package parser import ( - "bytes" - "errors" - "fmt" "testing" ) func TestRequireSortedPrivateSection(t *testing.T) { - // Shorthand for a simple suffix block with the right source data. - suffixBlock := func(lineOffset int, name, suffix string) Suffixes { - // For this test, every suffix block just has one suffix. - src := mkSrc(lineOffset, fmt.Sprintf("// %s", name), suffix) - return Suffixes{ - Source: src, - Header: []Source{src.slice(0, 1)}, - Entries: []Source{src.slice(1, 2)}, - Entity: name, - } - } - // Shorthand for an input file containing a series of suffixes. - suffixBlocks := func(suffixes ...Suffixes) []byte { - var ret bytes.Buffer - ret.WriteString("// ===BEGIN PRIVATE DOMAINS===\n\n") - for _, block := range suffixes { - for _, ln := range block.lineSources() { - ret.WriteString(ln.Text()) - ret.WriteByte('\n') - } - ret.WriteByte('\n') - } - ret.WriteString("// ===END PRIVATE DOMAINS===\n") - return ret.Bytes() - } - - aaa := suffixBlock(0, "AAA Corp", "aaa.com") - bbb := suffixBlock(0, "BBB Inc", "bbb.net") - ccc := suffixBlock(0, "CCC Ltd", "ccc.org") - dddLeadingDot := suffixBlock(0, ".DDD GmbH", "ddd.de") - aaaUmlaut := suffixBlock(0, "AÄA", "aaa.de") - aaaUmlautShort := suffixBlock(0, "AÄ", "aaa.ee") - aaaUmlautLong := suffixBlock(0, "AÄAA", "aaa.sk") - a3b := suffixBlock(0, "a3b", "a3b.com") - a24b := suffixBlock(0, "a24b", "a24b.com") + aaa := suffixes(0, 1, "AAA Corp", "", "", suffix(0, "aaa.com")) + bbb := suffixes(0, 1, "BBB Inc", "", "", suffix(0, "bbb.net")) + ccc := suffixes(0, 1, "CCC Ltd", "", "", suffix(0, "ccc.org")) + dddLeadingDot := suffixes(0, 1, ".DDD GmbH", "", "", suffix(0, "ddd.de")) + aaaUmlaut := suffixes(0, 1, "AÄA", "", "", suffix(0, "aaa.de")) + aaaUmlautShort := suffixes(0, 1, "AÄ", "", "", suffix(0, "aaa.ee")) + aaaUmlautLong := suffixes(0, 1, "AÄAA", "", "", suffix(0, "aaa.sk")) + a3b := suffixes(0, 1, "a3b", "", "", suffix(0, "a3b.com")) + a24b := suffixes(0, 1, "a24b", "", "", suffix(0, "a24b.com")) tests := []struct { name string - in []byte - want []error + in *Section + want error }{ { name: "easy_correct_order", - in: suffixBlocks(aaa, bbb, ccc), + in: section(0, 0, "", aaa, bbb, ccc), }, + { name: "easy_wrong_order", // correct order: aaa, bbb, ccc - in: suffixBlocks(bbb, aaa, ccc), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: bbb.Entity, - InsertAfter: aaa.Entity, - }, + in: section(0, 0, "", bbb, aaa, ccc), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: bbb.Entity, + InsertAfter: aaa.Entity, }, }, }, }, + { name: "reversed", // correct order: aaa, bbb, ccc - in: suffixBlocks(ccc, bbb, aaa), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: ccc.Entity, - InsertAfter: aaa.Entity, - }, - { - Name: bbb.Entity, - InsertAfter: aaa.Entity, - }, + in: section(0, 0, "", ccc, bbb, aaa), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: ccc.Entity, + InsertAfter: aaa.Entity, + }, + { + Name: bbb.Entity, + InsertAfter: aaa.Entity, }, }, }, }, + { name: "leading_punctuation", // correct order: dddLeadingDot, aaa, bbb, ccc - in: suffixBlocks(aaa, bbb, ccc, dddLeadingDot), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: dddLeadingDot.Entity, - InsertAfter: "", - }, + in: section(0, 0, "", aaa, bbb, ccc, dddLeadingDot), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: dddLeadingDot.Entity, + InsertAfter: "", }, }, }, }, + { name: "diacritics", // correct order: aaaUmlautShort, aaaUmlaut, aaa, aaaUmlautLong, bbb, ccc - in: suffixBlocks(aaa, bbb, ccc, aaaUmlaut, aaaUmlautShort, aaaUmlautLong), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: aaaUmlaut.Entity, - InsertAfter: "", - }, - { - Name: aaaUmlautShort.Entity, - InsertAfter: "", - }, - { - Name: aaaUmlautLong.Entity, - InsertAfter: aaa.Entity, - }, + in: section(0, 0, "", aaa, bbb, ccc, aaaUmlaut, aaaUmlautShort, aaaUmlautLong), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: aaaUmlaut.Entity, + InsertAfter: "", + }, + { + Name: aaaUmlautShort.Entity, + InsertAfter: "", + }, + { + Name: aaaUmlautLong.Entity, + InsertAfter: aaa.Entity, }, }, }, }, + { name: "numbers", // correct order: a24b, a3b, aaa, bbb - in: suffixBlocks(aaa, a3b, a24b, bbb), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: aaa.Entity, - InsertAfter: a24b.Entity, - }, - { - Name: a3b.Entity, - InsertAfter: a24b.Entity, - }, + in: section(0, 0, "", aaa, a3b, a24b, bbb), + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: aaa.Entity, + InsertAfter: a24b.Entity, + }, + { + Name: a3b.Entity, + InsertAfter: a24b.Entity, }, }, }, }, + { name: "amazon_superblock", - in: byteLines( - "// ===BEGIN PRIVATE DOMAINS===", - "", - "// AA Ltd", - "aa.com", - "", - "// Amazon : https://www.amazon.com", - "// several blocks follow", - "", - // note: incorrect order, but ignored because in Amazon superblock - "// eero", - "eero.com", - "", - "// AWS", - "aws.com", - "", - "// concludes Amazon", - "", - // note: out of order, not ignored - "// Altavista", - "altavista.com", - "", - "// BB Ltd", - "bb.com", - "", - "// ===END PRIVATE DOMAINS===", + in: section(0, 23, "", + suffixes(2, 4, "AA Ltd", "", "", suffix(3, "aa.com")), + + comment(5, "Amazon : https://www.amazon.com", "several blocks follow"), + // Note, incorrect sort, but ignored because it's in + // the Amazon superblock. + suffixes(8, 10, "eero", "", "", suffix(9, "eero.com")), + suffixes(11, 13, "AWS", "", "", suffix(12, "aws.com")), + comment(14, "concludes Amazon"), + + suffixes(16, 18, "Altavista", "", "", suffix(17, "altavista.com")), + + suffixes(19, 21, "BB Ltd", "", "", suffix(20, "bb.com")), ), - want: []error{ - SuffixBlocksInWrongPlace{ - EditScript: []MoveSuffixBlock{ - { - Name: `Amazon (all blocks until "concludes ..." comment)`, - InsertAfter: "Altavista", - }, + want: ErrSuffixBlocksInWrongPlace{ + EditScript: []MoveSuffixBlock{ + { + Name: `Amazon (all blocks until "concludes ..." comment)`, + InsertAfter: "Altavista", }, }, }, @@ -188,13 +140,8 @@ func TestRequireSortedPrivateSection(t *testing.T) { for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - p := parseWithExceptions(tc.in, downgradeToWarning, false) - if len(p.File.Errors) > 0 { - t.Fatalf("parse error before attempting validation: %v", errors.Join(p.File.Errors...)) - } - p.requireSortedPrivateSection() - - checkDiff(t, "validation result", p.File.Errors, tc.want) + errs := validatePrivateSectionOrder(tc.in) + checkDiff(t, "validation result", errs, tc.want) }) } }