Skip to content

Commit

Permalink
tools/internal/parser: rewrite parser to output a syntax tree (#2025)
Browse files Browse the repository at this point in the history
This makes the layout into a much more standard-looking recursive
descent parser, and lightens the amount of effort spent on reporting
egregiously invalid files in exchange for better code readability
for the more common situation of structurally valid but failing
on policy/lint issues.
  • Loading branch information
danderson authored Jul 4, 2024
1 parent afef4b8 commit 12b354d
Show file tree
Hide file tree
Showing 13 changed files with 1,169 additions and 2,760 deletions.
4 changes: 2 additions & 2 deletions tools/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ module github.com/publicsuffix/list/tools
go 1.21

require (
github.com/creachadair/mds v0.15.0
github.com/google/go-cmp v0.6.0
golang.org/x/net v0.26.0
golang.org/x/text v0.16.0
)

require github.com/creachadair/mds v0.15.0 // indirect
2 changes: 2 additions & 0 deletions tools/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@ github.com/creachadair/mds v0.15.0 h1:St6HvUcrX1UJ517Zha6GKxVibGyRDBDtInOjuaaHOr
github.com/creachadair/mds v0.15.0/go.mod h1:4vrFYUzTXMJpMBU+OA292I6IUxKWCCfZkgXg+/kBZMo=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
93 changes: 84 additions & 9 deletions tools/govalidate/govalidate.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@ import (
"flag"
"fmt"
"os"
"strconv"
"strings"

"github.com/publicsuffix/list/tools/internal/parser"
)

func main() {
warnings := flag.Bool("with-warnings", false, "also print errors that were downgraded to warnings")
debugPrintTree := flag.Bool("debug-print", false, "print the parse tree for debugging")

flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] pslfile\n", os.Args[0])
flag.PrintDefaults()
Expand All @@ -30,19 +33,91 @@ func main() {
os.Exit(1)
}

psl := parser.Parse(bs)
psl, errs := parser.Parse(bs)

if *debugPrintTree {
debugPrint(psl)
}

for _, err := range psl.Errors {
for _, err := range errs {
fmt.Println(err)
}
if *warnings {
for _, err := range psl.Warnings {
fmt.Println(err, "(warning)")
}

verrs := parser.ValidateOffline(psl)
for _, err := range verrs {
fmt.Println(err)
}
if len(psl.Errors) > 0 {

if total := len(errs) + len(verrs); total > 0 {
fmt.Printf("\nFile has %d errors.\n", total)
os.Exit(1)
} else {
fmt.Printf("%q seems to be a valid PSL file.\n", file)
fmt.Println("\nFile is valid.")
}
}

// debugPrint prints out a PSL syntax tree in a private, subject to
// change text format.
func debugPrint(p *parser.List) {
fmt.Println("List {")
for _, b := range p.Blocks {
debugPrintRec(b, " ")
}
fmt.Println("}")
}

func debugPrintRec(b parser.Block, indent string) {
nextIndent := indent + " "
f := func(msg string, args ...any) {
fmt.Printf(indent+msg+"\n", args...)
}
src := b.SrcRange()
loc := fmt.Sprintf("[%d:%d]", src.FirstLine, src.LastLine)
if src.FirstLine+1 == src.LastLine {
loc = strconv.Itoa(src.FirstLine)
}

switch v := b.(type) {
case *parser.Blank:
f("Blank(%s)", loc)
case *parser.Comment:
f("Comment(%s) {", loc)
for _, t := range v.Text {
f(" %q,", t)
}
f("}")
case *parser.Section:
f("Section(%s, %q) {", loc, v.Name)
for _, b := range v.Blocks {
debugPrintRec(b, nextIndent)
}
f("}")
case *parser.Suffixes:
items := []string{loc}
if v.Entity != "" {
items = append(items, fmt.Sprintf("name=%q", v.Entity))
}
if v.URL != nil {
items = append(items, fmt.Sprintf("url=%q", v.URL))
}
if v.Submitter != nil {
items = append(items, fmt.Sprintf("contact=%q", v.Submitter))
}

f("SuffixBlock(%s) {", strings.Join(items, fmt.Sprintf(",\n%s ", indent)))
for _, b := range v.Blocks {
debugPrintRec(b, nextIndent)
}
f("}")
case *parser.Suffix:
f("Suffix(%s, %q)", loc, strings.Join(v.Labels, "."))
case *parser.Wildcard:
if len(v.Exceptions) > 0 {
f("Wildcard(%s, %q, except=%v)", loc, strings.Join(v.Labels, "."), v.Exceptions)
} else {
f("Wildcard(%s, %q)", loc, strings.Join(v.Labels, "."))
}
default:
panic("unknown block type")
}
}
172 changes: 87 additions & 85 deletions tools/internal/parser/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,158 +5,148 @@ import (
"strings"
)

// InvalidEncodingError reports that the input is encoded with
// ErrInvalidEncoding reports that the input is encoded with
// something other than UTF-8.
type InvalidEncodingError struct {
type ErrInvalidEncoding struct {
Encoding string
}

func (e InvalidEncodingError) Error() string {
return fmt.Sprintf("file uses invalid character encoding %s", e.Encoding)
func (e ErrInvalidEncoding) Error() string {
return fmt.Sprintf("invalid character encoding %s", e.Encoding)
}

// UTF8BOMError reports that the input has an unnecessary UTF-8 byte
// ErrUTF8BOM reports that the input has an unnecessary UTF-8 byte
// order mark (BOM) at the start.
type UTF8BOMError struct{}
type ErrUTF8BOM struct{}

func (e UTF8BOMError) Error() string {
return "file starts with an unnecessary UTF-8 BOM (byte order mark)"
}
func (e ErrUTF8BOM) Error() string { return "file has a UTF-8 byte order mark (BOM)" }

// InvalidUTF8Error reports that a line contains bytes that are not
// ErrInvalidUTF8 reports that a line contains bytes that are not
// valid UTF-8.
type InvalidUTF8Error struct {
Line Source
type ErrInvalidUTF8 struct {
SourceRange
}

func (e InvalidUTF8Error) Error() string {
return fmt.Sprintf("found non UTF-8 bytes at %s", e.Line.LocationString())
func (e ErrInvalidUTF8) Error() string {
return fmt.Sprintf("%s: invalid UTF-8 bytes", e.SourceRange.LocationString())
}

// DOSNewlineError reports that a line has a DOS style line ending.
type DOSNewlineError struct {
Line Source
// ErrDOSNewline reports that a line has a DOS style line ending.
type ErrDOSNewline struct {
SourceRange
}

func (e DOSNewlineError) Error() string {
return fmt.Sprintf("%s has a DOS line ending (\\r\\n instead of just \\n)", e.Line.LocationString())
func (e ErrDOSNewline) Error() string {
return fmt.Sprintf("%s: found DOS line ending (\\r\\n instead of just \\n)", e.SourceRange.LocationString())
}

// TrailingWhitespaceError reports that a line has trailing whitespace.
type TrailingWhitespaceError struct {
Line Source
// ErrTrailingWhitespace reports that a line has trailing whitespace.
type ErrTrailingWhitespace struct {
SourceRange
}

func (e TrailingWhitespaceError) Error() string {
return fmt.Sprintf("%s has trailing whitespace", e.Line.LocationString())
func (e ErrTrailingWhitespace) Error() string {
return fmt.Sprintf("%s: trailing whitespace", e.SourceRange.LocationString())
}

// LeadingWhitespaceError reports that a line has leading whitespace.
type LeadingWhitespaceError struct {
Line Source
// ErrLeadingWhitespace reports that a line has leading whitespace.
type ErrLeadingWhitespace struct {
SourceRange
}

func (e LeadingWhitespaceError) Error() string {
return fmt.Sprintf("%s has leading whitespace", e.Line.LocationString())
func (e ErrLeadingWhitespace) Error() string {
return fmt.Sprintf("%s: leading whitespace", e.SourceRange.LocationString())
}

// SectionInSuffixBlock reports that a comment within a block of
// suffixes contains a section delimiter.
type SectionInSuffixBlock struct {
Line Source
// ErrSectionInSuffixBlock reports that a comment within a suffix
// block contains a section delimiter.
type ErrSectionInSuffixBlock struct {
SourceRange
}

func (e SectionInSuffixBlock) Error() string {
return fmt.Sprintf("section delimiters are not allowed in suffix block comment at %s", e.Line.LocationString())
func (e ErrSectionInSuffixBlock) Error() string {
return fmt.Sprintf("%s: section delimiter not allowed in suffix block comment", e.SourceRange.LocationString())
}

// UnclosedSectionError reports that a file section was not closed
// ErrUnclosedSection reports that a file section was not closed
// properly before EOF.
type UnclosedSectionError struct {
Start *StartSection // The unpaired section start
type ErrUnclosedSection struct {
Section *Section
}

func (e UnclosedSectionError) Error() string {
return fmt.Sprintf("section %q started at %s, but is never closed", e.Start.Name, e.Start.LocationString())
func (e ErrUnclosedSection) Error() string {
return fmt.Sprintf("%s: section %q is missing its closing marker", e.Section.SourceRange.LocationString(), e.Section.Name)
}

// NestedSectionError reports that a file section is being started
// while already within a section, which the PSL format does not
// allow.
type NestedSectionError struct {
Outer *StartSection
Inner *StartSection
// ErrNestedSection reports that a file section is being started while
// already within a section.
type ErrNestedSection struct {
SourceRange
Name string
Section *Section
}

func (e NestedSectionError) Error() string {
return fmt.Sprintf("new section %q started at %s while still in section %q (started at %s)", e.Inner.Name, e.Inner.LocationString(), e.Outer.Name, e.Outer.LocationString())
func (e ErrNestedSection) Error() string {
return fmt.Sprintf("%s: section %q is nested inside section %q (%s)", e.SourceRange.LocationString(), e.Name, e.Section.Name, e.Section.SourceRange.LocationString())
}

// UnstartedSectionError reports that a file section end marker was
// found without a corresponding start.
type UnstartedSectionError struct {
End *EndSection
// ErrUnstartedSection reports that section end marker was found
// without a corresponding start.
type ErrUnstartedSection struct {
SourceRange
Name string
}

func (e UnstartedSectionError) Error() string {
return fmt.Sprintf("section %q closed at %s but was not started", e.End.Name, e.End.LocationString())
func (e ErrUnstartedSection) Error() string {
return fmt.Sprintf("%s: end marker for non-existent section %q", e.SourceRange.LocationString(), e.Name)
}

// MismatchedSectionError reports that a file section was started
// ErrMismatchedSection reports that a file section was started
// under one name but ended under another.
type MismatchedSectionError struct {
Start *StartSection
End *EndSection
type ErrMismatchedSection struct {
SourceRange
EndName string
Section *Section
}

func (e MismatchedSectionError) Error() string {
return fmt.Sprintf("section %q closed at %s while in section %q (started at %s)", e.End.Name, e.End.LocationString(), e.Start.Name, e.Start.LocationString())
func (e ErrMismatchedSection) Error() string {
return fmt.Sprintf("%s: section %q (%s) closed with wrong name %q", e.SourceRange.LocationString(), e.Section.Name, e.Section.SourceRange.LocationString(), e.EndName)
}

// UnknownSectionMarker reports that a line looks like a file section
// ErrUnknownSectionMarker reports that a line looks like a file section
// marker (e.g. "===BEGIN ICANN DOMAINS==="), but is not one of the
// recognized kinds of marker.
type UnknownSectionMarker struct {
Line Source
}

func (e UnknownSectionMarker) Error() string {
return fmt.Sprintf("unknown kind of section marker %q at %s", e.Line.Text(), e.Line.LocationString())
}

// UnterminatedSectionMarker reports that a section marker is missing
// the required trailing "===", e.g. "===BEGIN ICANN DOMAINS".
type UnterminatedSectionMarker struct {
Line Source
type ErrUnknownSectionMarker struct {
SourceRange
}

func (e UnterminatedSectionMarker) Error() string {
return fmt.Sprintf(`section marker %q at %s is missing trailing "==="`, e.Line.Text(), e.Line.LocationString())
func (e ErrUnknownSectionMarker) Error() string {
return fmt.Sprintf("%s: unknown kind of section marker", e.SourceRange.LocationString())
}

// MissingEntityName reports that a block of suffixes does not have a
// parseable owner name in its header comment.
type MissingEntityName struct {
type ErrMissingEntityName struct {
Suffixes *Suffixes
}

func (e MissingEntityName) Error() string {
return fmt.Sprintf("could not find entity name for %s at %s", e.Suffixes.shortName(), e.Suffixes.LocationString())
func (e ErrMissingEntityName) Error() string {
return fmt.Sprintf("%s: suffix block has no owner name", e.Suffixes.SourceRange.LocationString())
}

// MissingEntityEmail reports that a block of suffixes does not have a
// ErrMissingEntityEmail reports that a block of suffixes does not have a
// parseable contact email address in its header comment.
type MissingEntityEmail struct {
type ErrMissingEntityEmail struct {
Suffixes *Suffixes
}

func (e MissingEntityEmail) Error() string {
return fmt.Sprintf("could not find a contact email for %s at %s", e.Suffixes.shortName(), e.Suffixes.LocationString())
func (e ErrMissingEntityEmail) Error() string {
return fmt.Sprintf("%s: suffix block has no contact email", e.Suffixes.SourceRange.LocationString())
}

// SuffixBlocksInWrongPlace reports that some suffix blocks of the
// ErrSuffixBlocksInWrongPlace reports that some suffix blocks of the
// private section are in the wrong sort order.
type SuffixBlocksInWrongPlace struct {
type ErrSuffixBlocksInWrongPlace struct {
// EditScript is a list of suffix block movements to put the
// private domains section in the correct order. Note that each
// step assumes that the previous steps have already been done.
Expand All @@ -174,7 +164,7 @@ type MoveSuffixBlock struct {
InsertAfter string
}

func (e SuffixBlocksInWrongPlace) Error() string {
func (e ErrSuffixBlocksInWrongPlace) Error() string {
if len(e.EditScript) == 1 {
after := e.EditScript[0].InsertAfter
if after == "" {
Expand All @@ -198,3 +188,15 @@ func (e SuffixBlocksInWrongPlace) Error() string {

return ret.String()
}

// ErrInvalidSuffix reports that a suffix suffix is not a valid PSL
// entry.
type ErrInvalidSuffix struct {
SourceRange
Suffix string
Err error
}

func (e ErrInvalidSuffix) Error() string {
return fmt.Sprintf("%s: invalid suffix %q: %v", e.SourceRange.LocationString(), e.Suffix, e.Err)
}
Loading

0 comments on commit 12b354d

Please sign in to comment.