Skip to content

Commit

Permalink
tools/internal/parser: implement automatic reformatting (#2036)
Browse files Browse the repository at this point in the history
* tools/internal/parser: remove Blank from the parsed output

This node is format-specific, and so should not exist in the abstract
syntax tree. The _lexer token_ for a blank is important to disambiguate
during parsing of the PSL format, but it's not necessary after the
tree has been assembled.

* tools/internal/domain: factor out parsing and comparing of domain names

The new domain subpackage enforces all IDNA2008 validation rules, which
covers a superset of the PSL style rules. One exception is that it corrects
unambiguous fixable errors like a browser would, instead of rejecting non
canonical input. This is to support automated reformatting and authoring
help in a followup change.

* tools/internal/parser: implement AST cleaning

The Clean method fixes lint issues that can be fixed by a machine, such
as the sorting of domain names and suffix blocks. These fixes should not
alter any of the meaning of a parsed PSL, only cosmetic things.

* tools/internal/parser: silently fix some whitespace/encoding mistakes

Now that there is a reformatter, leading/trailing whitespace, unnecessary
UTF-8 BOMs, explicitly declared UTF-16 encoding and DOS style line endings
can all be fixed automatically by the formatter, no need to bother the
user about it.

* tools/internal/parser: support outputting a parsed PSL back to text

govalidate now has extra flags to control formatting. By default, if
there are no parse errors (only validation/lint), it automatically
overwrites the input file with a reformatted copy.

* tools/internal/parser: support outputting a parsed PSL to debug form

This just pulls the ad-hoc debug printer from govalidate into the parser
library.
  • Loading branch information
danderson authored Jul 18, 2024
1 parent 8997e31 commit 461e2c2
Show file tree
Hide file tree
Showing 20 changed files with 8,578 additions and 638 deletions.
3 changes: 2 additions & 1 deletion tools/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ module github.com/publicsuffix/list/tools
go 1.21

require (
github.com/creachadair/mds v0.15.0
github.com/google/go-cmp v0.6.0
golang.org/x/net v0.26.0
golang.org/x/text v0.16.0
)

require github.com/natefinch/atomic v1.0.1
4 changes: 2 additions & 2 deletions tools/go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
github.com/creachadair/mds v0.15.0 h1:St6HvUcrX1UJ517Zha6GKxVibGyRDBDtInOjuaaHOrQ=
github.com/creachadair/mds v0.15.0/go.mod h1:4vrFYUzTXMJpMBU+OA292I6IUxKWCCfZkgXg+/kBZMo=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/natefinch/atomic v1.0.1 h1:ZPYKxkqQOx3KZ+RsbnP/YsgvxWQPGxjC0oBt2AhwV0A=
github.com/natefinch/atomic v1.0.1/go.mod h1:N/D/ELrljoqDyT3rZrsUmtsuzvHkeB/wWjHV22AZRbM=
golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
Expand Down
119 changes: 38 additions & 81 deletions tools/govalidate/govalidate.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@
package main

import (
"bytes"
"flag"
"fmt"
"os"
"strconv"
"strings"

"github.com/natefinch/atomic"
"github.com/publicsuffix/list/tools/internal/parser"
)

func main() {
debugPrintTree := flag.Bool("debug-print", false, "print the parse tree for debugging")
reformat := flag.Bool("reformat", true, "if input is valid, fix formatting errors")

flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] pslfile\n", os.Args[0])
Expand All @@ -35,94 +36,50 @@ func main() {

psl, errs := parser.Parse(bs)

// Errors during the base parse means we may have thrown away
// information, and that means we can't round-trip the file back
// to disk without potentially destroying stuff.
safeToRewrite := len(errs) == 0

errs = append(errs, psl.Clean()...)
errs = append(errs, parser.ValidateOffline(psl)...)

if *debugPrintTree {
debugPrint(psl)
bs := psl.MarshalDebug()
os.Stdout.Write(bs)
fmt.Println("")
}

for _, err := range errs {
fmt.Println(err)
// Maybe write out the reformatted file.
out := psl.MarshalPSL()
changed := !bytes.Equal(bs, out)
switch {
case !safeToRewrite:
// Can't rewrite without potentially destroying information, do
// nothing.
case !changed:
// No changes needed, don't rewrite so that timestamps etc. don't
// change.
case !*reformat:
// We were ordered to not reformat, and format is wrong.
errs = append(errs, fmt.Errorf("file has formatting errors, rerun with --reformat=true to fix"))
default:
if err := atomic.WriteFile(file, bytes.NewReader(out)); err != nil {
errs = append(errs, fmt.Errorf("formatting %q: %v", file, err))
}
}

verrs := parser.ValidateOffline(psl)
for _, err := range verrs {
for _, err := range errs {
fmt.Println(err)
}
fmt.Println("")

if total := len(errs) + len(verrs); total > 0 {
fmt.Printf("\nFile has %d errors.\n", total)
if total := len(errs); total > 0 {
fmt.Printf("File has %d errors.\n", total)
os.Exit(1)
} else if changed {
fmt.Println("File is valid, rewrote to canonical format.")
} else {
fmt.Println("\nFile is valid.")
}
}

// debugPrint prints out a PSL syntax tree in a private, subject to
// change text format.
func debugPrint(b parser.Block) {
debugPrintRec(b, "")
}

func debugPrintRec(b parser.Block, indent string) {
nextIndent := indent + " "
f := func(msg string, args ...any) {
fmt.Printf(indent+msg+"\n", args...)
}
src := b.SrcRange()
loc := fmt.Sprintf("[%d:%d]", src.FirstLine, src.LastLine)
if src.FirstLine+1 == src.LastLine {
loc = strconv.Itoa(src.FirstLine)
}

switch v := b.(type) {
case *parser.List:
f("List(%s) {", loc)
for _, b := range v.Blocks {
debugPrintRec(b, nextIndent)
}
f("}")
case *parser.Blank:
f("Blank(%s)", loc)
case *parser.Comment:
f("Comment(%s) {", loc)
for _, t := range v.Text {
f(" %q,", t)
}
f("}")
case *parser.Section:
f("Section(%s, %q) {", loc, v.Name)
for _, b := range v.Blocks {
debugPrintRec(b, nextIndent)
}
f("}")
case *parser.Suffixes:
items := []string{loc, fmt.Sprintf("editable=%v", v.Info.MachineEditable)}
if v.Info.Name != "" {
items = append(items, fmt.Sprintf("name=%q", v.Info.Name))
}
for _, u := range v.Info.URLs {
items = append(items, fmt.Sprintf("url=%q", u))
}
for _, e := range v.Info.Maintainers {
items = append(items, fmt.Sprintf("contact=%q", e))
}
for _, o := range v.Info.Other {
items = append(items, fmt.Sprintf("other=%q", o))
}

f("SuffixBlock(%s) {", strings.Join(items, fmt.Sprintf(",\n%s ", indent)))
for _, b := range v.Blocks {
debugPrintRec(b, nextIndent)
}
f("}")
case *parser.Suffix:
f("Suffix(%s, %q)", loc, strings.Join(v.Labels, "."))
case *parser.Wildcard:
if len(v.Exceptions) > 0 {
f("Wildcard(%s, %q, except=%v)", loc, strings.Join(v.Labels, "."), v.Exceptions)
} else {
f("Wildcard(%s, %q)", loc, strings.Join(v.Labels, "."))
}
default:
panic("unknown block type")
fmt.Println("File is valid.")
}
}
Loading

0 comments on commit 461e2c2

Please sign in to comment.