From dac421a4a9183a77888e2c37d95c4e7ca5ce88d2 Mon Sep 17 00:00:00 2001 From: Brandon Liu Date: Tue, 12 Sep 2023 15:33:32 +0800 Subject: [PATCH] Experimental stats output [#70] (#75) * Output a .tsv.gz of all non-duplicate ZXY tiles with their compressed length. --- .gitignore | 1 + main.go | 19 +++++++- pmtiles/stats.go | 118 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 pmtiles/stats.go diff --git a/.gitignore b/.gitignore index 09db075..f93b9dc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ dist/ go-pmtiles *.pmtiles *.geojson +*.tsv.gz diff --git a/main.go b/main.go index f2e2151..7697162 100644 --- a/main.go +++ b/main.go @@ -55,6 +55,16 @@ var cli struct { Overfetch float32 `default:0.05 help:"What ratio of extra data to download to minimize # requests; 0.2 is 20%"` } `cmd:"" help:"Create an archive from a larger archive for a subset of zoom levels or geographic region."` + Makesync struct { + Input string `arg:"" type:"existingfile"` + BlockSize int `default:1000 help:"The block size, in # of tiles."` + HashFunction string `default:fnv1a help:"The hash function."` + } `cmd:"" help:"Generates an **experimental** sync control file (.pmtiles.sync) for a local archive."` + + Stats struct { + Input string `arg:"" type:"existingfile"` + } `cmd:"" help:"Add a vector tile statistics file (.tilestats.tsv.gz) used for further analysis with DuckDB."` + Verify struct { Input string `arg:"" help:"Input archive." type:"existingfile"` } `cmd:"" help:"Verifies that a local archive is valid."` @@ -91,12 +101,12 @@ func main() { case "show ": err := pmtiles.Show(logger, cli.Show.Bucket, cli.Show.Path, false, 0, 0, 0) if err != nil { - logger.Fatalf("Failed to show database, %v", err) + logger.Fatalf("Failed to show archive, %v", err) } case "tile ": err := pmtiles.Show(logger, cli.Tile.Bucket, cli.Tile.Path, true, cli.Tile.Z, cli.Tile.X, cli.Tile.Y) if err != nil { - logger.Fatalf("Failed to show database, %v", err) + logger.Fatalf("Failed to show tile, %v", err) } case "serve ": server, err := pmtiles.NewServer(cli.Serve.Bucket, cli.Serve.Path, logger, cli.Serve.CacheSize, cli.Serve.Cors, cli.Serve.PublicHostname) @@ -125,6 +135,11 @@ func main() { if err != nil { logger.Fatalf("Failed to extract, %v", err) } + case "stats ": + err := pmtiles.Stats(logger, cli.Stats.Input) + if err != nil { + logger.Fatalf("Failed to stats archive, %v", err) + } case "convert ": path := cli.Convert.Input output := cli.Convert.Output diff --git a/pmtiles/stats.go b/pmtiles/stats.go new file mode 100644 index 0000000..cefa678 --- /dev/null +++ b/pmtiles/stats.go @@ -0,0 +1,118 @@ +package pmtiles + +import ( + "bytes" + "compress/gzip" + "context" + "encoding/csv" + "fmt" + "github.com/RoaringBitmap/roaring/roaring64" + "io" + "log" + "os" + "strconv" + "time" +) + +func Stats(logger *log.Logger, file string) error { + start := time.Now() + ctx := context.Background() + + bucketURL, key, err := NormalizeBucketKey("", "", file) + + if err != nil { + return err + } + + bucket, err := OpenBucket(ctx, bucketURL, "") + + if err != nil { + return fmt.Errorf("Failed to open bucket for %s, %w", bucketURL, err) + } + defer bucket.Close() + + r, err := bucket.NewRangeReader(ctx, key, 0, 16384) + + if err != nil { + return fmt.Errorf("Failed to create range reader for %s, %w", key, err) + } + b, err := io.ReadAll(r) + if err != nil { + return fmt.Errorf("Failed to read %s, %w", key, err) + } + r.Close() + + header, err := deserialize_header(b[0:HEADERV3_LEN_BYTES]) + + if header.TileType != Mvt { + return fmt.Errorf("Stats only works on MVT vector tilesets.") + } + + // Pass 1: through the entire entry set, finding all non-duplicated tiles. + + var CollectEntries func(uint64, uint64, func(EntryV3)) + + CollectEntries = func(dir_offset uint64, dir_length uint64, f func(EntryV3)) { + dirbytes, err := bucket.NewRangeReader(ctx, key, int64(dir_offset), int64(dir_length)) + if err != nil { + panic(fmt.Errorf("I/O error")) + } + defer dirbytes.Close() + b, err = io.ReadAll(dirbytes) + if err != nil { + panic(fmt.Errorf("I/O Error")) + } + + directory := deserialize_entries(bytes.NewBuffer(b)) + for _, entry := range directory { + if entry.RunLength > 0 { + f(entry) + } else { + CollectEntries(header.LeafDirectoryOffset+entry.Offset, uint64(entry.Length), f) + } + } + } + + seen_once := roaring64.New() + seen_twice := roaring64.New() + CollectEntries(header.RootOffset, header.RootLength, func(e EntryV3) { + if seen_once.Contains(e.Offset) { + seen_twice.Add(e.Offset) + } + seen_once.Add(e.Offset) + }) + + seen_once.AndNot(seen_twice) + fmt.Println("Non-duplicate tiles:", seen_once.GetCardinality()) + + // pass 2: decompress and parse tiles in order. + + output, err := os.Create(file + ".stats.tsv.gz") + if err != nil { + return fmt.Errorf("Failed to create output") + } + defer output.Close() + + gzWriter := gzip.NewWriter(output) + defer gzWriter.Close() + + csvWriter := csv.NewWriter(gzWriter) + csvWriter.Comma = '\t' + defer csvWriter.Flush() + if err := csvWriter.Write([]string{"z", "x", "y", "bytes_compressed"}); err != nil { + return fmt.Errorf("Failed to write header to TSV: %v", err) + } + + CollectEntries(header.RootOffset, header.RootLength, func(e EntryV3) { + if seen_once.Contains(e.Offset) { + z, x, y := IdToZxy(e.TileId) + row := []string{strconv.FormatUint(uint64(z), 10), strconv.FormatUint(uint64(x), 10), strconv.FormatUint(uint64(y), 10), strconv.FormatUint(uint64(e.Length), 10)} + if err := csvWriter.Write(row); err != nil { + panic(fmt.Errorf("Failed to write record to TSV: %v", err)) + } + } + }) + + fmt.Printf("Completed stats in %v.\n", time.Since(start)) + return nil +}