Skip to content

Commit

Permalink
Iniitial geoparquet support (#888)
Browse files Browse the repository at this point in the history
  • Loading branch information
msbarry authored May 22, 2024
1 parent 2b878fa commit fb1d0e3
Show file tree
Hide file tree
Showing 47 changed files with 3,492 additions and 23 deletions.
3 changes: 3 additions & 0 deletions NOTICE.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ The `planetiler-core` module includes the following software:
, [EPSG](https://github.com/geotools/geotools/blob/main/licenses/EPSG.md))
- org.msgpack:msgpack-core (Apache license)
- org.xerial:sqlite-jdbc (Apache license)
- org.xerial.snappy:snappy-java (Apache license)
- com.ibm.icu:icu4j ([ICU license](https://github.com/unicode-org/icu/blob/main/icu4c/LICENSE))
- com.google.guava:guava (Apache license)
- com.google.protobuf:protobuf-java (BSD 3-Clause License)
Expand All @@ -29,6 +30,7 @@ The `planetiler-core` module includes the following software:
- org.snakeyaml:snakeyaml-engine (Apache license)
- org.commonmark:commonmark (BSD 2-clause license)
- org.tukaani:xz (public domain)
- blue.strategic.parquet:parquet-floor (Apache license)
- Adapted code:
- `DouglasPeuckerSimplifier` from [JTS](https://github.com/locationtech/jts) (EDL)
- `OsmMultipolygon` from [imposm3](https://github.com/omniscale/imposm3) (Apache license)
Expand Down Expand Up @@ -65,4 +67,5 @@ The `planetiler-core` module includes the following software:
| OSM Lakelines | [MIT](https://github.com/lukasmartinelli/osm-lakelines), data from OSM [ODBL](https://www.openstreetmap.org/copyright) | yes | no |
| OSM Water Polygons | [acknowledgement](https://osmdata.openstreetmap.de/info/license.html), data from OSM [ODBL](https://www.openstreetmap.org/copyright) | yes | yes |
| Wikidata name translations | [CCO](https://www.wikidata.org/wiki/Wikidata:Licensing) | no | no |
| Overture Maps | [Various](https://docs.overturemaps.org/attribution) | no | yes |

2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,8 @@ Planetiler is made possible by these awesome open source projects:
Google's [Common Expression Language](https://github.com/google/cel-spec) that powers dynamic expressions embedded in
schema config files.
- [PMTiles](https://github.com/protomaps/PMTiles) optimized tile storage format
- [Apache Parquet](https://github.com/apache/parquet-mr) to support reading geoparquet files in java (with dependencies
minimized by [parquet-floor](https://github.com/strategicblue/parquet-floor))

See [NOTICE.md](NOTICE.md) for a full list and license details.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package com.onthegomap.planetiler.benchmarks;

import com.onthegomap.planetiler.config.Arguments;
import com.onthegomap.planetiler.config.Bounds;
import com.onthegomap.planetiler.reader.parquet.ParquetInputFile;
import java.nio.file.Path;

public class BenchmarkParquetRead {

public static void main(String[] args) {
var arguments = Arguments.fromArgs(args);
var path =
arguments.inputFile("parquet", "parquet file to read", Path.of("data", "sources", "locality.zstd.parquet"));
long c = 0;
var file = new ParquetInputFile("parquet", "locality", path, null, Bounds.WORLD, null, tags -> tags.get("id"));
for (int i = 0; i < 20; i++) {
long start = System.currentTimeMillis();
for (var block : file.get()) {
for (var item : block) {
c += item.tags().size();
}
}
System.err.println(System.currentTimeMillis() - start);
}

System.err.println(c);
}
}
22 changes: 17 additions & 5 deletions planetiler-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,24 @@
<artifactId>geopackage</artifactId>
<version>${geopackage.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-nop</artifactId>
</exclusion>
</exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-nop</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Pin transitive snappy dependency to more recent version without vulnerability -->
<dependency>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
<version>1.1.10.5</version>
</dependency>
<dependency>
<groupId>blue.strategic.parquet</groupId>
<artifactId>parquet-floor</artifactId>
<version>1.41</version>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
import com.onthegomap.planetiler.reader.GeoPackageReader;
import com.onthegomap.planetiler.reader.NaturalEarthReader;
import com.onthegomap.planetiler.reader.ShapefileReader;
import com.onthegomap.planetiler.reader.SourceFeature;
import com.onthegomap.planetiler.reader.osm.OsmInputFile;
import com.onthegomap.planetiler.reader.osm.OsmNodeBoundsProvider;
import com.onthegomap.planetiler.reader.osm.OsmReader;
import com.onthegomap.planetiler.reader.parquet.ParquetReader;
import com.onthegomap.planetiler.stats.ProcessInfo;
import com.onthegomap.planetiler.stats.Stats;
import com.onthegomap.planetiler.stats.Timers;
Expand All @@ -39,9 +41,12 @@
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.IntStream;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -469,6 +474,53 @@ public Planetiler addNaturalEarthSource(String name, Path defaultPath, String de
config, profile, stats, keepUnzipped)));
}


/**
* Adds a new <a href="https://github.com/opengeospatial/geoparquet">geoparquet</a> source that will be processed when
* {@link #run()} is called.
*
* @param name string to use in stats and logs to identify this stage
* @param paths paths to the geoparquet files to read.
* @param hivePartitioning Set to true to parse extra feature tags from the file path, for example
* {@code {them="buildings", type="part"}} from
* {@code base/theme=buildings/type=part/file.parquet}
* @param getId function that extracts a unique vector tile feature ID from each input feature, string or
* binary features will be hashed to a {@code long}.
* @param getLayer function that extracts {@link SourceFeature#getSourceLayer()} from the properties of each
* input feature
* @return this runner instance for chaining
* @see GeoPackageReader
*/
public Planetiler addParquetSource(String name, List<Path> paths, boolean hivePartitioning,
Function<Map<String, Object>, Object> getId, Function<Map<String, Object>, Object> getLayer) {
// TODO handle auto-downloading
for (var path : paths) {
inputPaths.add(new InputPath(name, path, false));
}
var separator = Pattern.quote(paths.isEmpty() ? "/" : paths.getFirst().getFileSystem().getSeparator());
String prefix = StringUtils.getCommonPrefix(paths.stream().map(Path::toString).toArray(String[]::new))
.replaceAll(separator + "[^" + separator + "]*$", "");
return addStage(name, "Process features in " + (prefix.isEmpty() ? (paths.size() + " files") : prefix),
ifSourceUsed(name, () -> new ParquetReader(name, profile, stats, getId, getLayer, hivePartitioning)
.process(paths, featureGroup, config)));
}

/**
* Alias for {@link #addParquetSource(String, List, boolean, Function, Function)} using the default layer and ID
* extractors.
*/
public Planetiler addParquetSource(String name, List<Path> paths, boolean hivePartitioning) {
return addParquetSource(name, paths, hivePartitioning, null, null);
}

/**
* Alias for {@link #addParquetSource(String, List, boolean, Function, Function)} without hive partitioning and using
* the default layer and ID extractors.
*/
public Planetiler addParquetSource(String name, List<Path> paths) {
return addParquetSource(name, paths, false);
}

/**
* Adds a new stage that will be invoked when {@link #run()} is called.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import org.locationtech.jts.geom.LineSegment;
import org.locationtech.jts.geom.LineString;
import org.locationtech.jts.geom.LinearRing;
import org.locationtech.jts.geom.MultiLineString;
import org.locationtech.jts.geom.MultiPoint;
import org.locationtech.jts.geom.MultiPolygon;
import org.locationtech.jts.geom.Point;
import org.locationtech.jts.geom.Polygon;
Expand All @@ -27,6 +29,7 @@
import org.locationtech.jts.geom.util.GeometryFixer;
import org.locationtech.jts.geom.util.GeometryTransformer;
import org.locationtech.jts.io.WKBReader;
import org.locationtech.jts.io.WKTReader;
import org.locationtech.jts.precision.GeometryPrecisionReducer;

/**
Expand All @@ -40,8 +43,9 @@ public class GeoUtils {
/** Rounding precision for 256x256px tiles encoded using 4096 values. */
public static final PrecisionModel TILE_PRECISION = new PrecisionModel(4096d / 256d);
public static final GeometryFactory JTS_FACTORY = new GeometryFactory(PackedCoordinateSequenceFactory.DOUBLE_FACTORY);
public static final WKBReader WKB_READER = new WKBReader(JTS_FACTORY);

public static final Geometry EMPTY_GEOMETRY = JTS_FACTORY.createGeometryCollection();
public static final CoordinateSequence EMPTY_COORDINATE_SEQUENCE = new PackedCoordinateSequence.Double(0, 2, 0);
public static final Point EMPTY_POINT = JTS_FACTORY.createPoint();
public static final LineString EMPTY_LINE = JTS_FACTORY.createLineString();
public static final Polygon EMPTY_POLYGON = JTS_FACTORY.createPolygon();
Expand Down Expand Up @@ -247,11 +251,11 @@ public static Point point(Coordinate coord) {
return JTS_FACTORY.createPoint(coord);
}

public static Geometry createMultiLineString(List<LineString> lineStrings) {
public static MultiLineString createMultiLineString(List<LineString> lineStrings) {
return JTS_FACTORY.createMultiLineString(lineStrings.toArray(EMPTY_LINE_STRING_ARRAY));
}

public static Geometry createMultiPolygon(List<Polygon> polygon) {
public static MultiPolygon createMultiPolygon(List<Polygon> polygon) {
return JTS_FACTORY.createMultiPolygon(polygon.toArray(EMPTY_POLYGON_ARRAY));
}

Expand Down Expand Up @@ -370,7 +374,7 @@ public static CoordinateSequence coordinateSequence(double... coords) {
return new PackedCoordinateSequence.Double(coords, 2, 0);
}

public static Geometry createMultiPoint(List<Point> points) {
public static MultiPoint createMultiPoint(List<Point> points) {
return JTS_FACTORY.createMultiPoint(points.toArray(EMPTY_POINT_ARRAY));
}

Expand Down Expand Up @@ -548,6 +552,14 @@ public static int minZoomForPixelSize(double worldGeometrySize, double minPixelS
PlanetilerConfig.MAX_MAXZOOM);
}

public static WKBReader wkbReader() {
return new WKBReader(JTS_FACTORY);
}

public static WKTReader wktReader() {
return new WKTReader(JTS_FACTORY);
}

/** Helper class to sort polygons by area of their outer shell. */
private record PolyAndArea(Polygon poly, double area) implements Comparable<PolyAndArea> {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,15 @@ public void readFeatures(Consumer<SimpleFeature> next) throws Exception {
}
}
if (geometryColumn >= 0) {
var wkbReader = GeoUtils.wkbReader();
while (rs.next()) {
byte[] geometry = rs.getBytes(geometryColumn + 1);
if (geometry == null) {
continue;
}

// create the feature and pass to next stage
Geometry latLonGeometry = GeoUtils.WKB_READER.read(geometry);
Geometry latLonGeometry = wkbReader.read(geometry);
SimpleFeature readerGeometry = SimpleFeature.create(latLonGeometry, HashMap.newHashMap(column.length - 1),
sourceName, table, ++id);
for (int c = 0; c < column.length; c++) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package com.onthegomap.planetiler.reader.parquet;

import com.onthegomap.planetiler.geo.GeoUtils;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import org.locationtech.jts.geom.CoordinateSequence;
import org.locationtech.jts.geom.LineString;
import org.locationtech.jts.geom.LinearRing;
import org.locationtech.jts.geom.MultiLineString;
import org.locationtech.jts.geom.MultiPoint;
import org.locationtech.jts.geom.MultiPolygon;
import org.locationtech.jts.geom.Point;
import org.locationtech.jts.geom.Polygon;
import org.locationtech.jts.geom.impl.PackedCoordinateSequence;

/**
* Utilities for converting nested <a href=
* "https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#native-encodings-based-on-geoarrow">geoarrow</a>
* coordinate lists to JTS geometries.
*/
class GeoArrow {
private GeoArrow() {}

// TODO create packed coordinate arrays while reading parquet values to avoid creating so many intermediate objects
static MultiPolygon multipolygon(List<List<List<Object>>> list) {
return GeoUtils.createMultiPolygon(map(list, GeoArrow::polygon));
}

static Polygon polygon(List<List<Object>> input) {
return GeoUtils.createPolygon(ring(input.getFirst()), input.stream().skip(1).map(GeoArrow::ring).toList());
}

static MultiPoint multipoint(List<Object> input) {
return GeoUtils.createMultiPoint(map(input, GeoArrow::point));
}

static Point point(Object input) {
int dims = input instanceof List<?> l ? l.size() : input instanceof Map<?, ?> m ? m.size() : 0;
CoordinateSequence result =
new PackedCoordinateSequence.Double(1, dims, dims == 4 ? 1 : 0);
coordinate(input, result, 0);
return GeoUtils.JTS_FACTORY.createPoint(result);
}

static MultiLineString multilinestring(List<List<Object>> input) {
return GeoUtils.createMultiLineString(map(input, GeoArrow::linestring));
}

static LineString linestring(List<Object> input) {
return GeoUtils.JTS_FACTORY.createLineString(coordinateSequence(input));
}


private static CoordinateSequence coordinateSequence(List<Object> input) {
if (input.isEmpty()) {
return GeoUtils.EMPTY_COORDINATE_SEQUENCE;
}
Object first = input.getFirst();
int dims = first instanceof List<?> l ? l.size() : first instanceof Map<?, ?> m ? m.size() : 0;
CoordinateSequence result =
new PackedCoordinateSequence.Double(input.size(), dims, dims == 4 ? 1 : 0);
for (int i = 0; i < input.size(); i++) {
Object item = input.get(i);
coordinate(item, result, i);
}
return result;
}

private static LinearRing ring(List<Object> input) {
return GeoUtils.JTS_FACTORY.createLinearRing(coordinateSequence(input));
}

private static void coordinate(Object input, CoordinateSequence result, int index) {
switch (input) {
case List<?> list -> {
List<Number> l = (List<Number>) list;
for (int i = 0; i < l.size(); i++) {
result.setOrdinate(index, i, l.get(i).doubleValue());
}
}
case Map<?, ?> map -> {
Map<String, Number> m = (Map<String, Number>) map;

for (var entry : m.entrySet()) {
int ordinateIndex = switch (entry.getKey()) {
case "x" -> 0;
case "y" -> 1;
case "z" -> 2;
case "m" -> 3;
case null, default -> throw new IllegalArgumentException("Bad coordinate key: " + entry.getKey());
};
result.setOrdinate(index, ordinateIndex, entry.getValue().doubleValue());
}
}
default -> throw new IllegalArgumentException("Expecting map or list, got: " + input);
}
}

private static <I, O> List<O> map(List<I> in, Function<I, O> remap) {
return in.stream().map(remap).toList();
}

}
Loading

0 comments on commit fb1d0e3

Please sign in to comment.