Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Iniitial geoparquet support #888

Merged
merged 24 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NOTICE.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ The `planetiler-core` module includes the following software:
, [EPSG](https://github.com/geotools/geotools/blob/main/licenses/EPSG.md))
- org.msgpack:msgpack-core (Apache license)
- org.xerial:sqlite-jdbc (Apache license)
- org.xerial.snappy:snappy-java (Apache license)
- com.ibm.icu:icu4j ([ICU license](https://github.com/unicode-org/icu/blob/main/icu4c/LICENSE))
- com.google.guava:guava (Apache license)
- com.google.protobuf:protobuf-java (BSD 3-Clause License)
Expand All @@ -29,6 +30,7 @@ The `planetiler-core` module includes the following software:
- org.snakeyaml:snakeyaml-engine (Apache license)
- org.commonmark:commonmark (BSD 2-clause license)
- org.tukaani:xz (public domain)
- blue.strategic.parquet:parquet-floor (Apache license)
- Adapted code:
- `DouglasPeuckerSimplifier` from [JTS](https://github.com/locationtech/jts) (EDL)
- `OsmMultipolygon` from [imposm3](https://github.com/omniscale/imposm3) (Apache license)
Expand Down Expand Up @@ -65,4 +67,5 @@ The `planetiler-core` module includes the following software:
| OSM Lakelines | [MIT](https://github.com/lukasmartinelli/osm-lakelines), data from OSM [ODBL](https://www.openstreetmap.org/copyright) | yes | no |
| OSM Water Polygons | [acknowledgement](https://osmdata.openstreetmap.de/info/license.html), data from OSM [ODBL](https://www.openstreetmap.org/copyright) | yes | yes |
| Wikidata name translations | [CCO](https://www.wikidata.org/wiki/Wikidata:Licensing) | no | no |
| Overture Maps | [Various](https://docs.overturemaps.org/attribution) | no | yes |

2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,8 @@ Planetiler is made possible by these awesome open source projects:
Google's [Common Expression Language](https://github.com/google/cel-spec) that powers dynamic expressions embedded in
schema config files.
- [PMTiles](https://github.com/protomaps/PMTiles) optimized tile storage format
- [Apache Parquet](https://github.com/apache/parquet-mr) to support reading geoparquet files in java (with dependencies
minimized by [parquet-floor](https://github.com/strategicblue/parquet-floor))

See [NOTICE.md](NOTICE.md) for a full list and license details.

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package com.onthegomap.planetiler.benchmarks;

import com.onthegomap.planetiler.config.Arguments;
import com.onthegomap.planetiler.config.Bounds;
import com.onthegomap.planetiler.reader.parquet.ParquetInputFile;
import java.nio.file.Path;

public class BenchmarkParquetRead {

public static void main(String[] args) {
var arguments = Arguments.fromArgs(args);
var path =
arguments.inputFile("parquet", "parquet file to read", Path.of("data", "sources", "locality.zstd.parquet"));
long c = 0;
var file = new ParquetInputFile("parquet", "locality", path, null, Bounds.WORLD, null, tags -> tags.get("id"));
for (int i = 0; i < 20; i++) {
long start = System.currentTimeMillis();
for (var block : file.get()) {
for (var item : block) {
c += item.tags().size();
}
}
System.err.println(System.currentTimeMillis() - start);
}

System.err.println(c);
}
}
22 changes: 17 additions & 5 deletions planetiler-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,24 @@
<artifactId>geopackage</artifactId>
<version>${geopackage.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-nop</artifactId>
</exclusion>
</exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-nop</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Pin transitive snappy dependency to more recent version without vulnerability -->
<dependency>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
<version>1.1.10.5</version>
</dependency>
<dependency>
<groupId>blue.strategic.parquet</groupId>
<artifactId>parquet-floor</artifactId>
<version>1.41</version>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
import com.onthegomap.planetiler.reader.GeoPackageReader;
import com.onthegomap.planetiler.reader.NaturalEarthReader;
import com.onthegomap.planetiler.reader.ShapefileReader;
import com.onthegomap.planetiler.reader.SourceFeature;
import com.onthegomap.planetiler.reader.osm.OsmInputFile;
import com.onthegomap.planetiler.reader.osm.OsmNodeBoundsProvider;
import com.onthegomap.planetiler.reader.osm.OsmReader;
import com.onthegomap.planetiler.reader.parquet.ParquetReader;
import com.onthegomap.planetiler.stats.ProcessInfo;
import com.onthegomap.planetiler.stats.Stats;
import com.onthegomap.planetiler.stats.Timers;
Expand All @@ -39,9 +41,12 @@
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.IntStream;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -469,6 +474,53 @@ public Planetiler addNaturalEarthSource(String name, Path defaultPath, String de
config, profile, stats, keepUnzipped)));
}


/**
* Adds a new <a href="https://github.com/opengeospatial/geoparquet">geoparquet</a> source that will be processed when
* {@link #run()} is called.
*
* @param name string to use in stats and logs to identify this stage
* @param paths paths to the geoparquet files to read.
* @param hivePartitioning Set to true to parse extra feature tags from the file path, for example
* {@code {them="buildings", type="part"}} from
* {@code base/theme=buildings/type=part/file.parquet}
* @param getId function that extracts a unique vector tile feature ID from each input feature, string or
* binary features will be hashed to a {@code long}.
* @param getLayer function that extracts {@link SourceFeature#getSourceLayer()} from the properties of each
* input feature
* @return this runner instance for chaining
* @see GeoPackageReader
*/
public Planetiler addParquetSource(String name, List<Path> paths, boolean hivePartitioning,
Function<Map<String, Object>, Object> getId, Function<Map<String, Object>, Object> getLayer) {
// TODO handle auto-downloading
for (var path : paths) {
inputPaths.add(new InputPath(name, path, false));
}
var separator = Pattern.quote(paths.isEmpty() ? "/" : paths.getFirst().getFileSystem().getSeparator());
String prefix = StringUtils.getCommonPrefix(paths.stream().map(Path::toString).toArray(String[]::new))
.replaceAll(separator + "[^" + separator + "]*$", "");
return addStage(name, "Process features in " + (prefix.isEmpty() ? (paths.size() + " files") : prefix),
ifSourceUsed(name, () -> new ParquetReader(name, profile, stats, getId, getLayer, hivePartitioning)
.process(paths, featureGroup, config)));
}

/**
* Alias for {@link #addParquetSource(String, List, boolean, Function, Function)} using the default layer and ID
* extractors.
*/
public Planetiler addParquetSource(String name, List<Path> paths, boolean hivePartitioning) {
return addParquetSource(name, paths, hivePartitioning, null, null);
}

/**
* Alias for {@link #addParquetSource(String, List, boolean, Function, Function)} without hive partitioning and using
* the default layer and ID extractors.
*/
public Planetiler addParquetSource(String name, List<Path> paths) {
return addParquetSource(name, paths, false);
}

/**
* Adds a new stage that will be invoked when {@link #run()} is called.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import org.locationtech.jts.geom.LineSegment;
import org.locationtech.jts.geom.LineString;
import org.locationtech.jts.geom.LinearRing;
import org.locationtech.jts.geom.MultiLineString;
import org.locationtech.jts.geom.MultiPoint;
import org.locationtech.jts.geom.MultiPolygon;
import org.locationtech.jts.geom.Point;
import org.locationtech.jts.geom.Polygon;
Expand All @@ -27,6 +29,7 @@
import org.locationtech.jts.geom.util.GeometryFixer;
import org.locationtech.jts.geom.util.GeometryTransformer;
import org.locationtech.jts.io.WKBReader;
import org.locationtech.jts.io.WKTReader;
import org.locationtech.jts.precision.GeometryPrecisionReducer;

/**
Expand All @@ -40,8 +43,9 @@ public class GeoUtils {
/** Rounding precision for 256x256px tiles encoded using 4096 values. */
public static final PrecisionModel TILE_PRECISION = new PrecisionModel(4096d / 256d);
public static final GeometryFactory JTS_FACTORY = new GeometryFactory(PackedCoordinateSequenceFactory.DOUBLE_FACTORY);
public static final WKBReader WKB_READER = new WKBReader(JTS_FACTORY);

public static final Geometry EMPTY_GEOMETRY = JTS_FACTORY.createGeometryCollection();
public static final CoordinateSequence EMPTY_COORDINATE_SEQUENCE = new PackedCoordinateSequence.Double(0, 2, 0);
public static final Point EMPTY_POINT = JTS_FACTORY.createPoint();
public static final LineString EMPTY_LINE = JTS_FACTORY.createLineString();
public static final Polygon EMPTY_POLYGON = JTS_FACTORY.createPolygon();
Expand Down Expand Up @@ -247,11 +251,11 @@ public static Point point(Coordinate coord) {
return JTS_FACTORY.createPoint(coord);
}

public static Geometry createMultiLineString(List<LineString> lineStrings) {
public static MultiLineString createMultiLineString(List<LineString> lineStrings) {
return JTS_FACTORY.createMultiLineString(lineStrings.toArray(EMPTY_LINE_STRING_ARRAY));
}

public static Geometry createMultiPolygon(List<Polygon> polygon) {
public static MultiPolygon createMultiPolygon(List<Polygon> polygon) {
return JTS_FACTORY.createMultiPolygon(polygon.toArray(EMPTY_POLYGON_ARRAY));
}

Expand Down Expand Up @@ -370,7 +374,7 @@ public static CoordinateSequence coordinateSequence(double... coords) {
return new PackedCoordinateSequence.Double(coords, 2, 0);
}

public static Geometry createMultiPoint(List<Point> points) {
public static MultiPoint createMultiPoint(List<Point> points) {
return JTS_FACTORY.createMultiPoint(points.toArray(EMPTY_POINT_ARRAY));
}

Expand Down Expand Up @@ -548,6 +552,14 @@ public static int minZoomForPixelSize(double worldGeometrySize, double minPixelS
PlanetilerConfig.MAX_MAXZOOM);
}

public static WKBReader wkbReader() {
return new WKBReader(JTS_FACTORY);
}

public static WKTReader wktReader() {
return new WKTReader(JTS_FACTORY);
}

/** Helper class to sort polygons by area of their outer shell. */
private record PolyAndArea(Polygon poly, double area) implements Comparable<PolyAndArea> {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,15 @@ public void readFeatures(Consumer<SimpleFeature> next) throws Exception {
}
}
if (geometryColumn >= 0) {
var wkbReader = GeoUtils.wkbReader();
while (rs.next()) {
byte[] geometry = rs.getBytes(geometryColumn + 1);
if (geometry == null) {
continue;
}

// create the feature and pass to next stage
Geometry latLonGeometry = GeoUtils.WKB_READER.read(geometry);
Geometry latLonGeometry = wkbReader.read(geometry);
SimpleFeature readerGeometry = SimpleFeature.create(latLonGeometry, HashMap.newHashMap(column.length - 1),
sourceName, table, ++id);
for (int c = 0; c < column.length; c++) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
package com.onthegomap.planetiler.reader.parquet;

import com.onthegomap.planetiler.geo.GeoUtils;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import org.locationtech.jts.geom.CoordinateSequence;
import org.locationtech.jts.geom.LineString;
import org.locationtech.jts.geom.LinearRing;
import org.locationtech.jts.geom.MultiLineString;
import org.locationtech.jts.geom.MultiPoint;
import org.locationtech.jts.geom.MultiPolygon;
import org.locationtech.jts.geom.Point;
import org.locationtech.jts.geom.Polygon;
import org.locationtech.jts.geom.impl.PackedCoordinateSequence;

/**
* Utilities for converting nested <a href=
* "https://github.com/opengeospatial/geoparquet/blob/main/format-specs/geoparquet.md#native-encodings-based-on-geoarrow">geoarrow</a>
* coordinate lists to JTS geometries.
*/
class GeoArrow {
private GeoArrow() {}

// TODO create packed coordinate arrays while reading parquet values to avoid creating so many intermediate objects
static MultiPolygon multipolygon(List<List<List<Object>>> list) {
return GeoUtils.createMultiPolygon(map(list, GeoArrow::polygon));
}

static Polygon polygon(List<List<Object>> input) {
return GeoUtils.createPolygon(ring(input.getFirst()), input.stream().skip(1).map(GeoArrow::ring).toList());
}

static MultiPoint multipoint(List<Object> input) {
return GeoUtils.createMultiPoint(map(input, GeoArrow::point));
}

static Point point(Object input) {
int dims = input instanceof List<?> l ? l.size() : input instanceof Map<?, ?> m ? m.size() : 0;
CoordinateSequence result =
new PackedCoordinateSequence.Double(1, dims, dims == 4 ? 1 : 0);
coordinate(input, result, 0);
return GeoUtils.JTS_FACTORY.createPoint(result);
}

static MultiLineString multilinestring(List<List<Object>> input) {
return GeoUtils.createMultiLineString(map(input, GeoArrow::linestring));
}

static LineString linestring(List<Object> input) {
return GeoUtils.JTS_FACTORY.createLineString(coordinateSequence(input));
}


private static CoordinateSequence coordinateSequence(List<Object> input) {
if (input.isEmpty()) {
return GeoUtils.EMPTY_COORDINATE_SEQUENCE;
}
Object first = input.getFirst();
int dims = first instanceof List<?> l ? l.size() : first instanceof Map<?, ?> m ? m.size() : 0;
CoordinateSequence result =
new PackedCoordinateSequence.Double(input.size(), dims, dims == 4 ? 1 : 0);
for (int i = 0; i < input.size(); i++) {
Object item = input.get(i);
coordinate(item, result, i);
}
return result;
}

private static LinearRing ring(List<Object> input) {
return GeoUtils.JTS_FACTORY.createLinearRing(coordinateSequence(input));
}

private static void coordinate(Object input, CoordinateSequence result, int index) {
switch (input) {
case List<?> list -> {
List<Number> l = (List<Number>) list;
for (int i = 0; i < l.size(); i++) {
result.setOrdinate(index, i, l.get(i).doubleValue());
}
}
case Map<?, ?> map -> {
Map<String, Number> m = (Map<String, Number>) map;

for (var entry : m.entrySet()) {
int ordinateIndex = switch (entry.getKey()) {
case "x" -> 0;
case "y" -> 1;
case "z" -> 2;
case "m" -> 3;
case null, default -> throw new IllegalArgumentException("Bad coordinate key: " + entry.getKey());
};
result.setOrdinate(index, ordinateIndex, entry.getValue().doubleValue());
}
}
default -> throw new IllegalArgumentException("Expecting map or list, got: " + input);
}
}

private static <I, O> List<O> map(List<I> in, Function<I, O> remap) {
return in.stream().map(remap).toList();
}

}
Loading
Loading