From 50ad84db1639795048f38bd573ce96cca6c806ff Mon Sep 17 00:00:00 2001 From: Mike Barry Date: Sun, 19 May 2024 15:49:04 -0400 Subject: [PATCH] fix windows glob matching --- .../com/onthegomap/planetiler/Planetiler.java | 64 ++++++------------ .../onthegomap/planetiler/util/FileUtils.java | 36 ---------- .../com/onthegomap/planetiler/util/Glob.java | 55 ++++++++++++++++ .../planetiler/PlanetilerTests.java | 2 +- .../reader/parquet/ParquetInputFileTest.java | 4 +- .../reader/parquet/ParquetReaderTest.java | 4 +- .../planetiler/util/FileUtilsTest.java | 43 +----------- .../onthegomap/planetiler/util/GlobTest.java | 66 +++++++++++++++++++ .../examples/overture/OvertureBasemap.java | 5 +- 9 files changed, 153 insertions(+), 126 deletions(-) create mode 100644 planetiler-core/src/main/java/com/onthegomap/planetiler/util/Glob.java create mode 100644 planetiler-core/src/test/java/com/onthegomap/planetiler/util/GlobTest.java diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java index 86a39eb6ac..59eef64fb3 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/Planetiler.java @@ -478,8 +478,7 @@ public Planetiler addNaturalEarthSource(String name, Path defaultPath, String de * {@link #run()} is called. * * @param name string to use in stats and logs to identify this stage - * @param pattern path to the geoparquet file to read, possibly including - * {@linkplain FileSystem#getPathMatcher(String) glob patterns} + * @param paths paths to the geoparquet files to read. * @param hivePartitioning Set to true to parse extra feature tags from the file path, for example * {@code {them="buildings", type="part"}} from * {@code base/theme=buildings/type=part/file.parquet} @@ -490,31 +489,32 @@ public Planetiler addNaturalEarthSource(String name, Path defaultPath, String de * @return this runner instance for chaining * @see GeoPackageReader */ - public Planetiler addParquetSource(String name, Path pattern, boolean hivePartitioning, + public Planetiler addParquetSource(String name, List paths, boolean hivePartitioning, Function, Object> getId, Function, Object> getLayer) { // TODO handle auto-downloading - Path path = getPath(name, "parquet", pattern, null, true); - return addStage(name, "Process features in " + path, ifSourceUsed(name, () -> { - var sourcePaths = FileUtils.walkPathWithPattern(path).stream().filter(Files::isRegularFile).toList(); - new ParquetReader(name, profile, stats, getId, getLayer, hivePartitioning).process(sourcePaths, featureGroup, + for (var path : paths) { + inputPaths.add(new InputPath(name, path, false)); + } + return addStage(name, "Process features in " + paths, ifSourceUsed(name, () -> { + new ParquetReader(name, profile, stats, getId, getLayer, hivePartitioning).process(paths, featureGroup, config); })); } /** - * Alias for {@link #addParquetSource(String, Path, boolean, Function, Function)} using the default layer and ID + * Alias for {@link #addParquetSource(String, List, boolean, Function, Function)} using the default layer and ID * extractors. */ - public Planetiler addParquetSource(String name, Path pattern, boolean hivePartitioning) { - return addParquetSource(name, pattern, hivePartitioning, null, null); + public Planetiler addParquetSource(String name, List paths, boolean hivePartitioning) { + return addParquetSource(name, paths, hivePartitioning, null, null); } /** - * Alias for {@link #addParquetSource(String, Path, boolean, Function, Function)} without hive partitioning and using + * Alias for {@link #addParquetSource(String, List, boolean, Function, Function)} without hive partitioning and using * the default layer and ID extractors. */ - public Planetiler addParquetSource(String name, Path pattern) { - return addParquetSource(name, pattern, false); + public Planetiler addParquetSource(String name, List paths) { + return addParquetSource(name, paths, false); } /** @@ -818,7 +818,7 @@ public void run() throws Exception { for (var inputPath : inputPaths) { if (inputPath.freeAfterReading()) { LOGGER.info("Deleting {} ({}) to make room for output file", inputPath.id, inputPath.path); - inputPath.delete(); + FileUtils.delete(inputPath.path()); } } @@ -858,7 +858,7 @@ private void checkDiskSpace() { // if the user opts to remove an input source after reading to free up additional space for the output... for (var input : inputPaths) { if (input.freeAfterReading()) { - writePhase.addDisk(input.path, -input.size(), "delete " + input.id + " source after reading"); + writePhase.addDisk(input.path, -FileUtils.size(input.path), "delete " + input.id + " source after reading"); } } @@ -941,23 +941,18 @@ private RunnableThatThrows ifSourceUsed(String name, RunnableThatThrows task) { } private Path getPath(String name, String type, Path defaultPath, String defaultUrl) { - return getPath(name, type, defaultPath, defaultUrl, false); - } - - private Path getPath(String name, String type, Path defaultPath, String defaultUrl, boolean wildcard) { Path path = arguments.file(name + "_path", name + " " + type + " path", defaultPath); boolean refresh = arguments.getBoolean("refresh_" + name, "Download new version of " + name + " if changed", refreshSources); boolean freeAfterReading = arguments.getBoolean("free_" + name + "_after_read", "delete " + name + " input file after reading to make space for output (reduces peak disk usage)", false); - var inputPath = new InputPath(name, path, freeAfterReading, wildcard); - inputPaths.add(inputPath); if (downloadSources || refresh) { String url = arguments.getString(name + "_url", name + " " + type + " url", defaultUrl); - if ((refresh || inputPath.isEmpty()) && url != null) { - toDownload.add(new ToDownload(name, url, path, wildcard)); + if ((!Files.exists(path) || refresh) && url != null) { + toDownload.add(new ToDownload(name, url, path)); } } + inputPaths.add(new InputPath(name, path, freeAfterReading)); return path; } @@ -975,7 +970,7 @@ private void download() { private void ensureInputFilesExist() { for (InputPath inputPath : inputPaths) { - if (profile.caresAboutSource(inputPath.id) && inputPath.isEmpty()) { + if (profile.caresAboutSource(inputPath.id) && !Files.exists(inputPath.path)) { throw new IllegalArgumentException(inputPath.path + " does not exist. Run with --download to fetch it"); } } @@ -988,24 +983,7 @@ private record Stage(String id, List details, RunnableThatThrows task) { } } - private record ToDownload(String id, String url, Path path, boolean wildcard) {} - - private record InputPath(String id, Path path, boolean freeAfterReading, boolean wildcard) { - - public boolean isEmpty() { - return wildcard ? FileUtils.walkPathWithPattern(path).isEmpty() : !Files.exists(path); - } - - public long size() { - return wildcard ? FileUtils.size(FileUtils.getPatternBase(path)) : FileUtils.fileSize(path); - } + private record ToDownload(String id, String url, Path path) {} - public void delete() { - if (wildcard) { - FileUtils.delete(FileUtils.getPatternBase(path)); - } else { - FileUtils.delete(path); - } - } - } + private record InputPath(String id, Path path, boolean freeAfterReading) {} } diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/FileUtils.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/FileUtils.java index 6874da68d1..431cc3ddea 100644 --- a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/FileUtils.java +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/FileUtils.java @@ -23,7 +23,6 @@ import java.util.List; import java.util.Objects; import java.util.function.Function; -import java.util.regex.Pattern; import java.util.stream.Stream; import java.util.stream.StreamSupport; import java.util.zip.ZipEntry; @@ -43,7 +42,6 @@ public class FileUtils { private static final double ZIP_THRESHOLD_RATIO = 1_000; private static final Logger LOGGER = LoggerFactory.getLogger(FileUtils.class); - private static final Pattern GLOB_PATTERN = Pattern.compile("[?*{\\[].*$"); private FileUtils() {} @@ -115,38 +113,6 @@ public static List walkPathWithPattern(Path basePath, String pattern) { return walkPathWithPattern(basePath, pattern, List::of); } - /** - * Returns list of paths matching {@param pathWithPattern} where {@param pathWithPattern} can contain glob patterns. - * - * @param pathWithPattern path that can contain glob patterns - */ - public static List walkPathWithPattern(Path pathWithPattern) { - var parsed = parsePattern(pathWithPattern); - return parsed.pattern == null ? List.of(parsed.base) : walkPathWithPattern(parsed.base, parsed.pattern, List::of); - } - - - /** - * Returns list of base of {@param pathWithPattern} before any glob patterns. - */ - public static Path getPatternBase(Path pathWithPattern) { - return parsePattern(pathWithPattern).base; - } - - static BaseWithPattern parsePattern(Path pattern) { - String string = pattern.toString(); - var matcher = GLOB_PATTERN.matcher(string); - if (!matcher.find()) { - return new BaseWithPattern(pattern, null); - } - matcher.reset(); - String base = matcher.replaceAll(""); - int idx = base.lastIndexOf(pattern.getFileSystem().getSeparator()); - if (idx > 0) { - base = base.substring(0, idx); - } - return new BaseWithPattern(Path.of(base), string.substring(idx + 1)); - } /** Returns true if {@code path} ends with ".extension" (case-insensitive). */ public static boolean hasExtension(Path path, String extension) { @@ -419,6 +385,4 @@ public static void setLength(Path path, long size) { throw new UncheckedIOException(e); } } - - record BaseWithPattern(Path base, String pattern) {} } diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Glob.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Glob.java new file mode 100644 index 0000000000..3406b00c5f --- /dev/null +++ b/planetiler-core/src/main/java/com/onthegomap/planetiler/util/Glob.java @@ -0,0 +1,55 @@ +package com.onthegomap.planetiler.util; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Pattern; + + +/** + * Utility for constructing base+glob paths for matching many files + */ +public record Glob(Path base, String pattern) { + + private static final Pattern GLOB_PATTERN = Pattern.compile("[?*{\\[].*$"); + + /** Wrap a base path with no globs in it yet. */ + public static Glob of(Path path) { + return new Glob(path, null); + } + + /** Resolves a subdirectory using parts separated by the platform file separator. */ + public Glob resolve(String... subPath) { + String separator = base.getFileSystem().getSeparator(); + if (pattern != null) { + return new Glob(base, pattern + separator + String.join(separator, subPath)); + } else if (subPath == null || subPath.length == 0) { + return this; + } else if (GLOB_PATTERN.matcher(subPath[0]).find()) { + return new Glob(base, String.join(separator, subPath)); + } else { + return of(base.resolve(subPath[0])).resolve(Arrays.copyOfRange(subPath, 1, subPath.length)); + } + } + + /** Parse a string containing platform-specific file separators into a base+glob pattern. */ + public static Glob parse(String path) { + var matcher = GLOB_PATTERN.matcher(path); + if (!matcher.find()) { + return of(Path.of(path)); + } + matcher.reset(); + String base = matcher.replaceAll(""); + String separator = Path.of(base).getFileSystem().getSeparator(); + int idx = base.lastIndexOf(separator); + if (idx > 0) { + base = base.substring(0, idx); + } + return of(Path.of(base)).resolve(path.substring(idx + 1).split(Pattern.quote(separator))); + } + + /** Search the filesystem for all files beneath {@link #base()} matching {@link #pattern()}. */ + public List find() { + return pattern == null ? List.of(base) : FileUtils.walkPathWithPattern(base, pattern); + } +} diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/PlanetilerTests.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/PlanetilerTests.java index b37d4bb955..d8d71292f3 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/PlanetilerTests.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/PlanetilerTests.java @@ -2266,7 +2266,7 @@ public void processFeature(SourceFeature source, FeatureCollector features) { .setAttr("id", source.getString("id")); } }) - .addParquetSource("parquet", TestUtils.pathToResource("parquet").resolve("boston.parquet")) + .addParquetSource("parquet", List.of(TestUtils.pathToResource("parquet").resolve("boston.parquet"))) .setOutput(mbtiles) .run(); diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/parquet/ParquetInputFileTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/parquet/ParquetInputFileTest.java index 46cdb470cf..389e031650 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/parquet/ParquetInputFileTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/parquet/ParquetInputFileTest.java @@ -7,7 +7,7 @@ import com.onthegomap.planetiler.TestUtils; import com.onthegomap.planetiler.config.Bounds; -import com.onthegomap.planetiler.util.FileUtils; +import com.onthegomap.planetiler.util.Glob; import java.nio.file.Path; import java.time.Instant; import java.time.LocalDate; @@ -28,7 +28,7 @@ class ParquetInputFileTest { static List bostons() { - return FileUtils.walkPathWithPattern(TestUtils.pathToResource("parquet").resolve("boston*.parquet")); + return Glob.of(TestUtils.pathToResource("parquet")).resolve("boston*.parquet").find(); } @ParameterizedTest diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/parquet/ParquetReaderTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/parquet/ParquetReaderTest.java index cbd9f04d3c..b01238c2da 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/parquet/ParquetReaderTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/reader/parquet/ParquetReaderTest.java @@ -15,7 +15,7 @@ import com.onthegomap.planetiler.geo.TileOrder; import com.onthegomap.planetiler.reader.SourceFeature; import com.onthegomap.planetiler.stats.Stats; -import com.onthegomap.planetiler.util.FileUtils; +import com.onthegomap.planetiler.util.Glob; import com.onthegomap.planetiler.util.Parse; import java.nio.file.Path; import java.util.List; @@ -32,7 +32,7 @@ class ParquetReaderTest { private final Stats stats = Stats.inMemory(); static List bostons() { - return FileUtils.walkPathWithPattern(TestUtils.pathToResource("parquet").resolve("boston*.parquet")); + return Glob.of(TestUtils.pathToResource("parquet")).resolve("boston*.parquet").find(); } @ParameterizedTest diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/FileUtilsTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/FileUtilsTest.java index 42391b048b..9d08d2ba36 100644 --- a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/FileUtilsTest.java +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/FileUtilsTest.java @@ -8,7 +8,6 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.charset.StandardCharsets; -import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; @@ -18,8 +17,6 @@ import java.util.stream.Stream; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; -import org.junit.jupiter.params.ParameterizedTest; -import org.junit.jupiter.params.provider.CsvSource; class FileUtilsTest { @@ -123,7 +120,7 @@ void testWalkPathWithPatternDirectory() throws IOException { matchingPaths.stream().sorted().toList() ); - matchingPaths = FileUtils.walkPathWithPattern(parent.resolve("*.txt")); + matchingPaths = Glob.of(parent).resolve("*.txt").find(); assertEquals( txtFiles.stream().sorted().toList(), @@ -152,7 +149,7 @@ void testWalkPathWithPatternDirectoryZip() throws IOException { FileUtils.walkPathWithPattern(parent, "*.zip", mockWalkZipFile)); - assertEquals(List.of(zipFile), FileUtils.walkPathWithPattern(parent.resolve("*.zip"))); + assertEquals(List.of(zipFile), Glob.of(parent).resolve("*.zip").find()); } @Test @@ -165,7 +162,7 @@ void testWalkPathWithPatternSingleZip() { List.of("/shapefile/stations.shp", "/shapefile/stations.shx"), matchingPaths.stream().map(Path::toString).sorted().toList()); - matchingPaths = FileUtils.walkPathWithPattern(zipPath.resolve("stations.sh[px]")); + matchingPaths = Glob.of(zipPath).resolve("stations.sh[px]").find(); assertEquals( List.of("/shapefile/stations.shp", "/shapefile/stations.shx"), @@ -178,38 +175,4 @@ void testExpandFile() throws IOException { FileUtils.setLength(path, 1000); assertEquals(1000, Files.size(path)); } - - @ParameterizedTest - @CsvSource(value = { - "a/b/c; a/b/c;", - "a/b/*; a/b; *", - "a/*/b; a; */b", - "*/b/*; ; */b/*", - "/*/test; /; */test", - "a/b={c,d}/other; a; b={c,d}/other", - "./a/b=?/other; ./a; b=?/other", - }, delimiter = ';') - void testParsePathWithPattern(String input, String base, String pattern) { - var separator = FileSystems.getDefault().getSeparator(); - input = input.replace("/", separator); - base = base == null ? "" : base.replace("/", separator); - pattern = pattern == null ? null : pattern.replace("/", separator); - assertEquals( - new FileUtils.BaseWithPattern( - Path.of(base), - pattern - ), - FileUtils.parsePattern(Path.of(input)) - ); - } - - @Test - void testWalkPathWithPattern() throws IOException { - var path = tmpDir.resolve("a").resolve("b").resolve("c.txt"); - FileUtils.createParentDirectories(path); - Files.writeString(path, "test"); - assertEquals(List.of(path), FileUtils.walkPathWithPattern(tmpDir.resolve(Path.of("a", "*", "c.txt")))); - assertEquals(List.of(path), FileUtils.walkPathWithPattern(tmpDir.resolve(Path.of("*", "*", "c.txt")))); - assertEquals(List.of(path), FileUtils.walkPathWithPattern(tmpDir.resolve(Path.of("a", "b", "c.txt")))); - } } diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/util/GlobTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/GlobTest.java new file mode 100644 index 0000000000..f9cba59913 --- /dev/null +++ b/planetiler-core/src/test/java/com/onthegomap/planetiler/util/GlobTest.java @@ -0,0 +1,66 @@ +package com.onthegomap.planetiler.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.nio.file.FileSystems; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +class GlobTest { + @TempDir + Path tmpDir; + + @ParameterizedTest + @CsvSource(value = { + "a/b/c; a/b/c;", + "a/b/*; a/b; *", + "a/*/b; a; */b", + "*/b/*; ; */b/*", + "/*/test; /; */test", + "a/b={c,d}/other; a; b={c,d}/other", + "./a/b=?/other; ./a; b=?/other", + }, delimiter = ';') + void testParsePathWithPattern(String input, String base, String pattern) { + var separator = FileSystems.getDefault().getSeparator(); + input = input.replace("/", separator); + base = base == null ? "" : base.replace("/", separator); + pattern = pattern == null ? null : pattern.replace("/", separator); + assertEquals( + new Glob(Path.of(base), pattern), + Glob.parse(input) + ); + } + + @Test + void testWalkPathWithPattern() throws IOException { + var path = tmpDir.resolve("a").resolve("b").resolve("c.txt"); + FileUtils.createParentDirectories(path); + Files.writeString(path, "test"); + assertEquals(List.of(path), Glob.of(tmpDir).resolve("a", "*", "c.txt").find()); + assertEquals(List.of(path), Glob.of(tmpDir).resolve("*", "*", "c.txt").find()); + assertEquals(List.of(path), Glob.of(tmpDir).resolve("a", "b", "c.txt").find()); + } + + @Test + void testResolve() { + var base = Glob.of(Path.of("a", "b")); + var separator = base.base().getFileSystem().getSeparator(); + assertEquals(new Glob(Path.of("a", "b", "c"), null), base.resolve("c")); + assertEquals(new Glob(Path.of("a", "b", "c", "d"), null), base.resolve("c", "d")); + assertEquals(new Glob(Path.of("a", "b"), "*" + separator + "d"), base.resolve("*", "d")); + } + + @Test + void testParseAbsoluteString() { + var base = Glob.of(Path.of("a", "b")).resolve("*", "d"); + var separator = base.base().getFileSystem().getSeparator(); + assertEquals(new Glob(base.base().toAbsolutePath(), base.pattern()), + Glob.parse(base.base().toAbsolutePath() + separator + base.pattern())); + } +} diff --git a/planetiler-examples/src/main/java/com/onthegomap/planetiler/examples/overture/OvertureBasemap.java b/planetiler-examples/src/main/java/com/onthegomap/planetiler/examples/overture/OvertureBasemap.java index 7a6315b933..6b7a270f1e 100644 --- a/planetiler-examples/src/main/java/com/onthegomap/planetiler/examples/overture/OvertureBasemap.java +++ b/planetiler-examples/src/main/java/com/onthegomap/planetiler/examples/overture/OvertureBasemap.java @@ -5,6 +5,7 @@ import com.onthegomap.planetiler.Profile; import com.onthegomap.planetiler.config.Arguments; import com.onthegomap.planetiler.reader.SourceFeature; +import com.onthegomap.planetiler.util.Glob; import java.nio.file.Path; /** @@ -51,11 +52,11 @@ public static void main(String[] args) throws Exception { } static void run(Arguments args) throws Exception { - Path input = args.inputFile("base", "overture base directory", Path.of("data", "overture")); + Path base = args.inputFile("base", "overture base directory", Path.of("data", "overture")); Planetiler.create(args) .setProfile(new OvertureBasemap()) .addParquetSource("overture-buildings", - input.resolve(Path.of("*", "type=building", "*.parquet")), + Glob.of(base).resolve("*", "type=building", "*.parquet").find(), true, // hive-partitioning fields -> fields.get("id"), // hash the ID field to generate unique long IDs fields -> fields.get("type")) // extract "type={}" from the filename to get layer