From 3dd1f0df724700fff514790a35e470941a896995 Mon Sep 17 00:00:00 2001 From: simei94 Date: Wed, 15 May 2024 19:02:21 +0200 Subject: [PATCH] read first 5 lines of csv to check delimiter --- .../application/options/CsvOptions.java | 54 +++++++++++++------ .../application/options/CsvOptionsTest.java | 4 +- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/contribs/application/src/main/java/org/matsim/application/options/CsvOptions.java b/contribs/application/src/main/java/org/matsim/application/options/CsvOptions.java index 1b2d6575450..74c25f3a96b 100644 --- a/contribs/application/src/main/java/org/matsim/application/options/CsvOptions.java +++ b/contribs/application/src/main/java/org/matsim/application/options/CsvOptions.java @@ -23,7 +23,7 @@ public final class CsvOptions { @CommandLine.Option(names = "--csv-format", description = "CSV Format", defaultValue = "Default") private CSVFormat.Predefined csvFormat; - @CommandLine.Option(names = "--csv-delimiter", description = "CSV Delimiter", required = false) + @CommandLine.Option(names = "--csv-delimiter", description = "CSV Delimiter") private Character csvDelimiter; @CommandLine.Option(names = "--csv-charset", description = "CSV input encoding", defaultValue = "UTF8") @@ -56,24 +56,48 @@ public CsvOptions(CSVFormat.Predefined csvFormat, Character csvDelimiter, Charse */ public static Character detectDelimiter(String path) throws IOException { try (BufferedReader reader = IOUtils.getBufferedReader(path)) { - String firstLine = reader.readLine(); + int[] comma = new int[5]; + int[] semicolon = new int[5]; + int[] tab = new int[5]; + String[] lines = new String[5]; + +// check five first lines for separator chars. It might be that the csv file has additional info in the first x lines (e.g. EPSG) + for (int i = 0; i < 5; i++) { + lines[i] = reader.readLine(); + if (lines[i] == null) { + comma[i] = 0; + semicolon[i] = 0; + tab[i] = 0; + } else { + comma[i] = StringUtils.countMatches(lines[i], ","); + semicolon[i] = StringUtils.countMatches(lines[i], ";"); + tab[i] = StringUtils.countMatches(lines[i], "\t"); + } + } - int comma = StringUtils.countMatches(firstLine, ","); - int semicolon = StringUtils.countMatches(firstLine, ";"); - int tab = StringUtils.countMatches(firstLine, "\t"); + Integer index = null; - if (comma == 0 && semicolon == 0 && tab == 0) { - throw new IllegalArgumentException("No delimiter found in the first line of the file."); + for (int i = 0; i < comma.length - 1; i++) { +// only check next index if line with separators was not found + if (index == null) { + if (!(comma[i] == 0 && semicolon[i] == 0 && tab[i] == 0)) { + index = i; + } + } } - // Comma is preferred as the more likely format - if (comma >= semicolon && comma >= tab) { - return ','; - } else if (tab >= semicolon) - return '\t'; - else - return ';'; - } + if (index == null) { + throw new IllegalArgumentException("No delimiter found in the first line of the file."); + } else { + // Comma is preferred as the more likely format + if (comma[index] >= semicolon[index] && comma[index] >= tab[index]) { + return ','; + } else if (tab[index] >= semicolon[index]) + return '\t'; + else + return ';'; + } + } } /** diff --git a/contribs/application/src/test/java/org/matsim/application/options/CsvOptionsTest.java b/contribs/application/src/test/java/org/matsim/application/options/CsvOptionsTest.java index a406deca398..70452d23138 100644 --- a/contribs/application/src/test/java/org/matsim/application/options/CsvOptionsTest.java +++ b/contribs/application/src/test/java/org/matsim/application/options/CsvOptionsTest.java @@ -37,10 +37,12 @@ void output() throws IOException { printer.printRecord("header", "column"); printer.printRecord("1", "2"); + printer.printRecord("3", "4"); + printer.printRecord("5", "6"); printer.close(); assertThat(tmp) - .hasContent("header" + delimiter + "column\n1" + delimiter + "2"); + .hasContent("header" + delimiter + "column\n1" + delimiter + "2" + "\n3" + delimiter + "4" + "\n5" + delimiter + "6"); assertThat(delimiter).isEqualTo(CsvOptions.detectDelimiter(tmp.toString()).toString()); }