Skip to content

Commit

Permalink
add method for detecting delimiter of csv file before reading it enti…
Browse files Browse the repository at this point in the history
…rely
  • Loading branch information
simei94 committed Feb 26, 2024
1 parent 1a32f41 commit 636c7bf
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.locationtech.jts.geom.Point;
import org.matsim.application.CommandSpec;
import org.matsim.application.MATSimAppCommand;
import org.matsim.application.options.CsvOptions;
import org.matsim.application.options.InputOptions;
import org.matsim.application.options.OutputOptions;
import org.matsim.application.options.ShpOptions;
Expand Down Expand Up @@ -94,7 +95,7 @@ public Integer call() throws Exception {
Table persons = Table.read().csv(CsvReadOptions.builder(IOUtils.getBufferedReader(input.getPath("persons.csv")))
.columnTypesPartial(Map.of("person", ColumnType.TEXT))
.sample(false)
.separator(detectDelimiter(input.getPath("persons.csv"))).build());
.separator(new CsvOptions().detectDelimiter(input.getPath("persons.csv"))).build());

int total = persons.rowCount();

Expand Down Expand Up @@ -135,7 +136,7 @@ public Integer call() throws Exception {
Table trips = Table.read().csv(CsvReadOptions.builder(IOUtils.getBufferedReader(input.getPath("trips.csv")))
.columnTypesPartial(columnTypes)
.sample(false)
.separator(detectDelimiter(input.getPath("trips.csv"))).build());
.separator(new CsvOptions().detectDelimiter(input.getPath("trips.csv"))).build());

// Trip filter with start and end
if (shp.isDefined() && filter == LocationFilter.trip_start_and_end) {
Expand Down Expand Up @@ -188,45 +189,6 @@ public Integer call() throws Exception {
return 0;
}

private Character detectDelimiter(String path) throws IOException {
// Create a map to count occurrences of potential delimiters
Map<Character, Integer> delimiterCounts = new HashMap<>();

BufferedReader reader = null;
try {
if (path.endsWith(".gz")) {
reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(path))));
} else if (path.endsWith(".csv")) {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
} else {
log.error("Usupported file format.");
}

String firstLine = reader.readLine();

delimiterCounts.put(',', firstLine.split(",").length);
delimiterCounts.put(';', firstLine.split(";").length);
delimiterCounts.put('\t', firstLine.split("\t").length);

} catch (FileNotFoundException e) {
throw e;
} catch (IOException e) {
throw e;
} finally {
reader.close();
}

Character delimiter = null;
for (Map.Entry<Character, Integer> e : delimiterCounts.entrySet()) {
if (e.getValue().equals(Collections.max(delimiterCounts.values()))) {
delimiter = e.getKey();
break;
}
}

return delimiter;
}

private void writeModeShare(Table trips, List<String> labels) {

Table aggr = trips.summarize("trip_id", count).by("dist_group", "main_mode");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.lang.StringUtils;
import org.matsim.core.utils.io.IOUtils;
import picocli.CommandLine;

import java.io.IOException;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
Expand Down Expand Up @@ -74,4 +75,24 @@ public CSVPrinter createPrinter(Path path) throws IOException {
return new CSVPrinter(IOUtils.getBufferedWriter(path.toUri().toURL(), csvCharset, false), getFormat());
}

public Character detectDelimiter(String path) throws IOException {
BufferedReader reader = IOUtils.getBufferedReader(path);

String firstLine = reader.readLine();

int comma = StringUtils.countMatches(firstLine, ",");
int semicolon = StringUtils.countMatches(firstLine, ";");
int tab = StringUtils.countMatches(firstLine, "\t");

if (Math.max(comma, Math.max(semicolon, tab)) == comma) {
return ',';
} else if (Math.max(comma, Math.max(semicolon, tab)) == semicolon) {
return ';';
} else if (Math.max(comma, Math.max(semicolon, tab)) == tab) {
return '\t';
} else {
return null;
}
}

}

0 comments on commit 636c7bf

Please sign in to comment.