From 1f22c863a87086a89cb36010a7abdbf519f06ed3 Mon Sep 17 00:00:00 2001 From: Evgenii Grigorev Date: Fri, 20 Dec 2024 13:25:31 +0100 Subject: [PATCH] [#228] Refactored the code, implemented suggested changes --- .../cz/cvut/spipes/modules/TabularModule.java | 44 +++------ .../modules/util/CSVFileReaderAdapter.java | 52 ---------- .../modules/util/CSVStreamReaderAdapter.java | 97 +++++++++++++++++++ ...pter.java => HTMLStreamReaderAdapter.java} | 19 ++-- ...rAdapter.java => StreamReaderAdapter.java} | 13 ++- ...apter.java => XLSStreamReaderAdapter.java} | 49 +++++++--- 6 files changed, 166 insertions(+), 108 deletions(-) delete mode 100644 s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java create mode 100644 s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java rename s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/{HTMLFileReaderAdapter.java => HTMLStreamReaderAdapter.java} (87%) rename s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/{FileReaderAdapter.java => StreamReaderAdapter.java} (54%) rename s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/{XLSFileReaderAdapter.java => XLSStreamReaderAdapter.java} (66%) diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java index a8fe18fd..4f61bd29 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java @@ -13,7 +13,6 @@ import cz.cvut.spipes.exception.ResourceNotUniqueException; import cz.cvut.spipes.exception.SPipesException; import cz.cvut.spipes.modules.annotations.SPipesModule; -import cz.cvut.spipes.modules.exception.SheetDoesntExistsException; import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException; import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException; import cz.cvut.spipes.modules.model.*; @@ -189,7 +188,7 @@ ExecutionContext executeSelf() { StreamResource originalSourceResource = sourceResource; TSVConvertor tsvConvertor = null; - FileReaderAdapter fileReaderAdapter = new XLSFileReaderAdapter(); + StreamReaderAdapter streamReaderAdapter = new XLSStreamReaderAdapter(); CsvPreference csvPreference = null; switch (sourceResourceFormat) { @@ -200,7 +199,7 @@ ExecutionContext executeSelf() { if (processTableAtIndex != 1) { throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet."); } - fileReaderAdapter = new HTMLFileReaderAdapter(); + streamReaderAdapter = new HTMLStreamReaderAdapter(); break; case XLS: case XLSM: @@ -208,14 +207,14 @@ ExecutionContext executeSelf() { if (processTableAtIndex == 0) { throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing."); } - fileReaderAdapter = new XLSFileReaderAdapter(); + streamReaderAdapter = new XLSStreamReaderAdapter(); break; default: csvPreference = new CsvPreference.Builder( quoteCharacter, delimiter, System.lineSeparator()).build(); - fileReaderAdapter = new CSVFileReaderAdapter(csvPreference); + streamReaderAdapter = new CSVStreamReaderAdapter(csvPreference); break; } @@ -231,12 +230,13 @@ ExecutionContext executeSelf() { List rowStatements = new ArrayList<>(); try { - fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex); - String[] header = fileReaderAdapter.getHeader(); + streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), + sourceResourceFormat, processTableAtIndex, acceptInvalidQuoting, inputCharset, sourceResource); + String[] header = streamReaderAdapter.getHeader(skipHeader); Set columnNames = new HashSet<>(); - if (fileReaderAdapter.getLabel() != null) - table.setLabel(fileReaderAdapter.getLabel()); + if (streamReaderAdapter.getSheetLabel() != null) + table.setLabel(streamReaderAdapter.getSheetLabel()); if (header == null) { LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri()); @@ -248,7 +248,7 @@ ExecutionContext executeSelf() { if (skipHeader) { header = getHeaderFromSchema(inputModel, header, hasInputSchema); - fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex); + //streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex); } else if (hasInputSchema) { header = getHeaderFromSchema(inputModel, header, true); } @@ -276,12 +276,7 @@ ExecutionContext executeSelf() { int rowNumber = 0; List row; - switch (sourceResourceFormat){ - case XLS, XLSM, XLSX: - row = fileReaderAdapter.getNextRow(); //skip header for xls files - break; - } - while ((row = fileReaderAdapter.getNextRow()) != null) { + while ((row = streamReaderAdapter.getNextRow()) != null) { rowNumber++; // 4.6.1 and 4.6.3 Row r = new Row(); @@ -327,7 +322,7 @@ ExecutionContext executeSelf() { em.persist(tableGroup); em.merge(tableSchema); - List regions = fileReaderAdapter.getMergedRegions(); + List regions = streamReaderAdapter.getMergedRegions(); int cellsNum = 1; for (Region region : regions) { @@ -345,6 +340,7 @@ ExecutionContext executeSelf() { } } } + streamReaderAdapter.close(); } catch (IOException e) { LOG.error("Error while reading file from resource uri {}", sourceResource, e); } @@ -386,16 +382,6 @@ private String getValueFromRow(List row, int index, int expectedRowLengt } } - private ICsvListReader getCsvListReader(CsvPreference csvPreference) { - if (acceptInvalidQuoting) { - if (getQuote() == '\0') { - return null; - } else - return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference); - } - return new CsvListReader(getReader(), csvPreference); - } - private Statement createRowResource(String cellValue, int rowNumber, Column column) { Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber)); @@ -574,10 +560,6 @@ private String normalize(String label) { return label.trim().replaceAll("[^\\w]", "_"); } - private Reader getReader() { - return new StringReader(new String(sourceResource.getContent(), inputCharset)); - } - @NotNull private StreamResource getResourceByUri(@NotNull String resourceUri) { diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java deleted file mode 100644 index 6eb9ad14..00000000 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java +++ /dev/null @@ -1,52 +0,0 @@ -package cz.cvut.spipes.modules.util; - -import cz.cvut.spipes.modules.ResourceFormat; -import cz.cvut.spipes.modules.model.Region; -import org.supercsv.io.CsvListReader; -import org.supercsv.io.ICsvListReader; -import org.supercsv.prefs.CsvPreference; - -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.List; - -public class CSVFileReaderAdapter implements FileReaderAdapter { - private ICsvListReader listReader; - private CsvPreference csvPreference; - - public CSVFileReaderAdapter(CsvPreference csvPreference) { - this.csvPreference = csvPreference; - } - - @Override - public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException { - listReader = new CsvListReader(new InputStreamReader(inputStream), csvPreference); - } - - @Override - public String[] getHeader() throws IOException { - return listReader.getHeader(true); - } - - @Override - public boolean hasNext() throws IOException { - return listReader.read() != null; - } - - @Override - public List getNextRow() throws IOException { - return listReader.read(); - } - - @Override - public List getMergedRegions() { - return new ArrayList<>(); - } - - @Override - public String getLabel(){ - return null; - } -} diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java new file mode 100644 index 00000000..6935ffcf --- /dev/null +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java @@ -0,0 +1,97 @@ +package cz.cvut.spipes.modules.util; + +import cz.cvut.spipes.InvalidQuotingTokenizer; +import cz.cvut.spipes.modules.ResourceFormat; +import cz.cvut.spipes.modules.model.Region; +import cz.cvut.spipes.registry.StreamResource; +import org.supercsv.io.CsvListReader; +import org.supercsv.io.ICsvListReader; +import org.supercsv.prefs.CsvPreference; + +import java.io.*; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.io.StringReader; + +public class CSVStreamReaderAdapter implements StreamReaderAdapter { + private ICsvListReader listReader; + private CsvPreference csvPreference; + String [] header = null; + String [] firstRow = null; + boolean acceptInvalidQuoting; + Charset inputCharset; + StreamResource sourceResource; + + public CSVStreamReaderAdapter(CsvPreference csvPreference) { + this.csvPreference = csvPreference; + } + + @Override + public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, + boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException { + //listReader = new CsvListReader(new InputStreamReader(inputStream), csvPreference); + this.acceptInvalidQuoting = acceptInvalidQuoting; + this.inputCharset = inputCharset; + this.sourceResource = sourceResource; + listReader = getCsvListReader(csvPreference); + } + + @Override + public String[] getHeader(Boolean skipHeader) throws IOException { + header = listReader.getHeader(true); + if (skipHeader) { + firstRow = header; + } + return header; + } + + @Override + public boolean hasNextRow() throws IOException { + return ((listReader.read() != null) || (firstRow != null)); + } + + @Override + public List getNextRow() throws IOException { + if (firstRow != null) { + List row = Arrays.asList(firstRow); + firstRow = null; + return row; + } + return listReader.read(); + } + + @Override + public List getMergedRegions() { + return new ArrayList<>(); + } + + @Override + public String getSheetLabel(){ + return null; + } + + @Override + public void close() throws IOException{ + listReader.close(); + } + + private ICsvListReader getCsvListReader(CsvPreference csvPreference) { + if (acceptInvalidQuoting) { + if (getQuote() == '\0') { + return null; + } else + return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference); + } + return new CsvListReader(getReader(), csvPreference); + } + + private Reader getReader() { + return new StringReader(new String(sourceResource.getContent(), inputCharset)); + } + + public char getQuote() { + return csvPreference.getQuoteChar(); + } +} diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLFileReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java similarity index 87% rename from s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLFileReaderAdapter.java rename to s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java index 4e6e9b88..857731b1 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLFileReaderAdapter.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java @@ -2,15 +2,17 @@ import cz.cvut.spipes.modules.ResourceFormat; import cz.cvut.spipes.modules.model.Region; +import cz.cvut.spipes.registry.StreamResource; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.*; +import java.nio.charset.Charset; import java.util.*; -public class HTMLFileReaderAdapter implements FileReaderAdapter { +public class HTMLStreamReaderAdapter implements StreamReaderAdapter { private Elements rows; private int currentIndex; private Element table; @@ -20,7 +22,8 @@ public class HTMLFileReaderAdapter implements FileReaderAdapter { private Map> mergedCells; @Override - public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException { + public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, + int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException { Document doc = Jsoup.parse(inputStream, "UTF-8", ""); Element table = doc.select("table").first(); rows = table.select("tr"); @@ -33,7 +36,7 @@ public void initialise(InputStream inputStream, ResourceFormat sourceResourceFor @Override - public String[] getHeader() throws IOException { + public String[] getHeader(Boolean skipHeader) throws IOException { Elements headerCells = rows.get(0).select("th, td"); return headerCells.stream() .map(Element::text) @@ -41,13 +44,13 @@ public String[] getHeader() throws IOException { } @Override - public boolean hasNext() { + public boolean hasNextRow() { return currentIndex < rows.size() - 1; // Skip header row } @Override public List getNextRow() { - if (!hasNext()) { + if (!hasNextRow()) { return null; } @@ -118,7 +121,11 @@ private List extractMergedRegions(Element table) { } @Override - public String getLabel(){ + public String getSheetLabel(){ return label; } + + @Override + public void close() { + } } diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/FileReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java similarity index 54% rename from s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/FileReaderAdapter.java rename to s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java index af377420..a8148354 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/FileReaderAdapter.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java @@ -3,19 +3,22 @@ import cz.cvut.spipes.modules.ResourceFormat; import cz.cvut.spipes.modules.TabularModule; import cz.cvut.spipes.modules.model.Region; +import cz.cvut.spipes.registry.StreamResource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; import java.util.List; -public interface FileReaderAdapter { +public interface StreamReaderAdapter { static final Logger LOG = LoggerFactory.getLogger(TabularModule.class); - void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException; - String[] getHeader() throws IOException; - boolean hasNext() throws IOException; + void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException; + String[] getHeader(Boolean skipHeader) throws IOException; + boolean hasNextRow() throws IOException; List getNextRow() throws IOException; List getMergedRegions(); - String getLabel() throws IOException; + String getSheetLabel() throws IOException; + void close() throws IOException; } diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSFileReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java similarity index 66% rename from s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSFileReaderAdapter.java rename to s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java index 1644b09f..0dc84cb4 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSFileReaderAdapter.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java @@ -3,28 +3,32 @@ import cz.cvut.spipes.modules.ResourceFormat; import cz.cvut.spipes.modules.exception.SheetDoesntExistsException; import cz.cvut.spipes.modules.model.Region; +import cz.cvut.spipes.registry.StreamResource; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.util.CellRangeAddress; import org.apache.poi.xssf.usermodel.XSSFWorkbook; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.stream.Collectors; import java.util.stream.StreamSupport; -public class XLSFileReaderAdapter implements FileReaderAdapter { +public class XLSStreamReaderAdapter implements StreamReaderAdapter { private Sheet sheet; private Iterator rowIterator; + Boolean skipHeader; @Override - public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException { + public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, + boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException { Workbook workbook; if (sourceResourceFormat == ResourceFormat.XLS) { workbook = new HSSFWorkbook(inputStream); @@ -43,15 +47,21 @@ public void initialise(InputStream inputStream, ResourceFormat sourceResourceFor } @Override - public String[] getHeader() throws IOException { - org.apache.poi.ss.usermodel.Row headerRow = sheet.getRow(0); - return StreamSupport.stream(headerRow.spliterator(), false) - .map(cell -> cell.getStringCellValue()) - .toArray(String[]::new); + public String[] getHeader(Boolean skipHeader) throws IOException { + Row headerRow = sheet.getRow(0); + if (skipHeader) { + return null; + } + else { + rowIterator.next(); // move iterator to 2nd row + return StreamSupport.stream(headerRow.spliterator(), false) + .map(cell -> cell.getStringCellValue()) + .toArray(String[]::new); + } } @Override - public boolean hasNext() { + public boolean hasNextRow() { return rowIterator.hasNext(); } @@ -59,14 +69,12 @@ public boolean hasNext() { public List getNextRow() { if (!rowIterator.hasNext()) return null; - org.apache.poi.ss.usermodel.Row currentRow = rowIterator.next(); + Row currentRow = rowIterator.next(); DataFormatter formatter = new DataFormatter(); List row = StreamSupport.stream(currentRow.spliterator(), false) .map(cell -> { String cellValue = formatter.formatCellValue(cell); - if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) { - cellValue = cellValue.replace(",", "."); - } + cellValue = fixNumberFormat(cellValue); return cellValue.isEmpty() ? null : cellValue; }) .collect(Collectors.toList()); @@ -88,8 +96,21 @@ public List getMergedRegions() { return regions; } - public String getLabel(){ + @Override + public String getSheetLabel(){ return sheet.getSheetName(); } + + public String fixNumberFormat (String cellValue){ + //xls uses ',' as decimal separator, so we should convert it to '.' + if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) { + cellValue = cellValue.replace(",", "."); + } + return cellValue; + } + + @Override + public void close() { + } }