From 1c849564534ff8fdb31577bbcd1c11701b69a5cf Mon Sep 17 00:00:00 2001
From: Miroslav Blasko <blcham@gmail.com>
Date: Mon, 6 Jan 2025 18:18:38 +0100
Subject: [PATCH 1/5] Reformat

---
 .../cz/cvut/spipes/modules/TabularModule.java | 101 ++++++++++--------
 1 file changed, 54 insertions(+), 47 deletions(-)
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
index 0316249a..ab238de3 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -113,12 +113,12 @@ public class TabularModule extends AnnotatedAbstractModule {
     private final Property P_PROCESS_TABLE_AT_INDEX = getSpecificParameter("process-table-at-index");
 
     @Parameter(iri = SML.replace, comment = "Specifies whether a module should overwrite triples" +
-        " from its predecessors. When set to true (default is false), it prevents" +
-        " passing through triples from the predecessors.")
+            " from its predecessors. When set to true (default is false), it prevents" +
+            " passing through triples from the predecessors.")
     private boolean isReplace = false;
 
     @Parameter(iri = PARAM_URL_PREFIX + "source-resource-uri", comment = "URI of resource" +
-        " that represent tabular data (e.g. resource representing CSV file).")
+            " that represent tabular data (e.g. resource representing CSV file).")
     private StreamResource sourceResource;
 
     @Parameter(iri = PARAM_URL_PREFIX + "delimiter", comment = "Column delimiter. Default value is comma ','.")
@@ -281,11 +281,13 @@ ExecutionContext executeSelf() {
 
                 tableSchema.setAboutUrl(schemaColumn, sourceResource.getUri());
                 schemaColumn.setProperty(
-                    dataPrefix,
-                    sourceResource.getUri(),
-                    hasInputSchema ? tableSchema.getColumn(columnName) : null);
+                        dataPrefix,
+                        sourceResource.getUri(),
+                        hasInputSchema ? tableSchema.getColumn(columnName) : null);
                 schemaColumn.setTitle(columnTitle);
-                if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName);
+                if (isDuplicate) {
+                    throwNotUniqueException(schemaColumn, columnTitle, columnName);
+                }
             }
 
             List<String> row;
@@ -348,13 +350,14 @@ ExecutionContext executeSelf() {
             int cellsNum = 1;
             for (Region region : regions) {
                 int firstCellInRegionNum = cellsNum;
-                for(int i = region.getFirstRow();i <= region.getLastRow();i++){
-                    for(int j = region.getFirstColumn();j <= region.getLastColumn();j++) {
-                        Cell cell = new Cell(sourceResource.getUri()+"#cell"+(cellsNum));
+                for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
+                    for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
+                        Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
                         cell.setRow(tableSchema.createAboutUrl(i));
                         cell.setColumn(outputColumns.get(j).getUri().toString());
-                        if(cellsNum != firstCellInRegionNum)
-                            cell.setSameValueAsCell(sourceResource.getUri()+"#cell"+(firstCellInRegionNum));
+                        if (cellsNum != firstCellInRegionNum) {
+                            cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
+                        }
                         em.merge(cell);
                         cellsNum++;
                     }
@@ -381,19 +384,19 @@ private String getValueFromRow(List<String> row, int index, int expectedRowLengt
             StringBuilder record = new StringBuilder(recordDelimiter);
             for (int i = 0; i < row.size(); i++) {
                 record
-                    .append(i)
-                    .append(":")
-                    .append(row.get(i))
-                    .append(recordDelimiter);
+                        .append(i)
+                        .append(":")
+                        .append(row.get(i))
+                        .append(recordDelimiter);
             }
             LOG.error("Reading input file failed when reading record #{} (may not reflect the line #).\n" +
-                    " It was expected that the current record contains {} values" +
-                    ", but {}. element was not retrieved before whole record was processed.\n" +
-                    "The problematic record: {}",
-                currentRecordNumber,
-                expectedRowLength,
-                index+1,
-                record
+                            " It was expected that the current record contains {} values" +
+                            ", but {}. element was not retrieved before whole record was processed.\n" +
+                            "The problematic record: {}",
+                    currentRecordNumber,
+                    expectedRowLength,
+                    index+1,
+                    record
             );
             throw new SPipesException("Reading input file failed.", e);
         }
@@ -413,9 +416,9 @@ private Statement createRowResource(String cellValue, int rowNumber, Column colu
         Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber));
 
         return ResourceFactory.createStatement(
-            rowResource,
-            ResourceFactory.createProperty(column.getPropertyUrl()),
-            ResourceFactory.createPlainLiteral(cellValue));
+                rowResource,
+                ResourceFactory.createProperty(column.getPropertyUrl()),
+                ResourceFactory.createPlainLiteral(cellValue));
     }
 
     private boolean hasInputSchema(TableSchema inputTableSchema) {
@@ -429,11 +432,11 @@ private boolean hasInputSchema(TableSchema inputTableSchema) {
 
     private TableSchema getTableSchema(EntityManager em) {
         TypedQuery<TableSchema> query = em.createNativeQuery(
-            "PREFIX csvw: <http://www.w3.org/ns/csvw#>\n" +
-                "SELECT ?t WHERE { \n" +
-                "?t a csvw:TableSchema. \n" +
-                "}",
-            TableSchema.class
+                "PREFIX csvw: <http://www.w3.org/ns/csvw#>\n" +
+                        "SELECT ?t WHERE { \n" +
+                        "?t a csvw:TableSchema. \n" +
+                        "}",
+                TableSchema.class
         );
 
         int tableSchemaCount = query.getResultList().size();
@@ -452,14 +455,14 @@ private TableSchema getTableSchema(EntityManager em) {
 
     private void throwNotUniqueException(Column column, String columnTitle, String columnName) {
         throw new ResourceNotUniqueException(
-            String.format("Unable to create value of property %s due to collision. " +
-                    "Both column titles '%s' and '%s' are normalized to '%s' " +
-                    "and thus would refer to the same property url <%s>.",
-                CSVW.propertyUrl,
-                columnTitle,
-                column.getTitle(),
-                columnName,
-                column.getPropertyUrl()));
+                String.format("Unable to create value of property %s due to collision. " +
+                                "Both column titles '%s' and '%s' are normalized to '%s' " +
+                                "and thus would refer to the same property url <%s>.",
+                        CSVW.propertyUrl,
+                        columnTitle,
+                        column.getTitle(),
+                        columnName,
+                        column.getPropertyUrl()));
     }
 
     private ExecutionContext getExecutionContext(Model inputModel, Model outputModel) {
@@ -473,12 +476,12 @@ private ExecutionContext getExecutionContext(Model inputModel, Model outputModel
     @Override
     public void loadManualConfiguration() {
         sourceResourceFormat = ResourceFormat.fromString(
-            getPropertyValue(P_SOURCE_RESOURCE_FORMAT, ResourceFormat.PLAIN.getValue())
+                getPropertyValue(P_SOURCE_RESOURCE_FORMAT, ResourceFormat.PLAIN.getValue())
         );
         delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier(sourceResourceFormat));
         quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(sourceResourceFormat));
         outputMode = Mode.fromResource(
-            getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource())
+                getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource())
         );
         setInputCharset(delimiter);
     }
@@ -514,7 +517,7 @@ private Supplier<Character> getDefaultQuoteCharacterSupplier(ResourceFormat sour
         if (sourceResourceFormat == ResourceFormat.CSV) {
             return () -> {
                 LOG.debug("Quote character not specified, using double-quote as default value" +
-                    " to be compliant with RFC 4180 (CSV)");
+                        " to be compliant with RFC 4180 (CSV)");
                 return '"';
             };
         }
@@ -524,8 +527,8 @@ private Supplier<Character> getDefaultQuoteCharacterSupplier(ResourceFormat sour
     private char getPropertyValue(Property property,
                                   Supplier<Character> defaultValueSupplier) {
         return Optional.ofNullable(getPropertyValue(property))
-            .map(n -> n.asLiteral().getChar())
-            .orElseGet(defaultValueSupplier);
+                .map(n -> n.asLiteral().getChar())
+                .orElseGet(defaultValueSupplier);
     }
 
     @Override
@@ -624,7 +627,7 @@ public int getDelimiter() {
 
     public void setDelimiter(int delimiter) {
         if ((sourceResourceFormat == ResourceFormat.CSV && delimiter != ',') ||
-            (sourceResourceFormat == ResourceFormat.TSV && delimiter != '\t')) {
+                (sourceResourceFormat == ResourceFormat.TSV && delimiter != '\t')) {
             throw new SpecificationNonComplianceException(sourceResourceFormat, delimiter);
         }
         this.delimiter = delimiter;
@@ -680,7 +683,9 @@ private String[] getHeaderFromSchema(Model inputModel, String[] header, boolean
                 tableSchema.setOrderList(orderList);
                 header = createHeaders(header.length, tableSchema.sortColumns(orderList));
 
-            } else LOG.info("Order of columns was not provided in the schema.");
+            } else {
+                LOG.info("Order of columns was not provided in the schema.");
+            }
         } else {
             header = createHeaders(header.length, new ArrayList<>());
         }
@@ -693,7 +698,9 @@ private String[] createHeaders(int size, List<Column> columns) {
         for (int i = 0; i < size; i++) {
             if (!columns.isEmpty()) {
                 headers[i] = columns.get(i).getName();
-            } else headers[i] = "column_" + (i + 1);
+            } else {
+                headers[i] = "column_" + (i + 1);
+            }
         }
         return headers;
     }

From a1b4b52d74a8da6694a7166263d93bfed5890586 Mon Sep 17 00:00:00 2001
From: Miroslav Blasko <blcham@gmail.com>
Date: Tue, 7 Jan 2025 17:43:48 +0100
Subject: [PATCH 2/5] [#228] Refactor using CSVStreamReaderAdapter

---
 .../cz/cvut/spipes/modules/TabularModule.java |  78 ++++---------
 .../exception/MissingArgumentException.java   |   7 ++
 .../modules/util/CSVStreamReaderAdapter.java  | 106 ++++++++++++++++++
 .../modules/util/StreamReaderAdapter.java     |  19 ++++
 4 files changed, 155 insertions(+), 55 deletions(-)
 create mode 100644 s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/exception/MissingArgumentException.java
 create mode 100644 s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java
 create mode 100644 s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java

diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
index ab238de3..cb9d4136 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -13,6 +13,7 @@
 import cz.cvut.spipes.exception.ResourceNotUniqueException;
 import cz.cvut.spipes.exception.SPipesException;
 import cz.cvut.spipes.modules.annotations.SPipesModule;
+import cz.cvut.spipes.modules.exception.MissingArgumentException;
 import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
 import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException;
 import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException;
@@ -22,18 +23,14 @@
 import cz.cvut.spipes.registry.StreamResource;
 import cz.cvut.spipes.registry.StreamResourceRegistry;
 import cz.cvut.spipes.util.JenaUtils;
-import org.apache.commons.cli.MissingArgumentException;
 import org.apache.jena.rdf.model.*;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.supercsv.io.CsvListReader;
-import org.supercsv.io.ICsvListReader;
 import org.supercsv.prefs.CsvPreference;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
 import java.net.URI;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
@@ -189,6 +186,8 @@ ExecutionContext executeSelf() {
 
         StreamResource originalSourceResource = sourceResource;
         TSVConvertor tsvConvertor = null;
+        StreamReaderAdapter streamReaderAdapter;
+        CsvPreference csvPreference = null;
 
         switch (sourceResourceFormat) {
             case HTML:
@@ -201,7 +200,7 @@ ExecutionContext executeSelf() {
                 tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
                 table.setLabel(tsvConvertor.getTableName(sourceResource));
                 setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                setDelimiter('\t');
+                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, '\t');
                 break;
             case XLS:
             case XLSM:
@@ -221,7 +220,10 @@ ExecutionContext executeSelf() {
                     throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
                 }
                 setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                setDelimiter('\t');
+                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, '\t');
+                break;
+            default:
+                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, delimiter);
                 break;
         }
 
@@ -236,33 +238,21 @@ ExecutionContext executeSelf() {
         List<Column> outputColumns = new ArrayList<>();
         List<Statement> rowStatements = new ArrayList<>();
 
-        CsvPreference csvPreference = new CsvPreference.Builder(
-            quoteCharacter,
-            delimiter,
-            System.lineSeparator()).build();
-
         try {
-            ICsvListReader listReader = getCsvListReader(csvPreference);
-
-            if (listReader == null) {
-                logMissingQuoteError();
-                return getExecutionContext(inputModel, outputModel);
-            }
-
-            String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader)
+            streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()),
+                sourceResourceFormat, processTableAtIndex, acceptInvalidQuoting, inputCharset, sourceResource);
+            String[] header = streamReaderAdapter.getHeader(skipHeader);;
+            Set<String> columnNames = new HashSet<>();
 
-            if (header == null) {
-                LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri());
-                return getExecutionContext(inputModel, outputModel);
+            if (streamReaderAdapter.getSheetLabel() != null) {
+                table.setLabel(streamReaderAdapter.getSheetLabel());
             }
-            Set<String> columnNames = new HashSet<>();
 
             TableSchema inputTableSchema = getTableSchema(em);
             hasInputSchema = hasInputSchema(inputTableSchema);
 
             if (skipHeader) {
                 header = getHeaderFromSchema(inputModel, header, hasInputSchema);
-                listReader = new CsvListReader(getReader(), csvPreference);
             } else if (hasInputSchema) {
                 header = getHeaderFromSchema(inputModel, header, true);
             }
@@ -290,10 +280,9 @@ ExecutionContext executeSelf() {
                 }
             }
 
-            List<String> row;
             int rowNumber = 0;
-            //for each row
-            while ((row = listReader.read()) != null) {
+            List<String> row;
+            while ((row = streamReaderAdapter.getNextRow()) != null) {
                 rowNumber++;
                 // 4.6.1 and 4.6.3
                 Row r = new Row();
@@ -331,8 +320,12 @@ ExecutionContext executeSelf() {
                     // 4.6.8.7 - else, if cellValue is not null
                 }
             }
-            listReader.close();
-        } catch (IOException | MissingArgumentException e) {
+            streamReaderAdapter.close();
+        } catch (MissingArgumentException e) {
+            if (ExecutionConfig.isExitOnError()) {
+                return getExecutionContext(inputModel, outputModel);
+            }
+        } catch (IOException e) {
             LOG.error("Error while reading file from resource uri {}", sourceResource, e);
         }
 
@@ -402,16 +395,6 @@ private String getValueFromRow(List<String> row, int index, int expectedRowLengt
         }
     }
 
-    private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
-        if (acceptInvalidQuoting) {
-            if (getQuote() == '\0') {
-                return null;
-            } else
-                return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
-        }
-        return new CsvListReader(getReader(), csvPreference);
-    }
-
     private Statement createRowResource(String cellValue, int rowNumber, Column column) {
         Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber));
 
@@ -590,10 +573,6 @@ private String normalize(String label) {
         return label.trim().replaceAll("[^\\w]", "_");
     }
 
-    private Reader getReader() {
-        return new StringReader(new String(sourceResource.getContent(), inputCharset));
-    }
-
     @NotNull
     private StreamResource getResourceByUri(@NotNull String resourceUri) {
 
@@ -633,10 +612,6 @@ public void setDelimiter(int delimiter) {
         this.delimiter = delimiter;
     }
 
-    public char getQuote() {
-        return quoteCharacter;
-    }
-
     public void setQuoteCharacter(char quoteCharacter) {
         this.quoteCharacter = quoteCharacter;
     }
@@ -704,11 +679,4 @@ private String[] createHeaders(int size, List<Column> columns) {
         }
         return headers;
     }
-
-    private void logMissingQuoteError() throws MissingArgumentException {
-        String message = "Quote character must be specified when using custom tokenizer.";
-        if (ExecutionConfig.isExitOnError()) {
-            throw new MissingArgumentException(message);
-        } else LOG.error(message);
-    }
 }
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/exception/MissingArgumentException.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/exception/MissingArgumentException.java
new file mode 100644
index 00000000..4614d352
--- /dev/null
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/exception/MissingArgumentException.java
@@ -0,0 +1,7 @@
+package cz.cvut.spipes.modules.exception;
+
+public class MissingArgumentException extends RuntimeException {
+    public MissingArgumentException(String message) {
+        super();
+    }
+}
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java
new file mode 100644
index 00000000..765fd3de
--- /dev/null
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java
@@ -0,0 +1,106 @@
+package cz.cvut.spipes.modules.util;
+
+import cz.cvut.spipes.InvalidQuotingTokenizer;
+import cz.cvut.spipes.modules.ResourceFormat;
+import cz.cvut.spipes.modules.exception.MissingArgumentException;
+import cz.cvut.spipes.modules.model.Region;
+import cz.cvut.spipes.registry.StreamResource;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.supercsv.io.CsvListReader;
+import org.supercsv.io.ICsvListReader;
+import org.supercsv.prefs.CsvPreference;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class CSVStreamReaderAdapter implements StreamReaderAdapter {
+    private ICsvListReader listReader;
+    private CsvPreference csvPreference;
+    String [] header = null;
+    String [] firstRow = null;
+    boolean acceptInvalidQuoting;
+    Charset inputCharset;
+    StreamResource sourceResource;
+    private final static Logger log = LoggerFactory.getLogger(CSVStreamReaderAdapter.class);
+
+    public CSVStreamReaderAdapter(char quoteCharacter, int delimiter) {
+        this.csvPreference = new CsvPreference.Builder(quoteCharacter, delimiter, System.lineSeparator()).build();
+    }
+
+    @Override
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex,
+                           boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
+        this.acceptInvalidQuoting = acceptInvalidQuoting;
+        this.inputCharset = inputCharset;
+        this.sourceResource = sourceResource;
+        listReader = getCsvListReader(csvPreference);
+        if (listReader == null) {
+            throwMissingQuoteError();
+        }
+    }
+
+    @Override
+    public String[] getHeader(boolean skipHeader) throws IOException {
+        header = listReader.getHeader(true);
+        if (skipHeader) {
+            firstRow = header;
+        }
+        return header;
+    }
+
+    @Override
+    public List<String> getNextRow() throws IOException {
+        if (firstRow != null) {
+            List<String> row = Arrays.asList(firstRow);
+            firstRow = null;
+            return row;
+        }
+        return listReader.read();
+    }
+
+    @Override
+    public List<Region> getMergedRegions() {
+         return new ArrayList<>();
+    }
+
+    @Override
+    public String getSheetLabel(){
+        return null;
+    }
+
+    @Override
+    public void close() throws IOException{
+        listReader.close();
+    }
+
+    private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
+        if (acceptInvalidQuoting) {
+            if (getQuote() == '\0') {
+                return null;
+            } else
+                return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
+        }
+        return new CsvListReader(getReader(), csvPreference);
+    }
+
+    private Reader getReader() {
+        return new StringReader(new String(sourceResource.getContent(), inputCharset));
+    }
+
+    public char getQuote() {
+        return csvPreference.getQuoteChar();
+    }
+
+    private void throwMissingQuoteError() throws MissingArgumentException {
+        String message = "Quote character must be specified when using custom tokenizer.";
+        log.error(message);
+        throw new MissingArgumentException(message);
+    }
+}
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java
new file mode 100644
index 00000000..a69b6160
--- /dev/null
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java
@@ -0,0 +1,19 @@
+package cz.cvut.spipes.modules.util;
+
+import cz.cvut.spipes.modules.ResourceFormat;
+import cz.cvut.spipes.modules.model.Region;
+import cz.cvut.spipes.registry.StreamResource;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.List;
+
+public interface StreamReaderAdapter {
+    void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException;
+    String[] getHeader(boolean skipHeader) throws IOException;
+    List<String> getNextRow() throws IOException;
+    List<Region> getMergedRegions();
+    String getSheetLabel() throws IOException;
+    void close() throws IOException;
+}

From 03a92728d4c5299ce08e7ceb9ab38245b4a41385 Mon Sep 17 00:00:00 2001
From: Miroslav Blasko <blcham@gmail.com>
Date: Tue, 7 Jan 2025 18:10:49 +0100
Subject: [PATCH 3/5] [#228] Tabular Module now uses adapters

---
 .../cz/cvut/spipes/modules/TabularModule.java |  66 ++++-----
 .../modules/util/HTMLStreamReaderAdapter.java | 130 ++++++++++++++++++
 .../modules/util/XLSStreamReaderAdapter.java  | 114 +++++++++++++++
 3 files changed, 268 insertions(+), 42 deletions(-)
 create mode 100644 s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java
 create mode 100644 s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java

diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
index cb9d4136..91972aac 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -14,7 +14,6 @@
 import cz.cvut.spipes.exception.SPipesException;
 import cz.cvut.spipes.modules.annotations.SPipesModule;
 import cz.cvut.spipes.modules.exception.MissingArgumentException;
-import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
 import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException;
 import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException;
 import cz.cvut.spipes.modules.handlers.ModeHandler;
@@ -197,10 +196,7 @@ ExecutionContext executeSelf() {
                 if (processTableAtIndex != 1) {
                     throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
                 }
-                tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
-                table.setLabel(tsvConvertor.getTableName(sourceResource));
-                setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, '\t');
+                streamReaderAdapter = new HTMLStreamReaderAdapter();
                 break;
             case XLS:
             case XLSM:
@@ -208,19 +204,7 @@ ExecutionContext executeSelf() {
                 if (processTableAtIndex == 0) {
                     throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
                 }
-                tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat);
-                int numberOfSheets = tsvConvertor.getTablesCount(sourceResource);
-                table.setLabel(tsvConvertor.getTableName(sourceResource));
-                LOG.debug("Number of sheets: {}", numberOfSheets);
-                if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) {
-                    LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
-                        numberOfSheets,
-                            processTableAtIndex
-                    );
-                    throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
-                }
-                setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, '\t');
+                streamReaderAdapter = new XLSStreamReaderAdapter();
                 break;
             default:
                 streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, delimiter);
@@ -320,14 +304,6 @@ ExecutionContext executeSelf() {
                     // 4.6.8.7 - else, if cellValue is not null
                 }
             }
-            streamReaderAdapter.close();
-        } catch (MissingArgumentException e) {
-            if (ExecutionConfig.isExitOnError()) {
-                return getExecutionContext(inputModel, outputModel);
-            }
-        } catch (IOException e) {
-            LOG.error("Error while reading file from resource uri {}", sourceResource, e);
-        }
 
         tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
         tableSchema.setColumnsSet(new HashSet<>(outputColumns));
@@ -337,26 +313,32 @@ ExecutionContext executeSelf() {
         em.persist(tableGroup);
         em.merge(tableSchema);
 
-        if (tsvConvertor != null) {
-            List<Region> regions =  tsvConvertor.getMergedRegions(originalSourceResource);
-
-            int cellsNum = 1;
-            for (Region region : regions) {
-                int firstCellInRegionNum = cellsNum;
-                for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
-                    for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
-                        Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
-                        cell.setRow(tableSchema.createAboutUrl(i));
-                        cell.setColumn(outputColumns.get(j).getUri().toString());
-                        if (cellsNum != firstCellInRegionNum) {
-                            cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
-                        }
-                        em.merge(cell);
-                        cellsNum++;
+        List<Region> regions = streamReaderAdapter.getMergedRegions();
+
+        int cellsNum = 1;
+        for (Region region : regions) {
+            int firstCellInRegionNum = cellsNum;
+            for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
+                for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
+                    Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
+                    cell.setRow(tableSchema.createAboutUrl(i));
+                    cell.setColumn(outputColumns.get(j).getUri().toString());
+                    if (cellsNum != firstCellInRegionNum) {
+                        cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
                     }
+                    em.merge(cell);
+                    cellsNum++;
                 }
             }
         }
+        streamReaderAdapter.close();
+        } catch (MissingArgumentException e) {
+                if (ExecutionConfig.isExitOnError()) {
+                    return getExecutionContext(inputModel, outputModel);
+                }
+        } catch (IOException e) {
+            LOG.error("Error while reading file from resource uri {}", sourceResource, e);
+        }
 
         em.getTransaction().commit();
         Model persistedModel = JopaPersistenceUtils.getDataset(em).getDefaultModel();
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java
new file mode 100644
index 00000000..44ddb5e5
--- /dev/null
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java
@@ -0,0 +1,130 @@
+package cz.cvut.spipes.modules.util;
+
+import cz.cvut.spipes.modules.ResourceFormat;
+import cz.cvut.spipes.modules.model.Region;
+import cz.cvut.spipes.registry.StreamResource;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.util.*;
+
+public class HTMLStreamReaderAdapter implements StreamReaderAdapter {
+    private Elements rows;
+    private int currentIndex;
+    private Element table;
+    private String label;
+
+    private List<Region> mergedRegions;
+    private Map<Integer, Map<Integer, String>> mergedCells;
+
+    @Override
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat,
+                           int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
+        Document doc = Jsoup.parse(inputStream, "UTF-8", "");
+        Element table = doc.select("table").first();
+        rows = table.select("tr");
+        currentIndex = 0;
+        this.table = table;
+        mergedRegions = extractMergedRegions(table);
+        mergedCells = new HashMap<>();
+        label = table.attr("data-name");
+    }
+
+
+    @Override
+    public String[] getHeader(boolean skipHeader) throws IOException {
+        Elements headerCells = rows.get(0).select("th, td");
+        return headerCells.stream()
+                .map(Element::text)
+                .toArray(String[]::new);
+    }
+
+    private boolean hasNextRow() {
+        return currentIndex < rows.size() - 1; // Skip header row
+    }
+
+    @Override
+    public List<String> getNextRow() {
+        if (!hasNextRow()) {
+            return null;
+        }
+
+        currentIndex++;
+        Elements cells = rows.get(currentIndex).select("td, th");
+        List<String> row = new ArrayList<>();
+        int cellIndex = 0;
+
+        for (Element cell : cells) {
+            int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
+            int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
+            String cellValue = cell.text();
+
+            if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) {
+                cellValue = cellValue.replace(",", ".");
+            }
+
+            while (row.size() < cellIndex) {
+                row.add(null);
+            }
+
+            row.add(cellValue);
+
+            for (int i = 1; i < colspan; i++) {
+                row.add(null);
+            }
+
+            if (rowspan > 1) {
+                for (int i = 1; i < rowspan; i++) {
+                    mergedCells.computeIfAbsent(currentIndex + i, k -> new HashMap<>()).put(cellIndex, cellValue);
+                }
+            }
+
+            cellIndex += colspan;
+        }
+
+        if (mergedCells.containsKey(currentIndex)) {
+            Map<Integer, String> rowMergedCells = mergedCells.get(currentIndex);
+            for (Map.Entry<Integer, String> entry : rowMergedCells.entrySet()) {
+                row.add(entry.getKey(), null);
+            }
+            mergedCells.remove(currentIndex);
+        }
+
+        return row;
+    }
+
+    @Override
+    public List<Region> getMergedRegions() {
+        return mergedRegions;
+    }
+
+    private List<Region> extractMergedRegions(Element table) {
+        List<Region> regions = new ArrayList<>();
+        Elements rows = table.select("tr");
+        for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
+            Elements cells = rows.get(rowIndex).select("td, th");
+            for (int colIndex = 0; colIndex < cells.size(); colIndex++) {
+                Element cell = cells.get(colIndex);
+                int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
+                int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
+                if (colspan > 1 || rowspan > 1) {
+                    regions.add(new Region(rowIndex, colIndex, rowIndex + rowspan - 1, colIndex + colspan - 1));
+                }
+            }
+        }
+        return regions;
+    }
+
+    @Override
+    public String getSheetLabel(){
+        return label;
+    }
+
+    @Override
+    public void close() {
+    }
+}
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java
new file mode 100644
index 00000000..6e29a321
--- /dev/null
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java
@@ -0,0 +1,114 @@
+package cz.cvut.spipes.modules.util;
+
+import cz.cvut.spipes.modules.ResourceFormat;
+import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
+import cz.cvut.spipes.modules.model.Region;
+import cz.cvut.spipes.registry.StreamResource;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.util.CellRangeAddress;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+public class XLSStreamReaderAdapter implements StreamReaderAdapter {
+    private Sheet sheet;
+    private Iterator<org.apache.poi.ss.usermodel.Row> rowIterator;
+    Boolean skipHeader;
+    private static final Logger LOG = LoggerFactory.getLogger(XLSStreamReaderAdapter.class);
+
+    @Override
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex,
+                           boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
+        Workbook workbook;
+        if (sourceResourceFormat == ResourceFormat.XLS) {
+            workbook = new HSSFWorkbook(inputStream);
+        } else {
+            workbook = new XSSFWorkbook(inputStream);
+        }
+        if ((tableIndex > workbook.getNumberOfSheets()) || (tableIndex < 1)) {
+                LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
+                        workbook.getNumberOfSheets(),
+                        tableIndex
+                );
+                    throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
+                }
+        sheet = workbook.getSheetAt(tableIndex - 1);
+        rowIterator = sheet.iterator();
+    }
+
+    @Override
+    public String[] getHeader(boolean skipHeader) throws IOException {
+        Row headerRow = sheet.getRow(0);
+        if (skipHeader) {
+            return null;
+        }
+        else {
+            rowIterator.next(); // move iterator to 2nd row
+            return StreamSupport.stream(headerRow.spliterator(), false)
+                    .map(cell -> cell.getStringCellValue())
+                    .toArray(String[]::new);
+        }
+    }
+
+    @Override
+    public List<String> getNextRow() {
+        if (!rowIterator.hasNext())
+            return null;
+        Row currentRow = rowIterator.next();
+        DataFormatter formatter = new DataFormatter();
+        List<String> row = StreamSupport.stream(currentRow.spliterator(), false)
+                .map(cell -> {
+                    String cellValue = formatter.formatCellValue(cell);
+                    cellValue = fixNumberFormat(cellValue);
+                    return cellValue.isEmpty() ? null : cellValue;
+                })
+                .collect(Collectors.toList());
+        return row;
+    }
+
+    @Override
+    public List<Region> getMergedRegions() {
+        List<Region> regions = new ArrayList<>();
+        for (int i = 0; i < sheet.getNumMergedRegions(); i++) {
+            CellRangeAddress region = sheet.getMergedRegion(i);
+            regions.add(new Region(
+                    region.getFirstRow(),
+                    region.getFirstColumn(),
+                    region.getLastRow(),
+                    region.getLastColumn()
+            ));
+        }
+        return regions;
+    }
+
+    @Override
+    public String getSheetLabel(){
+        return sheet.getSheetName();
+    }
+
+    public String fixNumberFormat (String cellValue){
+        //xls uses ',' as decimal separator, so we should convert it to '.'
+        if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) {
+            cellValue = cellValue.replace(",", ".");
+        }
+        return cellValue;
+    }
+
+    @Override
+    public void close() {
+    }
+}
+

From 18a893b3e8aff559051c3a2fa6b758e192f03ac2 Mon Sep 17 00:00:00 2001
From: Evgenii Grigorev <grigoevg@fel.cvut.cz>
Date: Wed, 8 Jan 2025 11:16:02 +0100
Subject: [PATCH 4/5] [#228] Implemented suggested changes

---
 .../cz/cvut/spipes/modules/TabularModule.java | 24 +++++++++++++++++--
 .../modules/util/CSVStreamReaderAdapter.java  |  9 ++++---
 .../modules/util/HTMLStreamReaderAdapter.java |  3 +--
 .../modules/util/StreamReaderAdapter.java     |  5 ++--
 .../modules/util/XLSStreamReaderAdapter.java  |  5 ++--
 5 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
index 91972aac..22b94eeb 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -91,6 +91,26 @@
  * Does not support custom table URIs. <br/>
  * Does not support processing of multiple files.<br/>
  * Does not support the <i>suppress output</i> annotation.
+ *
+ * The header processing uses this logic:
+ *
+ * If we have a schema, and we should skip the header:
+ * - not calling getHeader()
+ * - assume that data looks like in schema
+ *
+ * If we have a schema,and we should not skip the header:
+ * - calling getHeader()
+ * - adapt schema to match header of the file
+ *     - if ordering is not specified use ordering or the header
+ *     - reuse column IRIs from Schema
+ *
+ * If we don't have a schema, and we should skip the header:
+ * - not calling getHeader()
+ * - create column names column_1, column_2, etc.
+ *
+ * If we don't have a schema, and we should not skip the header:
+ * - calling getHeader()
+ * - create schema entirely based on the header
  */
 @SPipesModule(label = "Tabular module", comment = "Module for converting tabular data (e.g. CSV or TSV) to RDF")
 public class TabularModule extends AnnotatedAbstractModule {
@@ -207,7 +227,7 @@ ExecutionContext executeSelf() {
                 streamReaderAdapter = new XLSStreamReaderAdapter();
                 break;
             default:
-                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, delimiter);
+                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, delimiter, acceptInvalidQuoting, inputCharset);
                 break;
         }
 
@@ -224,7 +244,7 @@ ExecutionContext executeSelf() {
 
         try {
             streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()),
-                sourceResourceFormat, processTableAtIndex, acceptInvalidQuoting, inputCharset, sourceResource);
+                sourceResourceFormat, processTableAtIndex, sourceResource);
             String[] header = streamReaderAdapter.getHeader(skipHeader);;
             Set<String> columnNames = new HashSet<>();
 
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java
index 765fd3de..03faafd3 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java
@@ -30,15 +30,14 @@ public class CSVStreamReaderAdapter implements StreamReaderAdapter {
     StreamResource sourceResource;
     private final static Logger log = LoggerFactory.getLogger(CSVStreamReaderAdapter.class);
 
-    public CSVStreamReaderAdapter(char quoteCharacter, int delimiter) {
+    public CSVStreamReaderAdapter(char quoteCharacter, int delimiter, boolean acceptInvalidQuoting, Charset inputCharset) {
         this.csvPreference = new CsvPreference.Builder(quoteCharacter, delimiter, System.lineSeparator()).build();
+        this.acceptInvalidQuoting = acceptInvalidQuoting;
+        this.inputCharset = inputCharset;
     }
 
     @Override
-    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex,
-                           boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
-        this.acceptInvalidQuoting = acceptInvalidQuoting;
-        this.inputCharset = inputCharset;
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, StreamResource sourceResource) throws IOException {
         this.sourceResource = sourceResource;
         listReader = getCsvListReader(csvPreference);
         if (listReader == null) {
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java
index 44ddb5e5..d239b2c7 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java
@@ -9,7 +9,6 @@
 import org.jsoup.select.Elements;
 
 import java.io.*;
-import java.nio.charset.Charset;
 import java.util.*;
 
 public class HTMLStreamReaderAdapter implements StreamReaderAdapter {
@@ -23,7 +22,7 @@ public class HTMLStreamReaderAdapter implements StreamReaderAdapter {
 
     @Override
     public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat,
-                           int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
+                           int tableIndex, StreamResource sourceResource) throws IOException {
         Document doc = Jsoup.parse(inputStream, "UTF-8", "");
         Element table = doc.select("table").first();
         rows = table.select("tr");
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java
index a69b6160..41da79b4 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java
@@ -6,14 +6,13 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.charset.Charset;
 import java.util.List;
 
 public interface StreamReaderAdapter {
-    void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException;
+    void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, StreamResource sourceResource) throws IOException;
     String[] getHeader(boolean skipHeader) throws IOException;
     List<String> getNextRow() throws IOException;
     List<Region> getMergedRegions();
-    String getSheetLabel() throws IOException;
+    String getSheetLabel();
     void close() throws IOException;
 }
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java
index 6e29a321..88d1b56c 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java
@@ -16,7 +16,6 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
@@ -30,14 +29,14 @@ public class XLSStreamReaderAdapter implements StreamReaderAdapter {
     private static final Logger LOG = LoggerFactory.getLogger(XLSStreamReaderAdapter.class);
 
     @Override
-    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex,
-                           boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, StreamResource sourceResource) throws IOException {
         Workbook workbook;
         if (sourceResourceFormat == ResourceFormat.XLS) {
             workbook = new HSSFWorkbook(inputStream);
         } else {
             workbook = new XSSFWorkbook(inputStream);
         }
+        LOG.debug("Number of sheets: {}", workbook.getNumberOfSheets());
         if ((tableIndex > workbook.getNumberOfSheets()) || (tableIndex < 1)) {
                 LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
                         workbook.getNumberOfSheets(),

From 0119833389a9e1d2d7e2eb90337371b7aaa7a4ac Mon Sep 17 00:00:00 2001
From: Miroslav Blasko <blcham@gmail.com>
Date: Wed, 8 Jan 2025 13:42:07 +0100
Subject: [PATCH 5/5] [#228] Improve documentation about data schema

---
 .../cz/cvut/spipes/modules/TabularModule.java | 35 ++++++++-----------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
index 22b94eeb..43c51df0 100644
--- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
+++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -57,7 +57,7 @@
  * <p>The implementation loosely follows the W3C Recommendation described here:
  * <a href="https://www.w3.org/TR/csv2rdf/">Generating RDF from Tabular Data on the Web</a></p>
  * <p>
- * Within the recommendation, it is possible to define schema
+ * Within the recommendation, it is possible to specify schema
  * defining the shape of the output RDF data
  * (i.e. the input metadata values used for the conversion)
  * using csvw:tableSchema.<br/>
@@ -78,6 +78,19 @@
  *         ]
  * ]
  * </code></pre>
+ * Table schema can be provided in the input RDF data ("input schema") and is also included in the output RDF data
+ * ("output schema") of this module. If the input schema is provided, the output schema should consistently extend it.
+ * Following situations can happen:
+ * 1) there is no input schema in the input RDF data of this module
+ *   a) {@link TabularModule#skipHeader} is false -- the output schema is created based on the header of the input file
+ *   b) {@link TabularModule#skipHeader} is true -- the output schema is created based on number of columns,
+ *      where column names "column_1", "column_2", etc.
+ * 2) there is an input schema in the input RDF data of this module
+ *   a) {@link TabularModule#skipHeader} is false -- the output schema is consistently extended from data. This is
+ *      typically used when we have input data schema that does not define order of columns, while the output schema
+ *      will be extended with this order based on the header of the input file.
+ *   b) {@link TabularModule#skipHeader} is true -- the output schema is reused from the input RDF data
+ *
  * <p>
  * This module can also be used to process HTML tables, see option {@link TabularModule#sourceResourceFormat}.
  * First, the HTML table is converted to TSV while replacing "\t" with two spaces
@@ -91,26 +104,6 @@
  * Does not support custom table URIs. <br/>
  * Does not support processing of multiple files.<br/>
  * Does not support the <i>suppress output</i> annotation.
- *
- * The header processing uses this logic:
- *
- * If we have a schema, and we should skip the header:
- * - not calling getHeader()
- * - assume that data looks like in schema
- *
- * If we have a schema,and we should not skip the header:
- * - calling getHeader()
- * - adapt schema to match header of the file
- *     - if ordering is not specified use ordering or the header
- *     - reuse column IRIs from Schema
- *
- * If we don't have a schema, and we should skip the header:
- * - not calling getHeader()
- * - create column names column_1, column_2, etc.
- *
- * If we don't have a schema, and we should not skip the header:
- * - calling getHeader()
- * - create schema entirely based on the header
  */
 @SPipesModule(label = "Tabular module", comment = "Module for converting tabular data (e.g. CSV or TSV) to RDF")
 public class TabularModule extends AnnotatedAbstractModule {