[#228] HTML and XLS files are now being processed directly

kbss-cvut · Dec 22, 2024 · 17cbbf8 · 17cbbf8
1 parent d53c74c
commit 17cbbf8
Show file tree

Hide file tree

Showing 5 changed files with 444 additions and 106 deletions.
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -13,7 +13,6 @@
 import cz.cvut.spipes.exception.ResourceNotUniqueException;
 import cz.cvut.spipes.exception.SPipesException;
 import cz.cvut.spipes.modules.annotations.SPipesModule;
-import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
 import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException;
 import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException;
 import cz.cvut.spipes.modules.model.*;
@@ -30,6 +29,7 @@
 import org.supercsv.io.ICsvListReader;
 import org.supercsv.prefs.CsvPreference;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
@@ -112,12 +112,12 @@ public class TabularModule extends AnnotatedAbstractModule {
     private final Property P_PROCESS_TABLE_AT_INDEX = getSpecificParameter("process-table-at-index");
 
     @Parameter(iri = SML.replace, comment = "Specifies whether a module should overwrite triples" +
-        " from its predecessors. When set to true (default is false), it prevents" +
-        " passing through triples from the predecessors.")
+            " from its predecessors. When set to true (default is false), it prevents" +
+            " passing through triples from the predecessors.")
     private boolean isReplace = false;
 
     @Parameter(iri = PARAM_URL_PREFIX + "source-resource-uri", comment = "URI of resource" +
-        " that represent tabular data (e.g. resource representing CSV file).")
+            " that represent tabular data (e.g. resource representing CSV file).")
     private StreamResource sourceResource;
 
     @Parameter(iri = PARAM_URL_PREFIX + "delimiter", comment = "Column delimiter. Default value is comma ','.")
@@ -188,6 +188,8 @@ ExecutionContext executeSelf() {
 
         StreamResource originalSourceResource = sourceResource;
         TSVConvertor tsvConvertor = null;
+        StreamReaderAdapter streamReaderAdapter = new XLSStreamReaderAdapter();
+        CsvPreference csvPreference = null;
 
         switch (sourceResourceFormat) {
             case HTML:
@@ -197,30 +199,22 @@ ExecutionContext executeSelf() {
                 if (processTableAtIndex != 1) {
                     throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
                 }
-                tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
-                table.setLabel(tsvConvertor.getTableName(sourceResource));
-                setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                setDelimiter('\t');
+                streamReaderAdapter = new HTMLStreamReaderAdapter();
                 break;
             case XLS:
             case XLSM:
             case XLSX:
                 if (processTableAtIndex == 0) {
                     throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
                 }
-                tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat);
-                int numberOfSheets = tsvConvertor.getTablesCount(sourceResource);
-                table.setLabel(tsvConvertor.getTableName(sourceResource));
-                LOG.debug("Number of sheets: {}", numberOfSheets);
-                if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) {
-                    LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
-                        numberOfSheets,
-                            processTableAtIndex
-                    );
-                    throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
-                }
-                setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                setDelimiter('\t');
+                streamReaderAdapter = new XLSStreamReaderAdapter();
+                break;
+            default:
+                csvPreference = new CsvPreference.Builder(
+                        quoteCharacter,
+                        delimiter,
+                        System.lineSeparator()).build();
+                streamReaderAdapter = new CSVStreamReaderAdapter(csvPreference);
                 break;
         }
 
@@ -235,33 +229,26 @@ ExecutionContext executeSelf() {
         List<Column> outputColumns = new ArrayList<>();
         List<Statement> rowStatements = new ArrayList<>();
 
-        CsvPreference csvPreference = new CsvPreference.Builder(
-            quoteCharacter,
-            delimiter,
-            System.lineSeparator()).build();
-
         try {
-            ICsvListReader listReader = getCsvListReader(csvPreference);
-
-            if (listReader == null) {
-                logMissingQuoteError();
-                return getExecutionContext(inputModel, outputModel);
-            }
+            streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()),
+                    sourceResourceFormat, processTableAtIndex, acceptInvalidQuoting, inputCharset, sourceResource);
+            String[] header = streamReaderAdapter.getHeader(skipHeader);
+            Set<String> columnNames = new HashSet<>();
 
-            String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader)
+            if (streamReaderAdapter.getSheetLabel() != null)
+                table.setLabel(streamReaderAdapter.getSheetLabel());
 
             if (header == null) {
                 LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri());
                 return getExecutionContext(inputModel, outputModel);
             }
-            Set<String> columnNames = new HashSet<>();
 
             TableSchema inputTableSchema = getTableSchema(em);
             hasInputSchema = hasInputSchema(inputTableSchema);
 
             if (skipHeader) {
                 header = getHeaderFromSchema(inputModel, header, hasInputSchema);
-                listReader = new CsvListReader(getReader(), csvPreference);
+                //streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
             } else if (hasInputSchema) {
                 header = getHeaderFromSchema(inputModel, header, true);
             }
@@ -280,17 +267,16 @@ ExecutionContext executeSelf() {
 
                 tableSchema.setAboutUrl(schemaColumn, sourceResource.getUri());
                 schemaColumn.setProperty(
-                    dataPrefix,
-                    sourceResource.getUri(),
-                    hasInputSchema ? tableSchema.getColumn(columnName) : null);
+                        dataPrefix,
+                        sourceResource.getUri(),
+                        hasInputSchema ? tableSchema.getColumn(columnName) : null);
                 schemaColumn.setTitle(columnTitle);
                 if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName);
             }
 
-            List<String> row;
             int rowNumber = 0;
-            //for each row
-            while ((row = listReader.read()) != null) {
+            List<String> row;
+            while ((row = streamReaderAdapter.getNextRow()) != null) {
                 rowNumber++;
                 // 4.6.1 and 4.6.3
                 Row r = new Row();
@@ -328,37 +314,35 @@ ExecutionContext executeSelf() {
                     // 4.6.8.7 - else, if cellValue is not null
                 }
             }
-            listReader.close();
-        } catch (IOException | MissingArgumentException e) {
-            LOG.error("Error while reading file from resource uri {}", sourceResource, e);
-        }
-
-        tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
-        tableSchema.setColumnsSet(new HashSet<>(outputColumns));
+            tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
+            tableSchema.setColumnsSet(new HashSet<>(outputColumns));
 
-        em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel);
-        em.getTransaction().begin();
-        em.persist(tableGroup);
-        em.merge(tableSchema);
+            em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel);
+            em.getTransaction().begin();
+            em.persist(tableGroup);
+            em.merge(tableSchema);
 
-        if (tsvConvertor != null) {
-            List<Region> regions =  tsvConvertor.getMergedRegions(originalSourceResource);
+            List<Region> regions = streamReaderAdapter.getMergedRegions();
 
             int cellsNum = 1;
             for (Region region : regions) {
                 int firstCellInRegionNum = cellsNum;
-                for(int i = region.getFirstRow();i <= region.getLastRow();i++){
-                    for(int j = region.getFirstColumn();j <= region.getLastColumn();j++) {
-                        Cell cell = new Cell(sourceResource.getUri()+"#cell"+(cellsNum));
+                for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
+                    for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
+                        Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
                         cell.setRow(tableSchema.createAboutUrl(i));
                         cell.setColumn(outputColumns.get(j).getUri().toString());
-                        if(cellsNum != firstCellInRegionNum)
-                            cell.setSameValueAsCell(sourceResource.getUri()+"#cell"+(firstCellInRegionNum));
+                        if (cellsNum != firstCellInRegionNum) {
+                            cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
+                        }
                         em.merge(cell);
                         cellsNum++;
                     }
                 }
             }
+            streamReaderAdapter.close();
+        } catch (IOException e) {
+            LOG.error("Error while reading file from resource uri {}", sourceResource, e);
         }
 
         em.getTransaction().commit();
@@ -380,41 +364,31 @@ private String getValueFromRow(List<String> row, int index, int expectedRowLengt
             StringBuilder record = new StringBuilder(recordDelimiter);
             for (int i = 0; i < row.size(); i++) {
                 record
-                    .append(i)
-                    .append(":")
-                    .append(row.get(i))
-                    .append(recordDelimiter);
+                        .append(i)
+                        .append(":")
+                        .append(row.get(i))
+                        .append(recordDelimiter);
             }
             LOG.error("Reading input file failed when reading record #{} (may not reflect the line #).\n" +
-                    " It was expected that the current record contains {} values" +
-                    ", but {}. element was not retrieved before whole record was processed.\n" +
-                    "The problematic record: {}",
-                currentRecordNumber,
-                expectedRowLength,
-                index+1,
-                record
+                            " It was expected that the current record contains {} values" +
+                            ", but {}. element was not retrieved before whole record was processed.\n" +
+                            "The problematic record: {}",
+                    currentRecordNumber,
+                    expectedRowLength,
+                    index+1,
+                    record
             );
             throw new SPipesException("Reading input file failed.", e);
         }
     }
 
-    private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
-        if (acceptInvalidQuoting) {
-            if (getQuote() == '\0') {
-                return null;
-            } else
-                return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
-        }
-        return new CsvListReader(getReader(), csvPreference);
-    }
-
     private Statement createRowResource(String cellValue, int rowNumber, Column column) {
         Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber));
 
         return ResourceFactory.createStatement(
-            rowResource,
-            ResourceFactory.createProperty(column.getPropertyUrl()),
-            ResourceFactory.createPlainLiteral(cellValue));
+                rowResource,
+                ResourceFactory.createProperty(column.getPropertyUrl()),
+                ResourceFactory.createPlainLiteral(cellValue));
     }
 
     private boolean hasInputSchema(TableSchema inputTableSchema) {
@@ -428,11 +402,11 @@ private boolean hasInputSchema(TableSchema inputTableSchema) {
 
     private TableSchema getTableSchema(EntityManager em) {
         TypedQuery<TableSchema> query = em.createNativeQuery(
-            "PREFIX csvw: <http://www.w3.org/ns/csvw#>\n" +
-                "SELECT ?t WHERE { \n" +
-                "?t a csvw:TableSchema. \n" +
-                "}",
-            TableSchema.class
+                "PREFIX csvw: <http://www.w3.org/ns/csvw#>\n" +
+                        "SELECT ?t WHERE { \n" +
+                        "?t a csvw:TableSchema. \n" +
+                        "}",
+                TableSchema.class
         );
 
         int tableSchemaCount = query.getResultList().size();
@@ -451,14 +425,14 @@ private TableSchema getTableSchema(EntityManager em) {
 
     private void throwNotUniqueException(Column column, String columnTitle, String columnName) {
         throw new ResourceNotUniqueException(
-            String.format("Unable to create value of property %s due to collision. " +
-                    "Both column titles '%s' and '%s' are normalized to '%s' " +
-                    "and thus would refer to the same property url <%s>.",
-                CSVW.propertyUrl,
-                columnTitle,
-                column.getTitle(),
-                columnName,
-                column.getPropertyUrl()));
+                String.format("Unable to create value of property %s due to collision. " +
+                                "Both column titles '%s' and '%s' are normalized to '%s' " +
+                                "and thus would refer to the same property url <%s>.",
+                        CSVW.propertyUrl,
+                        columnTitle,
+                        column.getTitle(),
+                        columnName,
+                        column.getPropertyUrl()));
     }
 
     private ExecutionContext getExecutionContext(Model inputModel, Model outputModel) {
@@ -472,12 +446,12 @@ private ExecutionContext getExecutionContext(Model inputModel, Model outputModel
     @Override
     public void loadManualConfiguration() {
         sourceResourceFormat = ResourceFormat.fromString(
-            getPropertyValue(P_SOURCE_RESOURCE_FORMAT, ResourceFormat.PLAIN.getValue())
+                getPropertyValue(P_SOURCE_RESOURCE_FORMAT, ResourceFormat.PLAIN.getValue())
         );
         delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier(sourceResourceFormat));
         quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(sourceResourceFormat));
         outputMode = Mode.fromResource(
-            getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource())
+                getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource())
         );
         setInputCharset(delimiter);
     }
@@ -513,7 +487,7 @@ private Supplier<Character> getDefaultQuoteCharacterSupplier(ResourceFormat sour
         if (sourceResourceFormat == ResourceFormat.CSV) {
             return () -> {
                 LOG.debug("Quote character not specified, using double-quote as default value" +
-                    " to be compliant with RFC 4180 (CSV)");
+                        " to be compliant with RFC 4180 (CSV)");
                 return '"';
             };
         }
@@ -523,8 +497,8 @@ private Supplier<Character> getDefaultQuoteCharacterSupplier(ResourceFormat sour
     private char getPropertyValue(Property property,
                                   Supplier<Character> defaultValueSupplier) {
         return Optional.ofNullable(getPropertyValue(property))
-            .map(n -> n.asLiteral().getChar())
-            .orElseGet(defaultValueSupplier);
+                .map(n -> n.asLiteral().getChar())
+                .orElseGet(defaultValueSupplier);
     }
 
     @Override
@@ -586,10 +560,6 @@ private String normalize(String label) {
         return label.trim().replaceAll("[^\\w]", "_");
     }
 
-    private Reader getReader() {
-        return new StringReader(new String(sourceResource.getContent(), inputCharset));
-    }
-
     @NotNull
     private StreamResource getResourceByUri(@NotNull String resourceUri) {
 
@@ -623,7 +593,7 @@ public int getDelimiter() {
 
     public void setDelimiter(int delimiter) {
         if ((sourceResourceFormat == ResourceFormat.CSV && delimiter != ',') ||
-            (sourceResourceFormat == ResourceFormat.TSV && delimiter != '\t')) {
+                (sourceResourceFormat == ResourceFormat.TSV && delimiter != '\t')) {
             throw new SpecificationNonComplianceException(sourceResourceFormat, delimiter);
         }
         this.delimiter = delimiter;