diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java index f8695901..74fc420f 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java @@ -87,7 +87,7 @@ * and then processed as usual. * Take a look at the option {@link TabularModule#sourceResourceFormat} and class {@link HTML2TSVConvertor} for more details. * Also, in a similar way this module can process XLS tables. Note, that processing multiple sheets isn't supported, - * so {@link TabularModule#processSpecificSheetInXLSFile} parameter is required (range 1...number of sheets). + * so {@link TabularModule#processTableAtIndex} parameter is required (range 1...number of sheets). *

* Important notes (differences from the recommendation):
* Does not support custom table group URIs.
@@ -109,7 +109,7 @@ public class TabularModule extends AbstractModule { private final Property P_SOURCE_RESOURCE_URI = getSpecificParameter("source-resource-uri"); private final Property P_SKIP_HEADER = getSpecificParameter("skip-header"); private final Property P_SOURCE_RESOURCE_FORMAT = getSpecificParameter("source-resource-format"); - private final Property P_PROCESS_SPECIFIC_SHEET_IN_XLS_FILE = getSpecificParameter("process-specific-sheet-in-xls-file"); + private final Property P_PROCESS_TABLE_AT_INDEX = getSpecificParameter("process-table-at-index"); //sml:replace @Parameter(urlPrefix = SML.uri, name = "replace", comment = "Specifies whether a module should overwrite triples" + @@ -138,11 +138,11 @@ public class TabularModule extends AbstractModule { @Parameter(urlPrefix = PARAM_URL_PREFIX, name = "skip-header", comment = "Skip header. Default is false.") private boolean skipHeader; - //:process-specific-sheet-in-xls-file + //:process-table-at-index /** - * Required parameter that indicates that only specific single sheet should be converted + * Required parameter for HTML and EXCEL files that indicates that only specific single table should be processed */ - private int processSpecificSheetInXLSFile; + private int processTableAtIndex; //:output-mode // TODO - revise comment @@ -198,7 +198,13 @@ ExecutionContext executeSelf() { switch (sourceResourceFormat) { case HTML: - tsvConvertor = new HTML2TSVConvertor(); + if (processTableAtIndex == 0) { + throw new SheetIsNotSpecifiedException("Source resource format is set to HTML file but no specific table is set for processing."); + } + if (processTableAtIndex != 1) { + throw new UnsupportedOperationException("Not implemented yet."); + } + tsvConvertor = new HTML2TSVConvertor(processTableAtIndex); table.setLabel(tsvConvertor.getTableName(sourceResource)); setSourceResource(tsvConvertor.convertToTSV(sourceResource)); setDelimiter('\t'); @@ -206,17 +212,17 @@ ExecutionContext executeSelf() { case XLS: case XLSM: case XLSX: - if (processSpecificSheetInXLSFile == 0) { - throw new SheetIsNotSpecifiedException("Source resource format is set to XLS file but no specific sheet is set for processing."); + if (processTableAtIndex == 0) { + throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing."); } - tsvConvertor = new XLS2TSVConvertor(processSpecificSheetInXLSFile, sourceResourceFormat); - int numberOfSheets = tsvConvertor.getNumberTables(sourceResource); + tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat); + int numberOfSheets = tsvConvertor.getTablesCount(sourceResource); table.setLabel(tsvConvertor.getTableName(sourceResource)); LOG.debug("Number of sheets: {}", numberOfSheets); - if ((processSpecificSheetInXLSFile > numberOfSheets) || (processSpecificSheetInXLSFile < 1)) { + if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) { LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}", numberOfSheets, - processSpecificSheetInXLSFile + processTableAtIndex ); throw new SheetDoesntExistsException("Requested sheet doesn't exists."); } @@ -478,7 +484,7 @@ public void loadConfiguration() { delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier(sourceResourceFormat)); isReplace = getPropertyValue(SML.replace, false); skipHeader = getPropertyValue(P_SKIP_HEADER, false); - processSpecificSheetInXLSFile = getPropertyValue(P_PROCESS_SPECIFIC_SHEET_IN_XLS_FILE, 0); + processTableAtIndex = getPropertyValue(P_PROCESS_TABLE_AT_INDEX, 0); acceptInvalidQuoting = getPropertyValue(P_ACCEPT_INVALID_QUOTING, false); quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(sourceResourceFormat)); dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString(); @@ -668,8 +674,8 @@ public void setSourceResourceFormat(ResourceFormat sourceResourceFormat) { this.sourceResourceFormat = sourceResourceFormat; } - public void setProcessSpecificSheetInXLSFile(int sheetNumber) { - this.processSpecificSheetInXLSFile = sheetNumber; + public void processTableAtIndex(int sheetNumber) { + this.processTableAtIndex = sheetNumber; } private String[] getHeaderFromSchema(Model inputModel, String[] header, boolean hasInputSchema) { diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTML2TSVConvertor.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTML2TSVConvertor.java index 410a33ca..82eecfb6 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTML2TSVConvertor.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTML2TSVConvertor.java @@ -33,9 +33,13 @@ * > cellColSpan = new ArrayList<>(); + public HTML2TSVConvertor(int sheetNumber) { + this.tableNumber = sheetNumber; + } + @Override public StringStreamResource convertToTSV(StreamResource streamResource) { StringBuilder tsvStringBuilder = new StringBuilder(); @@ -131,7 +135,7 @@ public List getMergedRegions(StreamResource streamResource){ } @Override - public int getNumberTables(StreamResource streamResource) { + public int getTablesCount(StreamResource streamResource) { Document doc = Jsoup.parseBodyFragment(new String(streamResource.getContent())); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); Elements tables = doc.getElementsByTag(HTML.TABLE); diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/TSVConvertor.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/TSVConvertor.java index 5c711bdd..a6780834 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/TSVConvertor.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/TSVConvertor.java @@ -12,7 +12,7 @@ public interface TSVConvertor { List getMergedRegions(StreamResource streamResource); - int getNumberTables(StreamResource streamResource); + int getTablesCount(StreamResource streamResource); String getTableName(StreamResource streamResource); } diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLS2TSVConvertor.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLS2TSVConvertor.java index 6fb4f7b1..68e8fb97 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLS2TSVConvertor.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLS2TSVConvertor.java @@ -23,8 +23,6 @@ * Module for converting tabular data from XLS to TSV. Converts specific sheet of the xls file. */ public class XLS2TSVConvertor implements TSVConvertor { - - private static final Logger LOG = LoggerFactory.getLogger(XLS2TSVConvertor.class); private int sheetNumber; private ResourceFormat format; @@ -86,7 +84,7 @@ public List getMergedRegions(StreamResource streamResource){ } @Override - public int getNumberTables(StreamResource streamResource){ + public int getTablesCount(StreamResource streamResource){ try { if(format == ResourceFormat.XLS)return new HSSFWorkbook(new ByteArrayInputStream(streamResource.getContent())).getNumberOfSheets(); else return new XSSFWorkbook(new ByteArrayInputStream(streamResource.getContent())).getNumberOfSheets(); diff --git a/s-pipes-modules/module-tabular/src/test/java/cz/cvut/spipes/modules/TabularModuleTest.java b/s-pipes-modules/module-tabular/src/test/java/cz/cvut/spipes/modules/TabularModuleTest.java index fa809d98..cc59b2b8 100644 --- a/s-pipes-modules/module-tabular/src/test/java/cz/cvut/spipes/modules/TabularModuleTest.java +++ b/s-pipes-modules/module-tabular/src/test/java/cz/cvut/spipes/modules/TabularModuleTest.java @@ -72,7 +72,7 @@ void executeWithSimpleTransformationXls() throws URISyntaxException, IOException getFilePath("examples/countries/input.xls")) ); module.setSourceResourceFormat(ResourceFormat.XLS); - module.setProcessSpecificSheetInXLSFile(1); + module.processTableAtIndex(1); ExecutionContext outputContext = module.executeSelf(); @@ -89,7 +89,7 @@ void executeWithSimpleTransformationXlsm() throws URISyntaxException, IOExceptio getFilePath("examples/countries/input.xlsm")) ); module.setSourceResourceFormat(ResourceFormat.XLSM); - module.setProcessSpecificSheetInXLSFile(1); + module.processTableAtIndex(1); ExecutionContext outputContext = module.executeSelf(); @@ -106,7 +106,7 @@ void executeWithSimpleTransformationMergedXls() throws URISyntaxException, IOExc getFilePath("examples/mergedCells/input.xls")) ); module.setSourceResourceFormat(ResourceFormat.XLS); - module.setProcessSpecificSheetInXLSFile(1); + module.processTableAtIndex(1); ExecutionContext outputContext = module.executeSelf(); @@ -123,7 +123,7 @@ void executeWithSimpleTransformationMergedXlsx() throws URISyntaxException, IOEx getFilePath("examples/mergedCells/input.xlsx")) ); module.setSourceResourceFormat(ResourceFormat.XLSX); - module.setProcessSpecificSheetInXLSFile(1); + module.processTableAtIndex(1); ExecutionContext outputContext = module.executeSelf(); @@ -140,6 +140,7 @@ void executeWithSimpleTransformationMergedHTML() throws URISyntaxException, IOEx getFilePath("examples/mergedCells/input.html")) ); module.setSourceResourceFormat(ResourceFormat.HTML); + module.processTableAtIndex(1); ExecutionContext outputContext = module.executeSelf(); @@ -303,10 +304,10 @@ void executeSelfWithBNodesInSchema() throws IOException, URISyntaxException { @Test void executeSelfWithHTMLFileInput() throws URISyntaxException, IOException { module.setSourceResourceFormat(ResourceFormat.HTML); + module.processTableAtIndex(1); module.setSourceResource( StreamResourceUtils.getStreamResource(DATA_PREFIX, getFilePath("examples/htmlFile/input.html")) ); - ExecutionContext outputContext = module.executeSelf(); Model actualModel = outputContext.getDefaultModel();