diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java index f4398cf4..f8fd4c4d 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java @@ -197,6 +197,7 @@ ExecutionContext executeSelf() { StreamResource originalSourceResource = sourceResource; TSVConvertor tsvConvertor = null; + FileReaderAdapter fileReaderAdapter = new XLSFileReaderAdapter(); switch (sourceResourceFormat) { case HTML: if (processTableAtIndex == 0) { @@ -216,23 +217,15 @@ ExecutionContext executeSelf() { if (processTableAtIndex == 0) { throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing."); } - - return processXLSFile(sourceResource, processTableAtIndex); -// tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat); -// int numberOfSheets = tsvConvertor.getTablesCount(sourceResource); -// table.setLabel(tsvConvertor.getTableName(sourceResource)); -// LOG.debug("Number of sheets: {}", numberOfSheets); -// if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) { -// LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}", -// numberOfSheets, -// processTableAtIndex -// ); -// throw new SheetDoesntExistsException("Requested sheet doesn't exists."); -// } -// setSourceResource(tsvConvertor.convertToTSV(sourceResource)); -// setDelimiter('\t'); - - //break; + fileReaderAdapter = new XLSFileReaderAdapter(); + break; + default: + CsvPreference csvPreference = new CsvPreference.Builder( + quoteCharacter, + delimiter, + System.lineSeparator()).build(); + fileReaderAdapter = new CSVFileReaderAdapter(csvPreference); + break; } BNodesTransformer bNodesTransformer = new BNodesTransformer(); @@ -246,36 +239,29 @@ ExecutionContext executeSelf() { List outputColumns = new ArrayList<>(); List rowStatements = new ArrayList<>(); - CsvPreference csvPreference = new CsvPreference.Builder( - quoteCharacter, - delimiter, - System.lineSeparator()).build(); - try { - ICsvListReader listReader = getCsvListReader(csvPreference); - - if (listReader == null) { - logMissingQuoteError(); - return getExecutionContext(inputModel, outputModel); - } + fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex); + String[] header = fileReaderAdapter.getHeader(); + Set columnNames = new HashSet<>(); - String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader) + if (fileReaderAdapter.getLabel() != null) + table.setLabel(fileReaderAdapter.getLabel()); if (header == null) { LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri()); return getExecutionContext(inputModel, outputModel); } - Set columnNames = new HashSet<>(); TableSchema inputTableSchema = getTableSchema(em); hasInputSchema = hasInputSchema(inputTableSchema); if (skipHeader) { header = getHeaderFromSchema(inputModel, header, hasInputSchema); - listReader = new CsvListReader(getReader(), csvPreference); + fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex); } else if (hasInputSchema) { header = getHeaderFromSchema(inputModel, header, true); } + em.getTransaction().commit(); em.close(); em.getEntityManagerFactory().close(); @@ -298,10 +284,15 @@ ExecutionContext executeSelf() { if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName); } - List row; int rowNumber = 0; - //for each row - while ((row = listReader.read()) != null) { + List row; + switch (sourceResourceFormat){ + case XLS, XLSM, XLSX: + row = fileReaderAdapter.getNextRow(); //skip header for xls files + break; + } + while ((row = fileReaderAdapter.getNextRow()) != null) { + //row = fileReaderAdapter.getNextRow(); rowNumber++; // 4.6.1 and 4.6.3 Row r = new Row(); @@ -339,37 +330,34 @@ ExecutionContext executeSelf() { // 4.6.8.7 - else, if cellValue is not null } } - listReader.close(); - } catch (IOException | MissingArgumentException e) { - LOG.error("Error while reading file from resource uri {}", sourceResource, e); - } - - tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri()); - tableSchema.setColumnsSet(new HashSet<>(outputColumns)); + tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri()); + tableSchema.setColumnsSet(new HashSet<>(outputColumns)); - em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel); - em.getTransaction().begin(); - em.persist(tableGroup); - em.merge(tableSchema); + em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel); + em.getTransaction().begin(); + em.persist(tableGroup); + em.merge(tableSchema); - if (tsvConvertor != null) { - List regions = tsvConvertor.getMergedRegions(originalSourceResource); + List regions = fileReaderAdapter.getMergedRegions(); int cellsNum = 1; for (Region region : regions) { int firstCellInRegionNum = cellsNum; - for(int i = region.getFirstRow();i <= region.getLastRow();i++){ - for(int j = region.getFirstColumn();j <= region.getLastColumn();j++) { - Cell cell = new Cell(sourceResource.getUri()+"#cell"+(cellsNum)); + for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) { + for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) { + Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum); cell.setRow(tableSchema.createAboutUrl(i)); cell.setColumn(outputColumns.get(j).getUri().toString()); - if(cellsNum != firstCellInRegionNum) - cell.setSameValueAsCell(sourceResource.getUri()+"#cell"+(firstCellInRegionNum)); + if (cellsNum != firstCellInRegionNum) { + cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum); + } em.merge(cell); cellsNum++; } } } + } catch (IOException e) { + LOG.error("Error while reading file from resource uri {}", sourceResource, e); } em.getTransaction().commit(); @@ -380,7 +368,7 @@ ExecutionContext executeSelf() { outputModel.add(rowStatements); outputModel.add(bNodesTransformer.transferJOPAEntitiesToBNodes(persistedModel)); - return getExecutionContext(inputModel,outputModel); + return getExecutionContext(inputModel, outputModel); } private String getValueFromRow(List row, int index, int expectedRowLength, int currentRecordNumber) { @@ -714,173 +702,4 @@ private void logMissingQuoteError() throws MissingArgumentException { throw new MissingArgumentException(message); } else LOG.error(message); } - - private ExecutionContext processXLSFile(StreamResource sourceResource, int tableIndex) { - BNodesTransformer bNodesTransformer = new BNodesTransformer(); - Model inputModel = bNodesTransformer.convertBNodesToNonBNodes(executionContext.getDefaultModel()); - boolean hasInputSchema = false; - - Model outputModel = ModelFactory.createDefaultModel(); - EntityManager em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", inputModel); - em.getTransaction().begin(); - - List outputColumns = new ArrayList<>(); - List rowStatements = new ArrayList<>(); - - Workbook workbook; - try { - if (sourceResourceFormat == ResourceFormat.XLS) { - workbook = new HSSFWorkbook(new ByteArrayInputStream(sourceResource.getContent())); - } else { - workbook = new XSSFWorkbook(new ByteArrayInputStream(sourceResource.getContent())); - } - table.setLabel(workbook.getSheetAt(tableIndex - 1).getSheetName()); - - Sheet sheet = workbook.getSheetAt(tableIndex - 1); - Iterator rowIterator = sheet.iterator(); - - org.apache.poi.ss.usermodel.Row headerRow = rowIterator.next(); - String[] header = StreamSupport.stream(headerRow.spliterator(), false) - .map(cell -> cell.getStringCellValue()) - .toArray(String[]::new); - - if (header == null) { - LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri()); - return getExecutionContext(inputModel, outputModel); - } - Set columnNames = new HashSet<>(); - - TableSchema inputTableSchema = getTableSchema(em); - hasInputSchema = hasInputSchema(inputTableSchema); - - if (skipHeader) { - header = getHeaderFromSchema(inputModel, header, hasInputSchema); - } else if (hasInputSchema) { - header = getHeaderFromSchema(inputModel, header, true); - } - em.getTransaction().commit(); - em.close(); - em.getEntityManagerFactory().close(); - - outputColumns = new ArrayList<>(header.length); - - for (String columnTitle : header) { - String columnName = normalize(columnTitle); - boolean isDuplicate = !columnNames.add(columnName); - - Column schemaColumn = new Column(columnName, columnTitle); - outputColumns.add(schemaColumn); - - tableSchema.setAboutUrl(schemaColumn, sourceResource.getUri()); - schemaColumn.setProperty( - dataPrefix, - sourceResource.getUri(), - hasInputSchema ? tableSchema.getColumn(columnName) : null); - schemaColumn.setTitle(columnTitle); - if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName); - } - - int rowNumber = 0; - while (rowIterator.hasNext()) { - org.apache.poi.ss.usermodel.Row currentRow = rowIterator.next(); - rowNumber++; - // 4.6.1 and 4.6.3 - Row r = new Row(); - - if (outputMode == Mode.STANDARD) { - // 4.6.2 - table.getRows().add(r); - // 4.6.4 - r.setRownum(rowNumber); - // 4.6.5 - r.setUrl(sourceResource.getUri() + "#row=" + (rowNumber + 1)); - } - - // 4.6.6 - Add titles. - // We do not support titles. - - // 4.6.7 - // In standard mode only, emit the triples generated by running - // the algorithm specified in section 6. JSON-LD to RDF over any - // non-core annotations specified for the row, with node R as - // an initial subject, the non-core annotation as property, and the - // value of the non-core annotation as value. - DataFormatter formatter = new DataFormatter(); - ArrayList row = new ArrayList<>(); - - for (org.apache.poi.ss.usermodel.Cell cell : currentRow) { - String cellValue = formatter.formatCellValue(cell); - if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) //xls uses ',' as Decimal separator - { cellValue = cellValue.replace(",", "."); } // so we should replace it with '.' - if (cellValue.isEmpty()) { - row.add(null); - } else { - row.add(cellValue); - } - } - - for (int i = 0; i < header.length; i++) { - // 4.6.8.1 - Column column = outputColumns.get(i); - String cellValue = getValueFromRow(row, i, header.length, rowNumber); - if (cellValue != null) rowStatements.add(createRowResource(cellValue, rowNumber, column)); - // 4.6.8.2 - r.setDescribes(tableSchema.createAboutUrl(rowNumber)); - //TODO: URITemplate - - // 4.6.8.5 - else, if value is list and cellOrdering == true - // 4.6.8.6 - else, if value is list - // 4.6.8.7 - else, if cellValue is not null - } - } - tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri()); - tableSchema.setColumnsSet(new HashSet<>(outputColumns)); - - em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel); - em.getTransaction().begin(); - em.persist(tableGroup); - em.merge(tableSchema); - - sheet = workbook.getSheetAt(tableIndex - 1); - List regions = new ArrayList<>(); - - for(int i = 0;i < sheet.getNumMergedRegions();i++){ - CellRangeAddress region = sheet.getMergedRegion(i); - regions.add(new Region( - region.getFirstRow(), - region.getFirstColumn(), - region.getLastRow(), - region.getLastColumn()) - ); - } - - int cellsNum = 1; - for (Region region : regions) { - int firstCellInRegionNum = cellsNum; - for(int i = region.getFirstRow();i <= region.getLastRow();i++){ - for(int j = region.getFirstColumn();j <= region.getLastColumn();j++) { - Cell cell = new Cell(sourceResource.getUri()+"#cell"+(cellsNum)); - cell.setRow(tableSchema.createAboutUrl(i)); - cell.setColumn(outputColumns.get(j).getUri().toString()); - if(cellsNum != firstCellInRegionNum) - cell.setSameValueAsCell(sourceResource.getUri()+"#cell"+(firstCellInRegionNum)); - em.merge(cell); - cellsNum++; - } - } - } - } catch (IOException e) { - LOG.error("Error while reading file from resource uri {}", sourceResource, e); - } - - em.getTransaction().commit(); - Model persistedModel = JopaPersistenceUtils.getDataset(em).getDefaultModel(); - em.getEntityManagerFactory().close(); - - tableSchema.addColumnsList(persistedModel, outputColumns); - outputModel.add(rowStatements); - outputModel.add(bNodesTransformer.transferJOPAEntitiesToBNodes(persistedModel)); - - return getExecutionContext(inputModel,outputModel); - } } diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java new file mode 100644 index 00000000..b70a401e --- /dev/null +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java @@ -0,0 +1,53 @@ +package cz.cvut.spipes.modules.util; + +import cz.cvut.spipes.modules.ResourceFormat; +import cz.cvut.spipes.modules.model.Region; +import org.supercsv.io.CsvListReader; +import org.supercsv.io.ICsvListReader; +import org.supercsv.prefs.CsvPreference; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class CSVFileReaderAdapter implements FileReaderAdapter { + private ICsvListReader listReader; + private CsvPreference csvPreference; + + public CSVFileReaderAdapter(CsvPreference csvPreference) { + this.csvPreference = csvPreference; + } + + @Override + public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException { + listReader = new CsvListReader(new InputStreamReader(inputStream), csvPreference); + } + + @Override + public String[] getHeader() throws IOException { + return listReader.getHeader(true); + } + + @Override + public boolean hasNext() throws IOException { + return listReader.read() != null; + } + + @Override + public List getNextRow() throws IOException { + return listReader.read(); + } + + @Override + public List getMergedRegions() { + return new ArrayList<>(); + } + + @Override + public String getLabel(){ + return null; + } +} diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/FileReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/FileReaderAdapter.java new file mode 100644 index 00000000..ed7a0469 --- /dev/null +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/FileReaderAdapter.java @@ -0,0 +1,17 @@ +package cz.cvut.spipes.modules.util; + +import cz.cvut.spipes.modules.ResourceFormat; +import cz.cvut.spipes.modules.model.Region; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +public interface FileReaderAdapter { + void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException; + String[] getHeader() throws IOException; + boolean hasNext() throws IOException; + List getNextRow() throws IOException; + List getMergedRegions(); + String getLabel() throws IOException; +} diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSFileReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSFileReaderAdapter.java new file mode 100644 index 00000000..f55dac81 --- /dev/null +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSFileReaderAdapter.java @@ -0,0 +1,88 @@ +package cz.cvut.spipes.modules.util; + +import cz.cvut.spipes.modules.ResourceFormat; +import cz.cvut.spipes.modules.model.Region; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.ss.util.CellRangeAddress; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +public class XLSFileReaderAdapter implements FileReaderAdapter { + private Sheet sheet; + private Iterator rowIterator; + + @Override + public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException { + Workbook workbook; + if (sourceResourceFormat == ResourceFormat.XLS) { + workbook = new HSSFWorkbook(inputStream); + } else { + workbook = new XSSFWorkbook(inputStream); + } + sheet = workbook.getSheetAt(tableIndex - 1); + rowIterator = sheet.iterator(); + } + + @Override + public String[] getHeader() throws IOException { + org.apache.poi.ss.usermodel.Row headerRow = sheet.getRow(0); + return StreamSupport.stream(headerRow.spliterator(), false) + .map(cell -> cell.getStringCellValue()) + .toArray(String[]::new); + } + + @Override + public boolean hasNext() { + return rowIterator.hasNext(); + } + + @Override + public List getNextRow() { + if (!rowIterator.hasNext()) + return null; + org.apache.poi.ss.usermodel.Row currentRow = rowIterator.next(); + DataFormatter formatter = new DataFormatter(); + List row = StreamSupport.stream(currentRow.spliterator(), false) + .map(cell -> { + String cellValue = formatter.formatCellValue(cell); + if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) { + cellValue = cellValue.replace(",", "."); + } + return cellValue.isEmpty() ? null : cellValue; + }) + .collect(Collectors.toList()); + return row; + } + + @Override + public List getMergedRegions() { + List regions = new ArrayList<>(); + for (int i = 0; i < sheet.getNumMergedRegions(); i++) { + CellRangeAddress region = sheet.getMergedRegion(i); + regions.add(new Region( + region.getFirstRow(), + region.getFirstColumn(), + region.getLastRow(), + region.getLastColumn() + )); + } + return regions; + } + + public String getLabel(){ + return sheet.getSheetName(); + } +} +