Skip to content

Commit

Permalink
[#228] Streamlined processing of xls and csv files using adapter pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
Evgenii Grigorev committed Dec 15, 2024
1 parent d5b03b5 commit 125d65e
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 222 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ ExecutionContext executeSelf() {

StreamResource originalSourceResource = sourceResource;
TSVConvertor tsvConvertor = null;
FileReaderAdapter fileReaderAdapter = new XLSFileReaderAdapter();
switch (sourceResourceFormat) {
case HTML:
if (processTableAtIndex == 0) {
Expand All @@ -216,23 +217,15 @@ ExecutionContext executeSelf() {
if (processTableAtIndex == 0) {
throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
}

return processXLSFile(sourceResource, processTableAtIndex);
// tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat);
// int numberOfSheets = tsvConvertor.getTablesCount(sourceResource);
// table.setLabel(tsvConvertor.getTableName(sourceResource));
// LOG.debug("Number of sheets: {}", numberOfSheets);
// if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) {
// LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
// numberOfSheets,
// processTableAtIndex
// );
// throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
// }
// setSourceResource(tsvConvertor.convertToTSV(sourceResource));
// setDelimiter('\t');

//break;
fileReaderAdapter = new XLSFileReaderAdapter();
break;
default:
CsvPreference csvPreference = new CsvPreference.Builder(
quoteCharacter,
delimiter,
System.lineSeparator()).build();
fileReaderAdapter = new CSVFileReaderAdapter(csvPreference);
break;
}

BNodesTransformer bNodesTransformer = new BNodesTransformer();
Expand All @@ -246,36 +239,29 @@ ExecutionContext executeSelf() {
List<Column> outputColumns = new ArrayList<>();
List<Statement> rowStatements = new ArrayList<>();

CsvPreference csvPreference = new CsvPreference.Builder(
quoteCharacter,
delimiter,
System.lineSeparator()).build();

try {
ICsvListReader listReader = getCsvListReader(csvPreference);

if (listReader == null) {
logMissingQuoteError();
return getExecutionContext(inputModel, outputModel);
}
fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
String[] header = fileReaderAdapter.getHeader();
Set<String> columnNames = new HashSet<>();

String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader)
if (fileReaderAdapter.getLabel() != null)
table.setLabel(fileReaderAdapter.getLabel());

if (header == null) {
LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri());
return getExecutionContext(inputModel, outputModel);
}
Set<String> columnNames = new HashSet<>();

TableSchema inputTableSchema = getTableSchema(em);
hasInputSchema = hasInputSchema(inputTableSchema);

if (skipHeader) {
header = getHeaderFromSchema(inputModel, header, hasInputSchema);
listReader = new CsvListReader(getReader(), csvPreference);
fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
} else if (hasInputSchema) {
header = getHeaderFromSchema(inputModel, header, true);
}

em.getTransaction().commit();
em.close();
em.getEntityManagerFactory().close();
Expand All @@ -298,10 +284,15 @@ ExecutionContext executeSelf() {
if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName);
}

List<String> row;
int rowNumber = 0;
//for each row
while ((row = listReader.read()) != null) {
List<String> row;
switch (sourceResourceFormat){
case XLS, XLSM, XLSX:
row = fileReaderAdapter.getNextRow(); //skip header for xls files
break;
}
while ((row = fileReaderAdapter.getNextRow()) != null) {
//row = fileReaderAdapter.getNextRow();
rowNumber++;
// 4.6.1 and 4.6.3
Row r = new Row();
Expand Down Expand Up @@ -339,37 +330,34 @@ ExecutionContext executeSelf() {
// 4.6.8.7 - else, if cellValue is not null
}
}
listReader.close();
} catch (IOException | MissingArgumentException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
tableSchema.setColumnsSet(new HashSet<>(outputColumns));
tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
tableSchema.setColumnsSet(new HashSet<>(outputColumns));

em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel);
em.getTransaction().begin();
em.persist(tableGroup);
em.merge(tableSchema);
em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel);
em.getTransaction().begin();
em.persist(tableGroup);
em.merge(tableSchema);

if (tsvConvertor != null) {
List<Region> regions = tsvConvertor.getMergedRegions(originalSourceResource);
List<Region> regions = fileReaderAdapter.getMergedRegions();

int cellsNum = 1;
for (Region region : regions) {
int firstCellInRegionNum = cellsNum;
for(int i = region.getFirstRow();i <= region.getLastRow();i++){
for(int j = region.getFirstColumn();j <= region.getLastColumn();j++) {
Cell cell = new Cell(sourceResource.getUri()+"#cell"+(cellsNum));
for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
cell.setRow(tableSchema.createAboutUrl(i));
cell.setColumn(outputColumns.get(j).getUri().toString());
if(cellsNum != firstCellInRegionNum)
cell.setSameValueAsCell(sourceResource.getUri()+"#cell"+(firstCellInRegionNum));
if (cellsNum != firstCellInRegionNum) {
cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
}
em.merge(cell);
cellsNum++;
}
}
}
} catch (IOException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

em.getTransaction().commit();
Expand All @@ -380,7 +368,7 @@ ExecutionContext executeSelf() {
outputModel.add(rowStatements);
outputModel.add(bNodesTransformer.transferJOPAEntitiesToBNodes(persistedModel));

return getExecutionContext(inputModel,outputModel);
return getExecutionContext(inputModel, outputModel);
}

private String getValueFromRow(List<String> row, int index, int expectedRowLength, int currentRecordNumber) {
Expand Down Expand Up @@ -714,173 +702,4 @@ private void logMissingQuoteError() throws MissingArgumentException {
throw new MissingArgumentException(message);
} else LOG.error(message);
}

private ExecutionContext processXLSFile(StreamResource sourceResource, int tableIndex) {
BNodesTransformer bNodesTransformer = new BNodesTransformer();
Model inputModel = bNodesTransformer.convertBNodesToNonBNodes(executionContext.getDefaultModel());
boolean hasInputSchema = false;

Model outputModel = ModelFactory.createDefaultModel();
EntityManager em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", inputModel);
em.getTransaction().begin();

List<Column> outputColumns = new ArrayList<>();
List<Statement> rowStatements = new ArrayList<>();

Workbook workbook;
try {
if (sourceResourceFormat == ResourceFormat.XLS) {
workbook = new HSSFWorkbook(new ByteArrayInputStream(sourceResource.getContent()));
} else {
workbook = new XSSFWorkbook(new ByteArrayInputStream(sourceResource.getContent()));
}
table.setLabel(workbook.getSheetAt(tableIndex - 1).getSheetName());

Sheet sheet = workbook.getSheetAt(tableIndex - 1);
Iterator<org.apache.poi.ss.usermodel.Row> rowIterator = sheet.iterator();

org.apache.poi.ss.usermodel.Row headerRow = rowIterator.next();
String[] header = StreamSupport.stream(headerRow.spliterator(), false)
.map(cell -> cell.getStringCellValue())
.toArray(String[]::new);

if (header == null) {
LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri());
return getExecutionContext(inputModel, outputModel);
}
Set<String> columnNames = new HashSet<>();

TableSchema inputTableSchema = getTableSchema(em);
hasInputSchema = hasInputSchema(inputTableSchema);

if (skipHeader) {
header = getHeaderFromSchema(inputModel, header, hasInputSchema);
} else if (hasInputSchema) {
header = getHeaderFromSchema(inputModel, header, true);
}
em.getTransaction().commit();
em.close();
em.getEntityManagerFactory().close();

outputColumns = new ArrayList<>(header.length);

for (String columnTitle : header) {
String columnName = normalize(columnTitle);
boolean isDuplicate = !columnNames.add(columnName);

Column schemaColumn = new Column(columnName, columnTitle);
outputColumns.add(schemaColumn);

tableSchema.setAboutUrl(schemaColumn, sourceResource.getUri());
schemaColumn.setProperty(
dataPrefix,
sourceResource.getUri(),
hasInputSchema ? tableSchema.getColumn(columnName) : null);
schemaColumn.setTitle(columnTitle);
if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName);
}

int rowNumber = 0;
while (rowIterator.hasNext()) {
org.apache.poi.ss.usermodel.Row currentRow = rowIterator.next();
rowNumber++;
// 4.6.1 and 4.6.3
Row r = new Row();

if (outputMode == Mode.STANDARD) {
// 4.6.2
table.getRows().add(r);
// 4.6.4
r.setRownum(rowNumber);
// 4.6.5
r.setUrl(sourceResource.getUri() + "#row=" + (rowNumber + 1));
}

// 4.6.6 - Add titles.
// We do not support titles.

// 4.6.7
// In standard mode only, emit the triples generated by running
// the algorithm specified in section 6. JSON-LD to RDF over any
// non-core annotations specified for the row, with node R as
// an initial subject, the non-core annotation as property, and the
// value of the non-core annotation as value.
DataFormatter formatter = new DataFormatter();
ArrayList<String> row = new ArrayList<>();

for (org.apache.poi.ss.usermodel.Cell cell : currentRow) {
String cellValue = formatter.formatCellValue(cell);
if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) //xls uses ',' as Decimal separator
{ cellValue = cellValue.replace(",", "."); } // so we should replace it with '.'
if (cellValue.isEmpty()) {
row.add(null);
} else {
row.add(cellValue);
}
}

for (int i = 0; i < header.length; i++) {
// 4.6.8.1
Column column = outputColumns.get(i);
String cellValue = getValueFromRow(row, i, header.length, rowNumber);
if (cellValue != null) rowStatements.add(createRowResource(cellValue, rowNumber, column));
// 4.6.8.2
r.setDescribes(tableSchema.createAboutUrl(rowNumber));
//TODO: URITemplate

// 4.6.8.5 - else, if value is list and cellOrdering == true
// 4.6.8.6 - else, if value is list
// 4.6.8.7 - else, if cellValue is not null
}
}
tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
tableSchema.setColumnsSet(new HashSet<>(outputColumns));

em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel);
em.getTransaction().begin();
em.persist(tableGroup);
em.merge(tableSchema);

sheet = workbook.getSheetAt(tableIndex - 1);
List<Region> regions = new ArrayList<>();

for(int i = 0;i < sheet.getNumMergedRegions();i++){
CellRangeAddress region = sheet.getMergedRegion(i);
regions.add(new Region(
region.getFirstRow(),
region.getFirstColumn(),
region.getLastRow(),
region.getLastColumn())
);
}

int cellsNum = 1;
for (Region region : regions) {
int firstCellInRegionNum = cellsNum;
for(int i = region.getFirstRow();i <= region.getLastRow();i++){
for(int j = region.getFirstColumn();j <= region.getLastColumn();j++) {
Cell cell = new Cell(sourceResource.getUri()+"#cell"+(cellsNum));
cell.setRow(tableSchema.createAboutUrl(i));
cell.setColumn(outputColumns.get(j).getUri().toString());
if(cellsNum != firstCellInRegionNum)
cell.setSameValueAsCell(sourceResource.getUri()+"#cell"+(firstCellInRegionNum));
em.merge(cell);
cellsNum++;
}
}
}
} catch (IOException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

em.getTransaction().commit();
Model persistedModel = JopaPersistenceUtils.getDataset(em).getDefaultModel();
em.getEntityManagerFactory().close();

tableSchema.addColumnsList(persistedModel, outputColumns);
outputModel.add(rowStatements);
outputModel.add(bNodesTransformer.transferJOPAEntitiesToBNodes(persistedModel));

return getExecutionContext(inputModel,outputModel);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.model.Region;
import org.supercsv.io.CsvListReader;
import org.supercsv.io.ICsvListReader;
import org.supercsv.prefs.CsvPreference;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class CSVFileReaderAdapter implements FileReaderAdapter {
private ICsvListReader listReader;
private CsvPreference csvPreference;

public CSVFileReaderAdapter(CsvPreference csvPreference) {
this.csvPreference = csvPreference;
}

@Override
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
listReader = new CsvListReader(new InputStreamReader(inputStream), csvPreference);
}

@Override
public String[] getHeader() throws IOException {
return listReader.getHeader(true);
}

@Override
public boolean hasNext() throws IOException {
return listReader.read() != null;
}

@Override
public List<String> getNextRow() throws IOException {
return listReader.read();
}

@Override
public List<Region> getMergedRegions() {
return new ArrayList<>();
}

@Override
public String getLabel(){
return null;
}
}
Loading

0 comments on commit 125d65e

Please sign in to comment.