Skip to content

Commit

Permalink
[#228] Refactored the code, implemented suggested changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Evgenii Grigorev committed Dec 20, 2024
1 parent 339d972 commit 1f22c86
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 108 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import cz.cvut.spipes.exception.ResourceNotUniqueException;
import cz.cvut.spipes.exception.SPipesException;
import cz.cvut.spipes.modules.annotations.SPipesModule;
import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException;
import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException;
import cz.cvut.spipes.modules.model.*;
Expand Down Expand Up @@ -189,7 +188,7 @@ ExecutionContext executeSelf() {

StreamResource originalSourceResource = sourceResource;
TSVConvertor tsvConvertor = null;
FileReaderAdapter fileReaderAdapter = new XLSFileReaderAdapter();
StreamReaderAdapter streamReaderAdapter = new XLSStreamReaderAdapter();
CsvPreference csvPreference = null;

switch (sourceResourceFormat) {
Expand All @@ -200,22 +199,22 @@ ExecutionContext executeSelf() {
if (processTableAtIndex != 1) {
throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
}
fileReaderAdapter = new HTMLFileReaderAdapter();
streamReaderAdapter = new HTMLStreamReaderAdapter();
break;
case XLS:
case XLSM:
case XLSX:
if (processTableAtIndex == 0) {
throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
}
fileReaderAdapter = new XLSFileReaderAdapter();
streamReaderAdapter = new XLSStreamReaderAdapter();
break;
default:
csvPreference = new CsvPreference.Builder(
quoteCharacter,
delimiter,
System.lineSeparator()).build();
fileReaderAdapter = new CSVFileReaderAdapter(csvPreference);
streamReaderAdapter = new CSVStreamReaderAdapter(csvPreference);
break;
}

Expand All @@ -231,12 +230,13 @@ ExecutionContext executeSelf() {
List<Statement> rowStatements = new ArrayList<>();

try {
fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
String[] header = fileReaderAdapter.getHeader();
streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()),
sourceResourceFormat, processTableAtIndex, acceptInvalidQuoting, inputCharset, sourceResource);
String[] header = streamReaderAdapter.getHeader(skipHeader);
Set<String> columnNames = new HashSet<>();

if (fileReaderAdapter.getLabel() != null)
table.setLabel(fileReaderAdapter.getLabel());
if (streamReaderAdapter.getSheetLabel() != null)
table.setLabel(streamReaderAdapter.getSheetLabel());

if (header == null) {
LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri());
Expand All @@ -248,7 +248,7 @@ ExecutionContext executeSelf() {

if (skipHeader) {
header = getHeaderFromSchema(inputModel, header, hasInputSchema);
fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
//streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
} else if (hasInputSchema) {
header = getHeaderFromSchema(inputModel, header, true);
}
Expand Down Expand Up @@ -276,12 +276,7 @@ ExecutionContext executeSelf() {

int rowNumber = 0;
List<String> row;
switch (sourceResourceFormat){
case XLS, XLSM, XLSX:
row = fileReaderAdapter.getNextRow(); //skip header for xls files
break;
}
while ((row = fileReaderAdapter.getNextRow()) != null) {
while ((row = streamReaderAdapter.getNextRow()) != null) {
rowNumber++;
// 4.6.1 and 4.6.3
Row r = new Row();
Expand Down Expand Up @@ -327,7 +322,7 @@ ExecutionContext executeSelf() {
em.persist(tableGroup);
em.merge(tableSchema);

List<Region> regions = fileReaderAdapter.getMergedRegions();
List<Region> regions = streamReaderAdapter.getMergedRegions();

int cellsNum = 1;
for (Region region : regions) {
Expand All @@ -345,6 +340,7 @@ ExecutionContext executeSelf() {
}
}
}
streamReaderAdapter.close();
} catch (IOException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}
Expand Down Expand Up @@ -386,16 +382,6 @@ private String getValueFromRow(List<String> row, int index, int expectedRowLengt
}
}

private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
if (acceptInvalidQuoting) {
if (getQuote() == '\0') {
return null;
} else
return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
}
return new CsvListReader(getReader(), csvPreference);
}

private Statement createRowResource(String cellValue, int rowNumber, Column column) {
Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber));

Expand Down Expand Up @@ -574,10 +560,6 @@ private String normalize(String label) {
return label.trim().replaceAll("[^\\w]", "_");
}

private Reader getReader() {
return new StringReader(new String(sourceResource.getContent(), inputCharset));
}

@NotNull
private StreamResource getResourceByUri(@NotNull String resourceUri) {

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.InvalidQuotingTokenizer;
import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.supercsv.io.CsvListReader;
import org.supercsv.io.ICsvListReader;
import org.supercsv.prefs.CsvPreference;

import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.io.StringReader;

public class CSVStreamReaderAdapter implements StreamReaderAdapter {
private ICsvListReader listReader;
private CsvPreference csvPreference;
String [] header = null;
String [] firstRow = null;
boolean acceptInvalidQuoting;
Charset inputCharset;
StreamResource sourceResource;

public CSVStreamReaderAdapter(CsvPreference csvPreference) {
this.csvPreference = csvPreference;
}

@Override
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex,
boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
//listReader = new CsvListReader(new InputStreamReader(inputStream), csvPreference);
this.acceptInvalidQuoting = acceptInvalidQuoting;
this.inputCharset = inputCharset;
this.sourceResource = sourceResource;
listReader = getCsvListReader(csvPreference);
}

@Override
public String[] getHeader(Boolean skipHeader) throws IOException {
header = listReader.getHeader(true);
if (skipHeader) {
firstRow = header;
}
return header;
}

@Override
public boolean hasNextRow() throws IOException {
return ((listReader.read() != null) || (firstRow != null));
}

@Override
public List<String> getNextRow() throws IOException {
if (firstRow != null) {
List<String> row = Arrays.asList(firstRow);
firstRow = null;
return row;
}
return listReader.read();
}

@Override
public List<Region> getMergedRegions() {
return new ArrayList<>();
}

@Override
public String getSheetLabel(){
return null;
}

@Override
public void close() throws IOException{
listReader.close();
}

private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
if (acceptInvalidQuoting) {
if (getQuote() == '\0') {
return null;
} else
return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
}
return new CsvListReader(getReader(), csvPreference);
}

private Reader getReader() {
return new StringReader(new String(sourceResource.getContent(), inputCharset));
}

public char getQuote() {
return csvPreference.getQuoteChar();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,17 @@

import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.nio.charset.Charset;
import java.util.*;

public class HTMLFileReaderAdapter implements FileReaderAdapter {
public class HTMLStreamReaderAdapter implements StreamReaderAdapter {
private Elements rows;
private int currentIndex;
private Element table;
Expand All @@ -20,7 +22,8 @@ public class HTMLFileReaderAdapter implements FileReaderAdapter {
private Map<Integer, Map<Integer, String>> mergedCells;

@Override
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat,
int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
Document doc = Jsoup.parse(inputStream, "UTF-8", "");
Element table = doc.select("table").first();
rows = table.select("tr");
Expand All @@ -33,21 +36,21 @@ public void initialise(InputStream inputStream, ResourceFormat sourceResourceFor


@Override
public String[] getHeader() throws IOException {
public String[] getHeader(Boolean skipHeader) throws IOException {
Elements headerCells = rows.get(0).select("th, td");
return headerCells.stream()
.map(Element::text)
.toArray(String[]::new);
}

@Override
public boolean hasNext() {
public boolean hasNextRow() {
return currentIndex < rows.size() - 1; // Skip header row
}

@Override
public List<String> getNextRow() {
if (!hasNext()) {
if (!hasNextRow()) {
return null;
}

Expand Down Expand Up @@ -118,7 +121,11 @@ private List<Region> extractMergedRegions(Element table) {
}

@Override
public String getLabel(){
public String getSheetLabel(){
return label;
}

@Override
public void close() {
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@
import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.TabularModule;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.List;

public interface FileReaderAdapter {
public interface StreamReaderAdapter {
static final Logger LOG = LoggerFactory.getLogger(TabularModule.class);
void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException;
String[] getHeader() throws IOException;
boolean hasNext() throws IOException;
void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException;
String[] getHeader(Boolean skipHeader) throws IOException;
boolean hasNextRow() throws IOException;
List<String> getNextRow() throws IOException;
List<Region> getMergedRegions();
String getLabel() throws IOException;
String getSheetLabel() throws IOException;
void close() throws IOException;
}
Loading

0 comments on commit 1f22c86

Please sign in to comment.