Skip to content

Commit

Permalink
[Upd] Rename methods in convertor, make process-table-at-index parame…
Browse files Browse the repository at this point in the history
…ter required to be equal to 1 for processing HTML files.
  • Loading branch information
rodionnv committed Nov 2, 2023
1 parent 126fb98 commit 077d446
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 26 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
* and then processed as usual.
* Take a look at the option {@link TabularModule#sourceResourceFormat} and class {@link HTML2TSVConvertor} for more details.
* Also, in a similar way this module can process XLS tables. Note, that processing multiple sheets isn't supported,
* so {@link TabularModule#processSpecificSheetInXLSFile} parameter is required (range 1...number of sheets).
* so {@link TabularModule#processTableAtIndex} parameter is required (range 1...number of sheets).
* <p>
* <b>Important notes (differences from the recommendation):</b><br/>
* Does not support custom table group URIs.<br/>
Expand All @@ -109,7 +109,7 @@ public class TabularModule extends AbstractModule {
private final Property P_SOURCE_RESOURCE_URI = getSpecificParameter("source-resource-uri");
private final Property P_SKIP_HEADER = getSpecificParameter("skip-header");
private final Property P_SOURCE_RESOURCE_FORMAT = getSpecificParameter("source-resource-format");
private final Property P_PROCESS_SPECIFIC_SHEET_IN_XLS_FILE = getSpecificParameter("process-specific-sheet-in-xls-file");
private final Property P_PROCESS_TABLE_AT_INDEX = getSpecificParameter("process-table-at-index");

//sml:replace
@Parameter(urlPrefix = SML.uri, name = "replace", comment = "Specifies whether a module should overwrite triples" +
Expand Down Expand Up @@ -138,11 +138,11 @@ public class TabularModule extends AbstractModule {
@Parameter(urlPrefix = PARAM_URL_PREFIX, name = "skip-header", comment = "Skip header. Default is false.")
private boolean skipHeader;

//:process-specific-sheet-in-xls-file
//:process-table-at-index
/**
* Required parameter that indicates that only specific single sheet should be converted
* Required parameter for HTML and EXCEL files that indicates that only specific single table should be processed
*/
private int processSpecificSheetInXLSFile;
private int processTableAtIndex;

//:output-mode
// TODO - revise comment
Expand Down Expand Up @@ -198,25 +198,31 @@ ExecutionContext executeSelf() {

switch (sourceResourceFormat) {
case HTML:
tsvConvertor = new HTML2TSVConvertor();
if (processTableAtIndex == 0) {
throw new SheetIsNotSpecifiedException("Source resource format is set to HTML file but no specific table is set for processing.");
}
if (processTableAtIndex != 1) {
throw new UnsupportedOperationException("Not implemented yet.");
}
tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
table.setLabel(tsvConvertor.getTableName(sourceResource));
setSourceResource(tsvConvertor.convertToTSV(sourceResource));
setDelimiter('\t');
break;
case XLS:
case XLSM:
case XLSX:
if (processSpecificSheetInXLSFile == 0) {
throw new SheetIsNotSpecifiedException("Source resource format is set to XLS file but no specific sheet is set for processing.");
if (processTableAtIndex == 0) {
throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
}
tsvConvertor = new XLS2TSVConvertor(processSpecificSheetInXLSFile, sourceResourceFormat);
int numberOfSheets = tsvConvertor.getNumberTables(sourceResource);
tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat);
int numberOfSheets = tsvConvertor.getTablesCount(sourceResource);
table.setLabel(tsvConvertor.getTableName(sourceResource));
LOG.debug("Number of sheets: {}", numberOfSheets);
if ((processSpecificSheetInXLSFile > numberOfSheets) || (processSpecificSheetInXLSFile < 1)) {
if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) {
LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
numberOfSheets,
processSpecificSheetInXLSFile
processTableAtIndex
);
throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
}
Expand Down Expand Up @@ -478,7 +484,7 @@ public void loadConfiguration() {
delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier(sourceResourceFormat));
isReplace = getPropertyValue(SML.replace, false);
skipHeader = getPropertyValue(P_SKIP_HEADER, false);
processSpecificSheetInXLSFile = getPropertyValue(P_PROCESS_SPECIFIC_SHEET_IN_XLS_FILE, 0);
processTableAtIndex = getPropertyValue(P_PROCESS_TABLE_AT_INDEX, 0);
acceptInvalidQuoting = getPropertyValue(P_ACCEPT_INVALID_QUOTING, false);
quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(sourceResourceFormat));
dataPrefix = getEffectiveValue(P_DATE_PREFIX).asLiteral().toString();
Expand Down Expand Up @@ -668,8 +674,8 @@ public void setSourceResourceFormat(ResourceFormat sourceResourceFormat) {
this.sourceResourceFormat = sourceResourceFormat;
}

public void setProcessSpecificSheetInXLSFile(int sheetNumber) {
this.processSpecificSheetInXLSFile = sheetNumber;
public void processTableAtIndex(int sheetNumber) {
this.processTableAtIndex = sheetNumber;
}

private String[] getHeaderFromSchema(Model inputModel, String[] header, boolean hasInputSchema) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,13 @@
* </table
*/
public class HTML2TSVConvertor implements TSVConvertor {

private int tableNumber;
private final List<Pair<Integer, Integer> > cellColSpan = new ArrayList<>();

public HTML2TSVConvertor(int sheetNumber) {
this.tableNumber = sheetNumber;
}

@Override
public StringStreamResource convertToTSV(StreamResource streamResource) {
StringBuilder tsvStringBuilder = new StringBuilder();
Expand Down Expand Up @@ -131,7 +135,7 @@ public List<Region> getMergedRegions(StreamResource streamResource){
}

@Override
public int getNumberTables(StreamResource streamResource) {
public int getTablesCount(StreamResource streamResource) {
Document doc = Jsoup.parseBodyFragment(new String(streamResource.getContent()));
doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
Elements tables = doc.getElementsByTag(HTML.TABLE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public interface TSVConvertor {

List<Region> getMergedRegions(StreamResource streamResource);

int getNumberTables(StreamResource streamResource);
int getTablesCount(StreamResource streamResource);

String getTableName(StreamResource streamResource);
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
* Module for converting tabular data from XLS to TSV. Converts specific sheet of the xls file.
*/
public class XLS2TSVConvertor implements TSVConvertor {

private static final Logger LOG = LoggerFactory.getLogger(XLS2TSVConvertor.class);
private int sheetNumber;
private ResourceFormat format;

Expand Down Expand Up @@ -86,7 +84,7 @@ public List<Region> getMergedRegions(StreamResource streamResource){
}

@Override
public int getNumberTables(StreamResource streamResource){
public int getTablesCount(StreamResource streamResource){
try {
if(format == ResourceFormat.XLS)return new HSSFWorkbook(new ByteArrayInputStream(streamResource.getContent())).getNumberOfSheets();
else return new XSSFWorkbook(new ByteArrayInputStream(streamResource.getContent())).getNumberOfSheets();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void executeWithSimpleTransformationXls() throws URISyntaxException, IOException
getFilePath("examples/countries/input.xls"))
);
module.setSourceResourceFormat(ResourceFormat.XLS);
module.setProcessSpecificSheetInXLSFile(1);
module.processTableAtIndex(1);

ExecutionContext outputContext = module.executeSelf();

Expand All @@ -89,7 +89,7 @@ void executeWithSimpleTransformationXlsm() throws URISyntaxException, IOExceptio
getFilePath("examples/countries/input.xlsm"))
);
module.setSourceResourceFormat(ResourceFormat.XLSM);
module.setProcessSpecificSheetInXLSFile(1);
module.processTableAtIndex(1);

ExecutionContext outputContext = module.executeSelf();

Expand All @@ -106,7 +106,7 @@ void executeWithSimpleTransformationMergedXls() throws URISyntaxException, IOExc
getFilePath("examples/mergedCells/input.xls"))
);
module.setSourceResourceFormat(ResourceFormat.XLS);
module.setProcessSpecificSheetInXLSFile(1);
module.processTableAtIndex(1);

ExecutionContext outputContext = module.executeSelf();

Expand All @@ -123,7 +123,7 @@ void executeWithSimpleTransformationMergedXlsx() throws URISyntaxException, IOEx
getFilePath("examples/mergedCells/input.xlsx"))
);
module.setSourceResourceFormat(ResourceFormat.XLSX);
module.setProcessSpecificSheetInXLSFile(1);
module.processTableAtIndex(1);

ExecutionContext outputContext = module.executeSelf();

Expand All @@ -140,6 +140,7 @@ void executeWithSimpleTransformationMergedHTML() throws URISyntaxException, IOEx
getFilePath("examples/mergedCells/input.html"))
);
module.setSourceResourceFormat(ResourceFormat.HTML);
module.processTableAtIndex(1);

ExecutionContext outputContext = module.executeSelf();

Expand Down Expand Up @@ -303,10 +304,10 @@ void executeSelfWithBNodesInSchema() throws IOException, URISyntaxException {
@Test
void executeSelfWithHTMLFileInput() throws URISyntaxException, IOException {
module.setSourceResourceFormat(ResourceFormat.HTML);
module.processTableAtIndex(1);
module.setSourceResource(
StreamResourceUtils.getStreamResource(DATA_PREFIX, getFilePath("examples/htmlFile/input.html"))
);

ExecutionContext outputContext = module.executeSelf();
Model actualModel = outputContext.getDefaultModel();

Expand Down

0 comments on commit 077d446

Please sign in to comment.