From 071214e1acda6f45840b4691e5982fada148587c Mon Sep 17 00:00:00 2001 From: vikasrathee-cs Date: Wed, 27 Mar 2024 21:40:17 +0530 Subject: [PATCH] Added auto detection of number of rows and columns to read from Added auto detection of number of rows and columns to read from --- docs/GoogleSheets-batchsource.md | 3 ++ .../source/GoogleSheetsRecordReader.java | 36 +++++++++++-------- .../source/GoogleSheetsSourceConfig.java | 36 +++++++++++++++---- widgets/GoogleSheets-batchsource.json | 32 +++++++++++++++++ 4 files changed, 87 insertions(+), 20 deletions(-) diff --git a/docs/GoogleSheets-batchsource.md b/docs/GoogleSheets-batchsource.md index 9685c70..fe10937 100644 --- a/docs/GoogleSheets-batchsource.md +++ b/docs/GoogleSheets-batchsource.md @@ -140,6 +140,9 @@ _Treat first row as column names_ - the plugin uses first row for schema definin **Column Names Row Number:** Number of the row to be treated as a header. Only shown when the 'Column Names Selection' field is set to 'Custom row as column names' header. +**Auto Detect Number of Rows and Columns:** Field to enable automatic detection of the number of rows and columns to +read from the sheet. + **Number of Columns to Read:** Last column plugin will read as data. It will be ignored if the Column Names Row contains less number of columns. diff --git a/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsRecordReader.java b/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsRecordReader.java index 00bcf20..87dc3d0 100644 --- a/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsRecordReader.java +++ b/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsRecordReader.java @@ -17,6 +17,7 @@ package io.cdap.plugin.google.sheets.source; import com.github.rholder.retry.RetryException; +import com.google.api.services.sheets.v4.model.Sheet; import com.google.gson.reflect.TypeToken; import io.cdap.cdap.api.data.format.StructuredRecord; import io.cdap.cdap.api.data.schema.Schema; @@ -89,48 +90,55 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont } private void populateBufferedTasks() { - int firstDataRow = config.getActualFirstDataRow(); - int lastDataRow = config.getActualLastDataRow(); - List sheetTitles; + List sheetList; try { - sheetTitles = getSheetTitles(); + sheetList = getSheetList(); } catch (ExecutionException | RetryException e) { throw new RuntimeException("Exception during sheet titles retrieving.", e); } - sheetTitles.forEach(t -> { + sheetList.forEach(t -> { + int firstDataRow = config.getActualFirstDataRow(); + // Each sheet can have different number of records so last row can be different sheet wise + // and in case auto-detect for rows is enabled, It will fetch all records from the sheet + int lastDataRow = config.getActualLastDataRow(t.getProperties().getGridProperties().getRowCount()); int rowsNumber = lastDataRow - firstDataRow + 1; overallRowsNumber += rowsNumber; int counter = 0; + String title = t.getProperties().getTitle(); while (rowsNumber > bufferSize) { - rowTaskQueue.add(new GroupedRowTask(t, counter * bufferSize + firstDataRow, bufferSize)); + rowTaskQueue.add( + new GroupedRowTask(title, counter * bufferSize + firstDataRow, bufferSize)); counter++; rowsNumber -= bufferSize; } - rowTaskQueue.add(new GroupedRowTask(t, counter * bufferSize + firstDataRow, + rowTaskQueue.add(new GroupedRowTask(title, counter * bufferSize + firstDataRow, rowsNumber)); }); currentRowIndex = -1; currentGroupedRowTask = null; } - private List getSheetTitles() throws ExecutionException, RetryException { - List sheetTitles = new ArrayList<>(); + private List getSheetList() throws ExecutionException, RetryException { + List sheetList = new ArrayList<>(); switch (config.getSheetsToPull()) { case ALL: - sheetTitles = googleSheetsSourceClient.getSheetsTitles(fileId); + sheetList = googleSheetsSourceClient.getSheets(fileId); break; case NUMBERS: List sheetIndexes = config.getSheetsIdentifiers().stream() .map(s -> Integer.parseInt(s)).collect(Collectors.toList()); - sheetTitles = googleSheetsSourceClient.getSheets(fileId).stream() + sheetList = googleSheetsSourceClient.getSheets(fileId).stream() .filter(s -> sheetIndexes.contains(s.getProperties().getIndex())) - .map(s -> s.getProperties().getTitle()).collect(Collectors.toList()); + .collect(Collectors.toList()); break; case TITLES: - sheetTitles = config.getSheetsIdentifiers(); + List sheetTitles = config.getSheetsIdentifiers(); + sheetList = googleSheetsSourceClient.getSheets(fileId).stream() + .filter(s -> sheetTitles.contains(s.getProperties().getTitle())) + .collect(Collectors.toList()); break; } - return sheetTitles; + return sheetList; } @Override diff --git a/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java b/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java index 9298c2c..8fb53f1 100644 --- a/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java +++ b/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java @@ -82,6 +82,7 @@ public class GoogleSheetsSourceConfig extends GoogleFilteringSourceConfig { public static final String LAST_HEADER_ROW = "lastHeaderRow"; public static final String FIRST_FOOTER_ROW = "firstFooterRow"; public static final String LAST_FOOTER_ROW = "lastFooterRow"; + public static final String AUTO_DETECT_ROWS_AND_COLUMNS = "autoDetectRowsAndColumns"; public static final String LAST_DATA_COLUMN = "lastDataColumn"; public static final String LAST_DATA_ROW = "lastDataRow"; public static final String METADATA_CELLS = "metadataCells"; @@ -184,13 +185,20 @@ public class GoogleSheetsSourceConfig extends GoogleFilteringSourceConfig { @Macro private Integer lastFooterRow; + @Nullable + @Name(AUTO_DETECT_ROWS_AND_COLUMNS) + @Description("Field to enable automatic detection of the number of rows and columns to read from the sheet.") + private Boolean autoDetectRowsAndColumns; + @Name(LAST_DATA_COLUMN) + @Nullable @Description("Last column plugin will read as data. It will be ignored if the Column Names " + "Row contain less number of columns.") @Macro private String lastDataColumn; @Name(LAST_DATA_ROW) + @Nullable @Description("Last row plugin will read as data.") @Macro private String lastDataRow; @@ -301,7 +309,9 @@ public ValidationResult validate(FailureCollector collector) { dataSchemaInfo = new LinkedHashMap<>(); validateColumnNamesRow(collector); - validateLastDataColumnIndexAndLastRowIndex(collector); + if (!getAutoDetectRowsAndColumns()) { + validateLastDataColumnIndexAndLastRowIndex(collector); + } validateSpreadsheetAndSheetFieldNames(collector); if (collector.getValidationFailures().isEmpty() && validationResult.isDirectoryOrFileAccessible()) { @@ -547,7 +557,7 @@ private void getAndValidateSheetSchema(FailureCollector collector, GoogleSheetsS if (columnMerges.isEmpty()) { dataRow = subColumnsRow; } - + lastDataColumn = lastDataColumn == 0 ? columnsRow.size() : lastDataColumn; resultHeaderTitles = processColumns(columnsRow, subColumnsRow, dataRow, columnMerges, lastDataColumn, collector); if (collector.getValidationFailures().isEmpty()) { @@ -561,6 +571,7 @@ private void getAndValidateSheetSchema(FailureCollector collector, GoogleSheetsS MergesForNumeredRows firstRowData = sheetsSourceClient.getSingleRows(firstFileTitles.getKey(), firstFileTitles.getValue().get(0), Collections.singleton(firstDataRow)); List dataCells = firstRowData.getNumeredRows().get(firstDataRow); + lastDataColumn = lastDataColumn == 0 ? dataCells.size() : lastDataColumn; if (CollectionUtils.isEmpty(dataCells)) { dataSchemaInfo = defaultGeneratedHeaders(lastDataColumn); } else { @@ -696,8 +707,8 @@ public int getActualFirstDataRow() { * Returns the int. * @return The int */ - public int getActualLastDataRow() { - int lastDataRow = getLastDataRow(); + public int getActualLastDataRow(int recordsInSheet) { + int lastDataRow = getAutoDetectRowsAndColumns() ? recordsInSheet : getLastDataRow(); if (isExtractMetadata() && getFirstFooterRow() > 0) { lastDataRow = Math.min(lastDataRow, getFirstFooterRow() - 1); } @@ -913,12 +924,17 @@ public int getLastFooterRow() { return lastFooterRow == null ? 0 : lastFooterRow; } + @Nullable + public boolean getAutoDetectRowsAndColumns() { + return Boolean.TRUE.equals(autoDetectRowsAndColumns); + } + public Integer getLastDataColumn() { - return Integer.parseInt(lastDataColumn); + return lastDataColumn == null ? 0 : Integer.parseInt(lastDataColumn); } public Integer getLastDataRow() { - return Integer.parseInt(lastDataRow); + return lastDataRow == null ? 0 : Integer.parseInt(lastDataRow); } public String getMetadataCells() { @@ -1058,6 +1074,10 @@ public void setLastFooterRow(Integer lastFooterRow) { this.lastFooterRow = lastFooterRow; } + public void setAutoDetectRowsAndColumns(boolean autoDetectRowsAndColumns) { + this.autoDetectRowsAndColumns = autoDetectRowsAndColumns; + } + public void setLastDataColumn(String lastDataColumn) { this.lastDataColumn = lastDataColumn; } @@ -1280,6 +1300,10 @@ public static GoogleSheetsSourceConfig of(JsonObject properties) throws IOExcept googleSheetsSourceConfig.setIdentifierType( properties.get(GoogleSheetsSourceConfig.IDENTIFIER_TYPE).getAsString()); } + if (properties.has(GoogleSheetsSourceConfig.AUTO_DETECT_ROWS_AND_COLUMNS)) { + googleSheetsSourceConfig.setAutoDetectRowsAndColumns( + properties.get(GoogleSheetsSourceConfig.AUTO_DETECT_ROWS_AND_COLUMNS).getAsBoolean()); + } return googleSheetsSourceConfig; } diff --git a/widgets/GoogleSheets-batchsource.json b/widgets/GoogleSheets-batchsource.json index f744656..5d87a1e 100644 --- a/widgets/GoogleSheets-batchsource.json +++ b/widgets/GoogleSheets-batchsource.json @@ -387,6 +387,22 @@ "min": "1" } }, + { + "widget-type": "toggle", + "label": "Auto Detect Number of Rows and Columns", + "name": "autoDetectRowsAndColumns", + "widget-attributes": { + "on": { + "value": "true", + "label": "Yes" + }, + "off": { + "value": "false", + "label": "No" + }, + "default": "true" + } + }, { "widget-type": "number", "label": "Number of Columns to Read", @@ -632,6 +648,22 @@ "type": "property" } ] + }, + { + "name": "Auto Detect Rows and Columns", + "condition": { + "expression": "autoDetectRowsAndColumns != true" + }, + "show": [ + { + "name": "lastDataColumn", + "type": "property" + }, + { + "name": "lastDataRow", + "type": "property" + } + ] } ] }