Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PLUGIN-1766] Added auto detection of number of rows and columns to read from. #45

Merged
merged 1 commit into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/GoogleSheets-batchsource.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ _Treat first row as column names_ - the plugin uses first row for schema definin
**Column Names Row Number:** Number of the row to be treated as a header.
Only shown when the 'Column Names Selection' field is set to 'Custom row as column names' header.

**Auto Detect Number of Rows and Columns:** Field to enable automatic detection of the number of rows and columns to
read from the sheet.

**Number of Columns to Read:** Last column plugin will read as data. It will be ignored if the Column
Names Row contains less number of columns.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package io.cdap.plugin.google.sheets.source;

import com.github.rholder.retry.RetryException;
import com.google.api.services.sheets.v4.model.Sheet;
import com.google.gson.reflect.TypeToken;
import io.cdap.cdap.api.data.format.StructuredRecord;
import io.cdap.cdap.api.data.schema.Schema;
Expand Down Expand Up @@ -89,48 +90,55 @@ public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptCont
}

private void populateBufferedTasks() {
int firstDataRow = config.getActualFirstDataRow();
int lastDataRow = config.getActualLastDataRow();
List<String> sheetTitles;
List<Sheet> sheetList;
try {
sheetTitles = getSheetTitles();
sheetList = getSheetList();
} catch (ExecutionException | RetryException e) {
throw new RuntimeException("Exception during sheet titles retrieving.", e);
}
sheetTitles.forEach(t -> {
sheetList.forEach(t -> {
int firstDataRow = config.getActualFirstDataRow();
// Each sheet can have different number of records so last row can be different sheet wise
// and in case auto-detect for rows is enabled, It will fetch all records from the sheet
int lastDataRow = config.getActualLastDataRow(t.getProperties().getGridProperties().getRowCount());
int rowsNumber = lastDataRow - firstDataRow + 1;
overallRowsNumber += rowsNumber;
int counter = 0;
String title = t.getProperties().getTitle();
while (rowsNumber > bufferSize) {
rowTaskQueue.add(new GroupedRowTask(t, counter * bufferSize + firstDataRow, bufferSize));
rowTaskQueue.add(
new GroupedRowTask(title, counter * bufferSize + firstDataRow, bufferSize));
counter++;
rowsNumber -= bufferSize;
}
rowTaskQueue.add(new GroupedRowTask(t, counter * bufferSize + firstDataRow,
rowTaskQueue.add(new GroupedRowTask(title, counter * bufferSize + firstDataRow,
rowsNumber));
});
currentRowIndex = -1;
currentGroupedRowTask = null;
}

private List<String> getSheetTitles() throws ExecutionException, RetryException {
List<String> sheetTitles = new ArrayList<>();
private List<Sheet> getSheetList() throws ExecutionException, RetryException {
List<Sheet> sheetList = new ArrayList<>();
switch (config.getSheetsToPull()) {
case ALL:
sheetTitles = googleSheetsSourceClient.getSheetsTitles(fileId);
sheetList = googleSheetsSourceClient.getSheets(fileId);
break;
case NUMBERS:
List<Integer> sheetIndexes = config.getSheetsIdentifiers().stream()
.map(s -> Integer.parseInt(s)).collect(Collectors.toList());
sheetTitles = googleSheetsSourceClient.getSheets(fileId).stream()
sheetList = googleSheetsSourceClient.getSheets(fileId).stream()
.filter(s -> sheetIndexes.contains(s.getProperties().getIndex()))
.map(s -> s.getProperties().getTitle()).collect(Collectors.toList());
.collect(Collectors.toList());
break;
case TITLES:
sheetTitles = config.getSheetsIdentifiers();
List<String> sheetTitles = config.getSheetsIdentifiers();
sheetList = googleSheetsSourceClient.getSheets(fileId).stream()
.filter(s -> sheetTitles.contains(s.getProperties().getTitle()))
.collect(Collectors.toList());
break;
}
return sheetTitles;
return sheetList;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ public class GoogleSheetsSourceConfig extends GoogleFilteringSourceConfig {
public static final String LAST_HEADER_ROW = "lastHeaderRow";
public static final String FIRST_FOOTER_ROW = "firstFooterRow";
public static final String LAST_FOOTER_ROW = "lastFooterRow";
public static final String AUTO_DETECT_ROWS_AND_COLUMNS = "autoDetectRowsAndColumns";
public static final String LAST_DATA_COLUMN = "lastDataColumn";
public static final String LAST_DATA_ROW = "lastDataRow";
public static final String METADATA_CELLS = "metadataCells";
Expand Down Expand Up @@ -184,13 +185,20 @@ public class GoogleSheetsSourceConfig extends GoogleFilteringSourceConfig {
@Macro
private Integer lastFooterRow;

@Nullable
@Name(AUTO_DETECT_ROWS_AND_COLUMNS)
@Description("Field to enable automatic detection of the number of rows and columns to read from the sheet.")
private Boolean autoDetectRowsAndColumns;

@Name(LAST_DATA_COLUMN)
@Nullable
@Description("Last column plugin will read as data. It will be ignored if the Column Names " +
"Row contain less number of columns.")
@Macro
private String lastDataColumn;

@Name(LAST_DATA_ROW)
@Nullable
@Description("Last row plugin will read as data.")
@Macro
private String lastDataRow;
Expand Down Expand Up @@ -301,7 +309,9 @@ public ValidationResult validate(FailureCollector collector) {
dataSchemaInfo = new LinkedHashMap<>();

validateColumnNamesRow(collector);
validateLastDataColumnIndexAndLastRowIndex(collector);
if (!getAutoDetectRowsAndColumns()) {
validateLastDataColumnIndexAndLastRowIndex(collector);
}
validateSpreadsheetAndSheetFieldNames(collector);

if (collector.getValidationFailures().isEmpty() && validationResult.isDirectoryOrFileAccessible()) {
Expand Down Expand Up @@ -547,7 +557,7 @@ private void getAndValidateSheetSchema(FailureCollector collector, GoogleSheetsS
if (columnMerges.isEmpty()) {
dataRow = subColumnsRow;
}

lastDataColumn = lastDataColumn == 0 ? columnsRow.size() : lastDataColumn;
resultHeaderTitles = processColumns(columnsRow, subColumnsRow, dataRow, columnMerges,
lastDataColumn, collector);
if (collector.getValidationFailures().isEmpty()) {
Expand All @@ -561,6 +571,7 @@ private void getAndValidateSheetSchema(FailureCollector collector, GoogleSheetsS
MergesForNumeredRows firstRowData = sheetsSourceClient.getSingleRows(firstFileTitles.getKey(),
firstFileTitles.getValue().get(0), Collections.singleton(firstDataRow));
List<CellData> dataCells = firstRowData.getNumeredRows().get(firstDataRow);
lastDataColumn = lastDataColumn == 0 ? dataCells.size() : lastDataColumn;
if (CollectionUtils.isEmpty(dataCells)) {
dataSchemaInfo = defaultGeneratedHeaders(lastDataColumn);
} else {
Expand Down Expand Up @@ -696,8 +707,8 @@ public int getActualFirstDataRow() {
* Returns the int.
* @return The int
*/
public int getActualLastDataRow() {
int lastDataRow = getLastDataRow();
public int getActualLastDataRow(int recordsInSheet) {
int lastDataRow = getAutoDetectRowsAndColumns() ? recordsInSheet : getLastDataRow();
if (isExtractMetadata() && getFirstFooterRow() > 0) {
lastDataRow = Math.min(lastDataRow, getFirstFooterRow() - 1);
}
Expand Down Expand Up @@ -913,12 +924,17 @@ public int getLastFooterRow() {
return lastFooterRow == null ? 0 : lastFooterRow;
}

@Nullable
public boolean getAutoDetectRowsAndColumns() {
return Boolean.TRUE.equals(autoDetectRowsAndColumns);
}

public Integer getLastDataColumn() {
return Integer.parseInt(lastDataColumn);
return lastDataColumn == null ? 0 : Integer.parseInt(lastDataColumn);
}

public Integer getLastDataRow() {
return Integer.parseInt(lastDataRow);
return lastDataRow == null ? 0 : Integer.parseInt(lastDataRow);
}

public String getMetadataCells() {
Expand Down Expand Up @@ -1058,6 +1074,10 @@ public void setLastFooterRow(Integer lastFooterRow) {
this.lastFooterRow = lastFooterRow;
}

public void setAutoDetectRowsAndColumns(boolean autoDetectRowsAndColumns) {
this.autoDetectRowsAndColumns = autoDetectRowsAndColumns;
}

public void setLastDataColumn(String lastDataColumn) {
this.lastDataColumn = lastDataColumn;
}
Expand Down Expand Up @@ -1280,6 +1300,10 @@ public static GoogleSheetsSourceConfig of(JsonObject properties) throws IOExcept
googleSheetsSourceConfig.setIdentifierType(
properties.get(GoogleSheetsSourceConfig.IDENTIFIER_TYPE).getAsString());
}
if (properties.has(GoogleSheetsSourceConfig.AUTO_DETECT_ROWS_AND_COLUMNS)) {
googleSheetsSourceConfig.setAutoDetectRowsAndColumns(
properties.get(GoogleSheetsSourceConfig.AUTO_DETECT_ROWS_AND_COLUMNS).getAsBoolean());
}

return googleSheetsSourceConfig;
}
Expand Down
32 changes: 32 additions & 0 deletions widgets/GoogleSheets-batchsource.json
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,22 @@
"min": "1"
}
},
{
"widget-type": "toggle",
"label": "Auto Detect Number of Rows and Columns",
"name": "autoDetectRowsAndColumns",
"widget-attributes": {
"on": {
"value": "true",
"label": "Yes"
},
"off": {
"value": "false",
"label": "No"
},
"default": "true"
}
},
{
"widget-type": "number",
"label": "Number of Columns to Read",
Expand Down Expand Up @@ -632,6 +648,22 @@
"type": "property"
}
]
},
{
"name": "Auto Detect Rows and Columns",
"condition": {
"expression": "autoDetectRowsAndColumns != true"
},
"show": [
{
"name": "lastDataColumn",
"type": "property"
},
{
"name": "lastDataRow",
"type": "property"
}
]
}
]
}