From 3f6fe9010885628f30792c616ce9b52e42de9e4e Mon Sep 17 00:00:00 2001 From: vikasrathee-cs Date: Tue, 30 Apr 2024 11:39:51 +0530 Subject: [PATCH 1/2] column name cleansing done as per other file plugins. --- .../source/GoogleSheetsSourceConfig.java | 42 ++++++++++++++----- .../source/GoogleSheetsSourceConfigTest.java | 4 +- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java b/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java index 8fb53f1..9bec16d 100644 --- a/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java +++ b/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java @@ -98,7 +98,7 @@ public class GoogleSheetsSourceConfig extends GoogleFilteringSourceConfig { public static final String CONFIGURATION_PARSE_PROPERTY_NAME = "properties"; private static final Logger LOG = LoggerFactory.getLogger(GoogleSheetsSourceConfig.class); private static final Pattern CELL_ADDRESS = Pattern.compile("^([A-Z]+)([0-9]+)$"); - private static final Pattern COLUMN_NAME = Pattern.compile("^[A-Za-z_][A-Za-z0-9_-]*$"); + private static final Pattern NOT_VALID_PATTERN = Pattern.compile("[^A-Za-z0-9_]+"); private static LinkedHashMap dataSchemaInfo = new LinkedHashMap<>(); @Name(SHEETS_TO_PULL) @@ -593,7 +593,7 @@ private LinkedHashMap processColumns(List columnHeaders = new LinkedHashMap<>(); - + final Map seenFieldNames = new HashMap<>(); List headerTitles = new ArrayList<>(); for (int i = 0; i < Math.min(columnsRow.size(), lastDataColumn); i++) { CellData columnHeaderCell = columnsRow.get(i); @@ -609,7 +609,7 @@ private LinkedHashMap processColumns(List processColumns(List processSubHeaders(int startIndex, int length, List subColumnsRow, List dataRow, FailureCollector collector) { List subHeaders = new ArrayList<>(); + final Map seenFieldNames = new HashMap<>(); List titles = new ArrayList<>(); for (int i = startIndex; i < startIndex + length; i++) { String subHeaderTitle; @@ -642,7 +643,7 @@ private List processSubHeaders(int startIndex, int leng if (StringUtils.isEmpty(subHeaderTitle)) { subHeaderTitle = ColumnAddressConverter.getColumnName(i + 1); } - subHeaderTitle = checkTitleFormat(subHeaderTitle, i); + subHeaderTitle = checkTitleFormat(subHeaderTitle, seenFieldNames); } else { subHeaderTitle = ColumnAddressConverter.getColumnName(i + 1); } @@ -661,14 +662,33 @@ private List processSubHeaders(int startIndex, int leng return subHeaders; } - private String checkTitleFormat(String title, int columnIndex) { - if (!COLUMN_NAME.matcher(title).matches()) { - String defaultColumnName = ColumnAddressConverter.getColumnName(columnIndex + 1); - LOG.warn(String.format("Original column name '%s' doesn't satisfy column name requirements '%s', " + - "the default column name '%s' will be used.", title, COLUMN_NAME.pattern(), defaultColumnName)); - return defaultColumnName; + private String checkTitleFormat(String title, Map seenFieldNames) { + final String replacementChar = "_"; + + StringBuilder cleanFieldNameBuilder = new StringBuilder(); + + // Remove any spaces at the end of the strings + title = title.trim(); + + // If it's an empty string replace it with BLANK + if (title.isEmpty()) { + cleanFieldNameBuilder.append("BLANK"); + } else if (Character.isDigit(title.charAt(0))) { + // Prepend a col_ if the first character is a number + cleanFieldNameBuilder.append("col_"); + } + + // Replace all invalid characters with the replacement char + cleanFieldNameBuilder.append(NOT_VALID_PATTERN.matcher(title).replaceAll(replacementChar)); + + String cleanFieldName = cleanFieldNameBuilder.toString(); + int count = seenFieldNames.getOrDefault(cleanFieldName, 0) + 1; + seenFieldNames.put(cleanFieldName, count); + // In case column already exists in seenFieldNames map, append the count with column name. + if (count > 1) { + cleanFieldNameBuilder.append(replacementChar).append(count); } - return title; + return cleanFieldNameBuilder.toString(); } private Schema getDataCellSchema(List dataRow, int index, String headerName) { diff --git a/src/test/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfigTest.java b/src/test/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfigTest.java index 55969a9..1077342 100644 --- a/src/test/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfigTest.java +++ b/src/test/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfigTest.java @@ -359,13 +359,13 @@ public void testProcessColumnsInvalidTitles() Assert.assertTrue(columns.get(0).getSubColumns().isEmpty()); // check complex columns, top header should have column name as name - Assert.assertEquals("B", columns.get(1).getHeaderTitle()); + Assert.assertEquals("title_with_space", columns.get(1).getHeaderTitle()); List subColumns = columns.get(1).getSubColumns(); Assert.assertFalse(subColumns.isEmpty()); // check sub-columns Assert.assertEquals(2, subColumns.size()); - Assert.assertEquals("B", subColumns.get(0).getHeaderTitle()); + Assert.assertEquals("col_9titleWithFirstNumber", subColumns.get(0).getHeaderTitle()); Assert.assertTrue(subColumns.get(0).getSubColumns().isEmpty()); Assert.assertEquals("d", subColumns.get(1).getHeaderTitle()); Assert.assertTrue(subColumns.get(0).getSubColumns().isEmpty()); From 1f59380cdf98aeca1c88c0354a71471876faa8a5 Mon Sep 17 00:00:00 2001 From: vikasrathee-cs Date: Tue, 21 May 2024 17:16:23 +0530 Subject: [PATCH 2/2] added lowercase support --- .../source/GoogleSheetsSourceConfig.java | 5 ++- .../source/GoogleSheetsSourceConfigTest.java | 37 +++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java b/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java index 9bec16d..53c816b 100644 --- a/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java +++ b/src/main/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfig.java @@ -682,8 +682,9 @@ private String checkTitleFormat(String title, Map seenFieldName cleanFieldNameBuilder.append(NOT_VALID_PATTERN.matcher(title).replaceAll(replacementChar)); String cleanFieldName = cleanFieldNameBuilder.toString(); - int count = seenFieldNames.getOrDefault(cleanFieldName, 0) + 1; - seenFieldNames.put(cleanFieldName, count); + String lowerCaseCleanFieldName = cleanFieldName.toLowerCase(); + int count = seenFieldNames.getOrDefault(lowerCaseCleanFieldName, 0) + 1; + seenFieldNames.put(lowerCaseCleanFieldName, count); // In case column already exists in seenFieldNames map, append the count with column name. if (count > 1) { cleanFieldNameBuilder.append(replacementChar).append(count); diff --git a/src/test/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfigTest.java b/src/test/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfigTest.java index 1077342..d227aa8 100644 --- a/src/test/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfigTest.java +++ b/src/test/java/io/cdap/plugin/google/sheets/source/GoogleSheetsSourceConfigTest.java @@ -376,4 +376,41 @@ private void setFieldValue(String fieldName, Object fieldValue) throws NoSuchFie metadataKeyCellsField.setAccessible(true); metadataKeyCellsField.set(config, fieldValue); } + + @Test + public void testProcessColumnsSameCaseSensitiveTitles() + throws NoSuchMethodException, InvocationTargetException, IllegalAccessException { + Method processColumnsMethod = config.getClass().getDeclaredMethod("processColumns", List.class, + List.class, List.class, List.class, int.class, + FailureCollector.class); + processColumnsMethod.setAccessible(true); + + List columnsRow = new ArrayList<>(); + columnsRow.add(new CellData().setFormattedValue("title with space")); + columnsRow.add(new CellData().setFormattedValue("Title with space")); + columnsRow.add(new CellData().setFormattedValue("Title%with%space")); + + List dataRow = new ArrayList<>(); + dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setStringValue("aa"))); + dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setNumberValue(13d))); + dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setBoolValue(true))); + + List columnMerges = new ArrayList<>(); + + FailureCollector collector = new DefaultFailureCollector("", Collections.EMPTY_MAP); + + int lastDataColumn = 3; + + LinkedHashMap columns = + (LinkedHashMap) processColumnsMethod.invoke(config, columnsRow, + null, dataRow, columnMerges, + lastDataColumn, collector); + + Assert.assertEquals(3, columns.size()); + Assert.assertTrue(columns.keySet().containsAll(Arrays.asList(0, 1, 2))); + + Assert.assertEquals("title_with_space", columns.get(0).getHeaderTitle()); + Assert.assertEquals("Title_with_space_2", columns.get(1).getHeaderTitle()); + Assert.assertEquals("Title_with_space_3", columns.get(2).getHeaderTitle()); + } }