Skip to content

Commit

Permalink
column name cleansing done as per other file plugins.
Browse files Browse the repository at this point in the history
column name cleansing done as per other file plugins.
  • Loading branch information
vikasrathee-cs committed May 22, 2024
1 parent b923f45 commit 8f990d9
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public class GoogleSheetsSourceConfig extends GoogleFilteringSourceConfig {
public static final String CONFIGURATION_PARSE_PROPERTY_NAME = "properties";
private static final Logger LOG = LoggerFactory.getLogger(GoogleSheetsSourceConfig.class);
private static final Pattern CELL_ADDRESS = Pattern.compile("^([A-Z]+)([0-9]+)$");
private static final Pattern COLUMN_NAME = Pattern.compile("^[A-Za-z_][A-Za-z0-9_-]*$");
private static final Pattern NOT_VALID_PATTERN = Pattern.compile("[^A-Za-z0-9_]+");
private static LinkedHashMap<Integer, ColumnComplexSchemaInfo> dataSchemaInfo = new LinkedHashMap<>();

@Name(SHEETS_TO_PULL)
Expand Down Expand Up @@ -593,7 +593,7 @@ private LinkedHashMap<Integer, ColumnComplexSchemaInfo> processColumns(List<Cell
int lastDataColumn,
FailureCollector collector) {
LinkedHashMap<Integer, ColumnComplexSchemaInfo> columnHeaders = new LinkedHashMap<>();

final Map<String, Integer> seenFieldNames = new HashMap<>();
List<String> headerTitles = new ArrayList<>();
for (int i = 0; i < Math.min(columnsRow.size(), lastDataColumn); i++) {
CellData columnHeaderCell = columnsRow.get(i);
Expand All @@ -609,7 +609,7 @@ private LinkedHashMap<Integer, ColumnComplexSchemaInfo> processColumns(List<Cell
}
String title = columnHeaderCell.getFormattedValue();
if (StringUtils.isNotEmpty(title)) {
title = checkTitleFormat(title, i);
title = checkTitleFormat(title, seenFieldNames);

// for merge we should analyse sub headers for data schemas
if (isMergeHead) {
Expand All @@ -634,6 +634,7 @@ private LinkedHashMap<Integer, ColumnComplexSchemaInfo> processColumns(List<Cell
private List<ColumnComplexSchemaInfo> processSubHeaders(int startIndex, int length, List<CellData> subColumnsRow,
List<CellData> dataRow, FailureCollector collector) {
List<ColumnComplexSchemaInfo> subHeaders = new ArrayList<>();
final Map<String, Integer> seenFieldNames = new HashMap<>();
List<String> titles = new ArrayList<>();
for (int i = startIndex; i < startIndex + length; i++) {
String subHeaderTitle;
Expand All @@ -642,7 +643,7 @@ private List<ColumnComplexSchemaInfo> processSubHeaders(int startIndex, int leng
if (StringUtils.isEmpty(subHeaderTitle)) {
subHeaderTitle = ColumnAddressConverter.getColumnName(i + 1);
}
subHeaderTitle = checkTitleFormat(subHeaderTitle, i);
subHeaderTitle = checkTitleFormat(subHeaderTitle, seenFieldNames);
} else {
subHeaderTitle = ColumnAddressConverter.getColumnName(i + 1);
}
Expand All @@ -661,14 +662,34 @@ private List<ColumnComplexSchemaInfo> processSubHeaders(int startIndex, int leng
return subHeaders;
}

private String checkTitleFormat(String title, int columnIndex) {
if (!COLUMN_NAME.matcher(title).matches()) {
String defaultColumnName = ColumnAddressConverter.getColumnName(columnIndex + 1);
LOG.warn(String.format("Original column name '%s' doesn't satisfy column name requirements '%s', " +
"the default column name '%s' will be used.", title, COLUMN_NAME.pattern(), defaultColumnName));
return defaultColumnName;
private String checkTitleFormat(String title, Map<String, Integer> seenFieldNames) {
final String replacementChar = "_";

StringBuilder cleanFieldNameBuilder = new StringBuilder();

// Remove any spaces at the end of the strings
title = title.trim();

// If it's an empty string replace it with BLANK
if (title.isEmpty()) {
cleanFieldNameBuilder.append("BLANK");
} else if (Character.isDigit(title.charAt(0))) {
// Prepend a col_ if the first character is a number
cleanFieldNameBuilder.append("col_");
}

// Replace all invalid characters with the replacement char
cleanFieldNameBuilder.append(NOT_VALID_PATTERN.matcher(title).replaceAll(replacementChar));

String cleanFieldName = cleanFieldNameBuilder.toString();
String lowerCaseCleanFieldName = cleanFieldName.toLowerCase();
int count = seenFieldNames.getOrDefault(lowerCaseCleanFieldName, 0) + 1;
seenFieldNames.put(lowerCaseCleanFieldName, count);
// In case column already exists in seenFieldNames map, append the count with column name.
if (count > 1) {
cleanFieldNameBuilder.append(replacementChar).append(count);
}
return title;
return cleanFieldNameBuilder.toString();
}

private Schema getDataCellSchema(List<CellData> dataRow, int index, String headerName) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,13 +359,13 @@ public void testProcessColumnsInvalidTitles()
Assert.assertTrue(columns.get(0).getSubColumns().isEmpty());

// check complex columns, top header should have column name as name
Assert.assertEquals("B", columns.get(1).getHeaderTitle());
Assert.assertEquals("title_with_space", columns.get(1).getHeaderTitle());
List<ColumnComplexSchemaInfo> subColumns = columns.get(1).getSubColumns();
Assert.assertFalse(subColumns.isEmpty());

// check sub-columns
Assert.assertEquals(2, subColumns.size());
Assert.assertEquals("B", subColumns.get(0).getHeaderTitle());
Assert.assertEquals("col_9titleWithFirstNumber", subColumns.get(0).getHeaderTitle());
Assert.assertTrue(subColumns.get(0).getSubColumns().isEmpty());
Assert.assertEquals("d", subColumns.get(1).getHeaderTitle());
Assert.assertTrue(subColumns.get(0).getSubColumns().isEmpty());
Expand All @@ -376,4 +376,41 @@ private void setFieldValue(String fieldName, Object fieldValue) throws NoSuchFie
metadataKeyCellsField.setAccessible(true);
metadataKeyCellsField.set(config, fieldValue);
}

@Test
public void testProcessColumnsSameCaseSensitiveTitles()
throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
Method processColumnsMethod = config.getClass().getDeclaredMethod("processColumns", List.class,
List.class, List.class, List.class, int.class,
FailureCollector.class);
processColumnsMethod.setAccessible(true);

List<CellData> columnsRow = new ArrayList<>();
columnsRow.add(new CellData().setFormattedValue("title with space"));
columnsRow.add(new CellData().setFormattedValue("Title with space"));
columnsRow.add(new CellData().setFormattedValue("Title%with%space"));

List<CellData> dataRow = new ArrayList<>();
dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setStringValue("aa")));
dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setNumberValue(13d)));
dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setBoolValue(true)));

List<GridRange> columnMerges = new ArrayList<>();

FailureCollector collector = new DefaultFailureCollector("", Collections.EMPTY_MAP);

int lastDataColumn = 3;

LinkedHashMap<Integer, ColumnComplexSchemaInfo> columns =
(LinkedHashMap<Integer, ColumnComplexSchemaInfo>) processColumnsMethod.invoke(config, columnsRow,
null, dataRow, columnMerges,
lastDataColumn, collector);

Assert.assertEquals(3, columns.size());
Assert.assertTrue(columns.keySet().containsAll(Arrays.asList(0, 1, 2)));

Assert.assertEquals("title_with_space", columns.get(0).getHeaderTitle());
Assert.assertEquals("Title_with_space_2", columns.get(1).getHeaderTitle());
Assert.assertEquals("Title_with_space_3", columns.get(2).getHeaderTitle());
}
}

0 comments on commit 8f990d9

Please sign in to comment.