Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "[PLUGIN-1785]Column name cleansing done as per other file plugins." #49

Merged
merged 1 commit into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public class GoogleSheetsSourceConfig extends GoogleFilteringSourceConfig {
public static final String CONFIGURATION_PARSE_PROPERTY_NAME = "properties";
private static final Logger LOG = LoggerFactory.getLogger(GoogleSheetsSourceConfig.class);
private static final Pattern CELL_ADDRESS = Pattern.compile("^([A-Z]+)([0-9]+)$");
private static final Pattern NOT_VALID_PATTERN = Pattern.compile("[^A-Za-z0-9_]+");
private static final Pattern COLUMN_NAME = Pattern.compile("^[A-Za-z_][A-Za-z0-9_-]*$");
private static LinkedHashMap<Integer, ColumnComplexSchemaInfo> dataSchemaInfo = new LinkedHashMap<>();

@Name(SHEETS_TO_PULL)
Expand Down Expand Up @@ -593,7 +593,7 @@ private LinkedHashMap<Integer, ColumnComplexSchemaInfo> processColumns(List<Cell
int lastDataColumn,
FailureCollector collector) {
LinkedHashMap<Integer, ColumnComplexSchemaInfo> columnHeaders = new LinkedHashMap<>();
final Map<String, Integer> seenFieldNames = new HashMap<>();

List<String> headerTitles = new ArrayList<>();
for (int i = 0; i < Math.min(columnsRow.size(), lastDataColumn); i++) {
CellData columnHeaderCell = columnsRow.get(i);
Expand All @@ -609,7 +609,7 @@ private LinkedHashMap<Integer, ColumnComplexSchemaInfo> processColumns(List<Cell
}
String title = columnHeaderCell.getFormattedValue();
if (StringUtils.isNotEmpty(title)) {
title = checkTitleFormat(title, seenFieldNames);
title = checkTitleFormat(title, i);

// for merge we should analyse sub headers for data schemas
if (isMergeHead) {
Expand All @@ -634,7 +634,6 @@ private LinkedHashMap<Integer, ColumnComplexSchemaInfo> processColumns(List<Cell
private List<ColumnComplexSchemaInfo> processSubHeaders(int startIndex, int length, List<CellData> subColumnsRow,
List<CellData> dataRow, FailureCollector collector) {
List<ColumnComplexSchemaInfo> subHeaders = new ArrayList<>();
final Map<String, Integer> seenFieldNames = new HashMap<>();
List<String> titles = new ArrayList<>();
for (int i = startIndex; i < startIndex + length; i++) {
String subHeaderTitle;
Expand All @@ -643,7 +642,7 @@ private List<ColumnComplexSchemaInfo> processSubHeaders(int startIndex, int leng
if (StringUtils.isEmpty(subHeaderTitle)) {
subHeaderTitle = ColumnAddressConverter.getColumnName(i + 1);
}
subHeaderTitle = checkTitleFormat(subHeaderTitle, seenFieldNames);
subHeaderTitle = checkTitleFormat(subHeaderTitle, i);
} else {
subHeaderTitle = ColumnAddressConverter.getColumnName(i + 1);
}
Expand All @@ -662,34 +661,14 @@ private List<ColumnComplexSchemaInfo> processSubHeaders(int startIndex, int leng
return subHeaders;
}

private String checkTitleFormat(String title, Map<String, Integer> seenFieldNames) {
final String replacementChar = "_";

StringBuilder cleanFieldNameBuilder = new StringBuilder();

// Remove any spaces at the end of the strings
title = title.trim();

// If it's an empty string replace it with BLANK
if (title.isEmpty()) {
cleanFieldNameBuilder.append("BLANK");
} else if (Character.isDigit(title.charAt(0))) {
// Prepend a col_ if the first character is a number
cleanFieldNameBuilder.append("col_");
}

// Replace all invalid characters with the replacement char
cleanFieldNameBuilder.append(NOT_VALID_PATTERN.matcher(title).replaceAll(replacementChar));

String cleanFieldName = cleanFieldNameBuilder.toString();
String lowerCaseCleanFieldName = cleanFieldName.toLowerCase();
int count = seenFieldNames.getOrDefault(lowerCaseCleanFieldName, 0) + 1;
seenFieldNames.put(lowerCaseCleanFieldName, count);
// In case column already exists in seenFieldNames map, append the count with column name.
if (count > 1) {
cleanFieldNameBuilder.append(replacementChar).append(count);
private String checkTitleFormat(String title, int columnIndex) {
if (!COLUMN_NAME.matcher(title).matches()) {
String defaultColumnName = ColumnAddressConverter.getColumnName(columnIndex + 1);
LOG.warn(String.format("Original column name '%s' doesn't satisfy column name requirements '%s', " +
"the default column name '%s' will be used.", title, COLUMN_NAME.pattern(), defaultColumnName));
return defaultColumnName;
}
return cleanFieldNameBuilder.toString();
return title;
}

private Schema getDataCellSchema(List<CellData> dataRow, int index, String headerName) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -359,13 +359,13 @@ public void testProcessColumnsInvalidTitles()
Assert.assertTrue(columns.get(0).getSubColumns().isEmpty());

// check complex columns, top header should have column name as name
Assert.assertEquals("title_with_space", columns.get(1).getHeaderTitle());
Assert.assertEquals("B", columns.get(1).getHeaderTitle());
List<ColumnComplexSchemaInfo> subColumns = columns.get(1).getSubColumns();
Assert.assertFalse(subColumns.isEmpty());

// check sub-columns
Assert.assertEquals(2, subColumns.size());
Assert.assertEquals("col_9titleWithFirstNumber", subColumns.get(0).getHeaderTitle());
Assert.assertEquals("B", subColumns.get(0).getHeaderTitle());
Assert.assertTrue(subColumns.get(0).getSubColumns().isEmpty());
Assert.assertEquals("d", subColumns.get(1).getHeaderTitle());
Assert.assertTrue(subColumns.get(0).getSubColumns().isEmpty());
Expand All @@ -376,41 +376,4 @@ private void setFieldValue(String fieldName, Object fieldValue) throws NoSuchFie
metadataKeyCellsField.setAccessible(true);
metadataKeyCellsField.set(config, fieldValue);
}

@Test
public void testProcessColumnsSameCaseSensitiveTitles()
throws NoSuchMethodException, InvocationTargetException, IllegalAccessException {
Method processColumnsMethod = config.getClass().getDeclaredMethod("processColumns", List.class,
List.class, List.class, List.class, int.class,
FailureCollector.class);
processColumnsMethod.setAccessible(true);

List<CellData> columnsRow = new ArrayList<>();
columnsRow.add(new CellData().setFormattedValue("title with space"));
columnsRow.add(new CellData().setFormattedValue("Title with space"));
columnsRow.add(new CellData().setFormattedValue("Title%with%space"));

List<CellData> dataRow = new ArrayList<>();
dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setStringValue("aa")));
dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setNumberValue(13d)));
dataRow.add(new CellData().setUserEnteredValue(new ExtendedValue().setBoolValue(true)));

List<GridRange> columnMerges = new ArrayList<>();

FailureCollector collector = new DefaultFailureCollector("", Collections.EMPTY_MAP);

int lastDataColumn = 3;

LinkedHashMap<Integer, ColumnComplexSchemaInfo> columns =
(LinkedHashMap<Integer, ColumnComplexSchemaInfo>) processColumnsMethod.invoke(config, columnsRow,
null, dataRow, columnMerges,
lastDataColumn, collector);

Assert.assertEquals(3, columns.size());
Assert.assertTrue(columns.keySet().containsAll(Arrays.asList(0, 1, 2)));

Assert.assertEquals("title_with_space", columns.get(0).getHeaderTitle());
Assert.assertEquals("Title_with_space_2", columns.get(1).getHeaderTitle());
Assert.assertEquals("Title_with_space_3", columns.get(2).getHeaderTitle());
}
}