diff --git a/django/api/services/spreadsheet_uploader.py b/django/api/services/spreadsheet_uploader.py index 95aefec3..cbbd32ef 100644 --- a/django/api/services/spreadsheet_uploader.py +++ b/django/api/services/spreadsheet_uploader.py @@ -154,13 +154,20 @@ def transform_data( errors_and_warnings[column] = {} for issue, details in issues.items(): if issue not in errors_and_warnings[column]: - errors_and_warnings[column][issue] = { - "Expected Type": details.get("Expected Type", "Unknown"), - "Rows": details.get("Rows", []), - "Severity": details.get("Severity", "Error") - } + if(details.get("Severity", "Error") == 'Warning'): + errors_and_warnings[column][issue] = { + "Expected Type": details.get("Expected Type", "Unknown"), + "Groups": details.get("Groups", []), + "Severity": details.get("Severity", "Error") + } + else: + errors_and_warnings[column][issue] = { + "Expected Type": details.get("Expected Type", "Unknown"), + "Rows": details.get("Rows", []), + "Severity": details.get("Severity", "Error") + } else: - errors_and_warnings[column][issue]["Rows"].extend(details.get("Rows", [])) + errors_and_warnings[column][issue]["Groups"].extend(details.get("Groups", [])) column_mapping = {col.name: col.value for col in column_mapping_enum} inverse_column_mapping = {v: k for k, v in column_mapping.items()} diff --git a/django/api/services/spreadsheet_uploader_prep.py b/django/api/services/spreadsheet_uploader_prep.py index 877d133a..391a41b9 100644 --- a/django/api/services/spreadsheet_uploader_prep.py +++ b/django/api/services/spreadsheet_uploader_prep.py @@ -204,119 +204,157 @@ def adjust_ger_manufacturer_names(df): def typo_checker(df, *columns, **kwargs): result = {} + for column in columns: - indices = [] series = df[column] unique_vals = set(series) - + map_of_values_to_indices = get_map_of_values_to_indices(series, kwargs.get("indices_offset", 0)) + + typo_groups = [] + processed_values = set() + for value in unique_vals: - singleton = set() - singleton.add(value) - matches = dl.get_close_matches( - value, - unique_vals.difference(singleton), - cutoff=kwargs["cutoff"] - ) + if value in processed_values: + continue + + matches = dl.get_close_matches(value, unique_vals.difference({value}), cutoff=kwargs.get("cutoff", 0.8)) + if matches: - value_indices = map_of_values_to_indices[value] - indices.extend(value_indices) - # it appears that difflib's "is similar" predicate S is not symmetric (i.e. aSb does not imply bSa) - # so we have to do: + current_group = { + "Typo Group": [value] + matches, + "Rows": [] + } + + current_group["Rows"].extend(map_of_values_to_indices[value]) + for match in matches: - match_indices = map_of_values_to_indices[match] - indices.extend(match_indices) - if indices: + current_group["Rows"].extend(map_of_values_to_indices[match]) + + processed_values.add(value) + processed_values.update(matches) + + typo_groups.append(current_group) + + if typo_groups: result[column] = { "Similar Values Detected": { "Expected Type": "We detected applicant names that sound very similar. If these names refer to the same person/entity, please replace the applicant names in your dataset to the preferred spelling to ensure consistency", - "Rows": sorted(list(set(indices))), + "Groups": typo_groups, "Severity": "Warning" } } + return result def validate_phone_numbers(df, *columns, **kwargs): result = {} for column in columns: - indices = [] series = df[column] - for index, phone_number in series.items(): + map_of_values_to_indices = get_map_of_values_to_indices(series, kwargs.get("indices_offset", 0)) + invalid_groups = [] + + for phone_number, indices in map_of_values_to_indices.items(): formatted_number = str(phone_number).strip().replace('-', '') if len(formatted_number) != 10 or int(formatted_number[:3]) not in AREA_CODES: if pd.isna(formatted_number) or formatted_number == '': continue - indices.append(index + kwargs.get("indices_offset", 0)) - if indices: + invalid_groups.append({ + "Invalid Phone Number": phone_number, + "Rows": indices + }) + + if invalid_groups: result[column] = { "Phone Number Appears Incorrect": { "Expected Type": "Ensure phone numbers match the Canadian format (XXX-XXX-XXXX)", - "Rows": indices, + "Groups": invalid_groups, "Severity": "Warning" } } return result + def location_checker(df, *columns, columns_to_features_map={}, **kwargs): result = {} + for column in columns: - indices = [] series = df[column] - map_of_values_to_indices = get_map_of_values_to_indices(series, kwargs.get("indices_offset", 0)) unique_values = set(series) - unique_values_list = list(unique_values) - + map_of_values_to_indices = get_map_of_values_to_indices(series, kwargs.get("indices_offset", 0)) + communities = set() - features_map = columns_to_features_map[column] + features_map = columns_to_features_map.get(column, {}) + for category_code, feature_types in features_map.items(): - get_placename_matches(unique_values_list, category_code, feature_types, 200, 1, communities) - - # Find names that don't have a match in the locations_set + get_placename_matches( + list(unique_values), category_code, feature_types, + 200, 1, communities + ) + names_without_match = unique_values.difference(communities) + unrecognized_groups = [] + for name in names_without_match: - indices_to_add = map_of_values_to_indices[name] - indices.extend(indices_to_add) - if indices: - indices.sort() + group = { + "Unrecognized Name": name, + "Rows": map_of_values_to_indices[name] + } + unrecognized_groups.append(group) + + if unrecognized_groups: result[column] = { "Unrecognized City Names": { - "Expected Type": "The following city names are not in the list of geographic names. Please double check that these places exist or have correct spelling and adjust your dataset accordingly.", - "Rows": sorted(list(set(indices))), + "Expected Type": ( + "The following city names are not in the list of geographic names. " + "Please double-check that these places exist or have correct spelling " + "and adjust your dataset accordingly." + ), + "Groups": unrecognized_groups, "Severity": "Warning" } } + return result + def email_validator(df, *columns, **kwargs): - resolver = None - get_resolver = kwargs.get("get_resolver") - if get_resolver is not None: - resolver = get_resolver() + resolver = kwargs.get("get_resolver", None) + if resolver: + resolver = resolver() + result = {} for column in columns: - indices = [] series = df[column] - for index, value in series.items(): + map_of_values_to_indices = get_map_of_values_to_indices(series, kwargs.get("indices_offset", 0)) + invalid_groups = [] + + for email, indices in map_of_values_to_indices.items(): try: - validate_email(value, dns_resolver=resolver) + validate_email(email, dns_resolver=resolver) except EmailNotValidError: - if pd.isna(value) or value == '': + if pd.isna(email) or email == '': continue - indices.append(index + kwargs.get("indices_offset", 0)) - if indices: + invalid_groups.append({ + "Invalid Email": email, + "Rows": indices + }) + + if invalid_groups: result[column] = { "Possible Errors in Email Addresses": { "Expected Type": "Verify email addresses are valid", - "Rows": indices, + "Groups": invalid_groups, "Severity": "Warning" } } return result + def validate_field_values(df, *columns, **kwargs): allowed_values = kwargs.get("fields_and_values") invalid_values = [] diff --git a/frontend/src/uploads/UploadContainer.js b/frontend/src/uploads/UploadContainer.js index a30cc902..53446c2e 100644 --- a/frontend/src/uploads/UploadContainer.js +++ b/frontend/src/uploads/UploadContainer.js @@ -64,44 +64,51 @@ const UploadContainer = () => { errors: 0, warnings: 0, }; - + issueArray.forEach((issue) => { - Object.keys(issue).forEach((column) => { const errorDetails = issue[column]; Object.keys(errorDetails).forEach((errorType) => { const severity = errorDetails[errorType].Severity; const expectedType = errorDetails[errorType]["Expected Type"]; - const rows = errorDetails[errorType].Rows; - const rowCount = rows.length; + const groups = errorDetails[errorType].Groups || []; if (severity === "Error") { + const rows = errorDetails[errorType].Rows; + const rowCount = rows.length; totalIssueCount.errors += rowCount; + if (!groupedErrors[column]) { groupedErrors[column] = {}; } if (!groupedErrors[column][errorType]) { groupedErrors[column][errorType] = { ExpectedType: expectedType, - Rows: rows, + Rows: [...rows], }; } else { groupedErrors[column][errorType].Rows.push(...rows); } } else if (severity === "Warning") { - totalIssueCount.warnings += rowCount; + let warningRowCount = 0; + if (!groupedWarnings[column]) { groupedWarnings[column] = {}; } if (!groupedWarnings[column][errorType]) { groupedWarnings[column][errorType] = { ExpectedType: expectedType, - Rows: rows, + Groups: [], }; - } else { - groupedWarnings[column][errorType].Rows.push(...rows); } + + groups.forEach((group) => { + groupedWarnings[column][errorType].Groups.push(group); + warningRowCount += group.Rows.length; + }); + + totalIssueCount.warnings += warningRowCount; } }); }); @@ -109,8 +116,6 @@ const UploadContainer = () => { return { groupedErrors, groupedWarnings, totalIssueCount }; }; - - const showError = (error) => { const { response: errorResponse } = error; diff --git a/frontend/src/uploads/components/UploadIssuesDetail.js b/frontend/src/uploads/components/UploadIssuesDetail.js index 6c504e0d..c4275c37 100644 --- a/frontend/src/uploads/components/UploadIssuesDetail.js +++ b/frontend/src/uploads/components/UploadIssuesDetail.js @@ -1,20 +1,57 @@ import PropTypes from "prop-types"; import React, { useState } from "react"; import { Box, Button } from "@mui/material"; -import ErrorOutlineIcon from '@mui/icons-material/ErrorOutline'; +import ErrorOutlineIcon from "@mui/icons-material/ErrorOutline"; import ExpandMoreIcon from "@mui/icons-material/ExpandMore"; const UploadIssuesDetail = ({ type, issues, totalIssueCount, msg }) => { - const [showAllRowsMap, setShowAllRowsMap] = useState({}); // State to toggle showing all rows for each issue + const [showAllRowsMap, setShowAllRowsMap] = useState({}); const classname = type === "error" ? "error" : "warning"; - const toggleShowAllRows = (column, errorType) => { - const key = `${column}_${errorType}`; + const toggleShowAllRows = (column, errorType, groupIndex) => { + const key = `${column}_${errorType}_${groupIndex}`; setShowAllRowsMap((prevState) => ({ ...prevState, [key]: !prevState[key], })); }; + const renderWarning = (group) => ( + <> +