Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Strip White Space from Cell Values #215

Closed
Closed
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
0d0dbea
feat(logic): strip white space;
JVickery-TBS May 7, 2024
616c948
fix(logic): strip white space for load table;
JVickery-TBS May 7, 2024
860ca9e
fix(logic): strip white space for load table;
JVickery-TBS May 7, 2024
88f96a8
fix(logic): strip white space;
JVickery-TBS May 8, 2024
341ec1e
Merge branch 'master' into feature/strip-white-space
JVickery-TBS May 8, 2024
21a1ece
fix(tests): new code for tests;
JVickery-TBS May 8, 2024
a6ab0a0
fix(logic): load csv white space;
JVickery-TBS May 12, 2024
54f87e0
feat(logic): added `strip_extra_white` info field;
JVickery-TBS May 14, 2024
50080ea
feat(logic): added `strip_extra_white` field;
JVickery-TBS May 14, 2024
116c29f
fix(logic): minor logic fixes;
JVickery-TBS May 14, 2024
43b9f94
feat(tests,i18n): updated tests;
JVickery-TBS Jul 15, 2024
06ee48a
Merge branch 'master' into feature/strip-white-space
JVickery-TBS Jul 15, 2024
c00fb5a
fix(tests,logic): misc fixes;
JVickery-TBS Jul 15, 2024
669930e
fix(tests,logic): new output and parody;
JVickery-TBS Jul 15, 2024
3263bab
fix(logic): ckan versioning;
JVickery-TBS Jul 16, 2024
7cb6a84
fix(logic): ckan versioning;
JVickery-TBS Jul 16, 2024
bf2e939
feat(templates,logic): pre-datadictionary implement;
JVickery-TBS Jul 16, 2024
d6de1b1
feat(tests): add coverage;
JVickery-TBS Jul 22, 2024
8564cdd
Merge branch 'master' into feature/strip-white-space
JVickery-TBS Jul 23, 2024
9b65844
Merge branch 'master' into feature/strip-white-space
JVickery-TBS Dec 9, 2024
bf7efc6
fix(tests): ds keys;
JVickery-TBS Dec 9, 2024
4a35fc8
fix(tests): datadictionary;
JVickery-TBS Dec 9, 2024
4886902
debug tests...
JVickery-TBS Dec 9, 2024
6587421
debug tests...
JVickery-TBS Dec 9, 2024
20a33f6
fix(logic): loader;
JVickery-TBS Dec 9, 2024
e4aac5d
fix(logic): datadict versions;
JVickery-TBS Dec 9, 2024
dd137f3
fix(templates): selected value;
JVickery-TBS Dec 9, 2024
c7fb399
feat(misc): readme, changelog;
JVickery-TBS Dec 9, 2024
ec8ef04
Merge branch 'master' into feature/strip-white-space
JVickery-TBS Dec 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 60 additions & 15 deletions ckanext/xloader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,28 +171,23 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
csv_filepath = f_write.name

# datastore db connection
engine = get_write_engine()

# get column info from existing table
existing = datastore_resource_exists(resource_id)
existing_info = {}
if existing:
existing_fields = existing.get('fields', [])
if p.toolkit.check_ckan_version(min_version='2.10'):
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
else:
existing_fields = existing.get('fields', [])
existing_info = dict((f['id'], f['info'])
for f in existing_fields
if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Column types are either set (overridden) in the Data Dictionary page
# or default to text type (which is robust)
Expand All @@ -207,6 +202,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
for f in fields:
if f['id'] in existing_info:
f['info'] = existing_info[f['id']]
f['strip_extra_white'] = existing_info[f['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[f['id']] \
else existing_fields_by_headers[f['id']].get('strip_extra_white', True)

'''
Delete or truncate existing datastore table before proceeding,
Expand All @@ -223,11 +220,43 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
else:
fields = [
{'id': header_name,
'type': 'text'}
'type': 'text',
'strip_extra_white': True,}
for header_name in headers]

logger.info('Fields: %s', fields)

save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
if len(row) == len(fields):
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
super_iter = stream.iter
def strip_white_space_iter():
for row in super_iter():
if len(row) == len(fields):
for _index, _cell in enumerate(row):
# only strip white space if strip_extra_white is True
if fields[_index].get('strip_extra_white', True) and isinstance(_cell, str):
row[_index] = _cell.strip()
yield row
stream.iter = strip_white_space_iter
stream.save(**save_args)
csv_filepath = f_write.name

# Create table
from ckan import model
context = {'model': model, 'ignore_auth': True}
Expand Down Expand Up @@ -371,10 +400,13 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
existing = datastore_resource_exists(resource_id)
existing_info = None
if existing:
existing_fields = existing.get('fields', [])
ds_info = p.toolkit.get_action('datastore_info')({'ignore_auth': True}, {'id': resource_id})
existing_fields = ds_info.get('fields', [])
existing_info = dict(
(f['id'], f['info'])
for f in existing_fields if 'info' in f)
existing_fields_by_headers = dict((f['id'], f)
for f in existing_fields)

# Some headers might have been converted from strings to floats and such.
headers = encode_headers(headers)
Expand All @@ -388,6 +420,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
strict_guessing = p.toolkit.asbool(
config.get('ckanext.xloader.strict_type_guessing', True))
types = type_guess(stream.sample[1:], types=TYPES, strict=strict_guessing)
fields = []

# override with types user requested
if existing_info:
Expand All @@ -398,9 +431,15 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
'timestamp': datetime.datetime,
}.get(existing_info.get(h, {}).get('type_override'), t)
for t, h in zip(types, headers)]
for h in headers:
fields.append(existing_fields_by_headers.get(h, {}))
else:
# default strip_extra_white
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
for h in headers:
fields.append({'strip_extra_white': True})

headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types)
type_converter = TypeConverter(types=types, fields=fields)

with UnknownEncodingStream(table_filepath, file_format, decoding_result,
skip_rows=skip_rows,
Expand All @@ -421,10 +460,16 @@ def row_iterator():
for h in headers_dicts:
if h['id'] in existing_info:
h['info'] = existing_info[h['id']]
h['strip_extra_white'] = existing_info[h['id']].get('strip_extra_white') if 'strip_extra_white' in existing_info[h['id']] \
else existing_fields_by_headers[h['id']].get('strip_extra_white', True)
# create columns with types user requested
type_override = existing_info[h['id']].get('type_override')
if type_override in list(_TYPE_MAPPING.values()):
h['type'] = type_override
else:
# default strip_extra_white
for h in headers_dicts:
h['strip_extra_white'] = True

logger.info('Determined headers and types: %s', headers_dicts)

Expand Down
12 changes: 11 additions & 1 deletion ckanext/xloader/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ class TypeConverter:
as desired.
"""

def __init__(self, types=None):
def __init__(self, types=None, fields=None):
self.types = types
self.fields = fields

def convert_types(self, extended_rows):
""" Try converting cells to numbers or timestamps if applicable.
Expand All @@ -31,7 +32,16 @@ def convert_types(self, extended_rows):
for cell_index, cell_value in enumerate(row):
if cell_value is None:
row[cell_index] = ''
if self.fields:
# only strip white space if strip_extra_white is True
if self.fields[cell_index].get('strip_extra_white', True) and isinstance(cell_value, six.text_type):
cell_value = cell_value.strip()
row[cell_index] = cell_value.strip()
if not cell_value:
# load_csv parody: empty of string type should be None
if self.types and self.types[cell_index] == six.text_type:
cell_value = None
row[cell_index] = None
continue
cell_type = self.types[cell_index] if self.types else None
if cell_type in [Decimal, None]:
Expand Down
25 changes: 25 additions & 0 deletions ckanext/xloader/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@
def config_declarations(cls):
return cls

if toolkit.check_ckan_version(min_version='2.11'):
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
from ckanext.datastore.interfaces import IDataDictionaryForm
has_idata_dictionary_form = True
else:
has_idata_dictionary_form = False

log = logging.getLogger(__name__)


Expand All @@ -34,6 +40,8 @@ class xloaderPlugin(plugins.SingletonPlugin):
plugins.implements(plugins.IResourceController, inherit=True)
plugins.implements(plugins.IClick)
plugins.implements(plugins.IBlueprint)
if has_idata_dictionary_form:
plugins.implements(IDataDictionaryForm, inherit=True)

# IClick
def get_commands(self):
Expand Down Expand Up @@ -208,6 +216,23 @@ def get_helpers(self):
"is_resource_supported_by_xloader": xloader_helpers.is_resource_supported_by_xloader,
}

# IDataDictionaryForm

def update_datastore_create_schema(self, schema):
default = toolkit.get_validator('default')
boolean_validator = toolkit.get_validator('boolean_validator')
to_datastore_plugin_data = toolkit.get_validator('to_datastore_plugin_data')
schema['fields']['strip_extra_white'] = [default(True), boolean_validator, to_datastore_plugin_data('xloader')]
return schema

def update_datastore_info_field(self, field, plugin_data):
# expose all our non-secret plugin data in the field
field.update(plugin_data.get('xloader', {}))
# CKAN version parody
if '_info' in plugin_data:
field.update({'info': plugin_data['_info']})
return field


def _should_remove_unsupported_resource_from_datastore(res_dict):
if not toolkit.asbool(toolkit.config.get('ckanext.xloader.clean_datastore_tables', False)):
Expand Down
17 changes: 17 additions & 0 deletions ckanext/xloader/templates/datastore/snippets/dictionary_form.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{% ckan_extends %}
{% import 'macros/form.html' as form %}

{% block additional_fields %}
{{ super() }}
{% if h.check_ckan_version(min_version='2.11') %}
{% set field_prefix = 'fields__' %}
{% else %}
{% set field_prefix = 'info__' %}
{% endif %}
{% set is_selected = field.get('info', {}).get('strip_extra_white', field.get('strip_extra_white')) != 'False' %}
{{ form.select(field_prefix ~ position ~ '__strip_extra_white',
label=_('Strip Extra Leading and Trailing White Space'), options=[
{'text': _('Yes'), 'value': true},
{'text': _('No'), 'value': false},
], selected=is_selected) }}
{% endblock %}
8 changes: 4 additions & 4 deletions ckanext/xloader/tests/samples/boston_311_sample.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
CASE_ENQUIRY_ID,open_dt,target_dt,closed_dt,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,TYPE,QUEUE,Department,SubmittedPhoto,ClosedPhoto,Location,Fire_district,pwd_district,city_council_district,police_district,neighborhood,neighborhood_services_district,ward,precinct,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Latitude,Longitude,Source
101002153891,2017-07-06 23:38:43,2017-07-21 08:30:00,,ONTIME,Open, ,Street Light Outages,Public Works Department ,Street Lights,Street Light Outages,PWDx_Street Light Outages,PWDx,,,480 Harvard St Dorchester MA 02124,8,07,4,B3,Greater Mattapan,9,Ward 14,1411,480 Harvard St,02124,42.288,-71.0927,Citizens Connect App
101002153890,2017-07-06 23:29:13,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595f0000048560f46d94b9fa/report.jpg,,522 Saratoga St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0110,522 Saratoga St,02128,42.3807,-71.0259,Citizens Connect App
101002153889,2017-07-06 23:24:20,2017-09-11 08:30:00,,ONTIME,Open, ,Graffiti Removal,Property Management,Graffiti,Graffiti Removal,PROP_GRAF_GraffitiRemoval,PROP, https://mayors24.cityofboston.gov/media/boston/report/photos/595efedb048560f46d94b9ef/report.jpg,,965 Bennington St East Boston MA 02128,1,09,1,A7,East Boston,1,Ward 1,0112,965 Bennington St,02128,42.386,-71.008,Citizens Connect App
2 changes: 1 addition & 1 deletion ckanext/xloader/tests/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_xloader_data_into_datastore(self, cli, data):
with mock.patch("ckanext.xloader.jobs.get_response", get_response):
stdout = cli.invoke(ckan, ["jobs", "worker", "--burst"]).output
assert "File hash: d44fa65eda3675e11710682fdb5f1648" in stdout
assert "Fields: [{'id': 'x', 'type': 'text'}, {'id': 'y', 'type': 'text'}]" in stdout
assert "Fields: [{'id': 'x', 'type': 'text', 'strip_extra_white': True}, {'id': 'y', 'type': 'text', 'strip_extra_white': True}]" in stdout
JVickery-TBS marked this conversation as resolved.
Show resolved Hide resolved
assert "Copying to database..." in stdout
assert "Creating search index..." in stdout
assert "Express Load completed" in stdout
Expand Down
Loading
Loading