-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve compatibility across Data Package spec versions (#14)
- Map date fields to datetime dtype (pandas.DataFrame) - Convert defunct gyear and gyearmonth types to current year and yearmonth - Degrade gracefully in case of lack of support for year, yearmonth and duration types - Improve messaging around schema errors
- Loading branch information
Showing
9 changed files
with
787 additions
and
670 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
from collections import OrderedDict | ||
|
||
|
||
def patch_jsontableschema_pandas(mappers): | ||
"""Monkey patch jsontableschema_pandas module | ||
Up to version 0.2.0 jsontableschema_pandas mapped date fields | ||
to object dtype | ||
https://github.com/frictionlessdata/ | ||
jsontableschema-pandas-py/pull/23 | ||
""" | ||
if hasattr(mappers, 'jtstype_to_dtype'): | ||
mapper = mappers.jtstype_to_dtype | ||
new_mappings = { | ||
'date': 'datetime64[ns]', | ||
'year': 'int64', | ||
'yearmonth': 'int64', | ||
'duration': 'object', | ||
} | ||
|
||
def mapper_wrapper(jtstype): | ||
try: | ||
if jtstype == 'date': | ||
return new_mappings[jtstype] | ||
|
||
return mapper(jtstype) | ||
|
||
except TypeError as e: | ||
if jtstype in new_mappings: | ||
return new_mappings[jtstype] | ||
else: | ||
raise e | ||
|
||
mappers.jtstype_to_dtype = mapper_wrapper | ||
|
||
|
||
def sanitize_table_schema(r): | ||
"""Sanitize table schema for increased compatibility | ||
Up to version 0.9.0 jsontableschema did not support | ||
year, yearmonth and duration field types | ||
https://github.com/frictionlessdata/jsontableschema-py/pull/152 | ||
""" | ||
missing_type_support = False | ||
try: | ||
from jsontableschema import YearType, YearMonthType, DurationType | ||
except ImportError: | ||
missing_type_support = True | ||
|
||
if 'schema' in r.descriptor: | ||
for field in r.descriptor['schema'].get('fields', []): | ||
if missing_type_support: | ||
# Convert unsupported types to integer and string | ||
# as appropriate | ||
|
||
type_mapping = { | ||
'integer': [ | ||
'gyear', 'year', 'gyearmonth', 'yearmonth'], | ||
'string': [ | ||
'duration' | ||
]} | ||
|
||
for old_type, new_types in type_mapping.items(): | ||
if field.get('type') in new_types: | ||
field['type'] = old_type | ||
|
||
# Datapackage specs were changed along the way | ||
# Convert gyear and gyearmonth to year and yearmonth | ||
# https://github.com/frictionlessdata/specs/pull/370 | ||
if field.get('type') in ['gyear', 'gyearmonth']: | ||
field['type'] = field['type'][1:] | ||
|
||
return r | ||
|
||
|
||
def align_table_fields(fields, unordered_row): | ||
"""Ensure columns appear in the same order for every row in table""" | ||
fields_idx = {f: pos for pos, f in enumerate(fields)} | ||
return OrderedDict(sorted(unordered_row.items(), | ||
key=lambda i: fields_idx[i[0]])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,7 +49,10 @@ def find_version(*paths): | |
author_email='[email protected]', | ||
license='Apache 2.0', | ||
packages=find_packages(), | ||
keywords=['data.world', 'dataset'], | ||
keywords=[ | ||
'data.world', | ||
'dataset', | ||
], | ||
classifiers=[ | ||
'Development Status :: 4 - Beta', | ||
'Intended Audience :: Developers', | ||
|
@@ -64,23 +67,39 @@ def find_version(*paths): | |
'Programming Language :: Python :: 3.6', | ||
'Topic :: Database :: Database Engines/Servers', | ||
'Topic :: Scientific/Engineering :: Information Analysis', | ||
'Topic :: Software Development :: Libraries :: Python Modules' | ||
'Topic :: Software Development :: Libraries :: Python Modules', | ||
], | ||
install_requires=[ | ||
'certifi', 'click', 'configparser', 'datapackage', 'python-dateutil', | ||
'requests', 'six', 'tabulator', 'urllib3' | ||
'certifi', | ||
'click', | ||
'configparser', | ||
'datapackage', | ||
'python-dateutil', | ||
'requests', | ||
'six', | ||
'tabulator', | ||
'urllib3', | ||
], | ||
setup_requires=[ | ||
'pytest-runner' | ||
'pytest-runner', | ||
], | ||
tests_require=[ | ||
'doublex', 'pyhamcrest', 'responses', 'pytest', | ||
'jsontableschema_pandas', 'pandas<0.19a' | ||
'doublex', | ||
'pyhamcrest', | ||
'responses', | ||
'pytest', | ||
'jsontableschema_pandas', | ||
'pandas<0.19a', | ||
], | ||
extras_require={ | ||
'PANDAS': ['jsontableschema_pandas', 'pandas<0.19a'] | ||
'PANDAS': [ | ||
'jsontableschema_pandas', | ||
'pandas<0.19a', | ||
], | ||
}, | ||
entry_points={ | ||
'console_scripts': ['dw=datadotworld.cli:cli'], | ||
'console_scripts': [ | ||
'dw=datadotworld.cli:cli', | ||
], | ||
}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
tests/fixtures/the-simpsons-by-the-data-bad-schema/data/simpsons_episodes.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
"fields": [ | ||
{ | ||
"name": "id", | ||
"type": "integer", | ||
"type": "BAD_TYPE", | ||
"title": "id" | ||
}, | ||
{ | ||
|
Oops, something went wrong.