Skip to content

Commit

Permalink
Improve compatibility across Data Package spec versions (#14)
Browse files Browse the repository at this point in the history
- Map date fields to datetime dtype (pandas.DataFrame)
- Convert defunct gyear and gyearmonth types to current year and yearmonth
- Degrade gracefully in case of lack of support for year, yearmonth and duration types
- Improve messaging around schema errors
  • Loading branch information
rflprr authored Mar 24, 2017
1 parent 3e6bcdc commit 64741e2
Show file tree
Hide file tree
Showing 9 changed files with 787 additions and 670 deletions.
2 changes: 1 addition & 1 deletion datadotworld/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@

from datadotworld.datadotworld import load_dataset, query, api_client

__version__ = '1.0.0-beta.3'
__version__ = '1.0.0-beta.4'
41 changes: 20 additions & 21 deletions datadotworld/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@

import datapackage
import six
from datadotworld.models.util import (sanitize_table_schema,
align_table_fields,
patch_jsontableschema_pandas)
from datadotworld.util import LazyLoadedDict, memoized
from datapackage.resource import TabularResource
from jsontableschema.exceptions import SchemaValidationError
from tabulator import Stream

from datadotworld.util import LazyLoadedDict, memoized


class LocalDataset(object):
"""Dataset saved in the local file system
Expand Down Expand Up @@ -63,14 +65,16 @@ class LocalDataset(object):
def __init__(self, descriptor_file):

self._datapackage = datapackage.DataPackage(descriptor_file)

self.__descriptor_file = descriptor_file
self.__base_path = os.path.dirname(
os.path.abspath(self.__descriptor_file))

# Index resources by name
self.__resources = {r.descriptor['name']: r for r in
self._datapackage.resources}
self.__tabular_resources = {k: r for (k, r) in self.__resources.items()
self.__resources = {r.descriptor['name']: r
for r in self._datapackage.resources}
self.__tabular_resources = {k: sanitize_table_schema(r)
for (k, r) in self.__resources.items()
if type(r) is TabularResource}
self.__invalid_schemas = [] # Resource names with invalid schemas

Expand Down Expand Up @@ -138,12 +142,13 @@ def _load_table(self, resource_name):
elif len(tabular_resource.data) > 0:
fields = tabular_resource.data[0].keys()

return [self.__align_fields(fields, row) for row in
return [align_table_fields(fields, row) for row in
tabular_resource.data]
except (SchemaValidationError, ValueError):
except (SchemaValidationError, ValueError, TypeError) as e:
warnings.warn(
'Unable to apply datapackage table schema for {}. '
'Reverting to resource defaults...'.format(resource_name))
'Unable to set column types automatically using {} schema. '
'Data types may need to be adjusted manually. '
'Error: {}'.format(resource_name, e))
self.__invalid_schemas.append(resource_name)
file_format = tabular_resource.descriptor['format']
with Stream(six.BytesIO(self.raw_data[resource_name]),
Expand All @@ -158,7 +163,7 @@ def _load_dataframe(self, resource_name):

rows = self.tables[resource_name]
if (resource_name in self.__storage.buckets and
resource_name not in self.__invalid_schemas):
resource_name not in self.__invalid_schemas):
if self.__storage[resource_name].size == 0:
row_values = [row.values() for row in rows]
self.__storage.write(resource_name, row_values)
Expand All @@ -168,16 +173,16 @@ def _load_dataframe(self, resource_name):
import pandas
except ImportError:
raise RuntimeError('To enable dataframe support, '
'please install the pandas package first.')
'run \'pip install datadotworld[PANDAS]\'')
return pandas.DataFrame(rows)

def __initialize_storage(self):
try:
from jsontableschema_pandas import Storage
from jsontableschema_pandas import Storage, mappers
patch_jsontableschema_pandas(mappers)
except ImportError:
raise RuntimeError('To enable dataframe support for datapackages, '
'please install the jsontableschema_pandas '
'package first.')
raise RuntimeError('To enable dataframe support, '
'run \'pip install datadotworld[PANDAS]\'')

# Initialize storage if needed
if not hasattr(self, '__storage'):
Expand All @@ -189,12 +194,6 @@ def __initialize_storage(self):
except SchemaValidationError:
self.__invalid_schemas.append(r.descriptor['schema'])

@staticmethod
def __align_fields(fields, unordered_row):
fields_idx = {f: pos for pos, f in enumerate(fields)}
return OrderedDict(sorted(unordered_row.items(),
key=lambda i: fields_idx[i[0]]))

def __repr__(self):
return '{}({})'.format(self.__class__.__name__,
repr(self.__descriptor_file))
Expand Down
80 changes: 80 additions & 0 deletions datadotworld/models/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from collections import OrderedDict


def patch_jsontableschema_pandas(mappers):
"""Monkey patch jsontableschema_pandas module
Up to version 0.2.0 jsontableschema_pandas mapped date fields
to object dtype
https://github.com/frictionlessdata/
jsontableschema-pandas-py/pull/23
"""
if hasattr(mappers, 'jtstype_to_dtype'):
mapper = mappers.jtstype_to_dtype
new_mappings = {
'date': 'datetime64[ns]',
'year': 'int64',
'yearmonth': 'int64',
'duration': 'object',
}

def mapper_wrapper(jtstype):
try:
if jtstype == 'date':
return new_mappings[jtstype]

return mapper(jtstype)

except TypeError as e:
if jtstype in new_mappings:
return new_mappings[jtstype]
else:
raise e

mappers.jtstype_to_dtype = mapper_wrapper


def sanitize_table_schema(r):
"""Sanitize table schema for increased compatibility
Up to version 0.9.0 jsontableschema did not support
year, yearmonth and duration field types
https://github.com/frictionlessdata/jsontableschema-py/pull/152
"""
missing_type_support = False
try:
from jsontableschema import YearType, YearMonthType, DurationType
except ImportError:
missing_type_support = True

if 'schema' in r.descriptor:
for field in r.descriptor['schema'].get('fields', []):
if missing_type_support:
# Convert unsupported types to integer and string
# as appropriate

type_mapping = {
'integer': [
'gyear', 'year', 'gyearmonth', 'yearmonth'],
'string': [
'duration'
]}

for old_type, new_types in type_mapping.items():
if field.get('type') in new_types:
field['type'] = old_type

# Datapackage specs were changed along the way
# Convert gyear and gyearmonth to year and yearmonth
# https://github.com/frictionlessdata/specs/pull/370
if field.get('type') in ['gyear', 'gyearmonth']:
field['type'] = field['type'][1:]

return r


def align_table_fields(fields, unordered_row):
"""Ensure columns appear in the same order for every row in table"""
fields_idx = {f: pos for pos, f in enumerate(fields)}
return OrderedDict(sorted(unordered_row.items(),
key=lambda i: fields_idx[i[0]]))
37 changes: 28 additions & 9 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ def find_version(*paths):
author_email='[email protected]',
license='Apache 2.0',
packages=find_packages(),
keywords=['data.world', 'dataset'],
keywords=[
'data.world',
'dataset',
],
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
Expand All @@ -64,23 +67,39 @@ def find_version(*paths):
'Programming Language :: Python :: 3.6',
'Topic :: Database :: Database Engines/Servers',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Software Development :: Libraries :: Python Modules'
'Topic :: Software Development :: Libraries :: Python Modules',
],
install_requires=[
'certifi', 'click', 'configparser', 'datapackage', 'python-dateutil',
'requests', 'six', 'tabulator', 'urllib3'
'certifi',
'click',
'configparser',
'datapackage',
'python-dateutil',
'requests',
'six',
'tabulator',
'urllib3',
],
setup_requires=[
'pytest-runner'
'pytest-runner',
],
tests_require=[
'doublex', 'pyhamcrest', 'responses', 'pytest',
'jsontableschema_pandas', 'pandas<0.19a'
'doublex',
'pyhamcrest',
'responses',
'pytest',
'jsontableschema_pandas',
'pandas<0.19a',
],
extras_require={
'PANDAS': ['jsontableschema_pandas', 'pandas<0.19a']
'PANDAS': [
'jsontableschema_pandas',
'pandas<0.19a',
],
},
entry_points={
'console_scripts': ['dw=datadotworld.cli:cli'],
'console_scripts': [
'dw=datadotworld.cli:cli',
],
},
)
25 changes: 19 additions & 6 deletions tests/datadotworld/models/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
from os import path

import pytest
from datadotworld.models.dataset import LocalDataset
from datadotworld.models.util import sanitize_table_schema
from datapackage import DataPackage, Resource
from doublex import assert_that, is_
from hamcrest import equal_to, contains, calling, not_, raises

from datadotworld.models.dataset import LocalDataset
from hamcrest import equal_to, contains, calling, not_, raises, not_none


class TestLocalDataset:
Expand All @@ -39,7 +39,10 @@ def simpsons_descriptor_path(self, test_files_path):

@pytest.fixture()
def simpsons_datapackage(self, simpsons_descriptor_path):
return DataPackage(descriptor=simpsons_descriptor_path)
datapackage = DataPackage(descriptor=simpsons_descriptor_path)
for r in datapackage.resources:
sanitize_table_schema(r)
return datapackage

@pytest.fixture()
def simpsons_dataset(self, simpsons_descriptor_path):
Expand Down Expand Up @@ -100,7 +103,9 @@ def test_tables(self, simpsons_dataset, simpsons_datapackage):

def test_tables_broken_schema(self, simpsons_broken_dataset):
assert_that(calling(simpsons_broken_dataset.tables.get).with_args(
'simpsons_episodes'), not_(raises(ValueError)))
'simpsons_episodes'), not_(raises(Exception)))
assert_that(simpsons_broken_dataset.tables.get('simpsons_episodes'),
not_none())

def test_dataframes(self, simpsons_dataset):
for k, t in simpsons_dataset.tables.items():
Expand All @@ -113,7 +118,15 @@ def test_dataframe_types(self, simpsons_dataset):
df = simpsons_dataset.dataframes['simpsons_episodes']
assert_that(df['id'].dtype, equal_to('int64'))
assert_that(df['title'].dtype, equal_to('object'))
# TODO test different datapackages and more dtypes
assert_that(df['original_air_date'].dtype, equal_to('datetime64[ns]'))
assert_that(df['original_air_year'].dtype, equal_to('int64'))
assert_that(df['imdb_rating'].dtype, equal_to('float64'))

def test_dataframe_broken_schema(self, simpsons_broken_dataset):
assert_that(calling(simpsons_broken_dataset.dataframes.get).with_args(
'simpsons_episodes'), not_(raises(Exception)))
assert_that(simpsons_broken_dataset.dataframes.get(
'simpsons_episodes'), not_none())

def test_repr(self, simpsons_dataset):
# noinspection PyUnresolvedReferences
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
id,title,original_air_date,production_code,season,number_in_season,number_in_series,us_viewers_in_millions,views,imdb_rating,imdb_votes,image_url,video_url
10,Homer's Night Out,1990-03-25,7G10,1,10,10,30.3,50816,7.4,1511,http://static-media.fxx.com/img/FX_Networks_-_FXX/305/815/Simpsons_01_10.jpg,http://www.simpsonsworld.com/video/275197507879
10,Homer's Night Out,WRONG_TYPE,7G10,1,10,10,30.3,50816,7.4,1511,http://static-media.fxx.com/img/FX_Networks_-_FXX/305/815/Simpsons_01_10.jpg,http://www.simpsonsworld.com/video/275197507879
12,Krusty Gets Busted,1990-04-29,7G12,1,12,12,30.4,62561,8.3,1716,http://static-media.fxx.com/img/FX_Networks_-_FXX/245/843/Simpsons_01_12.jpg,http://www.simpsonsworld.com/video/288019523914
14,"Bart Gets an ""F""",1990-10-11,7F03,2,1,14,33.6,59575,8.2,1638,http://static-media.fxx.com/img/FX_Networks_-_FXX/662/811/bart_gets_F.jpg,http://www.simpsonsworld.com/video/260539459671
17,Two Cars in Every Garage and Three Eyes on Every Fish,1990-11-01,7F01,2,4,17,26.1,64959,8.1,1457,http://static-media.fxx.com/img/FX_Networks_-_FXX/660/859/Simpsons_02_01.jpg,http://www.simpsonsworld.com/video/260537411822
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"fields": [
{
"name": "id",
"type": "integer",
"type": "BAD_TYPE",
"title": "id"
},
{
Expand Down
Loading

0 comments on commit 64741e2

Please sign in to comment.