Improve compatibility across Data Package spec versions (#14)

- Map date fields to datetime dtype (pandas.DataFrame) - Convert defunct gyear and gyearmonth types to current year and yearmonth - Degrade gracefully in case of lack of support for year, yearmonth and duration types - Improve messaging around schema errors
datadotworld · Mar 24, 2017 · 64741e2 · 64741e2
1 parent 3e6bcdc
commit 64741e2
Show file tree

Hide file tree

Showing 9 changed files with 787 additions and 670 deletions.
diff --git a/datadotworld/__init__.py b/datadotworld/__init__.py
@@ -22,4 +22,4 @@
 
 from datadotworld.datadotworld import load_dataset, query, api_client
 
-__version__ = '1.0.0-beta.3'
+__version__ = '1.0.0-beta.4'
diff --git a/datadotworld/models/dataset.py b/datadotworld/models/dataset.py
@@ -25,12 +25,14 @@
 
 import datapackage
 import six
+from datadotworld.models.util import (sanitize_table_schema,
+                                      align_table_fields,
+                                      patch_jsontableschema_pandas)
+from datadotworld.util import LazyLoadedDict, memoized
 from datapackage.resource import TabularResource
 from jsontableschema.exceptions import SchemaValidationError
 from tabulator import Stream
 
-from datadotworld.util import LazyLoadedDict, memoized
-
 
 class LocalDataset(object):
     """Dataset saved in the local file system
@@ -63,14 +65,16 @@ class LocalDataset(object):
     def __init__(self, descriptor_file):
 
         self._datapackage = datapackage.DataPackage(descriptor_file)
+
         self.__descriptor_file = descriptor_file
         self.__base_path = os.path.dirname(
             os.path.abspath(self.__descriptor_file))
 
         # Index resources by name
-        self.__resources = {r.descriptor['name']: r for r in
-                            self._datapackage.resources}
-        self.__tabular_resources = {k: r for (k, r) in self.__resources.items()
+        self.__resources = {r.descriptor['name']: r
+                            for r in self._datapackage.resources}
+        self.__tabular_resources = {k: sanitize_table_schema(r)
+                                    for (k, r) in self.__resources.items()
                                     if type(r) is TabularResource}
         self.__invalid_schemas = []  # Resource names with invalid schemas
 
@@ -138,12 +142,13 @@ def _load_table(self, resource_name):
             elif len(tabular_resource.data) > 0:
                 fields = tabular_resource.data[0].keys()
 
-            return [self.__align_fields(fields, row) for row in
+            return [align_table_fields(fields, row) for row in
                     tabular_resource.data]
-        except (SchemaValidationError, ValueError):
+        except (SchemaValidationError, ValueError, TypeError) as e:
             warnings.warn(
-                'Unable to apply datapackage table schema for {}. '
-                'Reverting to resource defaults...'.format(resource_name))
+                'Unable to set column types automatically using {} schema. '
+                'Data types may need to be adjusted manually. '
+                'Error: {}'.format(resource_name, e))
             self.__invalid_schemas.append(resource_name)
             file_format = tabular_resource.descriptor['format']
             with Stream(six.BytesIO(self.raw_data[resource_name]),
@@ -158,7 +163,7 @@ def _load_dataframe(self, resource_name):
 
         rows = self.tables[resource_name]
         if (resource_name in self.__storage.buckets and
-                    resource_name not in self.__invalid_schemas):
+                resource_name not in self.__invalid_schemas):
             if self.__storage[resource_name].size == 0:
                 row_values = [row.values() for row in rows]
                 self.__storage.write(resource_name, row_values)
@@ -168,16 +173,16 @@ def _load_dataframe(self, resource_name):
                 import pandas
             except ImportError:
                 raise RuntimeError('To enable dataframe support, '
-                                   'please install the pandas package first.')
+                                   'run \'pip install datadotworld[PANDAS]\'')
             return pandas.DataFrame(rows)
 
     def __initialize_storage(self):
         try:
-            from jsontableschema_pandas import Storage
+            from jsontableschema_pandas import Storage, mappers
+            patch_jsontableschema_pandas(mappers)
         except ImportError:
-            raise RuntimeError('To enable dataframe support for datapackages, '
-                               'please install the jsontableschema_pandas '
-                               'package first.')
+            raise RuntimeError('To enable dataframe support, '
+                               'run \'pip install datadotworld[PANDAS]\'')
 
         # Initialize storage if needed
         if not hasattr(self, '__storage'):
@@ -189,12 +194,6 @@ def __initialize_storage(self):
                     except SchemaValidationError:
                         self.__invalid_schemas.append(r.descriptor['schema'])
 
-    @staticmethod
-    def __align_fields(fields, unordered_row):
-        fields_idx = {f: pos for pos, f in enumerate(fields)}
-        return OrderedDict(sorted(unordered_row.items(),
-                                  key=lambda i: fields_idx[i[0]]))
-
     def __repr__(self):
         return '{}({})'.format(self.__class__.__name__,
                                repr(self.__descriptor_file))

diff --git a/datadotworld/models/util.py b/datadotworld/models/util.py
@@ -0,0 +1,80 @@
+from collections import OrderedDict
+
+
+def patch_jsontableschema_pandas(mappers):
+    """Monkey patch jsontableschema_pandas module
+
+    Up to version 0.2.0 jsontableschema_pandas mapped date fields
+    to object dtype
+    https://github.com/frictionlessdata/
+        jsontableschema-pandas-py/pull/23
+    """
+    if hasattr(mappers, 'jtstype_to_dtype'):
+        mapper = mappers.jtstype_to_dtype
+        new_mappings = {
+            'date': 'datetime64[ns]',
+            'year': 'int64',
+            'yearmonth': 'int64',
+            'duration': 'object',
+        }
+
+        def mapper_wrapper(jtstype):
+            try:
+                if jtstype == 'date':
+                    return new_mappings[jtstype]
+
+                return mapper(jtstype)
+
+            except TypeError as e:
+                if jtstype in new_mappings:
+                    return new_mappings[jtstype]
+                else:
+                    raise e
+
+        mappers.jtstype_to_dtype = mapper_wrapper
+
+
+def sanitize_table_schema(r):
+    """Sanitize table schema for increased compatibility
+
+    Up to version 0.9.0 jsontableschema did not support
+    year, yearmonth and duration field types
+    https://github.com/frictionlessdata/jsontableschema-py/pull/152
+    """
+    missing_type_support = False
+    try:
+        from jsontableschema import YearType, YearMonthType, DurationType
+    except ImportError:
+        missing_type_support = True
+
+    if 'schema' in r.descriptor:
+        for field in r.descriptor['schema'].get('fields', []):
+            if missing_type_support:
+                # Convert unsupported types to integer and string
+                # as appropriate
+
+                type_mapping = {
+                    'integer': [
+                        'gyear', 'year', 'gyearmonth', 'yearmonth'],
+                    'string': [
+                        'duration'
+                    ]}
+
+                for old_type, new_types in type_mapping.items():
+                    if field.get('type') in new_types:
+                        field['type'] = old_type
+
+            # Datapackage specs were changed along the way
+            # Convert gyear and gyearmonth to year and yearmonth
+            # https://github.com/frictionlessdata/specs/pull/370
+            if field.get('type') in ['gyear', 'gyearmonth']:
+                field['type'] = field['type'][1:]
+
+    return r
+
+
+def align_table_fields(fields, unordered_row):
+    """Ensure columns appear in the same order for every row in table"""
+    fields_idx = {f: pos for pos, f in enumerate(fields)}
+    return OrderedDict(sorted(unordered_row.items(),
+                              key=lambda i: fields_idx[i[0]]))
diff --git a/setup.py b/setup.py
@@ -49,7 +49,10 @@ def find_version(*paths):
     author_email='[email protected]',
     license='Apache 2.0',
     packages=find_packages(),
-    keywords=['data.world', 'dataset'],
+    keywords=[
+        'data.world',
+        'dataset',
+    ],
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
@@ -64,23 +67,39 @@ def find_version(*paths):
         'Programming Language :: Python :: 3.6',
         'Topic :: Database :: Database Engines/Servers',
         'Topic :: Scientific/Engineering :: Information Analysis',
-        'Topic :: Software Development :: Libraries :: Python Modules'
+        'Topic :: Software Development :: Libraries :: Python Modules',
     ],
     install_requires=[
-        'certifi', 'click', 'configparser', 'datapackage', 'python-dateutil',
-        'requests', 'six', 'tabulator', 'urllib3'
+        'certifi',
+        'click',
+        'configparser',
+        'datapackage',
+        'python-dateutil',
+        'requests',
+        'six',
+        'tabulator',
+        'urllib3',
     ],
     setup_requires=[
-        'pytest-runner'
+        'pytest-runner',
     ],
     tests_require=[
-        'doublex', 'pyhamcrest', 'responses', 'pytest',
-        'jsontableschema_pandas', 'pandas<0.19a'
+        'doublex',
+        'pyhamcrest',
+        'responses',
+        'pytest',
+        'jsontableschema_pandas',
+        'pandas<0.19a',
     ],
     extras_require={
-        'PANDAS': ['jsontableschema_pandas', 'pandas<0.19a']
+        'PANDAS': [
+            'jsontableschema_pandas',
+            'pandas<0.19a',
+        ],
     },
     entry_points={
-        'console_scripts': ['dw=datadotworld.cli:cli'],
+        'console_scripts': [
+            'dw=datadotworld.cli:cli',
+        ],
     },
 )
diff --git a/tests/datadotworld/models/test_dataset.py b/tests/datadotworld/models/test_dataset.py
@@ -24,11 +24,11 @@
 from os import path
 
 import pytest
+from datadotworld.models.dataset import LocalDataset
+from datadotworld.models.util import sanitize_table_schema
 from datapackage import DataPackage, Resource
 from doublex import assert_that, is_
-from hamcrest import equal_to, contains, calling, not_, raises
-
-from datadotworld.models.dataset import LocalDataset
+from hamcrest import equal_to, contains, calling, not_, raises, not_none
 
 
 class TestLocalDataset:
@@ -39,7 +39,10 @@ def simpsons_descriptor_path(self, test_files_path):
 
     @pytest.fixture()
     def simpsons_datapackage(self, simpsons_descriptor_path):
-        return DataPackage(descriptor=simpsons_descriptor_path)
+        datapackage = DataPackage(descriptor=simpsons_descriptor_path)
+        for r in datapackage.resources:
+            sanitize_table_schema(r)
+        return datapackage
 
     @pytest.fixture()
     def simpsons_dataset(self, simpsons_descriptor_path):
@@ -100,7 +103,9 @@ def test_tables(self, simpsons_dataset, simpsons_datapackage):
 
     def test_tables_broken_schema(self, simpsons_broken_dataset):
         assert_that(calling(simpsons_broken_dataset.tables.get).with_args(
-            'simpsons_episodes'), not_(raises(ValueError)))
+            'simpsons_episodes'), not_(raises(Exception)))
+        assert_that(simpsons_broken_dataset.tables.get('simpsons_episodes'),
+                    not_none())
 
     def test_dataframes(self, simpsons_dataset):
         for k, t in simpsons_dataset.tables.items():
@@ -113,7 +118,15 @@ def test_dataframe_types(self, simpsons_dataset):
         df = simpsons_dataset.dataframes['simpsons_episodes']
         assert_that(df['id'].dtype, equal_to('int64'))
         assert_that(df['title'].dtype, equal_to('object'))
-        # TODO test different datapackages and more dtypes
+        assert_that(df['original_air_date'].dtype, equal_to('datetime64[ns]'))
+        assert_that(df['original_air_year'].dtype, equal_to('int64'))
+        assert_that(df['imdb_rating'].dtype, equal_to('float64'))
+
+    def test_dataframe_broken_schema(self, simpsons_broken_dataset):
+        assert_that(calling(simpsons_broken_dataset.dataframes.get).with_args(
+            'simpsons_episodes'), not_(raises(Exception)))
+        assert_that(simpsons_broken_dataset.dataframes.get(
+            'simpsons_episodes'), not_none())
 
     def test_repr(self, simpsons_dataset):
         # noinspection PyUnresolvedReferences

diff --git a/tests/fixtures/the-simpsons-by-the-data-bad-schema/data/simpsons_episodes.csv b/tests/fixtures/the-simpsons-by-the-data-bad-schema/data/simpsons_episodes.csv
@@ -1,5 +1,5 @@
 id,title,original_air_date,production_code,season,number_in_season,number_in_series,us_viewers_in_millions,views,imdb_rating,imdb_votes,image_url,video_url
-10,Homer's Night Out,1990-03-25,7G10,1,10,10,30.3,50816,7.4,1511,http://static-media.fxx.com/img/FX_Networks_-_FXX/305/815/Simpsons_01_10.jpg,http://www.simpsonsworld.com/video/275197507879
+10,Homer's Night Out,WRONG_TYPE,7G10,1,10,10,30.3,50816,7.4,1511,http://static-media.fxx.com/img/FX_Networks_-_FXX/305/815/Simpsons_01_10.jpg,http://www.simpsonsworld.com/video/275197507879
 12,Krusty Gets Busted,1990-04-29,7G12,1,12,12,30.4,62561,8.3,1716,http://static-media.fxx.com/img/FX_Networks_-_FXX/245/843/Simpsons_01_12.jpg,http://www.simpsonsworld.com/video/288019523914
 14,"Bart Gets an ""F""",1990-10-11,7F03,2,1,14,33.6,59575,8.2,1638,http://static-media.fxx.com/img/FX_Networks_-_FXX/662/811/bart_gets_F.jpg,http://www.simpsonsworld.com/video/260539459671
 17,Two Cars in Every Garage and Three Eyes on Every Fish,1990-11-01,7F01,2,4,17,26.1,64959,8.1,1457,http://static-media.fxx.com/img/FX_Networks_-_FXX/660/859/Simpsons_02_01.jpg,http://www.simpsonsworld.com/video/260537411822

diff --git a/tests/fixtures/the-simpsons-by-the-data-bad-schema/datapackage.json b/tests/fixtures/the-simpsons-by-the-data-bad-schema/datapackage.json
@@ -10,7 +10,7 @@
         "fields": [
           {
             "name": "id",
-            "type": "integer",
+            "type": "BAD_TYPE",
             "title": "id"
           },
           {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -22,4 +22,4 @@

		from datadotworld.datadotworld import load_dataset, query, api_client

		__version__ = '1.0.0-beta.3'
		__version__ = '1.0.0-beta.4'