Skip to content

Commit

Permalink
fix: prevent auto-converting 0, 0.0, '', etc to null. also fix dateti…
Browse files Browse the repository at this point in the history
…me check (#14)

fix: prevent auto-converting 0, 0.0, '', etc to null.

also fixes datetime check
  • Loading branch information
daigotanaka authored Jun 4, 2021
1 parent dd7717a commit 75b860d
Show file tree
Hide file tree
Showing 3 changed files with 183 additions and 34 deletions.
54 changes: 32 additions & 22 deletions getschema/impl.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
import argparse, csv, datetime, dateutil, logging, os, re, sys
import argparse, csv, datetime, logging, os, re, sys
from dateutil import parser as dateutil_parser
from dateutil.tz import tzoffset
import jsonpath_ng as jsonpath
import simplejson as json
Expand Down Expand Up @@ -30,6 +31,17 @@ def _get_jsonpath(raw, path):
return record


def _is_datetime(obj):
# TODO: This is a very loose regex for date-time.
return (
type(obj) is datetime.datetime or
type(obj) is datetime.date or
(type(obj) is str and
re.match("(19|20)\d\d-(0[1-9]|1[012])-([1-9]|0[1-9]|[12][0-9]|3[01])",
obj) is not None)
)


def _do_infer_schema(obj, record_level=None, lower=False,
replace_special=False, snake_case=False):
schema = dict()
Expand Down Expand Up @@ -66,12 +78,7 @@ def _do_infer_schema(obj, record_level=None, lower=False,
float(obj)
except ValueError:
schema["type"] = ["null", "string"]
# TODO: This is a very loose regex for date-time.
if (type(obj) is datetime.datetime or
type(obj) is datetime.date or
(type(obj) is str and
re.match("(19|20)\d\d-(0[1-9]|1[012])-([1-9]|0[1-9]|[12][0-9]|3[01])",
obj) is not None)):
if _is_datetime(obj):
schema["format"] = "date-time"
else:
if type(obj) == bool:
Expand Down Expand Up @@ -175,7 +182,7 @@ def _nested_get(input_dict, nested_key):


def _parse_datetime_tz(datetime_str, default_tz_offset=0):
d = dateutil.parser.parse(datetime_str)
d = dateutil_parser.parse(datetime_str)
if not d.tzinfo:
d = d.replace(tzinfo=tzoffset(None, default_tz_offset))
return d
Expand Down Expand Up @@ -325,8 +332,7 @@ def fix_type(obj, schema, dict_path=[], on_invalid_property="raise",
for key in keys:
ret = fix_type(obj[key], schema, dict_path + ["properties", key],
on_invalid_property)
if ret is not None:
cleaned[key] = ret
cleaned[key] = ret
new_key = _convert_key(key, lower, replace_special, snake_case)
if key != new_key:
cleaned[new_key] = cleaned.pop(key)
Expand All @@ -340,18 +346,22 @@ def fix_type(obj, schema, dict_path=[], on_invalid_property="raise",
cleaned.append(ret)
else:
if obj_type == "string":
cleaned = str(obj)
if obj_format == "date-time":
try:
cleaned = _parse_datetime_tz(
obj, default_tz_offset=0).isoformat()
except Exception as e:
cleaned = _on_invalid_property(on_invalid_property,
dict_path, obj_type, obj,
err_msg=str(e))
if obj is None:
cleaned = None
else:
cleaned = str(obj)
if obj_format == "date-time":
# Just test parsing for now. Not converting to Python's
# datetime as re-JSONifying datetime is not straight-foward
if not _is_datetime(cleaned):
cleaned = _on_invalid_property(
on_invalid_property,
dict_path, obj_type, cleaned,
err_msg="Not in a valid datetime format",
)
elif obj_type == "number":
if obj is None:
cleaned = 0.0
cleaned = None
else:
try:
cleaned = float(obj)
Expand All @@ -361,7 +371,7 @@ def fix_type(obj, schema, dict_path=[], on_invalid_property="raise",
err_msg=str(e))
elif obj_type == "integer":
if obj is None:
cleaned = 0
cleaned = None
else:
try:
cleaned = int(obj)
Expand All @@ -371,7 +381,7 @@ def fix_type(obj, schema, dict_path=[], on_invalid_property="raise",
err_msg=str(e))
elif obj_type == "boolean":
if obj is None:
cleaned = False
cleaned = None
elif str(obj).lower() == "true":
cleaned = True
elif str(obj).lower() == "false":
Expand Down
158 changes: 146 additions & 12 deletions tests/test_fix_type.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import logging
import getschema
import json
Expand All @@ -16,6 +17,9 @@
},
"boolean_field": True,
"another_boolean_field": True,
"number_field": 1,
"string_field": "a",
"datetime_field": "2021-06-04",
},
{
"index": 1,
Expand All @@ -27,6 +31,9 @@
},
"boolean_field": False,
"another_boolean_field": True,
"number_field": 0.5,
"string_field": "b",
"datetime_field": "2021-06-04T09:00",
},
]

Expand All @@ -38,6 +45,7 @@
"nested_field": {
"some_prop": -1,
},
"datetime_field": "2021-06-01 09:00:00"
}
valid_after_fix = {
"index": "0",
Expand All @@ -57,27 +65,61 @@
"some_prop": "1",
},
}
null_index = {
null_entries = {
"index": None,
"array": [
"1",
"1.5",
None,
],
"nested_field": {
"some_prop": "3",
},
"boolean_field": None,
"another_boolean_field": True,
"number_field": None,
"string_field": None,
}
invalid_datetime_record = {
"index": 2,
"array": [
1000,
],
"nested_field": {
"some_prop": -1,
},
"datetime_field": "20"
}
empty_string_record = {
"index": 2,
"array": [
1000,
],
"nested_field": {
"some_prop": -1,
},
"string_field": ""
}


def test_unsupported_schema():
schema = getschema.infer_schema(records)
schema["properties"]["index"]["type"] = ["null", "integer", "string"]
try:
getschema.fix_type(valid_record, schema)
except Exception as e:
assert(str(e).startswith("Sorry, getschema does not support multiple types"))


def test_int_zero():
schema = getschema.infer_schema(records)

# This should pass
getschema.fix_type(valid_record, schema)

fixed_record = getschema.fix_type(valid_after_fix, schema)
assert(isinstance(fixed_record["index"], int))
assert(isinstance(fixed_record["array"][0], float))
assert(isinstance(fixed_record["nested_field"]["some_prop"], int))

try:
fixed_record = getschema.fix_type(invalid_after_fix, schema)
except Exception as e:
Expand All @@ -86,19 +128,111 @@ def test_int_zero():
assert False, "It should raise an exception"


def test_invalid_obj_type():
def test_datetime():
schema = getschema.infer_schema(records)
schema["properties"]["index"]["type"] = ["null", "integer", "string"]
assert(schema["properties"]["datetime_field"]["type"] ==
["null", "string"])
assert(schema["properties"]["datetime_field"]["format"] == "date-time")
fixed_record = getschema.fix_type(valid_record, schema)
assert(isinstance(fixed_record["datetime_field"], str))
try:
getschema.fix_type(valid_record, schema)
fixed_record = getschema.fix_type(invalid_datetime_record, schema)
except Exception as e:
assert(str(e).startswith("Sorry, getschema does not support multiple types"))
schema["properties"]["index"]["type"] = ["null", "integer"]
fixed_record = getschema.fix_type(null_index, schema)
assert(fixed_record["boolean_field"] is False)
assert(fixed_record["another_boolean_field"] is True)
assert(str(e).startswith("Not in a valid datetime format"))
else:
assert False, "It should raise an exception"


def test_empty_string():
schema = getschema.infer_schema(records)
assert(schema["properties"]["string_field"]["type"] ==
["null", "string"])
fixed_record = getschema.fix_type(empty_string_record, schema)
assert(fixed_record["string_field"] == "")
schema["properties"]["string_field"]["type"] == ["string"]
fixed_record = getschema.fix_type(empty_string_record, schema)
assert(fixed_record["string_field"] == "")


def test_preserve_nulls_boolean():
schema = getschema.infer_schema(records)
assert(schema["properties"]["boolean_field"]["type"] ==
["null", "boolean"])
fixed_record = getschema.fix_type(null_entries, schema)
assert(fixed_record["boolean_field"] is None)


def test_preserve_nulls_integer():
schema = getschema.infer_schema(records)
assert(schema["properties"]["index"]["type"] == ["null", "integer"])
fixed_record = getschema.fix_type(null_entries, schema)
assert(fixed_record["index"] is None)


def test_preserve_nulls_number():
schema = getschema.infer_schema(records)
assert(schema["properties"]["number_field"]["type"] == ["null", "number"])
fixed_record = getschema.fix_type(null_entries, schema)
assert(fixed_record["number_field"] is None)


def test_preserve_nulls_string():
schema = getschema.infer_schema(records)
assert(schema["properties"]["string_field"]["type"] == ["null", "string"])
fixed_record = getschema.fix_type(null_entries, schema)
assert(fixed_record["string_field"] is None)


def test_reject_null_boolean():
schema = getschema.infer_schema(records)
# This will pass
_ = getschema.fix_type(null_entries, schema)

schema["properties"]["boolean_field"]["type"] = ["boolean"]
try:
_ = getschema.fix_type(null_entries, schema)
except Exception as e:
assert(str(e).startswith("Null object given at"))
else:
raise Exception("Supposed to fail with null value")


def test_reject_null_integer():
schema = getschema.infer_schema(records)
# This will pass
_ = getschema.fix_type(null_entries, schema)
schema["properties"]["index"]["type"] = ["integer"]
try:
fixed_record = getschema.fix_type(null_index, schema)
_ = getschema.fix_type(null_entries, schema)
except Exception as e:
assert(str(e).startswith("Null object given at"))
else:
raise Exception("Supposed to fail with null value")


def test_reject_null_number():
schema = getschema.infer_schema(records)
# This will pass
_ = getschema.fix_type(null_entries, schema)

schema["properties"]["number_field"]["type"] = ["number"]
try:
_ = getschema.fix_type(null_entries, schema)
except Exception as e:
assert(str(e).startswith("Null object given at"))
else:
raise Exception("Supposed to fail with null value")


def test_reject_null_string():
schema = getschema.infer_schema(records)
# This will pass
_ = getschema.fix_type(null_entries, schema)

schema["properties"]["string_field"]["type"] = ["string"]
try:
_ = getschema.fix_type(null_entries, schema)
except Exception as e:
assert(str(e).startswith("Null object given at"))
else:
raise Exception("Supposed to fail with null value")
5 changes: 5 additions & 0 deletions tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def test_null_records():
],
"nested_field": {
"some_date": "2021-05-25",
"number": 1,
"null_subfield": None,
},
},
Expand All @@ -26,6 +27,8 @@ def test_null_records():
],
"nested_field": {
"some_date": "2021-05-25",
"integer": 1,
"number": 1.5,
"null_subfield": None,
},
},
Expand All @@ -35,5 +38,7 @@ def test_null_records():
assert(schema["properties"]["null_field"]["type"] == ["null", "string"])
assert(schema["properties"]["nested_field"]["properties"]["some_date"]["type"] == ["null", "string"])
assert(schema["properties"]["nested_field"]["properties"]["some_date"]["format"] == "date-time")
assert(schema["properties"]["nested_field"]["properties"]["integer"]["type"] == ["null", "integer"])
assert(schema["properties"]["nested_field"]["properties"]["number"]["type"] == ["null", "number"])
assert(schema["properties"]["nested_field"]["properties"]["null_subfield"]["type"] == ["null", "string"])

0 comments on commit 75b860d

Please sign in to comment.