From d8a1e2e92198c83f250deb0f9d88270343af8cfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20PERIN?= Date: Thu, 21 Dec 2023 21:40:44 +0100 Subject: [PATCH] feat: Add composite fields --- README.md | 91 ++++++++++++- magicparse/__init__.py | 7 + magicparse/builders.py | 92 +++++++++++++ magicparse/fields.py | 17 +++ magicparse/schema.py | 17 ++- tests/test_builders.py | 235 ++++++++++++++++++++++++++++++++++ tests/test_computed_fields.py | 51 ++++++++ 7 files changed, 503 insertions(+), 7 deletions(-) create mode 100644 magicparse/builders.py create mode 100644 tests/test_builders.py create mode 100644 tests/test_computed_fields.py diff --git a/README.md b/README.md index e55d5d1..bf5625f 100644 --- a/README.md +++ b/README.md @@ -15,14 +15,83 @@ schema = { "has_header": False, "delimiter": ";", "fields": [ - {"key": "ean", "column-number": 2, "type": "str", "validators": [{"name": "regex-matches", "parameters": {"pattern": "^\\d{13}$"}}]}, + { + "key": "ean", + "column-number": 2, + "type": "str", + "validators": [ + { + "name": "regex-matches", + "parameters": {"pattern": "^\\d{13}$"}, + } + ], + }, {"key": "label", "column-number": 3, "type": "str"}, {"key": "family-code", "column-number": 8, "type": "str"}, - {"key": "vat", "column-number": 10, "type": "decimal", "optional": False}, - {"key": "initial-price", "column-number": 11, "type": "decimal", "post-processors": {"name": "divide", "parameters": {"denominator": 100}}}, - {"key": "unit-of-measurement", "column-number": 12, "type": "int", "pre-processors": [{"name": "map", "parameters": {"values": {"K": 0, "A": 1, "L": 2}}}]}, - {"key": "volume", "column-number": 13, "type": "decimal", "post-processors": {"name": "round", "parameters": {"precision": 3}}}, - ] + { + "key": "vat", + "column-number": 10, + "type": "decimal", + "optional": False, + }, + { + "key": "initial-price", + "column-number": 11, + "type": "decimal", + "post-processors": [ + { + "name": "divide", + "parameters": {"denominator": 100}, + }, + { + "name": "round", + "parameters": {"precision": 3}, + } + ] + }, + { + "key": "unit-of-measurement", + "column-number": 12, + "type": "int", + "pre-processors": [ + { + "name": "map", + "parameters": {"values": {"K": 0, "A": 1, "L": 2}}, + } + ], + } + ], + "computed-fields": [ + { + "key": "code", + "type": "str", + "builder": { + "name": "concat", + "parameters": {"fields": ["code_1", "code_2"]}, + } + }, + { + "key": "volume", + "type": "decimal", + "builder": { + "name": "divide", + "parameters": { + "numerator": "price", + "denominator": "price_by_unit", + }, + } + }, + { + "key": "price_by_unit", + "type": "decimal", + "builder": { + "name": "multiply", + "parameters": { + "x_factor": "price", + "y_factor": "unit", + }, + } + ], } @@ -122,3 +191,13 @@ assert rows == [{"name": "Joe"}, {"name": "William"}, {"name": "Jack"}, {"name": - divide - round + +### Computed Fields + +Types, Pre-processors, Post-processors and validator is same as Field + +#### Composite-processors + +- concat +- divide +- multiply diff --git a/magicparse/__init__.py b/magicparse/__init__.py index cc43a53..72d91d5 100644 --- a/magicparse/__init__.py +++ b/magicparse/__init__.py @@ -3,6 +3,10 @@ from .schema import Schema, builtins as builtins_schemas from .post_processors import PostProcessor, builtins as builtins_post_processors from .pre_processors import PreProcessor, builtins as builtins_pre_processors +from .builders import ( + Builder, + builtins as builtins_composite_processors, +) from .transform import Transform from .type_converters import TypeConverter, builtins as builtins_type_converters from typing import Any, Dict, List, Tuple, Union @@ -44,6 +48,8 @@ def register(items: Union[Registrable, List[Registrable]]) -> None: PreProcessor.register(item) elif issubclass(item, Validator): Validator.register(item) + elif issubclass(item, Builder): + Builder.register(item) else: raise ValueError( "transforms must be a subclass of Transform (or a list of it)" @@ -55,3 +61,4 @@ def register(items: Union[Registrable, List[Registrable]]) -> None: register(builtins_type_converters) register(builtins_validators) register(builtins_post_processors) +register(builtins_composite_processors) diff --git a/magicparse/builders.py b/magicparse/builders.py new file mode 100644 index 0000000..12a8357 --- /dev/null +++ b/magicparse/builders.py @@ -0,0 +1,92 @@ +from abc import ABC +from decimal import Decimal + +from .transform import Transform + + +class Builder(Transform, ABC): + @classmethod + def build(cls, options: dict) -> "Builder": + try: + name = options["name"] + except: + raise ValueError("builder must have a 'name' key") + + try: + builder = cls.registry[name] + except: + raise ValueError(f"invalid builder '{name}'") + + if "parameters" in options: + return builder(**options["parameters"]) + else: + return builder() + + +class Concat(Builder): + def __init__(self, fields: list[str]) -> None: + if ( + not fields + or isinstance(fields, str) + or not isinstance(fields, list) + or not all(isinstance(field, str) for field in fields) + or len(fields) < 2 + ): + raise ValueError( + "composite-processor 'concat': " + "'fields' parameter must be a list[str] with at least two elements" + ) + + self.fields = fields + + def apply(self, row: dict) -> str: + return "".join(row[field] for field in self.fields) + + @staticmethod + def key() -> str: + return "concat" + + +class Divide(Builder): + def __init__(self, numerator: str, denominator: str) -> None: + if not numerator or not isinstance(numerator, str): + raise ValueError( + "builder 'divide': " "'numerator' parameter must be a non null str" + ) + if not denominator or not isinstance(denominator, str): + raise ValueError( + "builder 'divide': " "'denominator' parameter must be a non null str" + ) + self.numerator = numerator + self.denominator = denominator + + def apply(self, row: dict) -> Decimal: + return row[self.numerator] / row[self.denominator] + + @staticmethod + def key() -> str: + return "divide" + + +class Multiply(Builder): + def __init__(self, x_factor: str, y_factor: str) -> None: + if not x_factor or not isinstance(x_factor, str): + raise ValueError( + "builder 'multiply': " "'x_factor' parameter must be a non null str" + ) + if not y_factor or not isinstance(y_factor, str): + raise ValueError( + "builder 'multiply': " "'y_factor' parameter must be a non null str" + ) + self.x_factor = x_factor + self.y_factor = y_factor + + def apply(self, row: dict): + return row[self.x_factor] * row[self.y_factor] + + @staticmethod + def key() -> str: + return "multiply" + + +builtins = [Concat, Divide, Multiply] diff --git a/magicparse/fields.py b/magicparse/fields.py index da131da..63cbcb8 100644 --- a/magicparse/fields.py +++ b/magicparse/fields.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod from typing import List + +from .builders import Builder from .type_converters import TypeConverter from .post_processors import PostProcessor from .pre_processors import PreProcessor @@ -96,3 +98,18 @@ def error(self, exception: Exception) -> dict: "field-key": self.key, "error": exception.args[0], } + + +class ComputedField(Field): + def __init__(self, options: dict) -> None: + super().__init__(options) + self.builder = Builder.build(options["builder"]) + + def _read_raw_value(self, row) -> str: + return self.builder.apply(row) + + def error(self, exception: Exception) -> dict: + return { + "field-key": self.key, + "error": exception.args[0], + } diff --git a/magicparse/schema.py b/magicparse/schema.py index c4aefb9..aa0c6da 100644 --- a/magicparse/schema.py +++ b/magicparse/schema.py @@ -1,7 +1,7 @@ import codecs from abc import ABC, abstractmethod import csv -from .fields import Field +from .fields import Field, ComputedField from io import BytesIO from typing import Any, Dict, List, Tuple, Union, Iterable @@ -13,6 +13,9 @@ class Schema(ABC): def __init__(self, options: Dict[str, Any]) -> None: self.fields = [Field.build(item) for item in options["fields"]] + self.computed_fields = [ + ComputedField.build(item) for item in options.get("computed-fields", []) + ] self.has_header = options.get("has_header", False) self.encoding = options.get("encoding", "utf-8") @@ -70,6 +73,18 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]: item[field.key] = value + for computed_field in self.computed_fields: + try: + value = computed_field.read_value(item) + except Exception as exc: + errors.append( + {"row-number": row_number, **computed_field.error(exc)} + ) + row_is_valid = False + continue + + item[computed_field.key] = value + if row_is_valid: result.append(item) diff --git a/tests/test_builders.py b/tests/test_builders.py new file mode 100644 index 0000000..869f614 --- /dev/null +++ b/tests/test_builders.py @@ -0,0 +1,235 @@ +from decimal import Decimal + +import pytest +from unittest import TestCase + +from magicparse import Builder + + +class TestBuild(TestCase): + class WithoutParamBuilder(Builder): + @staticmethod + def key() -> str: + return "without-param" + + def apply(self, value): + pass + + class WithParamBuilder(Builder): + def __init__(self, setting: str) -> None: + self.setting = setting + + @staticmethod + def key() -> str: + return "with-param" + + def apply(self, value): + pass + + def test_without_parameter(self): + Builder.register(self.WithoutParamBuilder) + + builder = Builder.build({"name": "without-param"}) + assert isinstance(builder, self.WithoutParamBuilder) + + def test_with_parameter(self): + Builder.register(self.WithParamBuilder) + + builder = Builder.build( + {"name": "with-param", "parameters": {"setting": "value"}} + ) + assert isinstance(builder, self.WithParamBuilder) + assert builder.setting == "value" + + def test_unknown(self): + with pytest.raises(ValueError, match="invalid builder 'anything'"): + Builder.build({"name": "anything"}) + + def test_no_name_provided(self): + with pytest.raises(ValueError, match="builder must have a 'name' key"): + Builder.build({}) + + +class TestConcat(TestCase): + def test_no_params(self): + with pytest.raises(TypeError): + Builder.build({"name": "concat"}) + + def test_empty_params(self): + with pytest.raises(TypeError): + Builder.build({"name": "concat", "parameters": ""}) + + def test_fields_params_empty(self): + with pytest.raises(ValueError): + Builder.build({"name": "concat", "parameters": {"fields": ""}}) + + def test_fields_params_not_a_list_of_str(self): + with pytest.raises(ValueError): + Builder.build({"name": "concat", "parameters": {"fields": "xxx"}}) + + def test_fields_params_has_less_than_two_field(self): + with pytest.raises(ValueError): + Builder.build({"name": "concat", "parameters": {"fields": ["code"]}}) + + def test_field_not_present(self): + builder = Builder.build( + {"name": "concat", "parameters": {"fields": ["code_1", "code_2"]}} + ) + with pytest.raises(KeyError): + builder.apply({}) + + def test_concat_two_fields(self): + builder = Builder.build( + {"name": "concat", "parameters": {"fields": ["code_1", "code_2"]}} + ) + + result = builder.apply({"code_1": "X", "code_2": "Y"}) + + assert result == "XY" + + def test_concat_three_fields(self): + builder = Builder.build( + {"name": "concat", "parameters": {"fields": ["code_1", "code_2", "code_3"]}} + ) + + result = builder.apply({"code_1": "X", "code_2": "Y", "code_3": "Z"}) + + assert result == "XYZ" + + def test_concat_integer(self): + builder = Builder.build( + {"name": "concat", "parameters": {"fields": ["code_1", "code_2"]}} + ) + + with pytest.raises(TypeError): + builder.apply({"code_1": 1, "code_2": 2}) + + +class TestDivide(TestCase): + def test_no_params(self): + with pytest.raises(TypeError): + Builder.build({"name": "divide"}) + + def test_empty_params(self): + with pytest.raises(TypeError): + Builder.build({"name": "divide", "parameters": ""}) + + def test_numerator_param_empty(self): + with pytest.raises(ValueError): + Builder.build( + { + "name": "divide", + "parameters": {"numerator": "", "denominator": "price_by_unit"}, + } + ) + + def test_denominator_param_empty(self): + with pytest.raises(ValueError): + Builder.build( + { + "name": "divide", + "parameters": {"numerator": "price", "denominator": ""}, + } + ) + + def test_field_not_present(self): + builder = Builder.build( + { + "name": "divide", + "parameters": {"numerator": "price", "denominator": "price_by_unit"}, + } + ) + with pytest.raises(KeyError): + builder.apply({}) + + def test_numerator_not_valid(self): + builder = Builder.build( + { + "name": "divide", + "parameters": {"numerator": "price", "denominator": "price_by_unit"}, + } + ) + with pytest.raises(TypeError): + builder.apply({"price": "e", "price_by_unit": 1}) + + def test_denominator_not_valid(self): + builder = Builder.build( + { + "name": "divide", + "parameters": {"numerator": "price", "denominator": "price_by_unit"}, + } + ) + with pytest.raises(TypeError): + builder.apply({"price": 1, "price_by_unit": "ee"}) + + def test_valid_param(self): + builder = Builder.build( + { + "name": "divide", + "parameters": {"numerator": "price", "denominator": "price_by_unit"}, + } + ) + + result = builder.apply({"price": 1, "price_by_unit": 2}) + + assert result == Decimal("0.5") + + +class TestMultiply(TestCase): + def test_no_params(self): + with pytest.raises(TypeError): + Builder.build({"name": "multiply"}) + + def test_empty_params(self): + with pytest.raises(TypeError): + Builder.build({"name": "multiply", "parameters": ""}) + + def test_x_factor_param_empty(self): + with pytest.raises(ValueError): + Builder.build( + { + "name": "multiply", + "parameters": {"x_factor": "", "y_factor": "unit"}, + } + ) + + def test_y_factor_param_empty(self): + with pytest.raises(ValueError): + Builder.build( + { + "name": "multiply", + "parameters": {"x_factor": "price", "y_factor": ""}, + } + ) + + def test_field_not_present(self): + builder = Builder.build( + { + "name": "multiply", + "parameters": {"x_factor": "price", "y_factor": "unit"}, + } + ) + with pytest.raises(KeyError): + builder.apply({}) + + def test_x_y_factor_not_valid(self): + builder = Builder.build( + { + "name": "multiply", + "parameters": {"x_factor": "price", "y_factor": "unit"}, + } + ) + with pytest.raises(TypeError): + builder.apply({"price": "e", "unit": "e"}) + + def test_valid_param(self): + builder = Builder.build( + { + "name": "multiply", + "parameters": {"x_factor": "price", "y_factor": "unit"}, + } + ) + + result = builder.apply({"price": 1.5, "unit": 2}) + + assert result == 3 diff --git a/tests/test_computed_fields.py b/tests/test_computed_fields.py new file mode 100644 index 0000000..a02fd7c --- /dev/null +++ b/tests/test_computed_fields.py @@ -0,0 +1,51 @@ +import pytest + +from magicparse.fields import ComputedField +from unittest import TestCase + + +class TestBuild(TestCase): + def test_without_builder(self): + with self.assertRaises(KeyError): + ComputedField({"key": "output", "type": "str"}) + + def test_not_iterable_value_for_builder(self): + with self.assertRaises(ValueError): + ComputedField({"key": "output", "type": "str", "builder": 1}) + + def test_bad_value_for_builder(self): + with self.assertRaises(ValueError): + ComputedField({"key": "output", "type": "str", "builder": "really"}) + + def test_with_valid_builder(self): + field = ComputedField( + { + "key": "output", + "type": "str", + "builder": { + "name": "concat", + "parameters": {"fields": ["code_1", "code_2"]}, + }, + } + ) + + computed = field.read_value({"code_1": "01", "code_2": "02"}) + + assert computed == "0102" + + def test_error_format(self): + field = ComputedField( + { + "key": "output", + "type": "str", + "builder": { + "name": "concat", + "parameters": {"fields": ["code_1", "code_2"]}, + }, + } + ) + + with pytest.raises(KeyError) as error: + field.read_value({}) + + assert field.error(error.value) == {"error": "code_1", "field-key": "output"}