From e11a82d72a813f43248465f116a7a62f39fb50e6 Mon Sep 17 00:00:00 2001 From: antoine-b-smartway Date: Wed, 17 Apr 2024 10:42:17 +0200 Subject: [PATCH] Add stream parsing in Schema class + temporary tests to verify the consumption of memory --- magicparse/schema.py | 25 ++++++++++++++++--------- pyproject.toml | 2 +- tests/test_schema.py | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/magicparse/schema.py b/magicparse/schema.py index aa0c6da..10b9980 100644 --- a/magicparse/schema.py +++ b/magicparse/schema.py @@ -45,6 +45,20 @@ def register(cls, schema: "Schema") -> None: cls.registry[schema.key()] = schema def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]: + items = [] + errors = [] + + for item, row_errors in self.stream_parse(data): + if row_errors: + errors.extend(row_errors) + else: + items.append(item) + + return items, errors + + def stream_parse( + self, data: Union[bytes, BytesIO] + ) -> Iterable[Tuple[dict, list[dict]]]: if isinstance(data, bytes): stream = BytesIO(data) else: @@ -57,18 +71,15 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]: next(reader) row_number += 1 - result = [] - errors = [] for row in reader: + errors = [] row_number += 1 - row_is_valid = True item = {} for field in self.fields: try: value = field.read_value(row) except Exception as exc: errors.append({"row-number": row_number, **field.error(exc)}) - row_is_valid = False continue item[field.key] = value @@ -80,15 +91,11 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]: errors.append( {"row-number": row_number, **computed_field.error(exc)} ) - row_is_valid = False continue item[computed_field.key] = value - if row_is_valid: - result.append(item) - - return result, errors + yield item, errors class CsvSchema(Schema): diff --git a/pyproject.toml b/pyproject.toml index b69cac2..199e8bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,4 +32,4 @@ extend-ignore = ["E203", "E722"] exclude = [".git/", ".pytest_cache/", ".venv"] [tool.pytest.ini_options] -python_files = ["tests/*"] +python_files = ["tests/*"] \ No newline at end of file diff --git a/tests/test_schema.py b/tests/test_schema.py index 7df6b47..ecfae8d 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1,4 +1,5 @@ from decimal import Decimal + from magicparse import Schema from magicparse.schema import ColumnarSchema, CsvSchema from magicparse.fields import ColumnarField, CsvField @@ -287,3 +288,29 @@ def test_register(self): } ) assert isinstance(schema, self.PipedSchema) + + +class TestSteamParse(TestCase): + def test_stream_parse_errors_do_not_halt_parsing(self): + schema = Schema.build( + { + "file_type": "csv", + "fields": [{"key": "age", "type": "int", "column-number": 1}], + } + ) + rows = list(schema.stream_parse(b"1\na\n2")) + assert rows == [ + ({"age": 1}, []), + ( + {}, + [ + { + "row-number": 2, + "column-number": 1, + "field-key": "age", + "error": "value 'a' is not a valid integer", + } + ], + ), + ({"age": 2}, []), + ]