From 3fc92543648301d1afe48cce160cd0f0dd173666 Mon Sep 17 00:00:00 2001 From: antoine-b-smartway Date: Wed, 17 Apr 2024 10:42:17 +0200 Subject: [PATCH] Add stream parsing in Schema class + temporary tests to verify the consumption of memory --- magicparse/schema.py | 25 +++++++++++++++---------- pyproject.toml | 2 +- tests/test_schema.py | 23 ++++++++++++++++++++++- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/magicparse/schema.py b/magicparse/schema.py index aa0c6da..84017fc 100644 --- a/magicparse/schema.py +++ b/magicparse/schema.py @@ -44,7 +44,20 @@ def register(cls, schema: "Schema") -> None: cls.registry[schema.key()] = schema + def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]: + items = [] + errors = [] + + for item, row_errors in self.stream_parse(data): + if(row_errors): + errors.extend(row_errors) + else: + items.append(item) + + return items, errors + + def stream_parse(self, data: Union[bytes, BytesIO]) -> Iterable[Tuple[dict, list[dict]]]: if isinstance(data, bytes): stream = BytesIO(data) else: @@ -57,18 +70,15 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]: next(reader) row_number += 1 - result = [] - errors = [] for row in reader: + errors = [] row_number += 1 - row_is_valid = True item = {} for field in self.fields: try: value = field.read_value(row) except Exception as exc: errors.append({"row-number": row_number, **field.error(exc)}) - row_is_valid = False continue item[field.key] = value @@ -80,16 +90,11 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]: errors.append( {"row-number": row_number, **computed_field.error(exc)} ) - row_is_valid = False continue item[computed_field.key] = value - if row_is_valid: - result.append(item) - - return result, errors - + yield item, errors class CsvSchema(Schema): def __init__(self, options: Dict[str, Any]) -> None: diff --git a/pyproject.toml b/pyproject.toml index b69cac2..199e8bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,4 +32,4 @@ extend-ignore = ["E203", "E722"] exclude = [".git/", ".pytest_cache/", ".venv"] [tool.pytest.ini_options] -python_files = ["tests/*"] +python_files = ["tests/*"] \ No newline at end of file diff --git a/tests/test_schema.py b/tests/test_schema.py index 7df6b47..909d21a 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -1,11 +1,11 @@ from decimal import Decimal + from magicparse import Schema from magicparse.schema import ColumnarSchema, CsvSchema from magicparse.fields import ColumnarField, CsvField import pytest from unittest import TestCase - class TestBuild(TestCase): def test_default_csv_schema(self): schema = Schema.build( @@ -287,3 +287,24 @@ def test_register(self): } ) assert isinstance(schema, self.PipedSchema) + +class TestSteamParse(TestCase): + + def test_stream_parse_errors_do_not_halt_parsing(self): + schema = Schema.build( + { + "file_type": "csv", + "fields": [{"key": "age", "type": "int", "column-number": 1}], + } + ) + rows = list(schema.stream_parse(b"1\na\n2")) + assert rows == [ + ({"age": 1}, []), + ({}, [{ + "row-number": 2, + "column-number": 1, + "field-key": "age", + "error": "value 'a' is not a valid integer", + }]), + ({"age": 2}, []) + ] \ No newline at end of file