From 3fc92543648301d1afe48cce160cd0f0dd173666 Mon Sep 17 00:00:00 2001
From: antoine-b-smartway <a.bous@smartway.ai>
Date: Wed, 17 Apr 2024 10:42:17 +0200
Subject: [PATCH] Add stream parsing in Schema class + temporary tests to
 verify the consumption of memory

---
 magicparse/schema.py | 25 +++++++++++++++----------
 pyproject.toml       |  2 +-
 tests/test_schema.py | 23 ++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/magicparse/schema.py b/magicparse/schema.py
index aa0c6da..84017fc 100644
--- a/magicparse/schema.py
+++ b/magicparse/schema.py
@@ -44,7 +44,20 @@ def register(cls, schema: "Schema") -> None:
 
         cls.registry[schema.key()] = schema
 
+
     def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]:
+        items = []
+        errors = []
+
+        for item, row_errors in self.stream_parse(data):
+            if(row_errors):
+                errors.extend(row_errors)
+            else:
+                items.append(item)
+
+        return items, errors
+
+    def stream_parse(self, data: Union[bytes, BytesIO]) -> Iterable[Tuple[dict, list[dict]]]:
         if isinstance(data, bytes):
             stream = BytesIO(data)
         else:
@@ -57,18 +70,15 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]:
             next(reader)
             row_number += 1
 
-        result = []
-        errors = []
         for row in reader:
+            errors = []
             row_number += 1
-            row_is_valid = True
             item = {}
             for field in self.fields:
                 try:
                     value = field.read_value(row)
                 except Exception as exc:
                     errors.append({"row-number": row_number, **field.error(exc)})
-                    row_is_valid = False
                     continue
 
                 item[field.key] = value
@@ -80,16 +90,11 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]:
                     errors.append(
                         {"row-number": row_number, **computed_field.error(exc)}
                     )
-                    row_is_valid = False
                     continue
 
                 item[computed_field.key] = value
 
-            if row_is_valid:
-                result.append(item)
-
-        return result, errors
-
+            yield item, errors
 
 class CsvSchema(Schema):
     def __init__(self, options: Dict[str, Any]) -> None:
diff --git a/pyproject.toml b/pyproject.toml
index b69cac2..199e8bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,4 +32,4 @@ extend-ignore = ["E203", "E722"]
 exclude = [".git/", ".pytest_cache/", ".venv"]
 
 [tool.pytest.ini_options]
-python_files = ["tests/*"]
+python_files = ["tests/*"]
\ No newline at end of file
diff --git a/tests/test_schema.py b/tests/test_schema.py
index 7df6b47..909d21a 100644
--- a/tests/test_schema.py
+++ b/tests/test_schema.py
@@ -1,11 +1,11 @@
 from decimal import Decimal
+
 from magicparse import Schema
 from magicparse.schema import ColumnarSchema, CsvSchema
 from magicparse.fields import ColumnarField, CsvField
 import pytest
 from unittest import TestCase
 
-
 class TestBuild(TestCase):
     def test_default_csv_schema(self):
         schema = Schema.build(
@@ -287,3 +287,24 @@ def test_register(self):
             }
         )
         assert isinstance(schema, self.PipedSchema)
+
+class TestSteamParse(TestCase):
+
+    def test_stream_parse_errors_do_not_halt_parsing(self):
+        schema = Schema.build(
+            {
+                "file_type": "csv",
+                "fields": [{"key": "age", "type": "int", "column-number": 1}],
+            }
+        )
+        rows = list(schema.stream_parse(b"1\na\n2"))
+        assert rows == [
+            ({"age": 1}, []), 
+            ({}, [{
+                "row-number": 2,
+                "column-number": 1,
+                "field-key": "age",
+                "error": "value 'a' is not a valid integer",
+            }]), 
+            ({"age": 2}, [])
+            ]
\ No newline at end of file