ZeroGachis · antoine-b-smartway · May 17, 2024 · May 17, 2024 · May 17, 2024
@@ -1,6 +1,6 @@
 from io import BytesIO
 
-from .schema import Schema, builtins as builtins_schemas
+from .schema import ParsedRow, Schema, builtins as builtins_schemas
 from .post_processors import PostProcessor, builtins as builtins_post_processors
 from .pre_processors import PreProcessor, builtins as builtins_pre_processors
 from .builders import (
@@ -9,13 +9,14 @@
 )
 from .transform import Transform
 from .type_converters import TypeConverter, builtins as builtins_type_converters
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, Iterable, List, Tuple, Union
 from .validators import Validator, builtins as builtins_validators
 
 
 __all__ = [
     "TypeConverter",
     "parse",
+    "stream_parse",
     "PostProcessor",
     "PreProcessor",
     "Schema",
@@ -30,6 +31,13 @@ def parse(
     return schema_definition.parse(data)
 
 
+def stream_parse(
+    data: Union[bytes, BytesIO], schema_options: Dict[str, Any]
+) -> Iterable[ParsedRow]:
+    schema_definition = Schema.build(schema_options)
+    return schema_definition.stream_parse(data)
+
+
 Registrable = Union[Schema, Transform]
 
 

@@ -1,11 +1,19 @@
 import codecs
 from abc import ABC, abstractmethod
 import csv
+from dataclasses import dataclass
 from .fields import Field, ComputedField
 from io import BytesIO
 from typing import Any, Dict, List, Tuple, Union, Iterable
 
 
+@dataclass(frozen=True, slots=True)
+class ParsedRow:
+    row_number: int
+    values: dict
+    errors: list[dict]
+
+
 class Schema(ABC):
     fields: List[Field]
     encoding: str
@@ -48,17 +56,15 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]:
         items = []
         errors = []
 
-        for item, row_errors in self.stream_parse(data):
-            if row_errors:
-                errors.extend(row_errors)
+        for parsed_row in self.stream_parse(data):
+            if parsed_row.errors:
+                errors.extend(parsed_row.errors)
             else:
-                items.append(item)
+                items.append(parsed_row.values)
 
         return items, errors
 
-    def stream_parse(
-        self, data: Union[bytes, BytesIO]
-    ) -> Iterable[Tuple[dict, list[dict]]]:
+    def stream_parse(self, data: Union[bytes, BytesIO]) -> Iterable[ParsedRow]:
         if isinstance(data, bytes):
             stream = BytesIO(data)
         else:
@@ -98,7 +104,7 @@ def stream_parse(
 
                 item[computed_field.key] = value
 
-            yield item, errors
+            yield ParsedRow(row_number, item, errors)
 
 
 class CsvSchema(Schema):

@@ -336,18 +336,46 @@ def test_stream_parse_errors_do_not_halt_parsing(self):
             }
         )
         rows = list(schema.stream_parse(b"1\na\n2"))
-        assert rows == [
-            ({"age": 1}, []),
-            (
-                {},
-                [
-                    {
-                        "row-number": 2,
-                        "column-number": 1,
-                        "field-key": "age",
-                        "error": "value 'a' is not a valid integer",
-                    }
-                ],
-            ),
-            ({"age": 2}, []),
+        assert len(rows) == 3
+        assert rows[0].row_number == 1
+        assert rows[0].values == {"age": 1}
+        assert rows[0].errors == []
+
+        assert rows[1].row_number == 2
+        assert rows[1].values == {}
+        assert rows[1].errors == [
+            {
+                "row-number": 2,
+                "column-number": 1,
+                "field-key": "age",
+                "error": "value 'a' is not a valid integer",
+            }
         ]
+
+        assert rows[2].row_number == 3
+        assert rows[2].values == {"age": 2}
+        assert rows[2].errors == []
+
+    def test_stream_parse_with_header_first_row_number_is_2(self):
+        schema = Schema.build(
+            {
+                "has_header": True,
+                "file_type": "csv",
+                "fields": [{"key": "age", "type": "int", "column-number": 1}],
+            }
+        )
+        rows = list(schema.stream_parse(b"My age\n1"))
+        assert len(rows) == 1
+        assert rows[0].row_number == 2
+
+    def test_stream_parse_without_header_first_row_number_is_1(self):
+        schema = Schema.build(
+            {
+                "has_header": False,
+                "file_type": "csv",
+                "fields": [{"key": "age", "type": "int", "column-number": 1}],
+            }
+        )
+        rows = list(schema.stream_parse(b"1"))
+        assert len(rows) == 1
+        assert rows[0].row_number == 1