Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

int-676 Introduce a ParsedRow class for parsing result (including the row number) + expose the stream_parse method #28

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions magicparse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from io import BytesIO

from .schema import Schema, builtins as builtins_schemas
from .schema import ParsedRow, Schema, builtins as builtins_schemas
from .post_processors import PostProcessor, builtins as builtins_post_processors
from .pre_processors import PreProcessor, builtins as builtins_pre_processors
from .builders import (
Expand All @@ -9,13 +9,14 @@
)
from .transform import Transform
from .type_converters import TypeConverter, builtins as builtins_type_converters
from typing import Any, Dict, List, Tuple, Union
from typing import Any, Dict, Iterable, List, Tuple, Union
from .validators import Validator, builtins as builtins_validators


__all__ = [
"TypeConverter",
"parse",
"stream_parse",
"PostProcessor",
"PreProcessor",
"Schema",
antoine-b-smartway marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -30,6 +31,13 @@ def parse(
return schema_definition.parse(data)


def stream_parse(
data: Union[bytes, BytesIO], schema_options: Dict[str, Any]
) -> Iterable[ParsedRow]:
schema_definition = Schema.build(schema_options)
return schema_definition.stream_parse(data)


Registrable = Union[Schema, Transform]


Expand Down
22 changes: 14 additions & 8 deletions magicparse/schema.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import codecs
from abc import ABC, abstractmethod
import csv
from dataclasses import dataclass
from .fields import Field, ComputedField
from io import BytesIO
from typing import Any, Dict, List, Tuple, Union, Iterable


@dataclass(frozen=True, slots=True)
class ParsedRow:
row_number: int
values: dict
errors: list[dict]


class Schema(ABC):
fields: List[Field]
encoding: str
Expand Down Expand Up @@ -48,17 +56,15 @@ def parse(self, data: Union[bytes, BytesIO]) -> Tuple[List[dict], List[dict]]:
items = []
errors = []

for item, row_errors in self.stream_parse(data):
if row_errors:
errors.extend(row_errors)
for parsed_row in self.stream_parse(data):
if parsed_row.errors:
errors.extend(parsed_row.errors)
else:
items.append(item)
items.append(parsed_row.values)

return items, errors

def stream_parse(
self, data: Union[bytes, BytesIO]
) -> Iterable[Tuple[dict, list[dict]]]:
def stream_parse(self, data: Union[bytes, BytesIO]) -> Iterable[ParsedRow]:
if isinstance(data, bytes):
stream = BytesIO(data)
else:
Expand Down Expand Up @@ -98,7 +104,7 @@ def stream_parse(

item[computed_field.key] = value

yield item, errors
yield ParsedRow(row_number, item, errors)


class CsvSchema(Schema):
Expand Down
56 changes: 42 additions & 14 deletions tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,18 +336,46 @@ def test_stream_parse_errors_do_not_halt_parsing(self):
}
)
rows = list(schema.stream_parse(b"1\na\n2"))
assert rows == [
({"age": 1}, []),
(
{},
[
{
"row-number": 2,
"column-number": 1,
"field-key": "age",
"error": "value 'a' is not a valid integer",
}
],
),
({"age": 2}, []),
assert len(rows) == 3
assert rows[0].row_number == 1
assert rows[0].values == {"age": 1}
assert rows[0].errors == []

assert rows[1].row_number == 2
assert rows[1].values == {}
assert rows[1].errors == [
{
"row-number": 2,
"column-number": 1,
"field-key": "age",
"error": "value 'a' is not a valid integer",
}
]

assert rows[2].row_number == 3
assert rows[2].values == {"age": 2}
assert rows[2].errors == []
antoine-b-smartway marked this conversation as resolved.
Show resolved Hide resolved

def test_stream_parse_with_header_first_row_number_is_2(self):
schema = Schema.build(
{
"has_header": True,
"file_type": "csv",
"fields": [{"key": "age", "type": "int", "column-number": 1}],
}
)
rows = list(schema.stream_parse(b"My age\n1"))
assert len(rows) == 1
assert rows[0].row_number == 2

def test_stream_parse_without_header_first_row_number_is_1(self):
schema = Schema.build(
{
"has_header": False,
"file_type": "csv",
"fields": [{"key": "age", "type": "int", "column-number": 1}],
}
)
rows = list(schema.stream_parse(b"1"))
assert len(rows) == 1
assert rows[0].row_number == 1
Loading