From da4d895aa671e76fd73c349d8e5aa05bacf43912 Mon Sep 17 00:00:00 2001 From: antoine-b-smartway Date: Tue, 23 Apr 2024 17:53:35 +0200 Subject: [PATCH 1/2] int-679 Skip empty lines from the parsing --- magicparse/schema.py | 48 +++++++++++++++++++++++++------------------- tests/test_schema.py | 37 ++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/magicparse/schema.py b/magicparse/schema.py index 10b9980..81a4bea 100644 --- a/magicparse/schema.py +++ b/magicparse/schema.py @@ -72,30 +72,31 @@ def stream_parse( row_number += 1 for row in reader: - errors = [] row_number += 1 - item = {} - for field in self.fields: - try: - value = field.read_value(row) - except Exception as exc: - errors.append({"row-number": row_number, **field.error(exc)}) - continue + if any(row): + errors = [] + item = {} + for field in self.fields: + try: + value = field.read_value(row) + except Exception as exc: + errors.append({"row-number": row_number, **field.error(exc)}) + continue - item[field.key] = value + item[field.key] = value - for computed_field in self.computed_fields: - try: - value = computed_field.read_value(item) - except Exception as exc: - errors.append( - {"row-number": row_number, **computed_field.error(exc)} - ) - continue + for computed_field in self.computed_fields: + try: + value = computed_field.read_value(item) + except Exception as exc: + errors.append( + {"row-number": row_number, **computed_field.error(exc)} + ) + continue - item[computed_field.key] = value + item[computed_field.key] = value - yield item, errors + yield item, errors class CsvSchema(Schema): @@ -124,8 +125,13 @@ def key() -> str: class ColumnarSchema(Schema): def get_reader(self, stream: BytesIO) -> Iterable[str]: - stream_reader = codecs.getreader(self.encoding) - return stream_reader(stream) + stream_reader_factory = codecs.getreader(self.encoding) + stream_reader = stream_reader_factory(stream) + while True: + line = stream_reader.readline(None, False) + if not line: + break + yield line @staticmethod def key() -> str: diff --git a/tests/test_schema.py b/tests/test_schema.py index ecfae8d..40e7ef3 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -130,6 +130,21 @@ def test_errors_do_not_halt_parsing(self): } ] + def test_parse_should_skip_empty_lines(self): + schema = Schema.build( + { + "file_type": "csv", + "fields": [{"key": "name", "type": "str", "column-number": 1}], + } + ) + rows, errors = schema.parse( + b"""1 + +""" + ) + assert rows == [{"name": "1"}] + assert not errors + class TestColumnarParse(TestCase): def test_with_no_data(self): @@ -216,6 +231,28 @@ def test_errors_do_not_halt_parsing(self): } ] + def test_parse_should_skip_empty_lines(self): + schema = Schema.build( + { + "file_type": "columnar", + "fields": [ + { + "key": "name", + "type": "str", + "column-start": 0, + "column-length": 8, + } + ], + } + ) + rows, errors = schema.parse( + b"""8013109C + +""" + ) + assert rows == [{"name": "8013109C"}] + assert not errors + class TestQuotingSetting(TestCase): def test_no_quote(self): From 1aed1e935808927347936f80512e632f0a49c142 Mon Sep 17 00:00:00 2001 From: antoine-b-smartway Date: Wed, 24 Apr 2024 15:03:11 +0200 Subject: [PATCH 2/2] int-679 code review feedbacks --- magicparse/schema.py | 50 +++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/magicparse/schema.py b/magicparse/schema.py index 81a4bea..914d45c 100644 --- a/magicparse/schema.py +++ b/magicparse/schema.py @@ -73,30 +73,32 @@ def stream_parse( for row in reader: row_number += 1 - if any(row): - errors = [] - item = {} - for field in self.fields: - try: - value = field.read_value(row) - except Exception as exc: - errors.append({"row-number": row_number, **field.error(exc)}) - continue - - item[field.key] = value - - for computed_field in self.computed_fields: - try: - value = computed_field.read_value(item) - except Exception as exc: - errors.append( - {"row-number": row_number, **computed_field.error(exc)} - ) - continue - - item[computed_field.key] = value - - yield item, errors + if not any(row): + continue + + errors = [] + item = {} + for field in self.fields: + try: + value = field.read_value(row) + except Exception as exc: + errors.append({"row-number": row_number, **field.error(exc)}) + continue + + item[field.key] = value + + for computed_field in self.computed_fields: + try: + value = computed_field.read_value(item) + except Exception as exc: + errors.append( + {"row-number": row_number, **computed_field.error(exc)} + ) + continue + + item[computed_field.key] = value + + yield item, errors class CsvSchema(Schema):