From b005370fb530da63f4a3e1098f9325d737eb03ba Mon Sep 17 00:00:00 2001 From: antoine-b-smartway <145436576+antoine-b-smartway@users.noreply.github.com> Date: Wed, 24 Apr 2024 18:05:28 +0200 Subject: [PATCH] int-679 Skip empty lines from the parsing (#27) int-679 Skip empty lines from the parsing to prevent noise in the logs --- magicparse/schema.py | 14 +++++++++++--- tests/test_schema.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/magicparse/schema.py b/magicparse/schema.py index 10b9980..914d45c 100644 --- a/magicparse/schema.py +++ b/magicparse/schema.py @@ -72,8 +72,11 @@ def stream_parse( row_number += 1 for row in reader: - errors = [] row_number += 1 + if not any(row): + continue + + errors = [] item = {} for field in self.fields: try: @@ -124,8 +127,13 @@ def key() -> str: class ColumnarSchema(Schema): def get_reader(self, stream: BytesIO) -> Iterable[str]: - stream_reader = codecs.getreader(self.encoding) - return stream_reader(stream) + stream_reader_factory = codecs.getreader(self.encoding) + stream_reader = stream_reader_factory(stream) + while True: + line = stream_reader.readline(None, False) + if not line: + break + yield line @staticmethod def key() -> str: diff --git a/tests/test_schema.py b/tests/test_schema.py index ecfae8d..40e7ef3 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -130,6 +130,21 @@ def test_errors_do_not_halt_parsing(self): } ] + def test_parse_should_skip_empty_lines(self): + schema = Schema.build( + { + "file_type": "csv", + "fields": [{"key": "name", "type": "str", "column-number": 1}], + } + ) + rows, errors = schema.parse( + b"""1 + +""" + ) + assert rows == [{"name": "1"}] + assert not errors + class TestColumnarParse(TestCase): def test_with_no_data(self): @@ -216,6 +231,28 @@ def test_errors_do_not_halt_parsing(self): } ] + def test_parse_should_skip_empty_lines(self): + schema = Schema.build( + { + "file_type": "columnar", + "fields": [ + { + "key": "name", + "type": "str", + "column-start": 0, + "column-length": 8, + } + ], + } + ) + rows, errors = schema.parse( + b"""8013109C + +""" + ) + assert rows == [{"name": "8013109C"}] + assert not errors + class TestQuotingSetting(TestCase): def test_no_quote(self):