From 70b8f7f85f0a0baf53f8fd731b9fe7400d211ce6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 27 Apr 2024 22:06:53 -0400 Subject: [PATCH] feat(csvclean): Add --header-normalize-space option, closes #1056 --- CHANGELOG.rst | 1 + csvkit/cleanup.py | 4 +++- csvkit/utilities/csvclean.py | 7 +++++-- examples/test_header_newline.csv | 5 +++++ tests/test_utilities/test_csvclean.py | 6 ++++++ 5 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 examples/test_header_newline.csv diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c8c7d0ba2..488e2f64d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,7 @@ Other changes: +* feat: :doc:`/scripts/csvclean` adds a :code:`--header-normalize-space` option to strip leading and trailing whitespace and replace sequences of whitespace characters by a single space in the header. * feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS `__) and 5 (`csv.QUOTE_NOTNULL `__) on Python 3.12. * feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS `__) and 5 (`csv.QUOTE_NOTNULL `__) on Python 3.12. * fix: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option works with 2 (`csv.QUOTE_NONUMERIC `__). Use the :code:`--locale` option to set the locale of any formatted numbers. diff --git a/csvkit/cleanup.py b/csvkit/cleanup.py index 818d268bd..885139f57 100644 --- a/csvkit/cleanup.py +++ b/csvkit/cleanup.py @@ -28,10 +28,12 @@ class RowChecker: Iterate over rows of a CSV producing cleaned rows and storing error rows. """ - def __init__(self, reader): + def __init__(self, reader, header_normalize_space=False): self.reader = reader try: self.column_names = next(reader) + if header_normalize_space: + self.column_names = [' '.join(column_name.split()) for column_name in self.column_names] except StopIteration: self.column_names = [] self.errors = [] diff --git a/csvkit/utilities/csvclean.py b/csvkit/utilities/csvclean.py index 2b92bfd77..f4290c8b7 100644 --- a/csvkit/utilities/csvclean.py +++ b/csvkit/utilities/csvclean.py @@ -13,7 +13,10 @@ class CSVClean(CSVKitUtility): override_flags = ['L', 'blanks', 'date-format', 'datetime-format'] def add_arguments(self): - pass + self.argparser.add_argument( + '--header-normalize-space', dest='header_normalize_space', action='store_true', + help='Strip leading and trailing whitespace and replace sequences of whitespace characters by a single ' + 'space in the header.') def main(self): if self.additional_input_expected(): @@ -21,7 +24,7 @@ def main(self): reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs) - checker = RowChecker(reader) + checker = RowChecker(reader, header_normalize_space=self.args.header_normalize_space) output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs) output_writer.writerow(checker.column_names) diff --git a/examples/test_header_newline.csv b/examples/test_header_newline.csv new file mode 100644 index 000000000..b7d271d5d --- /dev/null +++ b/examples/test_header_newline.csv @@ -0,0 +1,5 @@ +" + start + end + ",b,c +d,e,f diff --git a/tests/test_utilities/test_csvclean.py b/tests/test_utilities/test_csvclean.py index 754f75ab5..4d354db84 100644 --- a/tests/test_utilities/test_csvclean.py +++ b/tests/test_utilities/test_csvclean.py @@ -77,6 +77,12 @@ def test_no_header_row(self): ['1', '2', '3'], ], []) + def test_header_normalize_space(self): + self.assertCleaned(['--header-normalize-space', 'examples/test_header_newline.csv'], [ + ['start end', 'b', 'c'], + ['d', 'e', 'f'], + ], []) + def test_removes_optional_quote_characters(self): self.assertCleaned(['examples/optional_quote_characters.csv'], [ ['a', 'b', 'c'],