From 70b8f7f85f0a0baf53f8fd731b9fe7400d211ce6 Mon Sep 17 00:00:00 2001
From: James McKinney <26463+jpmckinney@users.noreply.github.com>
Date: Sat, 27 Apr 2024 22:06:53 -0400
Subject: [PATCH] feat(csvclean): Add --header-normalize-space option, closes
 #1056

---
 CHANGELOG.rst                         | 1 +
 csvkit/cleanup.py                     | 4 +++-
 csvkit/utilities/csvclean.py          | 7 +++++--
 examples/test_header_newline.csv      | 5 +++++
 tests/test_utilities/test_csvclean.py | 6 ++++++
 5 files changed, 20 insertions(+), 3 deletions(-)
 create mode 100644 examples/test_header_newline.csv

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c8c7d0ba2..488e2f64d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,7 @@
 
 Other changes:
 
+* feat: :doc:`/scripts/csvclean` adds a :code:`--header-normalize-space` option to strip leading and trailing whitespace and replace sequences of whitespace characters by a single space in the header.
 * feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
 * feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
 * fix: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option works with 2 (`csv.QUOTE_NONUMERIC <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNUMERIC>`__). Use the :code:`--locale` option to set the locale of any formatted numbers.
diff --git a/csvkit/cleanup.py b/csvkit/cleanup.py
index 818d268bd..885139f57 100644
--- a/csvkit/cleanup.py
+++ b/csvkit/cleanup.py
@@ -28,10 +28,12 @@ class RowChecker:
     Iterate over rows of a CSV producing cleaned rows and storing error rows.
     """
 
-    def __init__(self, reader):
+    def __init__(self, reader, header_normalize_space=False):
         self.reader = reader
         try:
             self.column_names = next(reader)
+            if header_normalize_space:
+                self.column_names = [' '.join(column_name.split()) for column_name in self.column_names]
         except StopIteration:
             self.column_names = []
         self.errors = []
diff --git a/csvkit/utilities/csvclean.py b/csvkit/utilities/csvclean.py
index 2b92bfd77..f4290c8b7 100644
--- a/csvkit/utilities/csvclean.py
+++ b/csvkit/utilities/csvclean.py
@@ -13,7 +13,10 @@ class CSVClean(CSVKitUtility):
     override_flags = ['L', 'blanks', 'date-format', 'datetime-format']
 
     def add_arguments(self):
-        pass
+        self.argparser.add_argument(
+            '--header-normalize-space', dest='header_normalize_space', action='store_true',
+            help='Strip leading and trailing whitespace and replace sequences of whitespace characters by a single '
+                 'space in the header.')
 
     def main(self):
         if self.additional_input_expected():
@@ -21,7 +24,7 @@ def main(self):
 
         reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
 
-        checker = RowChecker(reader)
+        checker = RowChecker(reader, header_normalize_space=self.args.header_normalize_space)
 
         output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
         output_writer.writerow(checker.column_names)
diff --git a/examples/test_header_newline.csv b/examples/test_header_newline.csv
new file mode 100644
index 000000000..b7d271d5d
--- /dev/null
+++ b/examples/test_header_newline.csv
@@ -0,0 +1,5 @@
+"	
+ start 	
+ end 	
+",b,c
+d,e,f
diff --git a/tests/test_utilities/test_csvclean.py b/tests/test_utilities/test_csvclean.py
index 754f75ab5..4d354db84 100644
--- a/tests/test_utilities/test_csvclean.py
+++ b/tests/test_utilities/test_csvclean.py
@@ -77,6 +77,12 @@ def test_no_header_row(self):
             ['1', '2', '3'],
         ], [])
 
+    def test_header_normalize_space(self):
+        self.assertCleaned(['--header-normalize-space', 'examples/test_header_newline.csv'], [
+            ['start end', 'b', 'c'],
+            ['d', 'e', 'f'],
+        ], [])
+
     def test_removes_optional_quote_characters(self):
         self.assertCleaned(['examples/optional_quote_characters.csv'], [
             ['a', 'b', 'c'],