Skip to content

Commit

Permalink
feat(csvclean): Add --header-normalize-space option, closes #1056
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Apr 28, 2024
1 parent 95dc26d commit 70b8f7f
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 3 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

Other changes:

* feat: :doc:`/scripts/csvclean` adds a :code:`--header-normalize-space` option to strip leading and trailing whitespace and replace sequences of whitespace characters by a single space in the header.
* feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
* feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
* fix: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option works with 2 (`csv.QUOTE_NONUMERIC <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNUMERIC>`__). Use the :code:`--locale` option to set the locale of any formatted numbers.
Expand Down
4 changes: 3 additions & 1 deletion csvkit/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ class RowChecker:
Iterate over rows of a CSV producing cleaned rows and storing error rows.
"""

def __init__(self, reader):
def __init__(self, reader, header_normalize_space=False):
self.reader = reader
try:
self.column_names = next(reader)
if header_normalize_space:
self.column_names = [' '.join(column_name.split()) for column_name in self.column_names]
except StopIteration:
self.column_names = []
self.errors = []
Expand Down
7 changes: 5 additions & 2 deletions csvkit/utilities/csvclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,18 @@ class CSVClean(CSVKitUtility):
override_flags = ['L', 'blanks', 'date-format', 'datetime-format']

def add_arguments(self):
pass
self.argparser.add_argument(
'--header-normalize-space', dest='header_normalize_space', action='store_true',
help='Strip leading and trailing whitespace and replace sequences of whitespace characters by a single '
'space in the header.')

def main(self):
if self.additional_input_expected():
sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')

reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)

checker = RowChecker(reader)
checker = RowChecker(reader, header_normalize_space=self.args.header_normalize_space)

output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
output_writer.writerow(checker.column_names)
Expand Down
5 changes: 5 additions & 0 deletions examples/test_header_newline.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"
start
end
",b,c
d,e,f
Expand Down
6 changes: 6 additions & 0 deletions tests/test_utilities/test_csvclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ def test_no_header_row(self):
['1', '2', '3'],
], [])

def test_header_normalize_space(self):
self.assertCleaned(['--header-normalize-space', 'examples/test_header_newline.csv'], [
['start end', 'b', 'c'],
['d', 'e', 'f'],
], [])

def test_removes_optional_quote_characters(self):
self.assertCleaned(['examples/optional_quote_characters.csv'], [
['a', 'b', 'c'],
Expand Down

0 comments on commit 70b8f7f

Please sign in to comment.