Skip to content

Commit

Permalink
feat(csvclean): Add --empty-columns option, closes #426
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Apr 28, 2024
1 parent d05f414 commit b8d152b
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 45 deletions.
7 changes: 4 additions & 3 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

**BACKWARDS-INCOMPATIBLE CHANGES:**

* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.
* :doc:`/scripts/csvclean` no longer fixes errors by default. Opt in using the :code:`--join-short-rows` option.
* :doc:`/scripts/csvclean` joins short rows using a newline by default, instead of a space.
* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` option to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.
* :doc:`/scripts/csvclean` no longer fixes errors by default. Opt in to the original behavior using the :code:`--join-short-rows` option.
* :doc:`/scripts/csvclean` joins short rows using a newline by default, instead of a space. Restore the original behavior using the :code:`--separator " "` option.

Other changes:

Expand All @@ -15,6 +15,7 @@ Other changes:
* :code:`--separator`, to change the string with which to join short rows
* :code:`--fill-short-rows`, to fill short rows with the missing cells
* :code:`--fillvalue`, to change the value with which to fill short rows
* :code:`--empty-columns`, to error on empty columns

* feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
* feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS <https://docs.python.org/3/library/csv.html#csv.QUOTE_STRINGS>`__) and 5 (`csv.QUOTE_NOTNULL <https://docs.python.org/3/library/csv.html#csv.QUOTE_NOTNULL>`__) on Python 3.12.
Expand Down
56 changes: 44 additions & 12 deletions csvkit/cleanup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
#!/usr/bin/env python
from dataclasses import dataclass

from csvkit.exceptions import CSVTestException, LengthMismatchError

@dataclass
class Error:
line_number: int
row: int
msg: str


def join_rows(rows, separator):
Expand Down Expand Up @@ -36,12 +42,16 @@ def __init__(
separator='\n',
fill_short_rows=False,
fillvalue=None,
empty_columns=False,
zero_based=False,
):
self.reader = reader
self.join_short_rows = join_short_rows
self.separator = separator
self.fill_short_rows = fill_short_rows
self.fillvalue = fillvalue
self.empty_columns = empty_columns
self.zero_based = zero_based

try:
self.column_names = next(reader)
Expand All @@ -56,13 +66,23 @@ def checked_rows(self):
"""
A generator which yields rows which are ready to write to output.
"""
length = len(self.column_names)
len_column_names = len(self.column_names)
joinable_row_errors = []

row_count = 0
empty_counts = [0 for _ in range(len_column_names)]

for row in self.reader:
row_length = len(row)
line_number = self.reader.line_num - 1
row_count += 1
len_row = len(row)

if row_length == length:
if self.empty_columns:
for i, value in enumerate(row):
if value == '':
empty_counts[i] += 1

if len_row == len_column_names:
yield row

if self.join_short_rows:
Expand All @@ -71,32 +91,32 @@ def checked_rows(self):

continue

if self.fill_short_rows and row_length < length:
yield row + [self.fillvalue] * (length - row_length)
if self.fill_short_rows and len_row < len_column_names:
yield row + [self.fillvalue] * (len_column_names - len_row)

continue

length_mismatch_error = LengthMismatchError(self.reader.line_num - 1, row, length)
length_error = Error(line_number, row, f'Expected {len_column_names} columns, found {len_row} columns')

self.errors.append(length_mismatch_error)
self.errors.append(length_error)

if self.join_short_rows:
if row_length > length:
if len_row > len_column_names:
# Don't join with long rows.
joinable_row_errors = []
continue

joinable_row_errors.append(length_mismatch_error)
joinable_row_errors.append(length_error)
if len(joinable_row_errors) == 1:
continue

while joinable_row_errors:
fixed_row = join_rows([error.row for error in joinable_row_errors], separator=self.separator)

if len(fixed_row) < length:
if len(fixed_row) < len_column_names:
break

if len(fixed_row) == length:
if len(fixed_row) == len_column_names:
yield fixed_row

for fixed in joinable_row_errors:
Expand All @@ -107,3 +127,15 @@ def checked_rows(self):

# keep trying in case we're too long because of a straggler
joinable_row_errors = joinable_row_errors[1:]

if row_count:
if empty_columns := [i for i, count in enumerate(empty_counts) if count == row_count]:
offset = 0 if self.zero_based else 1
self.errors.append(
Error(
1,
["" for _ in range(len_column_names)],
f"Empty columns named {', '.join(repr(self.column_names[i]) for i in empty_columns)}! "
f"Try: csvcut -C {','.join(str(i + offset) for i in empty_columns)}",
)
)
26 changes: 0 additions & 26 deletions csvkit/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,6 @@ class ColumnIdentifierError(CustomException):
pass


class CSVTestException(CustomException):
"""
Superclass for all row-test-failed exceptions.
All must have a line number, the problematic row, and a text explanation.
"""

def __init__(self, line_number, row, msg):
super().__init__(msg)
self.line_number = line_number
self.row = row


class LengthMismatchError(CSVTestException):
"""
Encapsulate information about a row which as the wrong length.
"""

def __init__(self, line_number, row, expected_length):
msg = 'Expected %i columns, found %i columns' % (expected_length, len(row))
super().__init__(line_number, row, msg)

@property
def length(self):
return len(self.row)


class InvalidValueForTypeException(CustomException):
"""
Exception raised when a value can not be normalized to a specified type.
Expand Down
5 changes: 5 additions & 0 deletions csvkit/utilities/csvclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ def add_arguments(self):
self.argparser.add_argument(
'--fillvalue', dest='fillvalue',
help='The value with which to fill short rows. Defaults to none.')
self.argparser.add_argument(
'--empty-columns', dest='empty_columns', action='store_true',
help='Report empty columns as errors.')

def main(self):
if self.additional_input_expected():
Expand All @@ -46,6 +49,8 @@ def main(self):
separator=self.args.separator,
fill_short_rows=self.args.fill_short_rows,
fillvalue=self.args.fillvalue,
empty_columns=self.args.empty_columns,
zero_based=self.args.zero_based,
)

output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
Expand Down
29 changes: 27 additions & 2 deletions docs/scripts/csvclean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ Description

Cleans a CSV file of common syntax errors:

- reports rows that have a different number of columns than the header row
- Reports rows that have a different number of columns than the header row.
- Reports columns that are empty, if the :code:`--empty-columns` option is set.
- If a CSV has unquoted cells that contain line breaks, like:

.. code-block:: none
Expand Down Expand Up @@ -103,13 +104,14 @@ All valid rows are written to standard output, and all error rows along with lin
--fillvalue FILLVALUE
The value with which to fill short rows. Defaults to
none.
--empty-columns Report empty columns as errors.
See also: :doc:`../common_arguments`.

Examples
========

Test a file with known bad rows:
Test a file with data rows that are shorter and longer than the header row:

.. code-block:: console
Expand All @@ -125,6 +127,29 @@ Test a file with known bad rows:

If any data rows are longer than the header row, you need to add columns manually: for example, by adding one or more delimiters (``,``) to the end of the header row. :code:`csvclean` can't do this, because it is designed to work with standard input, and correcting an error at the start of the CSV data based on an observation later in the CSV data would require holding all the CSV data in memory – which is not an option for large files.

Test a file with empty columns:

.. code-block:: console
$ csvclean --empty-columns examples/test_empty_columns.csv 2> errors.csv
a,b,c,,
a,,,,
,,c,,
,,,,
$ cat errors.csv
line_number,msg,a,b,c,,
1,"Empty columns named 'b', '', ''! Try: csvcut -C 2,4,5",,,,,
Use :doc:`csvcut` to exclude the empty columns:

.. code-block:: bash
$ csvcut -C 2,4,5 examples/test_empty_columns.csv
a,c
a,
,c
,
To change the line ending from line feed (LF or ``\n``) to carriage return and line feed (CRLF or ``\r\n``) use:

.. code-block:: bash
Expand Down
4 changes: 4 additions & 0 deletions examples/test_empty_columns.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
a,b,c,,
a,,,,
,,c,,
,,,,
26 changes: 24 additions & 2 deletions tests/test_utilities/test_csvclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,13 @@ def test_simple(self):
def test_no_header_row(self):
self.assertCleaned(['examples/no_header_row.csv'], [
['1', '2', '3'],
], [])
])

def test_header_normalize_space(self):
self.assertCleaned(['--header-normalize-space', 'examples/test_header_newline.csv'], [
['start end', 'b', 'c'],
['d', 'e', 'f'],
], [])
])

def test_join_short_rows(self):
self.assertCleaned(['--join-short-rows', 'examples/test_join_short_rows.csv'], [
Expand Down Expand Up @@ -113,6 +113,28 @@ def test_fill_short_rows_separator(self):
['3', 'b', 'c'],
])

def test_empty_columns(self):
self.assertCleaned(['--empty-columns', 'examples/test_empty_columns.csv'], [
['a', 'b', 'c', '', ''],
['a', '', '', '', ''],
['', '', 'c', '', ''],
['', '', '', '', ''],
], [
['line_number', 'msg', 'a', 'b', 'c', '', ''],
['1', "Empty columns named 'b', '', ''! Try: csvcut -C 2,4,5", '', '', '', '', ''],
])

def test_empty_columns_zero(self):
self.assertCleaned(['--empty-columns', '--zero', 'examples/test_empty_columns.csv'], [
['a', 'b', 'c', '', ''],
['a', '', '', '', ''],
['', '', 'c', '', ''],
['', '', '', '', ''],
], [
['line_number', 'msg', 'a', 'b', 'c', '', ''],
['1', "Empty columns named 'b', '', ''! Try: csvcut -C 1,3,4", '', '', '', '', ''],
])

def test_removes_optional_quote_characters(self):
self.assertCleaned(['examples/optional_quote_characters.csv'], [
['a', 'b', 'c'],
Expand Down

0 comments on commit b8d152b

Please sign in to comment.