Skip to content

Commit

Permalink
feat(csvclean): Add --label option
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Apr 30, 2024
1 parent fb1f8c2 commit 44ff089
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 3 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ Other changes:
- :code:`--length-mismatch`, to error on data rows that are shorter or longer than the header row
- :code:`--empty-columns`, to error on empty columns
- :code:`--enable-all-checks`, to enable all error reporting.
- :code:`--omit-error-rows`, to omit data rows that contain errors, from standard output.
- :code:`--label`, to add a "label" column to standard error.
- :code:`--header-normalize-space`, to strip leading and trailing whitespace and replace sequences of whitespace characters by a single space in the header
- :code:`--separator`, to change the string with which to join short rows
- :code:`--fill-short-rows`, to fill short rows with the missing cells
Expand Down
23 changes: 21 additions & 2 deletions csvkit/utilities/csvclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ def add_arguments(self):
self.argparser.add_argument(
'--omit-error-rows', dest='omit_error_rows', action='store_true',
help='Omit data rows that contain errors, from standard output.')
self.argparser.add_argument(
'--label', dest='label',
help='Add a "label" column to standard error. Useful in automated workflows. '
'Use "-" to default to the input filename.')
self.argparser.add_argument(
'--header-normalize-space', dest='header_normalize_space', action='store_true',
help='Strip leading and trailing whitespace and replace sequences of whitespace characters by a single '
Expand Down Expand Up @@ -81,16 +85,31 @@ def main(self):
omit_error_rows=self.args.omit_error_rows,
)

label = self.args.label
if label == '-':
if self.input_file == sys.stdin:
label = 'stdin'
else:
label = self.input_file.name

output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
output_writer.writerow(checker.column_names)
for row in checker.checked_rows():
output_writer.writerow(row)

if checker.errors:
error_writer = agate.csv.writer(self.error_file, **self.writer_kwargs)
error_writer.writerow(['line_number', 'msg'] + checker.column_names)

fieldnames = ['line_number', 'msg'] + checker.column_names
if self.args.label:
fieldnames.insert(0, 'label')
error_writer.writerow(fieldnames)

for error in checker.errors:
error_writer.writerow([error.line_number, error.msg] + error.row)
row = [error.line_number, error.msg] + error.row
if self.args.label:
row.insert(0, label)
error_writer.writerow(row)

sys.exit(1)

Expand Down
2 changes: 2 additions & 0 deletions docs/scripts/csvclean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ Usage
Enable all error reporting.
--omit-error-rows Omit data rows that contain errors, from standard
output.
--label LABEL Add a "label" column to standard error. Useful in
automated workflows.
--header-normalize-space
Strip leading and trailing whitespace and replace
sequences of whitespace characters by a single space
Expand Down
39 changes: 38 additions & 1 deletion tests/test_utilities/test_csvclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import agate

from csvkit.utilities.csvclean import CSVClean, launch_new_instance
from tests.utils import CSVKitTestCase, EmptyFileTests
from tests.utils import CSVKitTestCase, EmptyFileTests, stdin_as_string


class TestCSVClean(CSVKitTestCase, EmptyFileTests):
Expand Down Expand Up @@ -180,6 +180,43 @@ def test_enable_all_checks(self):
['1', "Empty columns named 'b', '', ''! Try: csvcut -C 2,4,5", '', '', '', '', ''],
])

def test_label(self):
self.assertCleaned(['-a', '--label', 'xyz', 'examples/test_empty_columns.csv'], [
['a', 'b', 'c', '', ''],
['a', '', '', '', ''],
['', '', 'c', ''],
['', '', '', '', ''],
], [
['label', 'line_number', 'msg', 'a', 'b', 'c', '', ''],
['xyz', '2', 'Expected 5 columns, found 4 columns', '', '', 'c', ''],
['xyz', '1', "Empty columns named 'b', '', ''! Try: csvcut -C 2,4,5", '', '', '', '', ''],
])

def test_label_default(self):
self.assertCleaned(['-a', '--label', '-', 'examples/test_empty_columns.csv'], [
['a', 'b', 'c', '', ''],
['a', '', '', '', ''],
['', '', 'c', ''],
['', '', '', '', ''],
], [
['label', 'line_number', 'msg', 'a', 'b', 'c', '', ''],
['examples/test_empty_columns.csv', '2', 'Expected 5 columns, found 4 columns', '', '', 'c', ''],
['examples/test_empty_columns.csv', '1', "Empty columns named 'b', '', ''! Try: csvcut -C 2,4,5", '', '', '', '', ''], # noqa: E501
])

def test_label_default_stdin(self):
input_file = io.BytesIO(b'a,b,c\n,\n')

with stdin_as_string(input_file):
self.assertCleaned(['-a', '--label', '-'], [
['a', 'b', 'c'],
['', ''],
], [
['label', 'line_number', 'msg', 'a', 'b', 'c'],
['stdin', '1', 'Expected 3 columns, found 2 columns', '', ''],
['stdin', '1', "Empty columns named 'a', 'b', 'c'! Try: csvcut -C 1,2,3", '', '', ''],
])

def test_removes_optional_quote_characters(self):
self.assertCleaned(['--length-mismatch', 'examples/optional_quote_characters.csv'], [
['a', 'b', 'c'],
Expand Down

0 comments on commit 44ff089

Please sign in to comment.