Skip to content

Commit

Permalink
Merge pull request #1241 from wireservice/195-csvclean
Browse files Browse the repository at this point in the history
csvclean: Use standard output and standard error, and use exit code 1 if errors #781 #195
  • Loading branch information
jpmckinney authored Apr 27, 2024
2 parents d00ea20 + 8d700e8 commit b3f68a3
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 145 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
2.0.0 - Unreleased
------------------

**BACKWARDS-INCOMPATIBLE CHANGES**

* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.

1.5.0 - March 28, 2024
----------------------

Expand Down
11 changes: 4 additions & 7 deletions csvkit/cleanup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@

def join_rows(rows, joiner=' '):
"""
Given a series of rows, return them as a single row where the inner edge cells are merged. By default joins with a
single space character, but you can specify new-line, empty string, or anything else with the 'joiner' kwarg.
Given a series of rows, return them as a single row where the inner edge cells are merged.
:param joiner:
The separator between cells, a single space by default.
"""
rows = list(rows)
fixed_row = rows[0][:]
Expand All @@ -33,8 +35,6 @@ def __init__(self, reader):
except StopIteration:
self.column_names = []
self.errors = []
self.rows_joined = 0
self.joins = 0

def checked_rows(self):
"""
Expand Down Expand Up @@ -69,9 +69,6 @@ def checked_rows(self):
break

if len(fixed_row) == length:
self.rows_joined += len(joinable_row_errors)
self.joins += 1

yield fixed_row

for fixed in joinable_row_errors:
Expand Down
9 changes: 8 additions & 1 deletion csvkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,26 @@ class CSVKitUtility:
epilog = ''
override_flags = ''

def __init__(self, args=None, output_file=None):
def __init__(self, args=None, output_file=None, error_file=None):
"""
Perform argument processing and other setup for a CSVKitUtility.
"""
self._init_common_parser()
self.add_arguments()
self.args = self.argparser.parse_args(args)

# Output file is only set during testing.
if output_file is None:
self.output_file = sys.stdout
else:
self.output_file = output_file

# Error file is only set during testing.
if error_file is None:
self.error_file = sys.stderr
else:
self.error_file = error_file

self.reader_kwargs = self._extract_csv_reader_kwargs()
self.writer_kwargs = self._extract_csv_writer_kwargs()

Expand Down
72 changes: 12 additions & 60 deletions csvkit/utilities/csvclean.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python

import sys
from os.path import splitext

import agate

Expand All @@ -14,75 +13,28 @@ class CSVClean(CSVKitUtility):
override_flags = ['L', 'blanks', 'date-format', 'datetime-format']

def add_arguments(self):
self.argparser.add_argument(
'-n', '--dry-run', dest='dryrun', action='store_true',
help='Do not create output files. Information about what would have been done will be printed to STDERR.')
pass

def main(self):
if self.additional_input_expected():
sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')

reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)

if self.args.dryrun:
checker = RowChecker(reader)
checker = RowChecker(reader)

for _row in checker.checked_rows():
pass
output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
output_writer.writerow(checker.column_names)
for row in checker.checked_rows():
output_writer.writerow(row)

if checker.errors:
for e in checker.errors:
self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg))
else:
self.output_file.write('No errors.\n')
if checker.errors:
error_writer = agate.csv.writer(self.error_file, **self.writer_kwargs)
error_writer.writerow(['line_number', 'msg'] + checker.column_names)
for error in checker.errors:
error_writer.writerow([error.line_number, error.msg] + error.row)

if checker.joins:
self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected '
'internal line breaks.\n' % (checker.rows_joined, checker.joins))
else:
if self.input_file == sys.stdin:
base = 'stdin' # "<stdin>_out.csv" is invalid on Windows
else:
base = splitext(self.input_file.name)[0]

with open(f'{base}_out.csv', 'w') as f:
clean_writer = agate.csv.writer(f, **self.writer_kwargs)

checker = RowChecker(reader)
clean_writer.writerow(checker.column_names)

for row in checker.checked_rows():
clean_writer.writerow(row)

if checker.errors:
error_filename = f'{base}_err.csv'

with open(error_filename, 'w') as f:
error_writer = agate.csv.writer(f, **self.writer_kwargs)

error_header = ['line_number', 'msg']
error_header.extend(checker.column_names)
error_writer.writerow(error_header)

error_count = len(checker.errors)

for e in checker.errors:
error_writer.writerow(self._format_error_row(e))

self.output_file.write('%i error%s logged to %s\n' % (
error_count, '' if error_count == 1 else 's', error_filename))
else:
self.output_file.write('No errors.\n')

if checker.joins:
self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal '
'line breaks.\n' % (checker.rows_joined, checker.joins))

def _format_error_row(self, error):
row = [error.line_number, error.msg]
row.extend(error.row)

return row
sys.exit(1)


def launch_new_instance():
Expand Down
16 changes: 9 additions & 7 deletions docs/scripts/csvclean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ Note that every csvkit tool does the following:
* changes the quote character to a double-quotation mark, if the character is set with the `--quotechar` (`-q`) option
* changes the character encoding to UTF-8, if the input encoding is set with the `--encoding` (`-e`) option

Outputs [basename]_out.csv and [basename]_err.csv, the former containing all valid rows and the latter containing all error rows along with line numbers and descriptions:
All valid rows are written to standard output, and all error rows along with line numbers and descriptions are written to standard error. If there are error rows, the exit code will be 1::

.. code-block:: none
usage: csvclean [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b]
[-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] [-S] [-H]
[-K SKIP_LINES] [-v] [-l] [--zero] [-V] [-n]
[-K SKIP_LINES] [-v] [-l] [--zero] [-V]
[FILE]
Fix common errors in a CSV file.
Expand All @@ -35,8 +35,6 @@ Outputs [basename]_out.csv and [basename]_err.csv, the former containing all val
optional arguments:
-h, --help show this help message and exit
-n, --dry-run Do not create output files. Information about what
would have been done will be printed to STDERR.
See also: :doc:`../common_arguments`.

Expand All @@ -47,9 +45,13 @@ Test a file with known bad rows:

.. code-block:: console
$ csvclean -n examples/bad.csv
Line 1: Expected 3 columns, found 4 columns
Line 2: Expected 3 columns, found 2 columns
$ csvclean examples/bad.csv 2> errors.csv
column_a,column_b,column_c
0,mixed types.... uh oh,17
$ cat errors.csv
line_number,msg,column_a,column_b,column_c
1,"Expected 3 columns, found 4 columns",1,27,,I'm too long!
2,"Expected 3 columns, found 2 columns",,I'm too short!
To change the line ending from line feed (LF or ``\n``) to carriage return and line feed (CRLF or ``\r\n``) use:

Expand Down
133 changes: 63 additions & 70 deletions tests/test_utilities/test_csvclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import sys
from unittest.mock import patch

import agate

from csvkit.utilities.csvclean import CSVClean, launch_new_instance
from tests.utils import CSVKitTestCase, EmptyFileTests

Expand All @@ -15,98 +17,89 @@ def tearDown(self):
if os.path.isfile(output_file):
os.remove(output_file)

def assertCleaned(self, basename, output_lines, error_lines, additional_args=[]):
args = [f'examples/{basename}.csv'] + additional_args
def assertCleaned(self, args, output_rows, error_rows=[]):
output_file = io.StringIO()
error_file = io.StringIO()

utility = CSVClean(args, output_file)
utility.run()
utility = CSVClean(args, output_file, error_file)

output_file.close()
if error_rows:
with self.assertRaises(SystemExit) as e:
utility.run()

self.assertEqual(e.exception.code, 1)
else:
utility.run()

output_file.seek(0)
error_file.seek(0)

output_file = f'examples/{basename}_out.csv'
error_file = f'examples/{basename}_err.csv'

self.assertEqual(os.path.exists(output_file), bool(output_lines))
self.assertEqual(os.path.exists(error_file), bool(error_lines))

try:
if output_lines:
with open(output_file) as f:
for line in output_lines:
self.assertEqual(next(f), line)
self.assertRaises(StopIteration, next, f)
if error_lines:
with open(error_file) as f:
for line in error_lines:
self.assertEqual(next(f), line)
self.assertRaises(StopIteration, next, f)
finally:
if output_lines:
os.remove(output_file)
if error_lines:
os.remove(error_file)
if output_rows:
reader = agate.csv.reader(output_file)
for row in output_rows:
self.assertEqual(next(reader), row)
self.assertRaises(StopIteration, next, reader)
if error_rows:
reader = agate.csv.reader(error_file)
for row in error_rows:
self.assertEqual(next(reader), row)
self.assertRaises(StopIteration, next, reader)

output_file.close()
error_file.close()

def test_launch_new_instance(self):
with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/bad.csv']):
with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/dummy.csv']):
launch_new_instance()

def test_skip_lines(self):
self.assertCleaned('bad_skip_lines', [
'column_a,column_b,column_c\n',
'0,mixed types.... uh oh,17\n',
self.assertCleaned(['--skip-lines', '3', 'examples/bad_skip_lines.csv'], [
['column_a', 'column_b', 'column_c'],
['0', 'mixed types.... uh oh', '17'],
], [
'line_number,msg,column_a,column_b,column_c\n',
'1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n',
'2,"Expected 3 columns, found 2 columns",,I\'m too short!\n',
], ['--skip-lines', '3'])
['line_number', 'msg', 'column_a', 'column_b', 'column_c'],
['1', 'Expected 3 columns, found 4 columns', '1', '27', '', "I'm too long!"],
['2', 'Expected 3 columns, found 2 columns', '', "I'm too short!"],
])

def test_simple(self):
self.assertCleaned('bad', [
'column_a,column_b,column_c\n',
'0,mixed types.... uh oh,17\n',
self.assertCleaned(['examples/bad.csv'], [
['column_a', 'column_b', 'column_c'],
['0', 'mixed types.... uh oh', '17'],
], [
'line_number,msg,column_a,column_b,column_c\n',
'1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n',
'2,"Expected 3 columns, found 2 columns",,I\'m too short!\n',
['line_number', 'msg', 'column_a', 'column_b', 'column_c'],
['1', 'Expected 3 columns, found 4 columns', '1', '27', '', "I'm too long!"],
['2', 'Expected 3 columns, found 2 columns', '', "I'm too short!"],
])

def test_no_header_row(self):
self.assertCleaned('no_header_row', [
'1,2,3\n',
self.assertCleaned(['examples/no_header_row.csv'], [
['1', '2', '3'],
], [])

def test_removes_optional_quote_characters(self):
self.assertCleaned('optional_quote_characters', [
'a,b,c\n',
'1,2,3\n',
], [])
self.assertCleaned(['examples/optional_quote_characters.csv'], [
['a', 'b', 'c'],
['1', '2', '3'],
])

def test_changes_line_endings(self):
self.assertCleaned('mac_newlines', [
'a,b,c\n',
'1,2,3\n',
'"Once upon\n',
'a time",5,6\n',
], [])
self.assertCleaned(['examples/mac_newlines.csv'], [
['a', 'b', 'c'],
['1', '2', '3'],
['Once upon\na time', '5', '6'],
])

def test_changes_character_encoding(self):
self.assertCleaned('test_latin1', [
'a,b,c\n',
'1,2,3\n',
'4,5,©\n',
], [], ['-e', 'latin1'])
self.assertCleaned(['-e', 'latin1', 'examples/test_latin1.csv'], [
['a', 'b', 'c'],
['1', '2', '3'],
['4', '5', u'©'],
])

def test_removes_bom(self):
self.assertCleaned('test_utf8_bom', [
'foo,bar,baz\n',
'1,2,3\n',
'4,5,ʤ\n',
], [], [])

def test_dry_run(self):
output = self.get_output_as_io(['-n', 'examples/bad.csv'])
self.assertFalse(os.path.exists('examples/bad_err.csv'))
self.assertFalse(os.path.exists('examples/bad_out.csv'))
self.assertEqual(next(output)[:6], 'Line 1')
self.assertEqual(next(output)[:6], 'Line 2')
self.assertCleaned(['examples/test_utf8_bom.csv'], [
['foo', 'bar', 'baz'],
['1', '2', '3'],
['4', '5', 'ʤ'],
])

0 comments on commit b3f68a3

Please sign in to comment.