diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 46d66aaaa..426e8d7d9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,8 @@ Unreleased ---------- +* :doc:`/scripts/csvstat` reports a "Non-null values" statistic (or a :code:`nonnulls` column when :code:`--csv` is set). +* :doc:`/scripts/csvstat` adds a :code:`--non-nulls` option to only output counts of non-null values. * feat: Add a :code:`--null-value` option to commands with the :code:`--blanks` option, to convert additional values to NULL. * Add Python 3.12 support. @@ -166,7 +168,7 @@ This is a minor release which fixes several bugs reported in the :code:`1.0.0` r * :doc:`/scripts/csvstat` no longer crashes when a :code:`Number` column has :code:`None` as a frequent value. (#738) * :doc:`/scripts/csvlook` documents that output tables are Markdown-compatible. (#734) -* :doc:`/scripts/csvstat` accepts a :code:`--csv` flag for tabular output. (#584) +* :doc:`/scripts/csvstat` adds a :code:`--csv` flag for tabular output. (#584) * :doc:`/scripts/csvstat` output is easier to read. (#714) * :doc:`/scripts/csvpy` has a better description when using the :code:`--agate` flag. (#729) * Fix a Python 2.6 bug preventing :doc:`/scripts/csvjson` from parsing utf-8 files. (#732) diff --git a/csvkit/utilities/csvstat.py b/csvkit/utilities/csvstat.py index 74e79ea47..37326d77e 100644 --- a/csvkit/utilities/csvstat.py +++ b/csvkit/utilities/csvstat.py @@ -19,6 +19,10 @@ 'aggregation': agate.HasNulls, 'label': 'Contains null values: ', }), + ('nonnulls', { + 'aggregation': agate.Count, + 'label': 'Non-null values: ', + }), ('unique', { 'aggregation': None, 'label': 'Unique values: ', @@ -79,6 +83,9 @@ def add_arguments(self): self.argparser.add_argument( '--nulls', dest='nulls_only', action='store_true', help='Only output whether columns contains nulls.') + self.argparser.add_argument( + '--non-nulls', dest='nonnulls_only', action='store_true', + help='Only output counts of non-null values.') self.argparser.add_argument( '--unique', dest='unique_only', action='store_true', help='Only output counts of unique values.') @@ -351,6 +358,7 @@ def format_decimal(d, f='%.3f', no_grouping_separator=False): return locale.format_string(f, d, grouping=not no_grouping_separator).rstrip('0').rstrip('.') +# These are accessed via: globals().get(f'get_{op_name}') def get_type(table, column_id, **kwargs): return f'{table.columns[column_id].data_type.__class__.__name__}' diff --git a/tests/test_utilities/test_csvstat.py b/tests/test_utilities/test_csvstat.py index f636cae51..a2636db64 100644 --- a/tests/test_utilities/test_csvstat.py +++ b/tests/test_utilities/test_csvstat.py @@ -76,14 +76,14 @@ def test_csv(self): header = next(reader) self.assertEqual(header[1], 'column_name') - self.assertEqual(header[4], 'unique') + self.assertEqual(header[5], 'unique') row = next(reader) self.assertEqual(row[1], 'state') self.assertEqual(row[2], 'Text') - self.assertEqual(row[5], '') - self.assertEqual(row[11], '2') + self.assertEqual(row[6], '') + self.assertEqual(row[12], '2') def test_csv_columns(self): output = self.get_output_as_io(['--csv', '-c', '4', 'examples/realdata/ks_1033_data.csv']) @@ -93,14 +93,14 @@ def test_csv_columns(self): header = next(reader) self.assertEqual(header[1], 'column_name') - self.assertEqual(header[4], 'unique') + self.assertEqual(header[5], 'unique') row = next(reader) self.assertEqual(row[1], 'nsn') self.assertEqual(row[2], 'Text') - self.assertEqual(row[5], '') - self.assertEqual(row[11], '16') + self.assertEqual(row[6], '') + self.assertEqual(row[12], '16') def test_decimal_format(self): output = self.get_output(['-c', 'TOTAL', '--mean', 'examples/realdata/FY09_EDU_Recipients_by_State.csv'])