Skip to content

Commit

Permalink
csvstat: Add --json and --indent options, closes #1216
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Oct 17, 2023
1 parent 36c78b9 commit 39ca69f
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 40 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
Unreleased
----------

* :doc:`/scripts/csvstat` adds a :code:`--json` option to output results as JSON text.
* :doc:`/scripts/csvstat` adds an :code:`--indent` option to indent the JSON text when :code:`--json` is set.
* :doc:`/scripts/csvstat` reports a "Non-null values" statistic (or a :code:`nonnulls` column when :code:`--csv` is set).
* :doc:`/scripts/csvstat` adds a :code:`--non-nulls` option to only output counts of non-null values.
* feat: Add a :code:`--null-value` option to commands with the :code:`--blanks` option, to convert additional values to NULL.
Expand Down
10 changes: 10 additions & 0 deletions csvkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import argparse
import bz2
import datetime
import decimal
import gzip
import itertools
import lzma
Expand Down Expand Up @@ -410,6 +412,14 @@ def isatty(f):
return False


def default(obj):
if isinstance(obj, (datetime.date, datetime.datetime)):
return obj.isoformat()
if isinstance(obj, decimal.Decimal):
return str(obj)
raise TypeError(f'{repr(obj)} is not JSON serializable')


def make_default_headers(n):
"""
Make a set of simple, default headers for files that are missing them.
Expand Down
14 changes: 2 additions & 12 deletions csvkit/utilities/csvjson.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
#!/usr/bin/env python

import datetime
import decimal
import json
import sys
from collections import OrderedDict

import agate

from csvkit.cli import CSVKitUtility, match_column_identifier
from csvkit.cli import CSVKitUtility, default, match_column_identifier


class CSVJSON(CSVKitUtility):
Expand Down Expand Up @@ -75,7 +73,6 @@ def main(self):
self.argparser.error('--key is only allowed with --stream when --lat and --lon are also specified.')

self.json_kwargs = {
'ensure_ascii': False,
'indent': self.args.indent,
}

Expand All @@ -98,14 +95,7 @@ def main(self):
self.output_json()

def dump_json(self, data, newline=False):
def default(obj):
if isinstance(obj, (datetime.date, datetime.datetime)):
return obj.isoformat()
if isinstance(obj, decimal.Decimal):
return str(obj)
raise TypeError(f'{repr(obj)} is not JSON serializable')

json.dump(data, self.output_file, default=default, **self.json_kwargs)
json.dump(data, self.output_file, default=default, ensure_ascii=False, **self.json_kwargs)
if newline:
self.output_file.write("\n")

Expand Down
57 changes: 35 additions & 22 deletions csvkit/utilities/csvstat.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#!/usr/bin/env python

import json
import locale
import warnings
from collections import Counter, OrderedDict
from decimal import Decimal

import agate

from csvkit.cli import CSVKitUtility, parse_column_identifiers
from csvkit.cli import CSVKitUtility, default, parse_column_identifiers

locale.setlocale(locale.LC_ALL, '')
OPERATIONS = OrderedDict([
Expand Down Expand Up @@ -69,7 +70,13 @@ class CSVStat(CSVKitUtility):
def add_arguments(self):
self.argparser.add_argument(
'--csv', dest='csv_output', action='store_true',
help='Output results as a CSV, rather than text.')
help='Output results as a CSV table, rather than plain text.')
self.argparser.add_argument(
'--json', dest='json_output', action='store_true',
help='Output results as JSON text, rather than plain text.')
self.argparser.add_argument(
'-i', '--indent', dest='indent', type=int,
help='Indent the output JSON this many spaces. Disabled by default.')
self.argparser.add_argument(
'-n', '--names', dest='names_only', action='store_true',
help='Display column names and indices from the input CSV and exit.')
Expand Down Expand Up @@ -147,7 +154,9 @@ def main(self):
if operations and self.args.csv_output:
self.argparser.error(
'You may not specify --csv and an operation (--mean, --median, etc) at the same time.')

if operations and self.args.json_output:
self.argparser.error(
'You may not specify --json and an operation (--mean, --median, etc) at the same time.')
if operations and self.args.count_only:
self.argparser.error(
'You may not specify --count and an operation (--mean, --median, etc) at the same time.')
Expand Down Expand Up @@ -194,10 +203,10 @@ def main(self):
for column_id in column_ids:
stats[column_id] = self.calculate_stats(table, column_id, **kwargs)

# Output as CSV
if self.args.csv_output:
self.print_csv(table, column_ids, stats)
# Output all stats
elif self.args.json_output:
self.print_json(table, column_ids, stats)
else:
self.print_stats(table, column_ids, stats)

Expand Down Expand Up @@ -325,33 +334,37 @@ def print_stats(self, table, column_ids, stats):

def print_csv(self, table, column_ids, stats):
"""
Print data for all statistics as a csv table.
Print data for all statistics as a CSV table.
"""
writer = agate.csv.writer(self.output_file)
header = ['column_id', 'column_name'] + list(OPERATIONS)

header = ['column_id', 'column_name'] + list(OPERATIONS.keys())
writer = agate.csv.DictWriter(self.output_file, fieldnames=header)
writer.writeheader()

for row in self._rows(table, column_ids, stats):
if 'freq' in row:
row['freq'] = ', '.join([str(row['value']) for row in row['freq']])
writer.writerow(row)

def print_json(self, table, column_ids, stats):
"""
Print data for all statistics as a JSON text.
"""
data = list(self._rows(table, column_ids, stats))

writer.writerow(header)
json.dump(data, self.output_file, default=default, ensure_ascii=False, indent=self.args.indent)

def _rows(self, table, column_ids, stats):
for column_id in column_ids:
column_name = table.column_names[column_id]
column_stats = stats[column_id]

output_row = [column_id + 1, column_name]

output_row = {'column_id': column_id + 1, 'column_name': column_name}
for op_name, _op_data in OPERATIONS.items():
if column_stats[op_name] is None:
output_row.append(None)
continue

if op_name == 'freq':
value = ', '.join([str(row['value']) for row in column_stats['freq']])
else:
value = column_stats[op_name]

output_row.append(value)
if column_stats[op_name] is not None:
output_row[op_name] = column_stats[op_name]

writer.writerow(output_row)
yield output_row


def format_decimal(d, f='%.3f', no_grouping_separator=False):
Expand Down
17 changes: 11 additions & 6 deletions docs/scripts/csvstat.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@ Prints descriptive statistics for all columns in a CSV file. Will intelligently
usage: csvstat [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b]
[-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] [-S] [-H]
[-K SKIP_LINES] [-v] [-l] [--zero] [-V] [--csv] [-n]
[-c COLUMNS] [--type] [--nulls] [--unique] [--min] [--max]
[--sum] [--mean] [--median] [--stdev] [--len] [--freq]
[--freq-count FREQ_COUNT] [--count] [--decimal-format DECIMAL_FORMAT]
[-G] [-y SNIFF_LIMIT]
[-K SKIP_LINES] [-v] [-l] [--zero] [-V] [--csv] [--json]
[-i INDENT] [-n] [-c COLUMNS] [--type] [--nulls] [--non-nulls]
[--unique] [--min] [--max] [--sum] [--mean] [--median] [--stdev]
[--len] [--freq] [--freq-count FREQ_COUNT] [--count]
[--decimal-format DECIMAL_FORMAT] [-G] [-y SNIFF_LIMIT]
[FILE]
Print descriptive statistics for each column in a CSV file.
Expand All @@ -26,7 +26,11 @@ Prints descriptive statistics for all columns in a CSV file. Will intelligently
optional arguments:
-h, --help show this help message and exit
--csv Output results as a CSV, rather than text.
--csv Output results as a CSV table, rather than plain text.
--json Output results as JSON text, rather than plain text.
-i INDENT, --indent INDENT
Indent the output JSON this many spaces. Disabled by
default.
-n, --names Display column names and indices from the input CSV
and exit.
-c COLUMNS, --columns COLUMNS
Expand All @@ -35,6 +39,7 @@ Prints descriptive statistics for all columns in a CSV file. Will intelligently
all columns.
--type Only output data type.
--nulls Only output whether columns contains nulls.
--non-nulls Only output counts of non-null values.
--unique Only output counts of unique values.
--min Only output smallest values.
--max Only output largest values.
Expand Down
35 changes: 35 additions & 0 deletions tests/test_utilities/test_csvstat.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import sys
from unittest.mock import patch

Expand Down Expand Up @@ -102,6 +103,40 @@ def test_csv_columns(self):
self.assertEqual(row[6], '')
self.assertEqual(row[12], '16')

def test_json(self):
output = self.get_output_as_io(['--json', 'examples/realdata/ks_1033_data.csv'])

data = json.load(output)

header = list(data[0])

self.assertEqual(header[1], 'column_name')
self.assertEqual(header[5], 'unique')

row = list(data[0].values())

self.assertEqual(row[1], 'state')
self.assertEqual(row[2], 'Text')
self.assertNotIn('min', data[0])
self.assertEqual(row[-2], '2')

def test_json_columns(self):
output = self.get_output_as_io(['--json', '-c', '4', 'examples/realdata/ks_1033_data.csv'])

data = json.load(output)

header = list(data[0])

self.assertEqual(header[1], 'column_name')
self.assertEqual(header[5], 'unique')

row = list(data[0].values())

self.assertEqual(row[1], 'nsn')
self.assertEqual(row[2], 'Text')
self.assertNotIn('min', data[0])
self.assertEqual(row[-2], '16')

def test_decimal_format(self):
output = self.get_output(['-c', 'TOTAL', '--mean', 'examples/realdata/FY09_EDU_Recipients_by_State.csv'])

Expand Down

0 comments on commit 39ca69f

Please sign in to comment.