From 95dc26dde97ddb2b28947f8b8353d8eace675a71 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 27 Apr 2024 21:44:34 -0400 Subject: [PATCH] fix: csvformat supports --out-quoting 2. --quoting (and --out-quoting) support options from Python 3.12. --- CHANGELOG.rst | 12 +++- csvkit/cli.py | 18 ++--- csvkit/utilities/csvformat.py | 45 ++++++++---- docs/common_arguments.rst | 8 +-- docs/contributing.rst | 2 +- docs/release.rst | 6 -- docs/scripts/csvformat.rst | 12 ++-- tests/test_utilities/test_csvformat.py | 99 ++++++++++++++++++++++++++ 8 files changed, 159 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0cf6ace88..c8c7d0ba2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,10 +1,16 @@ 2.0.0 - Unreleased ------------------ -**BACKWARDS-INCOMPATIBLE CHANGES** +**BACKWARDS-INCOMPATIBLE CHANGES:** * :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``. +Other changes: + +* feat: The :code:`--quoting` option accepts 4 (`csv.QUOTE_STRINGS `__) and 5 (`csv.QUOTE_NOTNULL `__) on Python 3.12. +* feat: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option accepts 4 (`csv.QUOTE_STRINGS `__) and 5 (`csv.QUOTE_NOTNULL `__) on Python 3.12. +* fix: :doc:`/scripts/csvformat`: The :code:`--out-quoting` option works with 2 (`csv.QUOTE_NONUMERIC `__). Use the :code:`--locale` option to set the locale of any formatted numbers. + 1.5.0 - March 28, 2024 ---------------------- @@ -21,7 +27,7 @@ * :code:`--sniff-limit`` * :code:`--no-inference`` -* feat: :doc:`/scripts/csvpy` removes the ``--linenumbers`` and ``--zero`` output options, which had no effect. +* feat: :doc:`/scripts/csvpy` removes the :code:`--linenumbers` and :code:`--zero` output options, which had no effect. * feat: :doc:`/scripts/in2csv` adds a :code:`--reset-dimensions` option to `recalculate `_ the dimensions of an XLSX file, instead of trusting the file's metadata. csvkit's dependency `agate-excel `_ 0.4.0 automatically recalculates the dimensions if the file's metadata expresses dimensions of "A1:A1" (a single cell). * fix: :doc:`/scripts/csvlook` only reads up to :code:`--max-rows` rows instead of the entire file. * fix: :doc:`/scripts/csvpy` supports the existing input options: @@ -61,7 +67,7 @@ 1.2.0 - October 4, 2023 ----------------------- -* fix: :doc:`/scripts/csvjoin` uses the correct columns when performing a ``--right`` join. +* fix: :doc:`/scripts/csvjoin` uses the correct columns when performing a :code:`--right` join. * Add SQLAlchemy 2 support. * Drop Python 3.7 support (end-of-life was June 5, 2023). diff --git a/csvkit/cli.py b/csvkit/cli.py index 6dabc6bfe..275dbba37 100644 --- a/csvkit/cli.py +++ b/csvkit/cli.py @@ -1,5 +1,4 @@ #!/usr/bin/env python - import argparse import bz2 import csv @@ -22,6 +21,8 @@ except ImportError: zstandard = None +QUOTING_CHOICES = sorted(getattr(csv, name) for name in dir(csv) if name.startswith('QUOTE_')) + class LazyFile: """ @@ -170,9 +171,9 @@ def _init_common_parser(self): help='Character used to quote strings in the input CSV file.') if 'u' not in self.override_flags: self.argparser.add_argument( - '-u', '--quoting', dest='quoting', type=int, choices=[0, 1, 2, 3], - help='Quoting style used in the input CSV file. 0 = Quote Minimal, 1 = Quote All, ' - '2 = Quote Non-numeric, 3 = Quote None.') + '-u', '--quoting', dest='quoting', type=int, choices=QUOTING_CHOICES, + help='Quoting style used in the input CSV file: 0 quote minimal, 1 quote all, ' + '2 quote non-numeric, 3 quote none.') if 'b' not in self.override_flags: self.argparser.add_argument( '-b', '--no-doublequote', dest='doublequote', action='store_false', @@ -180,7 +181,7 @@ def _init_common_parser(self): if 'p' not in self.override_flags: self.argparser.add_argument( '-p', '--escapechar', dest='escapechar', - help='Character used to escape the delimiter if --quoting 3 ("Quote None") is specified and to escape ' + help='Character used to escape the delimiter if --quoting 3 ("quote none") is specified and to escape ' 'the QUOTECHAR if --no-doublequote is specified.') if 'z' not in self.override_flags: self.argparser.add_argument( @@ -337,12 +338,13 @@ def get_column_types(self): type_kwargs['null_values'].append(null_value) text_type = agate.Text(**type_kwargs) + number_type = agate.Number(locale=self.args.locale, **type_kwargs) - if self.args.no_inference: + if getattr(self.args, 'no_inference', None): types = [text_type] + elif getattr(self.args, 'out_quoting', None) == 2: + types = [number_type, text_type] else: - number_type = agate.Number(locale=self.args.locale, **type_kwargs) - # See the order in the `agate.TypeTester` class. types = [ agate.Boolean(**type_kwargs), diff --git a/csvkit/utilities/csvformat.py b/csvkit/utilities/csvformat.py index 655f5c44b..387db6642 100644 --- a/csvkit/utilities/csvformat.py +++ b/csvkit/utilities/csvformat.py @@ -4,12 +4,12 @@ import agate -from csvkit.cli import CSVKitUtility, make_default_headers +from csvkit.cli import QUOTING_CHOICES, CSVKitUtility, make_default_headers class CSVFormat(CSVKitUtility): description = 'Convert a CSV file to a custom output format.' - override_flags = ['L', 'blanks', 'date-format', 'datetime-format'] + override_flags = ['blanks', 'date-format', 'datetime-format'] def add_arguments(self): self.argparser.add_argument( @@ -29,9 +29,9 @@ def add_arguments(self): '-Q', '--out-quotechar', dest='out_quotechar', help='Character used to quote strings in the output file.') self.argparser.add_argument( - '-U', '--out-quoting', dest='out_quoting', type=int, choices=[0, 1, 2, 3], - help='Quoting style used in the output file. 0 = Quote Minimal, 1 = Quote All, ' - '2 = Quote Non-numeric, 3 = Quote None.') + '-U', '--out-quoting', dest='out_quoting', type=int, choices=QUOTING_CHOICES, + help='Quoting style used in the output file: 0 quote minimal, 1 quote all, ' + '2 quote non-numeric, 3 quote none.') self.argparser.add_argument( '-B', '--out-no-doublequote', dest='out_doublequote', action='store_false', help='Whether or not double quotes are doubled in the output file.') @@ -72,18 +72,33 @@ def main(self): if self.additional_input_expected(): sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n') - reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs) writer = agate.csv.writer(self.output_file, **self.writer_kwargs) - if self.args.no_header_row: - # Peek at a row to get the number of columns. - _row = next(reader) - headers = make_default_headers(len(_row)) - reader = itertools.chain([headers, _row], reader) - if self.args.skip_header: - next(reader) - - writer.writerows(reader) + if self.args.out_quoting == 2: + table = agate.Table.from_csv( + self.input_file, + skip_lines=self.args.skip_lines, + column_types=self.get_column_types(), + **self.reader_kwargs, + ) + + # table.to_csv() has no option to omit the column names. + if not self.args.skip_header: + writer.writerow(table.column_names) + + writer.writerows(table.rows) + else: + reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs) + if self.args.no_header_row: + # Peek at a row to get the number of columns. + _row = next(reader) + headers = make_default_headers(len(_row)) + reader = itertools.chain([headers, _row], reader) + + if self.args.skip_header: + next(reader) + + writer.writerows(reader) def launch_new_instance(): diff --git a/docs/common_arguments.rst b/docs/common_arguments.rst index 6487ce046..152bd7ce4 100644 --- a/docs/common_arguments.rst +++ b/docs/common_arguments.rst @@ -13,14 +13,14 @@ csvkit's tools share a set of common command-line arguments. Not every argument -q QUOTECHAR, --quotechar QUOTECHAR Character used to quote strings in the input CSV file. -u {0,1,2,3}, --quoting {0,1,2,3} - Quoting style used in the input CSV file. 0 = Quote - Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 = - Quote None. + Quoting style used in the input CSV file: 0 quote + minimal, 1 quote all, 2 quote non-numeric, 3 quote + none. -b, --no-doublequote Whether or not double quotes are doubled in the input CSV file. -p ESCAPECHAR, --escapechar ESCAPECHAR Character used to escape the delimiter if --quoting 3 - ("Quote None") is specified and to escape the + ("quote none") is specified and to escape the QUOTECHAR if --no-doublequote is specified. -z FIELD_SIZE_LIMIT, --maxfieldsize FIELD_SIZE_LIMIT Maximum length of a single field in the input CSV diff --git a/docs/contributing.rst b/docs/contributing.rst index b16825ff9..e1fd6d743 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -70,7 +70,7 @@ Currently, the following tools stream: * :doc:`/scripts/csvclean` * :doc:`/scripts/csvcut` -* :doc:`/scripts/csvformat` +* :doc:`/scripts/csvformat` unless :code:`--quoting 2` is set * :doc:`/scripts/csvgrep` * :doc:`/scripts/csvstack` * :doc:`/scripts/sql2csv` diff --git a/docs/release.rst b/docs/release.rst index 5a3db0da6..700468298 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -2,12 +2,6 @@ Release process =============== -.. admonition:: One-time setup - - .. code-block:: bash - - pip install --upgrade build twine - #. All tests pass on continuous integration #. The changelog is up-to-date and dated #. If new options are added, regenerate the usage information in the documentation with, for example: diff --git a/docs/scripts/csvformat.rst b/docs/scripts/csvformat.rst index cbaf956ec..eca45c935 100644 --- a/docs/scripts/csvformat.rst +++ b/docs/scripts/csvformat.rst @@ -10,9 +10,9 @@ Convert a CSV file to a custom output format.: .. code-block:: none usage: csvformat [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b] - [-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] [-S] [-H] - [-K SKIP_LINES] [-v] [-l] [--zero] [-V] [-E] - [-D OUT_DELIMITER] [-T] [-A] [-Q OUT_QUOTECHAR] + [-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] + [-L LOCALE] [-S] [-H] [-K SKIP_LINES] [-v] [-l] [--zero] [-V] + [-E] [-D OUT_DELIMITER] [-T] [-A] [-Q OUT_QUOTECHAR] [-U {0,1,2,3}] [-B] [-P OUT_ESCAPECHAR] [-M OUT_LINETERMINATOR] [FILE] @@ -36,9 +36,9 @@ Convert a CSV file to a custom output format.: -Q OUT_QUOTECHAR, --out-quotechar OUT_QUOTECHAR Character used to quote strings in the output file. -U {0,1,2,3}, --out-quoting {0,1,2,3} - Quoting style used in the output file. 0 = Quote - Minimal, 1 = Quote All, 2 = Quote Non-numeric, 3 = - Quote None. + Quoting style used in the output file: 0 quote + minimal, 1 quote all, 2 quote non-numeric, 3 quote + none. -B, --out-no-doublequote Whether or not double quotes are doubled in the output CSV file. diff --git a/tests/test_utilities/test_csvformat.py b/tests/test_utilities/test_csvformat.py index 7123f2c23..4521f38a8 100644 --- a/tests/test_utilities/test_csvformat.py +++ b/tests/test_utilities/test_csvformat.py @@ -95,3 +95,102 @@ def test_lineterminator(self): self.assertLines(['-M', 'XYZ', 'examples/dummy.csv'], [ 'a,b,cXYZ1,2,3XYZ', ], newline_at_eof=False) + + +class TestCSVFormatQuoteNonNumeric(CSVKitTestCase, EmptyFileTests): + Utility = CSVFormat + + # New test compared to TestCSVFormat. + def test_locale(self): + self.assertLines(['-U', '2', '--locale', 'de_DE', 'examples/test_locale.csv'], [ + '"a","b","c"', + '1.7,200000000,""', + ]) + + + def test_launch_new_instance(self): + with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/dummy.csv']): + launch_new_instance() + + def test_skip_lines(self): + self.assertLines(['-U', '2', '--skip-lines', '3', '-D', '|', 'examples/test_skip_lines.csv'], [ + '"a"|"b"|"c"', + '1|2|3', + ]) + + def test_skip_header(self): + self.assertLines(['-U', '2', '--skip-header', 'examples/dummy.csv'], [ + '1,2,3', + ]) + + def test_skip_header_no_header_row(self): + self.assertLines(['-U', '2', '--no-header-row', '--skip-header', 'examples/no_header_row.csv'], [ + '1,2,3', + ]) + + def test_no_header_row(self): + self.assertLines(['-U', '2', '--no-header-row', 'examples/no_header_row.csv'], [ + '"a","b","c"', + '1,2,3', + ]) + + def test_linenumbers(self): + self.assertLines(['-U', '2', '--linenumbers', 'examples/dummy.csv'], [ + '"line_number","a","b","c"', + '1,1,2,3', + ]) + + def test_delimiter(self): + self.assertLines(['-U', '2', '-D', '|', 'examples/dummy.csv'], [ + '"a"|"b"|"c"', + '1|2|3', + ]) + + def test_tabs(self): + self.assertLines(['-U', '2', '-T', 'examples/dummy.csv'], [ + '"a"\t"b"\t"c"', + '1\t2\t3', + ]) + + def test_asv(self): + self.assertLines(['-U', '2', '-A', 'examples/dummy.csv'], [ + '"a"\x1f"b"\x1f"c"\x1e1\x1f2\x1f3\x1e', + ], newline_at_eof=False) + + def test_quotechar(self): + input_file = io.BytesIO(b'a,b,c\n1*2,3,4\n') + + with stdin_as_string(input_file): + self.assertLines(['-U', '2', '-Q', '*'], [ + '*a*,*b*,*c*', + '*1**2*,3,4', + ]) + + input_file.close() + + def test_doublequote(self): + input_file = io.BytesIO(b'a\n"a ""quoted"" string"') + + with stdin_as_string(input_file): + self.assertLines(['-U', '2', '-P', '#', '-B'], [ + '"a"', + '"a #"quoted#" string"', + ]) + + input_file.close() + + def test_escapechar(self): + input_file = io.BytesIO(b'a,b,c\n1"2,3,4\n') + + with stdin_as_string(input_file): + self.assertLines(['-U', '2', '-P', '#', '-U', '3'], [ + 'a,b,c', + '1#"2,3,4', + ]) + + input_file.close() + + def test_lineterminator(self): + self.assertLines(['-U', '2', '-M', 'XYZ', 'examples/dummy.csv'], [ + '"a","b","c"XYZ1,2,3XYZ', + ], newline_at_eof=False)