Skip to content

Commit

Permalink
fix: Reconfigure the encoding of standard input according to the --en…
Browse files Browse the repository at this point in the history
…coding option, closes #1038
  • Loading branch information
jpmckinney committed Oct 18, 2023
1 parent 12be2ff commit e5ef16e
Show file tree
Hide file tree
Showing 14 changed files with 55 additions and 51 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Unreleased
* :doc:`/scripts/csvstat` adds a :code:`--non-nulls` option to only output counts of non-null values.
* :doc:`/scripts/csvstat` adds a :code:`--max-precision` option to only output the most decimal places.
* feat: Add a :code:`--null-value` option to commands with the :code:`--blanks` option, to convert additional values to NULL.
* fix: Reconfigure the encoding of standard input according to the :code:`--encoding` option, which defaults to ``utf-8-sig``. Affected users no longer need to set the ``PYTHONIOENCODING`` environment variable.
* fix: Prompt the user if additional input is expected (i.e. if no input file or piped data is provided) in :doc:`/scripts/csvjoin`, :doc:`/scripts/csvsql` and :doc:`/scripts/csvstack`.
* fix: No longer errors if a NUL byte occurs in an input file.
* Add Python 3.12 support.
Expand Down
5 changes: 4 additions & 1 deletion csvkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,11 +233,14 @@ def _init_common_parser(self):
'-V', '--version', action='version', version='%(prog)s 1.2.0',
help='Display version information and exit.')

def _open_input_file(self, path):
def _open_input_file(self, path, opened=False):
"""
Open the input file specified on the command line.
"""
if not path or path == '-':
# "UnsupportedOperation: It is not possible to set the encoding or newline of stream after the first read"
if not opened:
sys.stdin.reconfigure(encoding=self.args.encoding)
f = sys.stdin
else:
extension = splitext(path)[1]
Expand Down
2 changes: 1 addition & 1 deletion csvkit/utilities/csvstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def main(self):
output.writerow(headers)

for i, path in enumerate(self.args.input_paths):
f = self._open_input_file(path)
f = self._open_input_file(path, opened=True)
file_is_stdin = path == '-'

if has_groups:
Expand Down
8 changes: 4 additions & 4 deletions tests/test_convert/test_fixed.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from io import StringIO
import io

from csvkit.convert import fixed
from csvkit.utilities.in2csv import In2CSV
Expand All @@ -23,7 +23,7 @@ def test_fixed_skip_lines(self):
self.assertEqual(f.read(), output)

def test_fixed_no_inference(self):
input_file = StringIO(' 1 2 3')
input_file = io.BytesIO(b' 1 2 3')

with stdin_as_string(input_file):
self.assertLines(['--no-inference', '-f', 'fixed', '--schema',
Expand All @@ -36,7 +36,7 @@ def test_fixed_no_inference(self):

def test_fixed_streaming(self):
with open('examples/testfixed') as f, open('examples/testfixed_schema.csv') as schema:
output_file = StringIO()
output_file = io.StringIO()
fixed.fixed2csv(f, schema, output=output_file)
output = output_file.getvalue()
output_file.close()
Expand Down Expand Up @@ -91,7 +91,7 @@ def test_schematic_line_parser(self):
bar,6,2
baz,8,5"""

f = StringIO(schema)
f = io.StringIO(schema)
parser = fixed.FixedWidthRowParser(f)
f.close()

Expand Down
4 changes: 2 additions & 2 deletions tests/test_utilities/test_csvclean.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import os
import sys
from io import StringIO
from unittest.mock import patch

from csvkit.utilities.csvclean import CSVClean, launch_new_instance
Expand All @@ -17,7 +17,7 @@ def tearDown(self):

def assertCleaned(self, basename, output_lines, error_lines, additional_args=[]):
args = [f'examples/{basename}.csv'] + additional_args
output_file = StringIO()
output_file = io.StringIO()

utility = CSVClean(args, output_file)
utility.run()
Expand Down
8 changes: 4 additions & 4 deletions tests/test_utilities/test_csvformat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import io
import sys
from io import StringIO
from unittest.mock import patch

from csvkit.utilities.csvformat import CSVFormat, launch_new_instance
Expand Down Expand Up @@ -54,7 +54,7 @@ def test_tab_delimiter(self):
])

def test_quotechar(self):
input_file = StringIO('a,b,c\n1*2,3,4\n')
input_file = io.BytesIO(b'a,b,c\n1*2,3,4\n')

with stdin_as_string(input_file):
self.assertLines(['-Q', '*'], [
Expand All @@ -65,7 +65,7 @@ def test_quotechar(self):
input_file.close()

def test_doublequote(self):
input_file = StringIO('a\n"a ""quoted"" string"')
input_file = io.BytesIO(b'a\n"a ""quoted"" string"')

with stdin_as_string(input_file):
self.assertLines(['-P', '#', '-B'], [
Expand All @@ -76,7 +76,7 @@ def test_doublequote(self):
input_file.close()

def test_escapechar(self):
input_file = StringIO('a,b,c\n1"2,3,4\n')
input_file = io.BytesIO(b'a,b,c\n1"2,3,4\n')

with stdin_as_string(input_file):
self.assertLines(['-P', '#', '-U', '3'], [
Expand Down
4 changes: 2 additions & 2 deletions tests/test_utilities/test_csvjson.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import json
import sys
from io import StringIO
from unittest.mock import patch

from csvkit.utilities.csvjson import CSVJSON, launch_new_instance
Expand Down Expand Up @@ -58,7 +58,7 @@ def test_keying(self):
self.assertDictEqual(js, {'True': {'a': True, 'c': 3.0, 'b': 2.0}})

def test_duplicate_keys(self):
output_file = StringIO()
output_file = io.StringIO()
utility = CSVJSON(['-k', 'a', 'examples/dummy3.csv'], output_file)
self.assertRaisesRegex(ValueError,
'Value True is not unique in the key column.',
Expand Down
4 changes: 2 additions & 2 deletions tests/test_utilities/test_csvlook.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import io
import sys
from io import StringIO
from unittest.mock import patch

from csvkit.utilities.csvlook import CSVLook, launch_new_instance
Expand Down Expand Up @@ -127,7 +127,7 @@ def test_max_column_width(self):
])

def test_stdin(self):
input_file = StringIO('a,b,c\n1,2,3\n4,5,6\n')
input_file = io.BytesIO(b'a,b,c\n1,2,3\n4,5,6\n')

with stdin_as_string(input_file):
self.assertLines([], [
Expand Down
4 changes: 2 additions & 2 deletions tests/test_utilities/test_csvsort.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import io
import sys
from io import StringIO
from unittest.mock import patch

from csvkit.utilities.csvsort import CSVSort, launch_new_instance
Expand Down Expand Up @@ -78,7 +78,7 @@ def test_sort_t_and_nulls(self):
self.assertEqual(test_order, new_order)

def test_stdin(self):
input_file = StringIO('a,b,c\n4,5,6\n1,2,3\n')
input_file = io.BytesIO(b'a,b,c\n4,5,6\n1,2,3\n')

with stdin_as_string(input_file):
self.assertLines([], [
Expand Down
14 changes: 7 additions & 7 deletions tests/test_utilities/test_csvsql.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import os
import sys
from io import StringIO
from textwrap import dedent
from unittest.mock import patch

Expand Down Expand Up @@ -108,7 +108,7 @@ def test_linenumbers(self):
''')) # noqa: W291

def test_stdin(self):
input_file = StringIO('a,b,c\n4,2,3\n')
input_file = io.BytesIO(b'a,b,c\n4,2,3\n')

with stdin_as_string(input_file):
sql = self.get_output(['--tables', 'foo'])
Expand All @@ -124,7 +124,7 @@ def test_stdin(self):
input_file.close()

def test_stdin_and_filename(self):
input_file = StringIO("a,b,c\n1,2,3\n")
input_file = io.BytesIO(b'a,b,c\n1,2,3\n')

with stdin_as_string(input_file):
sql = self.get_output(['-', 'examples/dummy.csv'])
Expand All @@ -135,7 +135,7 @@ def test_stdin_and_filename(self):
input_file.close()

def test_query(self):
input_file = StringIO("a,b,c\n1,2,3\n")
input_file = io.BytesIO(b'a,b,c\n1,2,3\n')

with stdin_as_string(input_file):
sql = self.get_output(['--query', 'SELECT m.usda_id, avg(i.sepal_length) AS mean_sepal_length FROM iris '
Expand All @@ -150,7 +150,7 @@ def test_query(self):
input_file.close()

def test_query_empty(self):
input_file = StringIO()
input_file = io.BytesIO()

with stdin_as_string(input_file):
output = self.get_output(['--query', 'SELECT 1'])
Expand Down Expand Up @@ -185,14 +185,14 @@ def test_before_after_insert(self):
'SELECT 1; CREATE TABLE foobar (date DATE)', '--after-insert',
'INSERT INTO dummy VALUES (0, 5, 6)'])

output_file = StringIO()
output_file = io.StringIO()
utility = SQL2CSV(['--db', 'sqlite:///' + self.db_file, '--query', 'SELECT * FROM foobar'], output_file)
utility.run()
output = output_file.getvalue()
output_file.close()
self.assertEqual(output, 'date\n')

output_file = StringIO()
output_file = io.StringIO()
utility = SQL2CSV(['--db', 'sqlite:///' + self.db_file, '--query', 'SELECT * FROM dummy'], output_file)
utility.run()
output = output_file.getvalue()
Expand Down
10 changes: 5 additions & 5 deletions tests/test_utilities/test_csvstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_skip_lines(self):
])

def test_skip_lines_stdin(self):
with open('examples/test_skip_lines.csv') as f, stdin_as_string(f):
with open('examples/test_skip_lines.csv', 'rb') as f, stdin_as_string(f):
self.assertRows(['--skip-lines', '3', '-', 'examples/test_skip_lines.csv'], [
['a', 'b', 'c'],
['1', '2', '3'],
Expand Down Expand Up @@ -62,14 +62,14 @@ def test_multiple_file_stack_col_ragged(self):
])

def test_multiple_file_stack_col_ragged_stdin(self):
with open('examples/dummy.csv') as f, stdin_as_string(f):
with open('examples/dummy.csv', 'rb') as f, stdin_as_string(f):
self.assertRows(['-', 'examples/dummy_col_shuffled_ragged.csv'], [
['a', 'b', 'c', 'd'],
['1', '2', '3', ''],
['1', '2', '3', '4'],
])

with open('examples/dummy.csv') as f, stdin_as_string(f):
with open('examples/dummy.csv', 'rb') as f, stdin_as_string(f):
self.assertRows(['examples/dummy_col_shuffled_ragged.csv', '-'], [
['b', 'c', 'a', 'd'],
['2', '3', '1', '4'],
Expand Down Expand Up @@ -101,14 +101,14 @@ def test_no_header_row_basic(self):
])

def test_no_header_row_basic_stdin(self):
with open('examples/no_header_row.csv') as f, stdin_as_string(f):
with open('examples/no_header_row.csv', 'rb') as f, stdin_as_string(f):
self.assertRows(['--no-header-row', '-', 'examples/no_header_row2.csv'], [
['a', 'b', 'c'],
['1', '2', '3'],
['4', '5', '6'],
])

with open('examples/no_header_row.csv') as f, stdin_as_string(f):
with open('examples/no_header_row.csv', 'rb') as f, stdin_as_string(f):
self.assertRows(['--no-header-row', 'examples/no_header_row2.csv', '-'], [
['a', 'b', 'c'],
['4', '5', '6'],
Expand Down
18 changes: 9 additions & 9 deletions tests/test_utilities/test_in2csv.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import os
import sys
from io import StringIO
from unittest.mock import patch

from csvkit.utilities.in2csv import In2CSV, launch_new_instance
Expand Down Expand Up @@ -38,7 +38,7 @@ def test_blanks(self):
self.assertConverted('csv', 'examples/blanks.csv', 'examples/blanks.csv', ['--blanks'])

def test_null_value(self):
input_file = StringIO('a,b\nn/a,\\N')
input_file = io.BytesIO(b'a,b\nn/a,\\N')

with stdin_as_string(input_file):
self.assertLines(['-f', 'csv', '--null-value', '\\N'], [
Expand All @@ -49,7 +49,7 @@ def test_null_value(self):
input_file.close()

def test_null_value_blanks(self):
input_file = StringIO('a,b\nn/a,\\N')
input_file = io.BytesIO(b'a,b\nn/a,\\N')

with stdin_as_string(input_file):
self.assertLines(['-f', 'csv', '--null-value', '\\N', '--blanks'], [
Expand Down Expand Up @@ -153,7 +153,7 @@ def test_csv_no_headers_streaming(self):
['--no-header-row', '--no-inference', '--snifflimit', '0'])

def test_csv_datetime_inference(self):
input_file = StringIO('a\n2015-01-01T00:00:00Z')
input_file = io.BytesIO(b'a\n2015-01-01T00:00:00Z')

with stdin_as_string(input_file):
self.assertLines(['-f', 'csv'], [
Expand Down Expand Up @@ -182,9 +182,9 @@ def test_xlsx_no_inference(self):
])

def test_geojson_no_inference(self):
input_file = StringIO(
'{"a": 1, "b": 2, "type": "FeatureCollection", "features": [{"geometry": {}, "properties": '
'{"a": 1, "b": 2, "c": 3}}]}')
input_file = io.BytesIO(
b'{"a": 1, "b": 2, "type": "FeatureCollection", "features": [{"geometry": {}, "properties": '
b'{"a": 1, "b": 2, "c": 3}}]}')

with stdin_as_string(input_file):
self.assertLines(['--no-inference', '-f', 'geojson'], [
Expand All @@ -195,7 +195,7 @@ def test_geojson_no_inference(self):
input_file.close()

def test_json_no_inference(self):
input_file = StringIO('[{"a": 1, "b": 2, "c": 3}]')
input_file = io.BytesIO(b'[{"a": 1, "b": 2, "c": 3}]')

with stdin_as_string(input_file):
self.assertLines(['--no-inference', '-f', 'json'], [
Expand All @@ -206,7 +206,7 @@ def test_json_no_inference(self):
input_file.close()

def test_ndjson_no_inference(self):
input_file = StringIO('{"a": 1, "b": 2, "c": 3}')
input_file = io.BytesIO(b'{"a": 1, "b": 2, "c": 3}')

with stdin_as_string(input_file):
self.assertLines(['--no-inference', '-f', 'ndjson'], [
Expand Down
10 changes: 5 additions & 5 deletions tests/test_utilities/test_sql2csv.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import os
import sys
from io import StringIO
from unittest.mock import patch

try:
Expand Down Expand Up @@ -71,7 +71,7 @@ def test_file_with_query(self):
self.assertTrue('54' in csv)

def test_stdin(self):
input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer')
input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer')

with stdin_as_string(input_file):
csv = self.get_output([])
Expand All @@ -82,7 +82,7 @@ def test_stdin(self):
input_file.close()

def test_stdin_with_query(self):
input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer')
input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer')

with stdin_as_string(input_file):
csv = self.get_output(['--query', 'select 6*9 as question'])
Expand All @@ -93,7 +93,7 @@ def test_stdin_with_query(self):
input_file.close()

def test_stdin_with_file(self):
input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer')
input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer')

with stdin_as_string(input_file):
csv = self.get_output(['examples/test.sql'])
Expand All @@ -104,7 +104,7 @@ def test_stdin_with_file(self):
input_file.close()

def test_stdin_with_file_and_query(self):
input_file = StringIO('select cast(3.1415 * 13.37 as integer) as answer')
input_file = io.BytesIO(b'select cast(3.1415 * 13.37 as integer) as answer')

with stdin_as_string(input_file):
csv = self.get_output(['examples/test.sql', '--query', 'select 6*9 as question'])
Expand Down
Loading

0 comments on commit e5ef16e

Please sign in to comment.