Skip to content

Commit

Permalink
io_util: Detect utf-8-sig when using file utility
Browse files Browse the repository at this point in the history
  • Loading branch information
ales-erjavec committed Jan 27, 2025
1 parent 091e88c commit 2fd00d0
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
14 changes: 14 additions & 0 deletions Orange/data/io_util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import os.path
import subprocess
from collections import defaultdict
Expand Down Expand Up @@ -45,6 +46,16 @@ def open_compressed(filename, *args, _open=open, **kwargs):
# Else already a file, just pass it through
return filename

def _is_utf8_sig(filename: str) -> bool:
"""Does filename start with an UTF-8 BOM."""
try:
with open(filename, "rb") as f:
bom = f.read(3)
return bom == codecs.BOM_UTF8
except OSError:
return False

Check warning on line 56 in Orange/data/io_util.py

View check run for this annotation

Codecov / codecov/patch

Orange/data/io_util.py#L55-L56

Added lines #L55 - L56 were not covered by tests



def detect_encoding(filename):
"""
Expand All @@ -59,6 +70,9 @@ def detect_encoding(filename):
proc.wait()
if proc.returncode == 0:
encoding = proc.stdout.read().strip()
# file does not detect/report UTF-8 BOM
if encoding == b'utf-8':
return "utf-8-sig" if _is_utf8_sig(filename) else "utf-8"
# file only supports these encodings; for others it says
# unknown-8bit or binary. So we give chardet a chance to do
# better
Expand Down
8 changes: 7 additions & 1 deletion Orange/tests/test_txt_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from Orange.data import Table, ContinuousVariable, DiscreteVariable
from Orange.data.io import CSVReader
from Orange.tests import test_filename
from Orange.tests import test_filename, named_file

tab_file = """\
Feature 1\tFeature 2\tFeature 3
Expand Down Expand Up @@ -124,6 +124,12 @@ def test_csv_sniffer(self):
self.assertEqual(len(data), 8)
self.assertEqual(len(data.domain.variables) + len(data.domain.metas), 15)

def test_utf_8_sig(self):
with named_file(csv_file, encoding="utf-8-sig") as f:
reader = CSVReader(f)
data = reader.read()
self.assertEqual(data.domain[0].name, "Feature 1")


if __name__ == "__main__":
unittest.main()

0 comments on commit 2fd00d0

Please sign in to comment.