Skip to content

Commit

Permalink
io_util: Detect utf-8-sig when using file utility
Browse files Browse the repository at this point in the history
  • Loading branch information
ales-erjavec committed Jan 29, 2025
1 parent 091e88c commit 2df43bf
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 1 deletion.
14 changes: 14 additions & 0 deletions Orange/data/io_util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import os.path
import subprocess
from collections import defaultdict
Expand Down Expand Up @@ -45,6 +46,16 @@ def open_compressed(filename, *args, _open=open, **kwargs):
# Else already a file, just pass it through
return filename

def _is_utf8_sig(filename: str) -> bool:
"""Does filename start with an UTF-8 BOM."""
try:
with open(filename, "rb") as f:
bom = f.read(3)
return bom == codecs.BOM_UTF8
except OSError: # pragma: no cover
return False



def detect_encoding(filename):
"""
Expand All @@ -59,6 +70,9 @@ def detect_encoding(filename):
proc.wait()
if proc.returncode == 0:
encoding = proc.stdout.read().strip()
# file does not detect/report UTF-8 BOM
if encoding == b'utf-8':
return "utf-8-sig" if _is_utf8_sig(filename) else "utf-8"
# file only supports these encodings; for others it says
# unknown-8bit or binary. So we give chardet a chance to do
# better
Expand Down
8 changes: 7 additions & 1 deletion Orange/tests/test_txt_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from Orange.data import Table, ContinuousVariable, DiscreteVariable
from Orange.data.io import CSVReader
from Orange.tests import test_filename
from Orange.tests import test_filename, named_file

tab_file = """\
Feature 1\tFeature 2\tFeature 3
Expand Down Expand Up @@ -124,6 +124,12 @@ def test_csv_sniffer(self):
self.assertEqual(len(data), 8)
self.assertEqual(len(data.domain.variables) + len(data.domain.metas), 15)

def test_utf_8_sig(self):
with named_file(csv_file, encoding="utf-8-sig") as f:
reader = CSVReader(f)
data = reader.read()
self.assertEqual(data.domain[0].name, "Feature 1")


if __name__ == "__main__":
unittest.main()
3 changes: 3 additions & 0 deletions i18n/si/msgs.jaml
Original file line number Diff line number Diff line change
Expand Up @@ -1252,11 +1252,14 @@ data/io_util.py:
.gz: false
.bz2: false
.xz: false
def `_is_utf8_sig`:
rb: false
def `detect_encoding`:
file: false
--brief: false
--mime-encoding: false
utf-8: false
utf-8-sig: false
us-ascii: false
iso-8859-1: false
utf-7: false
Expand Down

0 comments on commit 2df43bf

Please sign in to comment.