Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] io_util: Detect utf-8-sig when using file utility #7006

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions Orange/data/io_util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import codecs
import os.path
import subprocess
from collections import defaultdict
Expand Down Expand Up @@ -45,6 +46,16 @@ def open_compressed(filename, *args, _open=open, **kwargs):
# Else already a file, just pass it through
return filename

def _is_utf8_sig(filename: str) -> bool:
"""Does filename start with an UTF-8 BOM."""
try:
with open(filename, "rb") as f:
bom = f.read(3)
return bom == codecs.BOM_UTF8
except OSError: # pragma: no cover
return False



def detect_encoding(filename):
"""
Expand All @@ -59,6 +70,9 @@ def detect_encoding(filename):
proc.wait()
if proc.returncode == 0:
encoding = proc.stdout.read().strip()
# file does not detect/report UTF-8 BOM
if encoding == b'utf-8':
return "utf-8-sig" if _is_utf8_sig(filename) else "utf-8"
# file only supports these encodings; for others it says
# unknown-8bit or binary. So we give chardet a chance to do
# better
Expand Down
8 changes: 7 additions & 1 deletion Orange/tests/test_txt_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from Orange.data import Table, ContinuousVariable, DiscreteVariable
from Orange.data.io import CSVReader
from Orange.tests import test_filename
from Orange.tests import test_filename, named_file

tab_file = """\
Feature 1\tFeature 2\tFeature 3
Expand Down Expand Up @@ -124,6 +124,12 @@ def test_csv_sniffer(self):
self.assertEqual(len(data), 8)
self.assertEqual(len(data.domain.variables) + len(data.domain.metas), 15)

def test_utf_8_sig(self):
with named_file(csv_file, encoding="utf-8-sig") as f:
reader = CSVReader(f)
data = reader.read()
self.assertEqual(data.domain[0].name, "Feature 1")


if __name__ == "__main__":
unittest.main()
3 changes: 3 additions & 0 deletions i18n/si/msgs.jaml
Original file line number Diff line number Diff line change
Expand Up @@ -1252,11 +1252,14 @@ data/io_util.py:
.gz: false
.bz2: false
.xz: false
def `_is_utf8_sig`:
rb: false
def `detect_encoding`:
file: false
--brief: false
--mime-encoding: false
utf-8: false
utf-8-sig: false
us-ascii: false
iso-8859-1: false
utf-7: false
Expand Down
Loading