biolab · ales-erjavec · Jan 24, 2025
diff --git a/Orange/data/io_util.py b/Orange/data/io_util.py
@@ -1,3 +1,4 @@
+import codecs
 import os.path
 import subprocess
 from collections import defaultdict
@@ -45,6 +46,16 @@ def open_compressed(filename, *args, _open=open, **kwargs):
     # Else already a file, just pass it through
     return filename
 
+def _is_utf8_sig(filename: str) -> bool:
+    """Does filename start with an UTF-8 BOM."""
+    try:
+        with open(filename, "rb") as f:
+            bom = f.read(3)
+            return bom == codecs.BOM_UTF8
+    except OSError:  # pragma: no cover
+        return False
+
+
 
 def detect_encoding(filename):
     """
@@ -59,6 +70,9 @@ def detect_encoding(filename):
                 proc.wait()
                 if proc.returncode == 0:
                     encoding = proc.stdout.read().strip()
+                    # file does not detect/report UTF-8 BOM
+                    if encoding == b'utf-8':
+                        return "utf-8-sig" if _is_utf8_sig(filename) else "utf-8"
                     # file only supports these encodings; for others it says
                     # unknown-8bit or binary. So we give chardet a chance to do
                     # better

diff --git a/Orange/tests/test_txt_reader.py b/Orange/tests/test_txt_reader.py
@@ -7,7 +7,7 @@
 
 from Orange.data import Table, ContinuousVariable, DiscreteVariable
 from Orange.data.io import CSVReader
-from Orange.tests import test_filename
+from Orange.tests import test_filename, named_file
 
 tab_file = """\
 Feature 1\tFeature 2\tFeature 3
@@ -124,6 +124,12 @@ def test_csv_sniffer(self):
         self.assertEqual(len(data), 8)
         self.assertEqual(len(data.domain.variables) + len(data.domain.metas), 15)
 
+    def test_utf_8_sig(self):
+        with named_file(csv_file, encoding="utf-8-sig") as f:
+            reader = CSVReader(f)
+            data = reader.read()
+            self.assertEqual(data.domain[0].name, "Feature 1")
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/i18n/si/msgs.jaml b/i18n/si/msgs.jaml
@@ -1252,11 +1252,14 @@ data/io_util.py:
         .gz: false
         .bz2: false
         .xz: false
+    def `_is_utf8_sig`:
+        rb: false
     def `detect_encoding`:
         file: false
         --brief: false
         --mime-encoding: false
         utf-8: false
+        utf-8-sig: false
         us-ascii: false
         iso-8859-1: false
         utf-7: false