diff --git a/CHANGES.txt b/CHANGES.txt index 357cb453a6..90156778dd 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -80,6 +80,9 @@ RELEASE VERSION/DATE TO BE FILLED IN LATER Fixes #4468. - Fix bad typing in Action.py: process() and strfunction(). - Add Pseudo() to global functions, had been omitted. Fixes #4474. + - Improve handling of file data that SCons itself processes - try + harder to decode non-UTF-8 text. SCons.Util.to_Text now exists + to convert a byte stream, such as "raw" file data. Fixes #3569, #4462. RELEASE 4.6.0 - Sun, 19 Nov 2023 17:22:20 -0700 diff --git a/RELEASE.txt b/RELEASE.txt index ae43db1c33..d52bc55ebc 100644 --- a/RELEASE.txt +++ b/RELEASE.txt @@ -54,6 +54,8 @@ FIXES make sure decoding of bytes doesn't fail. - Documentation indicated that both Pseudo() and env.Pseudo() were usable, but Pseudo() did not work; is now enabled. +- Improve handling of file data that SCons itself processes - as in + scanners - try harder to decode non-UTF-8 text. IMPROVEMENTS ------------ diff --git a/SCons/Node/FS.py b/SCons/Node/FS.py index a5282e6aa5..8d9c5fd233 100644 --- a/SCons/Node/FS.py +++ b/SCons/Node/FS.py @@ -1057,7 +1057,7 @@ def get_contents(self): contents of the file.""" return SCons.Node._get_contents_map[self._func_get_contents](self) - def get_text_contents(self): + def get_text_contents(self) -> str: """Fetch the decoded text contents of a Unicode encoded Entry. Since this should return the text contents from the file @@ -1073,6 +1073,7 @@ def get_text_contents(self): # hand or catch the exception. return '' else: + # now we're a different node type, call its method to get the text. return self.get_text_contents() def must_be_same(self, klass) -> None: @@ -2751,38 +2752,13 @@ def get_contents(self) -> bytes: return SCons.Node._get_contents_map[self._func_get_contents](self) def get_text_contents(self) -> str: - """Return the contents of the file in text form. - - This attempts to figure out what the encoding of the text is - based upon the BOM bytes, and then decodes the contents so that - it's a valid python string. - """ - contents = self.get_contents() - # The behavior of various decode() methods and functions - # w.r.t. the initial BOM bytes is different for different - # encodings and/or Python versions. ('utf-8' does not strip - # them, but has a 'utf-8-sig' which does; 'utf-16' seems to - # strip them; etc.) Just sidestep all the complication by - # explicitly stripping the BOM before we decode(). - if contents[:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8: - return contents[len(codecs.BOM_UTF8):].decode('utf-8') - if contents[:len(codecs.BOM_UTF16_LE)] == codecs.BOM_UTF16_LE: - return contents[len(codecs.BOM_UTF16_LE):].decode('utf-16-le') - if contents[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE: - return contents[len(codecs.BOM_UTF16_BE):].decode('utf-16-be') - try: - return contents.decode('utf-8') - except UnicodeDecodeError as e: - try: - return contents.decode('latin-1') - except UnicodeDecodeError as e: - return contents.decode('utf-8', errors='backslashreplace') + """Return the contents of the file as text.""" + return SCons.Util.to_Text(self.get_contents()) def get_content_hash(self) -> str: - """ - Compute and return the hash for this file. - """ + """Compute and return the hash for this file.""" if not self.rexists(): + # special marker to help distinguish from empty file return hash_signature(SCons.Util.NOFILE) fname = self.rfile().get_abspath() try: diff --git a/SCons/Scanner/C.py b/SCons/Scanner/C.py index 2f1cb41e58..aafe0d9a56 100644 --- a/SCons/Scanner/C.py +++ b/SCons/Scanner/C.py @@ -58,10 +58,9 @@ def find_include_file(self, t): self.missing.append((fname, self.current_file)) return result - def read_file(self, file): + def read_file(self, file) -> str: try: - with open(str(file.rfile())) as fp: - return fp.read() + return file.rfile().get_text_contents() except OSError as e: self.missing.append((file, self.current_file)) return '' @@ -209,10 +208,9 @@ def find_include_file(self, t): self.missing.append((fname, self.current_file)) return result - def read_file(self, file): + def read_file(self, file) -> str: try: - with open(str(file.rfile())) as fp: - return fp.read() + return file.rfile().get_text_contents() except OSError: self.missing.append((file, self.current_file)) return "" diff --git a/SCons/Tool/JavaCommon.py b/SCons/Tool/JavaCommon.py index 31695c21b2..c7e62b88ce 100644 --- a/SCons/Tool/JavaCommon.py +++ b/SCons/Tool/JavaCommon.py @@ -29,6 +29,8 @@ from pathlib import Path from typing import List +import SCons.Util + java_parsing = True default_java_version = '1.4' @@ -451,8 +453,8 @@ def parseToken(self, token): def parse_java_file(fn, version=default_java_version): - with open(fn, encoding='utf-8') as f: - data = f.read() + with open(fn, "rb") as f: + data = SCons.Util.to_Text(f.read()) return parse_java(data, version) diff --git a/SCons/Tool/JavaCommonTests.py b/SCons/Tool/JavaCommonTests.py index fa462b6983..bb5c57f036 100644 --- a/SCons/Tool/JavaCommonTests.py +++ b/SCons/Tool/JavaCommonTests.py @@ -74,8 +74,9 @@ def test_file_parser(self) -> None: { public static void main(String[] args) { - /* This tests that unicde is handled . */ + /* This tests that unicode is handled . */ String hello1 = new String("ఎత్తువెడల్పు"); + /* and even smart quotes “like this” ‘and this’ */ } } """ diff --git a/SCons/Util/__init__.py b/SCons/Util/__init__.py index be2142f034..95c1b9978d 100644 --- a/SCons/Util/__init__.py +++ b/SCons/Util/__init__.py @@ -81,6 +81,7 @@ to_String, to_String_for_subst, to_String_for_signature, + to_Text, to_bytes, to_str, get_env_bool, diff --git a/SCons/Util/sctypes.py b/SCons/Util/sctypes.py index 53fcc562a5..bcbefb6c80 100644 --- a/SCons/Util/sctypes.py +++ b/SCons/Util/sctypes.py @@ -7,6 +7,7 @@ Routines which check types and do type conversions. """ +import codecs import os import pprint import re @@ -187,7 +188,11 @@ def to_String( # pylint: disable=redefined-outer-name,redefined-builtin UserString=UserString, BaseStringTypes=BaseStringTypes, ) -> str: - """Return a string version of obj.""" + """Return a string version of obj. + + Use this for data likely to be well-behaved. Use + :func:`to_Text` for unknown file data that needs to be decoded. + """ if isinstance(obj, BaseStringTypes): # Early out when already a string! return obj @@ -244,6 +249,42 @@ def to_String_for_signature( # pylint: disable=redefined-outer-name,redefined-b return f() +def to_Text(data: bytes) -> str: + """Return bytes data converted to text. + + Useful for whole-file reads where the data needs some interpretation, + particularly for Scanners. Attempts to figure out what the encoding of + the text is based upon the BOM bytes, and then decodes the contents so + that it's a valid python string. + """ + _encoding_map = [ + (codecs.BOM_UTF8, 'utf-8'), + (codecs.BOM_UTF16_LE, 'utf-16le'), + (codecs.BOM_UTF16_BE, 'utf-16be'), + (codecs.BOM_UTF32_LE, 'utf-32le'), + (codecs.BOM_UTF32_BE, 'utf-32be'), + ] + + # First look for Byte-order-mark sequences to identify the encoding. + # Strip these since some codecs do, some don't. + for bom, encoding in _encoding_map: + if data.startswith(bom): + return data[len(bom):].decode(encoding, errors='backslashreplace') + + # If we didn't see a BOM, try UTF-8, then the "preferred" encoding + # (the files might be written on this system), then finally latin-1. + # TODO: possibly should be a way for the build to set an encoding. + try: + return data.decode('utf-8') + except UnicodeDecodeError: + try: + import locale + prefencoding = locale.getpreferredencoding() + return data.decode(prefencoding) + except (UnicodeDecodeError, LookupError): + return data.decode('latin-1', errors='backslashreplace') + + def get_env_bool(env, name: str, default: bool=False) -> bool: """Convert a construction variable to bool. diff --git a/SCons/cpp.py b/SCons/cpp.py index 97aba8cc34..1093ae2ac2 100644 --- a/SCons/cpp.py +++ b/SCons/cpp.py @@ -26,6 +26,8 @@ import os import re +import SCons.Util + # First "subsystem" of regular expressions that we set up: # # Stuff to turn the C preprocessor directives in a file's contents into @@ -401,9 +403,9 @@ def find_include_file(self, t): return f return None - def read_file(self, file): - with open(file) as f: - return f.read() + def read_file(self, file) -> str: + with open(file, 'rb') as f: + return SCons.Util.to_Text(f.read()) # Start and stop processing include lines.