Skip to content

Commit

Permalink
Be more careful about file reading
Browse files Browse the repository at this point in the history
If SCons reads a file to interpret the contents, codecs are a concern.
The second-level Entry node class (inherits from node Base) has a
get_text_contents() method which makes a best effort at decoding
bytes data, but there are other places that don't either get their
file contents via that method or do their own careful decoding.

Move the decode-bytes portion out of Entry.get_text_contents() to
SCons.Util.to_Text() so that everyone that needs this can call it.
Add a couple of additional known BOM codes (after consulting Python's
codecs module).

Note that while get_text_contents acts on nodes, the new (moved)
routine to_Text acts on passed bytes, so it can be used in a non-Node
context as well - for example the Java tool initializer reads a file and
tries to decode it, and can get it wrong (see #3569), this change provides
it some help.

Fixes #3569
FIxes #4462

Signed-off-by: Mats Wichmann <[email protected]>
  • Loading branch information
mwichmann committed Feb 8, 2024
1 parent 7e120e8 commit 5c6bb9a
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 43 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ RELEASE VERSION/DATE TO BE FILLED IN LATER
Fixes #4468.
- Fix bad typing in Action.py: process() and strfunction().
- Add Pseudo() to global functions, had been omitted. Fixes #4474.
- Improve handling of file data that SCons itself processes - try
harder to decode non-UTF-8 text. SCons.Util.to_Text now exists
to convert a byte stream, such as "raw" file data. Fixes #3569, #4462.


RELEASE 4.6.0 - Sun, 19 Nov 2023 17:22:20 -0700
Expand Down
2 changes: 2 additions & 0 deletions RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ FIXES
make sure decoding of bytes doesn't fail.
- Documentation indicated that both Pseudo() and env.Pseudo() were usable,
but Pseudo() did not work; is now enabled.
- Improve handling of file data that SCons itself processes - as in
scanners - try harder to decode non-UTF-8 text.

IMPROVEMENTS
------------
Expand Down
36 changes: 6 additions & 30 deletions SCons/Node/FS.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ def get_contents(self):
contents of the file."""
return SCons.Node._get_contents_map[self._func_get_contents](self)

def get_text_contents(self):
def get_text_contents(self) -> str:
"""Fetch the decoded text contents of a Unicode encoded Entry.
Since this should return the text contents from the file
Expand All @@ -1073,6 +1073,7 @@ def get_text_contents(self):
# hand or catch the exception.
return ''
else:
# now we're a different node type, call its method to get the text.
return self.get_text_contents()

def must_be_same(self, klass) -> None:
Expand Down Expand Up @@ -2751,38 +2752,13 @@ def get_contents(self) -> bytes:
return SCons.Node._get_contents_map[self._func_get_contents](self)

def get_text_contents(self) -> str:
"""Return the contents of the file in text form.
This attempts to figure out what the encoding of the text is
based upon the BOM bytes, and then decodes the contents so that
it's a valid python string.
"""
contents = self.get_contents()
# The behavior of various decode() methods and functions
# w.r.t. the initial BOM bytes is different for different
# encodings and/or Python versions. ('utf-8' does not strip
# them, but has a 'utf-8-sig' which does; 'utf-16' seems to
# strip them; etc.) Just sidestep all the complication by
# explicitly stripping the BOM before we decode().
if contents[:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8:
return contents[len(codecs.BOM_UTF8):].decode('utf-8')
if contents[:len(codecs.BOM_UTF16_LE)] == codecs.BOM_UTF16_LE:
return contents[len(codecs.BOM_UTF16_LE):].decode('utf-16-le')
if contents[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
return contents[len(codecs.BOM_UTF16_BE):].decode('utf-16-be')
try:
return contents.decode('utf-8')
except UnicodeDecodeError as e:
try:
return contents.decode('latin-1')
except UnicodeDecodeError as e:
return contents.decode('utf-8', errors='backslashreplace')
"""Return the contents of the file as text."""
return SCons.Util.to_Text(self.get_contents())

def get_content_hash(self) -> str:
"""
Compute and return the hash for this file.
"""
"""Compute and return the hash for this file."""
if not self.rexists():
# special marker to help distinguish from empty file
return hash_signature(SCons.Util.NOFILE)
fname = self.rfile().get_abspath()
try:
Expand Down
10 changes: 4 additions & 6 deletions SCons/Scanner/C.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,9 @@ def find_include_file(self, t):
self.missing.append((fname, self.current_file))
return result

def read_file(self, file):
def read_file(self, file) -> str:
try:
with open(str(file.rfile())) as fp:
return fp.read()
return file.rfile().get_text_contents()
except OSError as e:
self.missing.append((file, self.current_file))
return ''
Expand Down Expand Up @@ -209,10 +208,9 @@ def find_include_file(self, t):
self.missing.append((fname, self.current_file))
return result

def read_file(self, file):
def read_file(self, file) -> str:
try:
with open(str(file.rfile())) as fp:
return fp.read()
return file.rfile().get_text_contents()
except OSError:
self.missing.append((file, self.current_file))
return ""
Expand Down
6 changes: 4 additions & 2 deletions SCons/Tool/JavaCommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from pathlib import Path
from typing import List

import SCons.Util

java_parsing = True

default_java_version = '1.4'
Expand Down Expand Up @@ -451,8 +453,8 @@ def parseToken(self, token):


def parse_java_file(fn, version=default_java_version):
with open(fn, encoding='utf-8') as f:
data = f.read()
with open(fn, "rb") as f:
data = SCons.Util.to_Text(f.read())
return parse_java(data, version)


Expand Down
3 changes: 2 additions & 1 deletion SCons/Tool/JavaCommonTests.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ def test_file_parser(self) -> None:
{
public static void main(String[] args)
{
/* This tests that unicde is handled . */
/* This tests that unicode is handled . */
String hello1 = new String("ఎత్తువెడల్పు");
/* and even smart quotes “like this” ‘and this’ */
}
}
"""
Expand Down
1 change: 1 addition & 0 deletions SCons/Util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
to_String,
to_String_for_subst,
to_String_for_signature,
to_Text,
to_bytes,
to_str,
get_env_bool,
Expand Down
43 changes: 42 additions & 1 deletion SCons/Util/sctypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Routines which check types and do type conversions.
"""

import codecs
import os
import pprint
import re
Expand Down Expand Up @@ -187,7 +188,11 @@ def to_String( # pylint: disable=redefined-outer-name,redefined-builtin
UserString=UserString,
BaseStringTypes=BaseStringTypes,
) -> str:
"""Return a string version of obj."""
"""Return a string version of obj.
Use this for data likely to be well-behaved. Use
:func:`to_Text` for unknown file data that needs to be decoded.
"""
if isinstance(obj, BaseStringTypes):
# Early out when already a string!
return obj
Expand Down Expand Up @@ -244,6 +249,42 @@ def to_String_for_signature( # pylint: disable=redefined-outer-name,redefined-b
return f()


def to_Text(data: bytes) -> str:
"""Return bytes data converted to text.
Useful for whole-file reads where the data needs some interpretation,
particularly for Scanners. Attempts to figure out what the encoding of
the text is based upon the BOM bytes, and then decodes the contents so
that it's a valid python string.
"""
_encoding_map = [
(codecs.BOM_UTF8, 'utf-8'),
(codecs.BOM_UTF16_LE, 'utf-16le'),
(codecs.BOM_UTF16_BE, 'utf-16be'),
(codecs.BOM_UTF32_LE, 'utf-32le'),
(codecs.BOM_UTF32_BE, 'utf-32be'),
]

# First look for Byte-order-mark sequences to identify the encoding.
# Strip these since some codecs do, some don't.
for bom, encoding in _encoding_map:
if data.startswith(bom):
return data[len(bom):].decode(encoding, errors='backslashreplace')

# If we didn't see a BOM, try UTF-8, then the "preferred" encoding
# (the files might be written on this system), then finally latin-1.
# TODO: possibly should be a way for the build to set an encoding.
try:
return data.decode('utf-8')
except UnicodeDecodeError:
try:
import locale
prefencoding = locale.getpreferredencoding()
return data.decode(prefencoding)
except (UnicodeDecodeError, LookupError):
return data.decode('latin-1', errors='backslashreplace')


def get_env_bool(env, name: str, default: bool=False) -> bool:
"""Convert a construction variable to bool.
Expand Down
8 changes: 5 additions & 3 deletions SCons/cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import os
import re

import SCons.Util

# First "subsystem" of regular expressions that we set up:
#
# Stuff to turn the C preprocessor directives in a file's contents into
Expand Down Expand Up @@ -401,9 +403,9 @@ def find_include_file(self, t):
return f
return None

def read_file(self, file):
with open(file) as f:
return f.read()
def read_file(self, file) -> str:
with open(file, 'rb') as f:
return SCons.Util.to_Text(f.read())

# Start and stop processing include lines.

Expand Down

0 comments on commit 5c6bb9a

Please sign in to comment.