Skip to content

Commit

Permalink
Merge pull request #4476 from mwichmann/rawfile-convert
Browse files Browse the repository at this point in the history
Be more careful about file reading
  • Loading branch information
bdbaddog authored Feb 14, 2024
2 parents b8fffb3 + 3e60ee1 commit 759ed8c
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 43 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ RELEASE VERSION/DATE TO BE FILLED IN LATER
Fixes #4468.
- Fix bad typing in Action.py: process() and strfunction().
- Add Pseudo() to global functions, had been omitted. Fixes #4474.
- Improve handling of file data that SCons itself processes - try
harder to decode non-UTF-8 text. SCons.Util.to_Text now exists
to convert a byte stream, such as "raw" file data. Fixes #3569, #4462.
The Pseudo manpage entry was updated to provide more clarity.
- The internal routine which implements the PyPackageDir function
would fail with an exception if called with a module which is
Expand Down
2 changes: 2 additions & 0 deletions RELEASE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ FIXES
make sure decoding of bytes doesn't fail.
- Documentation indicated that both Pseudo() and env.Pseudo() were usable,
but Pseudo() did not work; is now enabled.
- Improve handling of file data that SCons itself processes - as in
scanners - try harder to decode non-UTF-8 text.
- PyPackageDir no longer fails if passed a module name which cannot be found,
now returns None.

Expand Down
36 changes: 6 additions & 30 deletions SCons/Node/FS.py
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ def get_contents(self):
contents of the file."""
return SCons.Node._get_contents_map[self._func_get_contents](self)

def get_text_contents(self):
def get_text_contents(self) -> str:
"""Fetch the decoded text contents of a Unicode encoded Entry.
Since this should return the text contents from the file
Expand All @@ -1073,6 +1073,7 @@ def get_text_contents(self):
# hand or catch the exception.
return ''
else:
# now we're a different node type, call its method to get the text.
return self.get_text_contents()

def must_be_same(self, klass) -> None:
Expand Down Expand Up @@ -2754,38 +2755,13 @@ def get_contents(self) -> bytes:
return SCons.Node._get_contents_map[self._func_get_contents](self)

def get_text_contents(self) -> str:
"""Return the contents of the file in text form.
This attempts to figure out what the encoding of the text is
based upon the BOM bytes, and then decodes the contents so that
it's a valid python string.
"""
contents = self.get_contents()
# The behavior of various decode() methods and functions
# w.r.t. the initial BOM bytes is different for different
# encodings and/or Python versions. ('utf-8' does not strip
# them, but has a 'utf-8-sig' which does; 'utf-16' seems to
# strip them; etc.) Just sidestep all the complication by
# explicitly stripping the BOM before we decode().
if contents[:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8:
return contents[len(codecs.BOM_UTF8):].decode('utf-8')
if contents[:len(codecs.BOM_UTF16_LE)] == codecs.BOM_UTF16_LE:
return contents[len(codecs.BOM_UTF16_LE):].decode('utf-16-le')
if contents[:len(codecs.BOM_UTF16_BE)] == codecs.BOM_UTF16_BE:
return contents[len(codecs.BOM_UTF16_BE):].decode('utf-16-be')
try:
return contents.decode('utf-8')
except UnicodeDecodeError as e:
try:
return contents.decode('latin-1')
except UnicodeDecodeError as e:
return contents.decode('utf-8', errors='backslashreplace')
"""Return the contents of the file as text."""
return SCons.Util.to_Text(self.get_contents())

def get_content_hash(self) -> str:
"""
Compute and return the hash for this file.
"""
"""Compute and return the hash for this file."""
if not self.rexists():
# special marker to help distinguish from empty file
return hash_signature(SCons.Util.NOFILE)
fname = self.rfile().get_abspath()
try:
Expand Down
10 changes: 4 additions & 6 deletions SCons/Scanner/C.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,9 @@ def find_include_file(self, t):
self.missing.append((fname, self.current_file))
return result

def read_file(self, file):
def read_file(self, file) -> str:
try:
with open(str(file.rfile())) as fp:
return fp.read()
return file.rfile().get_text_contents()
except OSError as e:
self.missing.append((file, self.current_file))
return ''
Expand Down Expand Up @@ -209,10 +208,9 @@ def find_include_file(self, t):
self.missing.append((fname, self.current_file))
return result

def read_file(self, file):
def read_file(self, file) -> str:
try:
with open(str(file.rfile())) as fp:
return fp.read()
return file.rfile().get_text_contents()
except OSError:
self.missing.append((file, self.current_file))
return ""
Expand Down
6 changes: 4 additions & 2 deletions SCons/Tool/JavaCommon.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from pathlib import Path
from typing import List

import SCons.Util

java_parsing = True

default_java_version = '1.4'
Expand Down Expand Up @@ -451,8 +453,8 @@ def parseToken(self, token):


def parse_java_file(fn, version=default_java_version):
with open(fn, encoding='utf-8') as f:
data = f.read()
with open(fn, "rb") as f:
data = SCons.Util.to_Text(f.read())
return parse_java(data, version)


Expand Down
3 changes: 2 additions & 1 deletion SCons/Tool/JavaCommonTests.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ def test_file_parser(self) -> None:
{
public static void main(String[] args)
{
/* This tests that unicde is handled . */
/* This tests that unicode is handled . */
String hello1 = new String("ఎత్తువెడల్పు");
/* and even smart quotes “like this” ‘and this’ */
}
}
"""
Expand Down
1 change: 1 addition & 0 deletions SCons/Util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
to_String,
to_String_for_subst,
to_String_for_signature,
to_Text,
to_bytes,
to_str,
get_env_bool,
Expand Down
43 changes: 42 additions & 1 deletion SCons/Util/sctypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Routines which check types and do type conversions.
"""

import codecs
import os
import pprint
import re
Expand Down Expand Up @@ -187,7 +188,11 @@ def to_String( # pylint: disable=redefined-outer-name,redefined-builtin
UserString=UserString,
BaseStringTypes=BaseStringTypes,
) -> str:
"""Return a string version of obj."""
"""Return a string version of obj.
Use this for data likely to be well-behaved. Use
:func:`to_Text` for unknown file data that needs to be decoded.
"""
if isinstance(obj, BaseStringTypes):
# Early out when already a string!
return obj
Expand Down Expand Up @@ -244,6 +249,42 @@ def to_String_for_signature( # pylint: disable=redefined-outer-name,redefined-b
return f()


def to_Text(data: bytes) -> str:
"""Return bytes data converted to text.
Useful for whole-file reads where the data needs some interpretation,
particularly for Scanners. Attempts to figure out what the encoding of
the text is based upon the BOM bytes, and then decodes the contents so
that it's a valid python string.
"""
_encoding_map = [
(codecs.BOM_UTF8, 'utf-8'),
(codecs.BOM_UTF16_LE, 'utf-16le'),
(codecs.BOM_UTF16_BE, 'utf-16be'),
(codecs.BOM_UTF32_LE, 'utf-32le'),
(codecs.BOM_UTF32_BE, 'utf-32be'),
]

# First look for Byte-order-mark sequences to identify the encoding.
# Strip these since some codecs do, some don't.
for bom, encoding in _encoding_map:
if data.startswith(bom):
return data[len(bom):].decode(encoding, errors='backslashreplace')

# If we didn't see a BOM, try UTF-8, then the "preferred" encoding
# (the files might be written on this system), then finally latin-1.
# TODO: possibly should be a way for the build to set an encoding.
try:
return data.decode('utf-8')
except UnicodeDecodeError:
try:
import locale
prefencoding = locale.getpreferredencoding()
return data.decode(prefencoding)
except (UnicodeDecodeError, LookupError):
return data.decode('latin-1', errors='backslashreplace')


def get_env_bool(env, name: str, default: bool=False) -> bool:
"""Convert a construction variable to bool.
Expand Down
8 changes: 5 additions & 3 deletions SCons/cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import os
import re

import SCons.Util

# First "subsystem" of regular expressions that we set up:
#
# Stuff to turn the C preprocessor directives in a file's contents into
Expand Down Expand Up @@ -401,9 +403,9 @@ def find_include_file(self, t):
return f
return None

def read_file(self, file):
with open(file) as f:
return f.read()
def read_file(self, file) -> str:
with open(file, 'rb') as f:
return SCons.Util.to_Text(f.read())

# Start and stop processing include lines.

Expand Down

0 comments on commit 759ed8c

Please sign in to comment.