From d2bb6c3aa04105deaec8a04489582de3de7ce4e5 Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Wed, 20 Nov 2024 18:05:48 +0100 Subject: [PATCH 1/5] Fix Unicode encoding issues in Bazel's use of Starlark --- .../starlark/java/eval/StarlarkSemantics.java | 6 +++++ src/test/java/net/starlark/java/eval/BUILD | 26 +++++++++++++++---- .../net/starlark/java/eval/ScriptTest.java | 22 +++++++++++----- 3 files changed, 42 insertions(+), 12 deletions(-) diff --git a/src/main/java/net/starlark/java/eval/StarlarkSemantics.java b/src/main/java/net/starlark/java/eval/StarlarkSemantics.java index 658977e013bf08..ef98716ba88456 100644 --- a/src/main/java/net/starlark/java/eval/StarlarkSemantics.java +++ b/src/main/java/net/starlark/java/eval/StarlarkSemantics.java @@ -174,6 +174,9 @@ public Builder setBool(String name, boolean value) { /** Returns an immutable StarlarkSemantics. */ public StarlarkSemantics build() { + if (map.isEmpty()) { + return DEFAULT; + } return new StarlarkSemantics(ImmutableMap.copyOf(map)); } } @@ -259,4 +262,7 @@ public final String toString() { /** Whether StarlarkSet objects may be constructed by the interpreter. */ public static final String EXPERIMENTAL_ENABLE_STARLARK_SET = "-experimental_enable_starlark_set"; + + public static final String INTERNAL_BAZEL_ONLY_STRINGS_ARE_BYTES = + "-internal_bazel_only_strings_are_bytes"; } diff --git a/src/test/java/net/starlark/java/eval/BUILD b/src/test/java/net/starlark/java/eval/BUILD index 39473a4dc04e00..75d84133b5ac5b 100644 --- a/src/test/java/net/starlark/java/eval/BUILD +++ b/src/test/java/net/starlark/java/eval/BUILD @@ -49,13 +49,10 @@ java_test( ], ) -# Script-based tests of the Starlark interpreter. -java_test( - name = "ScriptTest", +java_library( + name = "ScriptTest_lib", srcs = ["ScriptTest.java"], data = glob(["testdata/*.star"]), - jvm_flags = ["-Dfile.encoding=UTF8"], - use_testrunner = False, deps = [ "//src/main/java/net/starlark/java/annot", "//src/main/java/net/starlark/java/eval", @@ -66,6 +63,25 @@ java_test( ], ) +# Script-based tests of the Starlark interpreter. +java_test( + name = "ScriptTest", + use_testrunner = False, + runtime_deps = [ + ":ScriptTest_lib", + ], +) + +java_test( + name = "ScriptTest_Latin1", + jvm_flags = ["-Dnet.starlark.java.eval.ScriptTest.latin1=true"], + main_class = "net.starlark.java.eval.ScriptTest", + use_testrunner = False, + runtime_deps = [ + ":ScriptTest_lib", + ], +) + # Script-based benchmarks of the Starlark interpreter. java_binary( name = "Benchmarks", diff --git a/src/test/java/net/starlark/java/eval/ScriptTest.java b/src/test/java/net/starlark/java/eval/ScriptTest.java index f8409a21807bbb..5ab6ed829f87e0 100644 --- a/src/test/java/net/starlark/java/eval/ScriptTest.java +++ b/src/test/java/net/starlark/java/eval/ScriptTest.java @@ -223,19 +223,27 @@ public static void main(String[] args) throws Exception { } // parse & execute - ParserInput input = ParserInput.fromString(buf.toString(), file.toString()); + boolean useLatin1 = Boolean.getBoolean("net.starlark.java.eval.ScriptTest.latin1"); + ParserInput input; + if (useLatin1) { + input = ParserInput.fromLatin1(buf.toString().getBytes(UTF_8), file.toString()); + } else { + input = ParserInput.fromString(buf.toString(), file.toString()); + } ImmutableMap.Builder predeclared = ImmutableMap.builder(); Starlark.addMethods(predeclared, new ScriptTest()); // e.g. assert_eq predeclared.put("json", Json.INSTANCE); // TODO(b/376078033): remove special set.star handling once Starlark sets are enabled by // default. - StarlarkSemantics semantics = - name.equals("set.star") - ? StarlarkSemantics.builder() - .setBool(StarlarkSemantics.EXPERIMENTAL_ENABLE_STARLARK_SET, true) - .build() - : StarlarkSemantics.DEFAULT; + StarlarkSemantics.Builder semanticsBuilder = StarlarkSemantics.builder(); + if (name.equals("set.star")) { + semanticsBuilder.setBool(StarlarkSemantics.EXPERIMENTAL_ENABLE_STARLARK_SET, true); + } + if (useLatin1) { + semanticsBuilder.setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_STRINGS_ARE_BYTES, true); + } + StarlarkSemantics semantics = semanticsBuilder.build(); Module module = Module.withPredeclared(semantics, predeclared.buildOrThrow()); try (Mutability mu = Mutability.createAllowingShallowFreeze("test")) { StarlarkThread thread = StarlarkThread.createTransient(mu, semantics); From 1f2bb58865c83b6228d5a43fbc0cb4b12ff01e19 Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Thu, 21 Nov 2024 20:46:57 +0100 Subject: [PATCH 2/5] Fix JSON tests --- .../starlark/java/eval/StarlarkSemantics.java | 4 +- .../java/net/starlark/java/lib/json/Json.java | 100 ++++++++++++++---- src/test/java/net/starlark/java/eval/BUILD | 2 +- .../net/starlark/java/eval/ScriptTest.java | 14 ++- .../net/starlark/java/eval/testdata/json.star | 3 +- .../starlark/java/eval/testdata/sorted.star | 2 +- 6 files changed, 93 insertions(+), 32 deletions(-) diff --git a/src/main/java/net/starlark/java/eval/StarlarkSemantics.java b/src/main/java/net/starlark/java/eval/StarlarkSemantics.java index ef98716ba88456..ac2fe6f79e9c62 100644 --- a/src/main/java/net/starlark/java/eval/StarlarkSemantics.java +++ b/src/main/java/net/starlark/java/eval/StarlarkSemantics.java @@ -263,6 +263,6 @@ public final String toString() { /** Whether StarlarkSet objects may be constructed by the interpreter. */ public static final String EXPERIMENTAL_ENABLE_STARLARK_SET = "-experimental_enable_starlark_set"; - public static final String INTERNAL_BAZEL_ONLY_STRINGS_ARE_BYTES = - "-internal_bazel_only_strings_are_bytes"; + public static final String INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS = + "-internal_bazel_only_utf_8_byte_strings"; } diff --git a/src/main/java/net/starlark/java/lib/json/Json.java b/src/main/java/net/starlark/java/lib/json/Json.java index a21338b6dc7ea2..da247bfca9f566 100644 --- a/src/main/java/net/starlark/java/lib/json/Json.java +++ b/src/main/java/net/starlark/java/lib/json/Json.java @@ -14,6 +14,9 @@ package net.starlark.java.lib.json; +import static java.nio.charset.StandardCharsets.ISO_8859_1; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.util.Arrays; import java.util.Map; import net.starlark.java.annot.Param; @@ -27,6 +30,7 @@ import net.starlark.java.eval.StarlarkInt; import net.starlark.java.eval.StarlarkIterable; import net.starlark.java.eval.StarlarkList; +import net.starlark.java.eval.StarlarkSemantics; import net.starlark.java.eval.StarlarkThread; import net.starlark.java.eval.StarlarkValue; import net.starlark.java.eval.Structure; @@ -89,9 +93,14 @@ public interface Encodable { + "\n" + "An application-defined type may define its own JSON encoding.\n" + "Encoding any other value yields an error.\n", - parameters = {@Param(name = "x")}) - public String encode(Object x) throws EvalException { - Encoder enc = new Encoder(); + parameters = {@Param(name = "x")}, + useStarlarkThread = true) + public String encode(Object x, StarlarkThread starlarkThread) throws EvalException { + Encoder enc = + new Encoder( + starlarkThread + .getSemantics() + .getBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS)); try { enc.encode(x); } catch (StackOverflowError unused) { @@ -103,6 +112,11 @@ public String encode(Object x) throws EvalException { private static final class Encoder { private final StringBuilder out = new StringBuilder(); + private final boolean utf8ByteStrings; + + public Encoder(boolean utf8ByteStrings) { + this.utf8ByteStrings = utf8ByteStrings; + } private void encode(Object x) throws EvalException { if (x == Starlark.NONE) { @@ -302,7 +316,13 @@ private void appendQuoted(String s) { useStarlarkThread = true) public Object decode(String x, Object defaultValue, StarlarkThread thread) throws EvalException { try { - return new Decoder(thread.mutability(), x).decode(); + return new Decoder( + thread.mutability(), + x, + thread + .getSemantics() + .getBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS)) + .decode(); } catch (EvalException e) { if (defaultValue != Starlark.UNBOUND) { return defaultValue; @@ -321,11 +341,13 @@ private static final class Decoder { private final Mutability mu; private final String s; // the input string + private final boolean utf8ByteStrings; private int i = 0; // current index in s - private Decoder(Mutability mu, String s) { + private Decoder(Mutability mu, String s, boolean utf8ByteStrings) { this.mu = mu; this.s = s; + this.utf8ByteStrings = utf8ByteStrings; } // decode is the entry point into the decoder. @@ -492,23 +514,33 @@ private String parseString() throws EvalException { if (i + 4 >= s.length()) { throw Starlark.errorf("incomplete \\uXXXX escape"); } - int hex = 0; - for (int j = 0; j < 4; j++) { - c = s.charAt(i + j); - int nybble = 0; - if (isdigit(c)) { - nybble = c - '0'; - } else if ('a' <= c && c <= 'f') { - nybble = 10 + c - 'a'; - } else if ('A' <= c && c <= 'F') { - nybble = 10 + c - 'A'; - } else { - throw Starlark.errorf("invalid hex char %s in \\uXXXX escape", quoteChar(c)); + int hex = parseUnicodeEscape(); + if (utf8ByteStrings) { + String unicodeString; + try { + unicodeString = Character.toString(hex); + } catch (IllegalArgumentException unused) { + unicodeString = Character.toString(0xFFFD); } - hex = (hex << 4) | nybble; + if (Character.MIN_SURROGATE <= hex && hex <= Character.MAX_SURROGATE) { + // If the code point is a surrogate, consume the next \\uXXXX escape. + if (i + 6 >= s.length() || s.charAt(i) != '\\' || s.charAt(i + 1) != 'u') { + unicodeString = Character.toString(0xFFFD); + } else { + i += 2; + int hex2 = parseUnicodeEscape(); + if (Character.MIN_SURROGATE <= hex2 && hex2 <= Character.MAX_SURROGATE) { + unicodeString += Character.toString(hex2); + } else { + unicodeString += Character.toString(0xFFFD); + } + } + } + // Append the escaped Unicode code point as a UTF-8 byte sequence. + str.append(new String(unicodeString.getBytes(UTF_8), ISO_8859_1)); + } else { + str.append((char) hex); } - str.append((char) hex); - i += 4; break; default: throw Starlark.errorf("invalid escape '\\%s'", c); @@ -517,6 +549,26 @@ private String parseString() throws EvalException { throw Starlark.errorf("unclosed string literal"); } + private int parseUnicodeEscape() throws EvalException { + int hex = 0; + for (int j = 0; j < 4; j++) { + char c = s.charAt(i + j); + int nybble = 0; + if (isdigit(c)) { + nybble = c - '0'; + } else if ('a' <= c && c <= 'f') { + nybble = 10 + c - 'a'; + } else if ('A' <= c && c <= 'F') { + nybble = 10 + c - 'A'; + } else { + throw Starlark.errorf("invalid hex char %s in \\uXXXX escape", quoteChar(c)); + } + hex = (hex << 4) | nybble; + } + i += 4; + return hex; + } + private Object parseNumber(char c) throws EvalException { // For now, allow any sequence of [0-9.eE+-]*. boolean isfloat = false; // whether digit string contains [.Ee+-] (other than leading minus) @@ -624,9 +676,11 @@ public String indent(String s, String prefix, String indent) throws EvalExceptio @Param(name = "x"), @Param(name = "prefix", positional = false, named = true, defaultValue = "''"), @Param(name = "indent", positional = false, named = true, defaultValue = "'\\t'"), - }) - public String encodeIndent(Object x, String prefix, String indent) throws EvalException { - return indent(encode(x), prefix, indent); + }, + useStarlarkThread = true) + public String encodeIndent(Object x, String prefix, String indent, StarlarkThread starlarkThread) + throws EvalException { + return indent(encode(x, starlarkThread), prefix, indent); } private static final class Indenter { diff --git a/src/test/java/net/starlark/java/eval/BUILD b/src/test/java/net/starlark/java/eval/BUILD index 75d84133b5ac5b..9712c19e394b7d 100644 --- a/src/test/java/net/starlark/java/eval/BUILD +++ b/src/test/java/net/starlark/java/eval/BUILD @@ -74,7 +74,7 @@ java_test( java_test( name = "ScriptTest_Latin1", - jvm_flags = ["-Dnet.starlark.java.eval.ScriptTest.latin1=true"], + jvm_flags = ["-Dnet.starlark.java.eval.ScriptTest.utf8ByteStrings=true"], main_class = "net.starlark.java.eval.ScriptTest", use_testrunner = False, runtime_deps = [ diff --git a/src/test/java/net/starlark/java/eval/ScriptTest.java b/src/test/java/net/starlark/java/eval/ScriptTest.java index 5ab6ed829f87e0..8fa88c903480dc 100644 --- a/src/test/java/net/starlark/java/eval/ScriptTest.java +++ b/src/test/java/net/starlark/java/eval/ScriptTest.java @@ -14,6 +14,7 @@ package net.starlark.java.eval; +import static java.nio.charset.StandardCharsets.ISO_8859_1; import static java.nio.charset.StandardCharsets.UTF_8; import com.google.common.base.Splitter; @@ -223,9 +224,9 @@ public static void main(String[] args) throws Exception { } // parse & execute - boolean useLatin1 = Boolean.getBoolean("net.starlark.java.eval.ScriptTest.latin1"); + boolean utf8ByteStrings = Boolean.getBoolean("net.starlark.java.eval.ScriptTest.utf8ByteStrings"); ParserInput input; - if (useLatin1) { + if (utf8ByteStrings) { input = ParserInput.fromLatin1(buf.toString().getBytes(UTF_8), file.toString()); } else { input = ParserInput.fromString(buf.toString(), file.toString()); @@ -233,6 +234,7 @@ public static void main(String[] args) throws Exception { ImmutableMap.Builder predeclared = ImmutableMap.builder(); Starlark.addMethods(predeclared, new ScriptTest()); // e.g. assert_eq predeclared.put("json", Json.INSTANCE); + predeclared.put("_utf8_byte_strings", utf8ByteStrings); // TODO(b/376078033): remove special set.star handling once Starlark sets are enabled by // default. @@ -240,8 +242,8 @@ public static void main(String[] args) throws Exception { if (name.equals("set.star")) { semanticsBuilder.setBool(StarlarkSemantics.EXPERIMENTAL_ENABLE_STARLARK_SET, true); } - if (useLatin1) { - semanticsBuilder.setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_STRINGS_ARE_BYTES, true); + if (utf8ByteStrings) { + semanticsBuilder.setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS, true); } StarlarkSemantics semantics = semanticsBuilder.build(); Module module = Module.withPredeclared(semantics, predeclared.buildOrThrow()); @@ -310,6 +312,10 @@ private static void reportError(StarlarkThread thread, String message) { for (StarlarkThread.CallStackEntry fr : stack) { System.err.printf("%s: called from %s\n", fr.location, fr.name); } + if (thread.getSemantics().getBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS)) { + // Reencode the message for display. + message = new String(message.getBytes(ISO_8859_1), UTF_8); + } System.err.println("Error: " + message); ok = false; } diff --git a/src/test/java/net/starlark/java/eval/testdata/json.star b/src/test/java/net/starlark/java/eval/testdata/json.star index 0ce226c72c9905..47d5bfac502e52 100644 --- a/src/test/java/net/starlark/java/eval/testdata/json.star +++ b/src/test/java/net/starlark/java/eval/testdata/json.star @@ -25,7 +25,8 @@ assert_eq(json.encode("\""), r'"\""') assert_eq(json.encode("/"), '"/"') assert_eq(json.encode("\\"), r'"\\"') assert_eq(json.encode(""), '""') -assert_eq(json.encode("😹"[:1]), '"�"') # invalid UTF-16 -> replacement char U+FFFD +# TODO: Invalid UTF-8 byte sequences are not replaced with U+FFFD. +assert_eq(json.encode("😹"[:1]), '"�"') if not _utf8_byte_strings else None # invalid UTF-16 -> replacement char U+FFFD assert_eq(json.encode([1, 2, 3]), "[1,2,3]") assert_eq(json.encode((1, 2, 3)), "[1,2,3]") diff --git a/src/test/java/net/starlark/java/eval/testdata/sorted.star b/src/test/java/net/starlark/java/eval/testdata/sorted.star index d7446794b5a5aa..d5bde7ad5879ba 100644 --- a/src/test/java/net/starlark/java/eval/testdata/sorted.star +++ b/src/test/java/net/starlark/java/eval/testdata/sorted.star @@ -102,7 +102,7 @@ assert_([None] <= [None]) # U+FFFD � = [FFFD] REPLACEMENT CHAR # U+1F33F 🌿 = [D83C DF3F] HERB # The first compares greater than the second. -assert_eq(sorted(["�", "🌿"]), ["🌿", "�"]) +assert_eq(sorted(["�", "🌿"]), ["�", "🌿"] if _utf8_byte_strings else ["🌿", "�"]) assert_(False < True) assert_fails(lambda: False < 1, "unsupported comparison: bool <=> int") From 67867c99e7f32ff812aa93bf43c343d8feefe68e Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Thu, 21 Nov 2024 20:48:38 +0100 Subject: [PATCH 3/5] Enable in Bazel --- .../semantics/BuildLanguageOptions.java | 1 + .../java/net/starlark/java/eval/Starlark.java | 11 ++-- .../net/starlark/java/eval/StringModule.java | 57 ++++++++++++++----- .../java/eval/testdata/string_misc.star | 10 ++++ 4 files changed, 60 insertions(+), 19 deletions(-) diff --git a/src/main/java/com/google/devtools/build/lib/packages/semantics/BuildLanguageOptions.java b/src/main/java/com/google/devtools/build/lib/packages/semantics/BuildLanguageOptions.java index 39ffa6524a6719..dd32b6a1f24877 100644 --- a/src/main/java/com/google/devtools/build/lib/packages/semantics/BuildLanguageOptions.java +++ b/src/main/java/com/google/devtools/build/lib/packages/semantics/BuildLanguageOptions.java @@ -849,6 +849,7 @@ public StarlarkSemantics toStarlarkSemantics() { // This function connects command-line flags to their corresponding StarlarkSemantics keys. StarlarkSemantics semantics = StarlarkSemantics.builder() + .setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS, true) .setBool(EXPERIMENTAL_JAVA_LIBRARY_EXPORT, experimentalJavaLibraryExport) .setBool( INCOMPATIBLE_STOP_EXPORTING_LANGUAGE_MODULES, diff --git a/src/main/java/net/starlark/java/eval/Starlark.java b/src/main/java/net/starlark/java/eval/Starlark.java index e36e7da6d196c4..0a47a196431583 100644 --- a/src/main/java/net/starlark/java/eval/Starlark.java +++ b/src/main/java/net/starlark/java/eval/Starlark.java @@ -589,11 +589,12 @@ public static String trimDocString(String docString) { return ""; } // First line is special: we fully strip it and ignore it for leading spaces calculation - String firstLineTrimmed = StringModule.INSTANCE.strip(lines.get(0), NONE); + String firstLineTrimmed = + StringModule.INSTANCE.stripSemantics(lines.get(0), NONE, StarlarkSemantics.DEFAULT); Iterable subsequentLines = Iterables.skip(lines, 1); int minLeadingSpaces = Integer.MAX_VALUE; for (String line : subsequentLines) { - String strippedLeading = StringModule.INSTANCE.lstrip(line, NONE); + String strippedLeading = StringModule.INSTANCE.lstripSemantics(line, NONE, StarlarkSemantics.DEFAULT); if (!strippedLeading.isEmpty()) { int leadingSpaces = line.length() - strippedLeading.length(); minLeadingSpaces = min(leadingSpaces, minLeadingSpaces); @@ -611,11 +612,13 @@ public static String trimDocString(String docString) { result.append("\n"); } if (line.length() > minLeadingSpaces) { - result.append(StringModule.INSTANCE.rstrip(line.substring(minLeadingSpaces), NONE)); + result.append( + StringModule.INSTANCE.rstripSemantics( + line.substring(minLeadingSpaces), NONE, StarlarkSemantics.DEFAULT)); } } // Remove trailing empty lines - return StringModule.INSTANCE.rstrip(result.toString(), NONE); + return StringModule.INSTANCE.rstripSemantics(result.toString(), NONE, StarlarkSemantics.DEFAULT); } /** diff --git a/src/main/java/net/starlark/java/eval/StringModule.java b/src/main/java/net/starlark/java/eval/StringModule.java index 3d481e50158ac1..3f7c812ea89c35 100644 --- a/src/main/java/net/starlark/java/eval/StringModule.java +++ b/src/main/java/net/starlark/java/eval/StringModule.java @@ -191,6 +191,11 @@ public String upper(String self) { "\u0009" + "\n" + "\u000B" + "\u000C" + "\r" + "\u001C" + "\u001D" + "\u001E" + "\u001F " + "\u0085" + "\u00A0"); + private static final CharMatcher ASCII_WHITESPACE = + CharMatcher.anyOf( + "\u0009" + "\n" + "\u000B" + "\u000C" + "\r" + "\u001C" + "\u001D" + "\u001E" + + "\u001F "); + private static String stringLStrip(String self, CharMatcher matcher) { for (int i = 0; i < self.length(); i++) { if (!matcher.matches(self.charAt(i))) { @@ -232,11 +237,15 @@ private static String stringStrip(String self, CharMatcher matcher) { }, doc = "The characters to remove, or all whitespace if None.", defaultValue = "None") - }) - public String lstrip(String self, Object charsOrNone) { - CharMatcher matcher = - charsOrNone != Starlark.NONE ? CharMatcher.anyOf((String) charsOrNone) : LATIN1_WHITESPACE; - return stringLStrip(self, matcher); + }, + useStarlarkThread = true) + public String lstrip(String self, Object charsOrNone, StarlarkThread starlarkThread) { + return lstripSemantics(self, charsOrNone, starlarkThread.getSemantics()); + } + + public String lstripSemantics( + String self, Object charsOrNone, StarlarkSemantics starlarkSemantics) { + return stringLStrip(self, matcher(charsOrNone, starlarkSemantics)); } @StarlarkMethod( @@ -258,11 +267,15 @@ public String lstrip(String self, Object charsOrNone) { }, doc = "The characters to remove, or all whitespace if None.", defaultValue = "None") - }) - public String rstrip(String self, Object charsOrNone) { - CharMatcher matcher = - charsOrNone != Starlark.NONE ? CharMatcher.anyOf((String) charsOrNone) : LATIN1_WHITESPACE; - return stringRStrip(self, matcher); + }, + useStarlarkThread = true) + public String rstrip(String self, Object charsOrNone, StarlarkThread starlarkThread) { + return rstripSemantics(self, charsOrNone, starlarkThread.getSemantics()); + } + + public String rstripSemantics( + String self, Object charsOrNone, StarlarkSemantics starlarkSemantics) { + return stringRStrip(self, matcher(charsOrNone, starlarkSemantics)); } @StarlarkMethod( @@ -285,11 +298,25 @@ public String rstrip(String self, Object charsOrNone) { }, doc = "The characters to remove, or all whitespace if None.", defaultValue = "None") - }) - public String strip(String self, Object charsOrNone) { - CharMatcher matcher = - charsOrNone != Starlark.NONE ? CharMatcher.anyOf((String) charsOrNone) : LATIN1_WHITESPACE; - return stringStrip(self, matcher); + }, + useStarlarkThread = true) + public String strip(String self, Object charsOrNone, StarlarkThread starlarkThread) { + return stripSemantics(self, charsOrNone, starlarkThread.getSemantics()); + } + + public String stripSemantics( + String self, Object charsOrNone, StarlarkSemantics starlarkSemantics) { + return stringStrip(self, matcher(charsOrNone, starlarkSemantics)); + } + + private static CharMatcher matcher(Object charsOrNone, StarlarkSemantics starlarkSemantics) { + return charsOrNone != Starlark.NONE + // TODO: This doesn't work correctly for non-ASCII characters in charsOrNone if using UTF-8 + // byte strings. + ? CharMatcher.anyOf((String) charsOrNone) + : (starlarkSemantics.getBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS) + ? ASCII_WHITESPACE + : LATIN1_WHITESPACE); } @StarlarkMethod( diff --git a/src/test/java/net/starlark/java/eval/testdata/string_misc.star b/src/test/java/net/starlark/java/eval/testdata/string_misc.star index 5660205e2d76dd..dc97c7cc61991f 100644 --- a/src/test/java/net/starlark/java/eval/testdata/string_misc.star +++ b/src/test/java/net/starlark/java/eval/testdata/string_misc.star @@ -187,10 +187,12 @@ assert_eq("".removeprefix(""), "") assert_eq("".removeprefix("a"), "") assert_eq("Apricot".removeprefix("pr"), "Apricot") assert_eq("AprApricot".removeprefix("Apr"), "Apricot") + def removeprefix_self_unmodified(): original_string = "Apricot" assert_eq(original_string.removeprefix("Apr"), "icot") assert_eq(original_string, "Apricot") + removeprefix_self_unmodified() assert_fails(lambda: "1234".removeprefix(1), "got value of type 'int', want 'string") @@ -203,9 +205,17 @@ assert_eq("".removesuffix(""), "") assert_eq("".removesuffix("a"), "") assert_eq("Apricot".removesuffix("co"), "Apricot") assert_eq("Apricotcot".removesuffix("cot"), "Apricot") + def removesuffix_self_unmodified(): original_string = "Apricot" assert_eq(original_string.removesuffix("cot"), "Apri") assert_eq(original_string, "Apricot") + removesuffix_self_unmodified() assert_fails(lambda: "1234".removesuffix(4), "got value of type 'int', want 'string") + +# strip +assert_eq(" abc ".strip(), "abc") +assert_eq("薠".strip(), "薠") +assert_eq("薠".lstrip(), "薠") +assert_eq("薠".rstrip(), "薠") From 78b8b6379b127e24e0bdd2dd38f3505b675f9588 Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Sat, 23 Nov 2024 23:35:55 +0100 Subject: [PATCH 4/5] Fix test --- .../lib/packages/semantics/BuildLanguageOptions.java | 11 +++++++++++ .../build/lib/packages/semantics/ConsistencyTest.java | 8 +++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/google/devtools/build/lib/packages/semantics/BuildLanguageOptions.java b/src/main/java/com/google/devtools/build/lib/packages/semantics/BuildLanguageOptions.java index dd32b6a1f24877..6891676f1841d8 100644 --- a/src/main/java/com/google/devtools/build/lib/packages/semantics/BuildLanguageOptions.java +++ b/src/main/java/com/google/devtools/build/lib/packages/semantics/BuildLanguageOptions.java @@ -836,6 +836,14 @@ public final class BuildLanguageOptions extends OptionsBase { help = "If true, enable the set data type and set() constructor in Starlark.") public boolean experimentalEnableStarlarkSet; + @Option( + name = "internal_starlark_utf_8_byte_strings", + defaultValue = "true", + documentationCategory = OptionDocumentationCategory.UNDOCUMENTED, + effectTags = {OptionEffectTag.BUILD_FILE_SEMANTICS}, + metadataTags = {OptionMetadataTag.HIDDEN}) + public boolean internalStarlarkUtf8ByteStrings; + /** * An interner to reduce the number of StarlarkSemantics instances. A single Blaze instance should * never accumulate a large number of these and being able to shortcut on object identity makes a @@ -953,6 +961,9 @@ public StarlarkSemantics toStarlarkSemantics() { incompatibleSimplifyUnconditionalSelectsInRuleAttrs) .setBool( StarlarkSemantics.EXPERIMENTAL_ENABLE_STARLARK_SET, experimentalEnableStarlarkSet) + .setBool( + StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS, + internalStarlarkUtf8ByteStrings) .build(); return INTERNER.intern(semantics); } diff --git a/src/test/java/com/google/devtools/build/lib/packages/semantics/ConsistencyTest.java b/src/test/java/com/google/devtools/build/lib/packages/semantics/ConsistencyTest.java index d2b8299d5cd777..8d6899b7c04b14 100644 --- a/src/test/java/com/google/devtools/build/lib/packages/semantics/ConsistencyTest.java +++ b/src/test/java/com/google/devtools/build/lib/packages/semantics/ConsistencyTest.java @@ -96,7 +96,11 @@ public void serializationRoundTrip() throws Exception { @Test public void checkDefaultsMatch() { BuildLanguageOptions defaultOptions = Options.getDefaults(BuildLanguageOptions.class); - StarlarkSemantics defaultSemantics = StarlarkSemantics.DEFAULT; + StarlarkSemantics defaultSemantics = + StarlarkSemantics.DEFAULT.toBuilder() + // This flag must be false in Starlark, but true in Bazel by default. + .setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS, true) + .build(); StarlarkSemantics semanticsFromOptions = defaultOptions.toStarlarkSemantics(); assertThat(semanticsFromOptions).isEqualTo(defaultSemantics); } @@ -153,6 +157,7 @@ private static BuildLanguageOptions buildRandomOptions(Random rand) throws Excep "--incompatible_use_cc_configure_from_rules_cc=" + rand.nextBoolean(), "--incompatible_unambiguous_label_stringification=" + rand.nextBoolean(), "--internal_starlark_flag_test_canary=" + rand.nextBoolean(), + "--internal_starlark_utf_8_byte_strings=" + rand.nextBoolean(), "--max_computation_steps=" + rand.nextLong()); } @@ -208,6 +213,7 @@ private static StarlarkSemantics buildRandomSemantics(Random rand) { .setBool( BuildLanguageOptions.INCOMPATIBLE_UNAMBIGUOUS_LABEL_STRINGIFICATION, rand.nextBoolean()) .setBool(StarlarkSemantics.PRINT_TEST_MARKER, rand.nextBoolean()) + .setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS, rand.nextBoolean()) .set(BuildLanguageOptions.MAX_COMPUTATION_STEPS, rand.nextLong()) .build(); } From 0435c0979c15151df1dd762bfc8162b98630cf1a Mon Sep 17 00:00:00 2001 From: Fabian Meumertzheim Date: Mon, 25 Nov 2024 16:40:56 +0100 Subject: [PATCH 5/5] Drop Encoder argument --- .../java/net/starlark/java/lib/json/Json.java | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/main/java/net/starlark/java/lib/json/Json.java b/src/main/java/net/starlark/java/lib/json/Json.java index da247bfca9f566..6b8eabb2d9a5d9 100644 --- a/src/main/java/net/starlark/java/lib/json/Json.java +++ b/src/main/java/net/starlark/java/lib/json/Json.java @@ -93,14 +93,9 @@ public interface Encodable { + "\n" + "An application-defined type may define its own JSON encoding.\n" + "Encoding any other value yields an error.\n", - parameters = {@Param(name = "x")}, - useStarlarkThread = true) - public String encode(Object x, StarlarkThread starlarkThread) throws EvalException { - Encoder enc = - new Encoder( - starlarkThread - .getSemantics() - .getBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS)); + parameters = {@Param(name = "x")}) + public String encode(Object x) throws EvalException { + Encoder enc = new Encoder(); try { enc.encode(x); } catch (StackOverflowError unused) { @@ -112,11 +107,6 @@ public String encode(Object x, StarlarkThread starlarkThread) throws EvalExcepti private static final class Encoder { private final StringBuilder out = new StringBuilder(); - private final boolean utf8ByteStrings; - - public Encoder(boolean utf8ByteStrings) { - this.utf8ByteStrings = utf8ByteStrings; - } private void encode(Object x) throws EvalException { if (x == Starlark.NONE) { @@ -680,7 +670,7 @@ public String indent(String s, String prefix, String indent) throws EvalExceptio useStarlarkThread = true) public String encodeIndent(Object x, String prefix, String indent, StarlarkThread starlarkThread) throws EvalException { - return indent(encode(x, starlarkThread), prefix, indent); + return indent(encode(x), prefix, indent); } private static final class Indenter {