Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Unicode encoding issues in Bazel's use of Starlark #24417

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,14 @@ public final class BuildLanguageOptions extends OptionsBase {
help = "If true, enable the set data type and set() constructor in Starlark.")
public boolean experimentalEnableStarlarkSet;

@Option(
name = "internal_starlark_utf_8_byte_strings",
defaultValue = "true",
documentationCategory = OptionDocumentationCategory.UNDOCUMENTED,
effectTags = {OptionEffectTag.BUILD_FILE_SEMANTICS},
metadataTags = {OptionMetadataTag.HIDDEN})
public boolean internalStarlarkUtf8ByteStrings;

/**
* An interner to reduce the number of StarlarkSemantics instances. A single Blaze instance should
* never accumulate a large number of these and being able to shortcut on object identity makes a
Expand All @@ -849,6 +857,7 @@ public StarlarkSemantics toStarlarkSemantics() {
// This function connects command-line flags to their corresponding StarlarkSemantics keys.
StarlarkSemantics semantics =
StarlarkSemantics.builder()
.setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS, true)
.setBool(EXPERIMENTAL_JAVA_LIBRARY_EXPORT, experimentalJavaLibraryExport)
.setBool(
INCOMPATIBLE_STOP_EXPORTING_LANGUAGE_MODULES,
Expand Down Expand Up @@ -952,6 +961,9 @@ public StarlarkSemantics toStarlarkSemantics() {
incompatibleSimplifyUnconditionalSelectsInRuleAttrs)
.setBool(
StarlarkSemantics.EXPERIMENTAL_ENABLE_STARLARK_SET, experimentalEnableStarlarkSet)
.setBool(
StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS,
internalStarlarkUtf8ByteStrings)
.build();
return INTERNER.intern(semantics);
}
Expand Down
11 changes: 7 additions & 4 deletions src/main/java/net/starlark/java/eval/Starlark.java
Original file line number Diff line number Diff line change
Expand Up @@ -589,11 +589,12 @@ public static String trimDocString(String docString) {
return "";
}
// First line is special: we fully strip it and ignore it for leading spaces calculation
String firstLineTrimmed = StringModule.INSTANCE.strip(lines.get(0), NONE);
String firstLineTrimmed =
StringModule.INSTANCE.stripSemantics(lines.get(0), NONE, StarlarkSemantics.DEFAULT);
Iterable<String> subsequentLines = Iterables.skip(lines, 1);
int minLeadingSpaces = Integer.MAX_VALUE;
for (String line : subsequentLines) {
String strippedLeading = StringModule.INSTANCE.lstrip(line, NONE);
String strippedLeading = StringModule.INSTANCE.lstripSemantics(line, NONE, StarlarkSemantics.DEFAULT);
if (!strippedLeading.isEmpty()) {
int leadingSpaces = line.length() - strippedLeading.length();
minLeadingSpaces = min(leadingSpaces, minLeadingSpaces);
Expand All @@ -611,11 +612,13 @@ public static String trimDocString(String docString) {
result.append("\n");
}
if (line.length() > minLeadingSpaces) {
result.append(StringModule.INSTANCE.rstrip(line.substring(minLeadingSpaces), NONE));
result.append(
StringModule.INSTANCE.rstripSemantics(
line.substring(minLeadingSpaces), NONE, StarlarkSemantics.DEFAULT));
}
}
// Remove trailing empty lines
return StringModule.INSTANCE.rstrip(result.toString(), NONE);
return StringModule.INSTANCE.rstripSemantics(result.toString(), NONE, StarlarkSemantics.DEFAULT);
}

/**
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/net/starlark/java/eval/StarlarkSemantics.java
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,9 @@ public Builder setBool(String name, boolean value) {

/** Returns an immutable StarlarkSemantics. */
public StarlarkSemantics build() {
if (map.isEmpty()) {
return DEFAULT;
}
return new StarlarkSemantics(ImmutableMap.copyOf(map));
}
}
Expand Down Expand Up @@ -259,4 +262,7 @@ public final String toString() {

/** Whether StarlarkSet objects may be constructed by the interpreter. */
public static final String EXPERIMENTAL_ENABLE_STARLARK_SET = "-experimental_enable_starlark_set";

public static final String INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS =
"-internal_bazel_only_utf_8_byte_strings";
}
57 changes: 42 additions & 15 deletions src/main/java/net/starlark/java/eval/StringModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,11 @@ public String upper(String self) {
"\u0009" + "\n" + "\u000B" + "\u000C" + "\r" + "\u001C" + "\u001D" + "\u001E" + "\u001F "
+ "\u0085" + "\u00A0");

private static final CharMatcher ASCII_WHITESPACE =
CharMatcher.anyOf(
"\u0009" + "\n" + "\u000B" + "\u000C" + "\r" + "\u001C" + "\u001D" + "\u001E"
+ "\u001F ");

private static String stringLStrip(String self, CharMatcher matcher) {
for (int i = 0; i < self.length(); i++) {
if (!matcher.matches(self.charAt(i))) {
Expand Down Expand Up @@ -232,11 +237,15 @@ private static String stringStrip(String self, CharMatcher matcher) {
},
doc = "The characters to remove, or all whitespace if None.",
defaultValue = "None")
})
public String lstrip(String self, Object charsOrNone) {
CharMatcher matcher =
charsOrNone != Starlark.NONE ? CharMatcher.anyOf((String) charsOrNone) : LATIN1_WHITESPACE;
return stringLStrip(self, matcher);
},
useStarlarkThread = true)
public String lstrip(String self, Object charsOrNone, StarlarkThread starlarkThread) {
return lstripSemantics(self, charsOrNone, starlarkThread.getSemantics());
}

public String lstripSemantics(
String self, Object charsOrNone, StarlarkSemantics starlarkSemantics) {
return stringLStrip(self, matcher(charsOrNone, starlarkSemantics));
}

@StarlarkMethod(
Expand All @@ -258,11 +267,15 @@ public String lstrip(String self, Object charsOrNone) {
},
doc = "The characters to remove, or all whitespace if None.",
defaultValue = "None")
})
public String rstrip(String self, Object charsOrNone) {
CharMatcher matcher =
charsOrNone != Starlark.NONE ? CharMatcher.anyOf((String) charsOrNone) : LATIN1_WHITESPACE;
return stringRStrip(self, matcher);
},
useStarlarkThread = true)
public String rstrip(String self, Object charsOrNone, StarlarkThread starlarkThread) {
return rstripSemantics(self, charsOrNone, starlarkThread.getSemantics());
}

public String rstripSemantics(
String self, Object charsOrNone, StarlarkSemantics starlarkSemantics) {
return stringRStrip(self, matcher(charsOrNone, starlarkSemantics));
}

@StarlarkMethod(
Expand All @@ -285,11 +298,25 @@ public String rstrip(String self, Object charsOrNone) {
},
doc = "The characters to remove, or all whitespace if None.",
defaultValue = "None")
})
public String strip(String self, Object charsOrNone) {
CharMatcher matcher =
charsOrNone != Starlark.NONE ? CharMatcher.anyOf((String) charsOrNone) : LATIN1_WHITESPACE;
return stringStrip(self, matcher);
},
useStarlarkThread = true)
public String strip(String self, Object charsOrNone, StarlarkThread starlarkThread) {
return stripSemantics(self, charsOrNone, starlarkThread.getSemantics());
}

public String stripSemantics(
String self, Object charsOrNone, StarlarkSemantics starlarkSemantics) {
return stringStrip(self, matcher(charsOrNone, starlarkSemantics));
}

private static CharMatcher matcher(Object charsOrNone, StarlarkSemantics starlarkSemantics) {
return charsOrNone != Starlark.NONE
// TODO: This doesn't work correctly for non-ASCII characters in charsOrNone if using UTF-8
// byte strings.
? CharMatcher.anyOf((String) charsOrNone)
: (starlarkSemantics.getBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS)
? ASCII_WHITESPACE
: LATIN1_WHITESPACE);
}

@StarlarkMethod(
Expand Down
82 changes: 63 additions & 19 deletions src/main/java/net/starlark/java/lib/json/Json.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

package net.starlark.java.lib.json;

import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;

import java.util.Arrays;
import java.util.Map;
import net.starlark.java.annot.Param;
Expand All @@ -27,6 +30,7 @@
import net.starlark.java.eval.StarlarkInt;
import net.starlark.java.eval.StarlarkIterable;
import net.starlark.java.eval.StarlarkList;
import net.starlark.java.eval.StarlarkSemantics;
import net.starlark.java.eval.StarlarkThread;
import net.starlark.java.eval.StarlarkValue;
import net.starlark.java.eval.Structure;
Expand Down Expand Up @@ -302,7 +306,13 @@ private void appendQuoted(String s) {
useStarlarkThread = true)
public Object decode(String x, Object defaultValue, StarlarkThread thread) throws EvalException {
try {
return new Decoder(thread.mutability(), x).decode();
return new Decoder(
thread.mutability(),
x,
thread
.getSemantics()
.getBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS))
.decode();
} catch (EvalException e) {
if (defaultValue != Starlark.UNBOUND) {
return defaultValue;
Expand All @@ -321,11 +331,13 @@ private static final class Decoder {

private final Mutability mu;
private final String s; // the input string
private final boolean utf8ByteStrings;
private int i = 0; // current index in s

private Decoder(Mutability mu, String s) {
private Decoder(Mutability mu, String s, boolean utf8ByteStrings) {
this.mu = mu;
this.s = s;
this.utf8ByteStrings = utf8ByteStrings;
}

// decode is the entry point into the decoder.
Expand Down Expand Up @@ -492,23 +504,33 @@ private String parseString() throws EvalException {
if (i + 4 >= s.length()) {
throw Starlark.errorf("incomplete \\uXXXX escape");
}
int hex = 0;
for (int j = 0; j < 4; j++) {
c = s.charAt(i + j);
int nybble = 0;
if (isdigit(c)) {
nybble = c - '0';
} else if ('a' <= c && c <= 'f') {
nybble = 10 + c - 'a';
} else if ('A' <= c && c <= 'F') {
nybble = 10 + c - 'A';
} else {
throw Starlark.errorf("invalid hex char %s in \\uXXXX escape", quoteChar(c));
int hex = parseUnicodeEscape();
if (utf8ByteStrings) {
String unicodeString;
try {
unicodeString = Character.toString(hex);
} catch (IllegalArgumentException unused) {
unicodeString = Character.toString(0xFFFD);
}
if (Character.MIN_SURROGATE <= hex && hex <= Character.MAX_SURROGATE) {
// If the code point is a surrogate, consume the next \\uXXXX escape.
if (i + 6 >= s.length() || s.charAt(i) != '\\' || s.charAt(i + 1) != 'u') {
unicodeString = Character.toString(0xFFFD);
} else {
i += 2;
int hex2 = parseUnicodeEscape();
if (Character.MIN_SURROGATE <= hex2 && hex2 <= Character.MAX_SURROGATE) {
unicodeString += Character.toString(hex2);
} else {
unicodeString += Character.toString(0xFFFD);
}
}
}
hex = (hex << 4) | nybble;
// Append the escaped Unicode code point as a UTF-8 byte sequence.
str.append(new String(unicodeString.getBytes(UTF_8), ISO_8859_1));
} else {
str.append((char) hex);
}
str.append((char) hex);
i += 4;
break;
default:
throw Starlark.errorf("invalid escape '\\%s'", c);
Expand All @@ -517,6 +539,26 @@ private String parseString() throws EvalException {
throw Starlark.errorf("unclosed string literal");
}

private int parseUnicodeEscape() throws EvalException {
int hex = 0;
for (int j = 0; j < 4; j++) {
char c = s.charAt(i + j);
int nybble = 0;
if (isdigit(c)) {
nybble = c - '0';
} else if ('a' <= c && c <= 'f') {
nybble = 10 + c - 'a';
} else if ('A' <= c && c <= 'F') {
nybble = 10 + c - 'A';
} else {
throw Starlark.errorf("invalid hex char %s in \\uXXXX escape", quoteChar(c));
}
hex = (hex << 4) | nybble;
}
i += 4;
return hex;
}

private Object parseNumber(char c) throws EvalException {
// For now, allow any sequence of [0-9.eE+-]*.
boolean isfloat = false; // whether digit string contains [.Ee+-] (other than leading minus)
Expand Down Expand Up @@ -624,8 +666,10 @@ public String indent(String s, String prefix, String indent) throws EvalExceptio
@Param(name = "x"),
@Param(name = "prefix", positional = false, named = true, defaultValue = "''"),
@Param(name = "indent", positional = false, named = true, defaultValue = "'\\t'"),
})
public String encodeIndent(Object x, String prefix, String indent) throws EvalException {
},
useStarlarkThread = true)
public String encodeIndent(Object x, String prefix, String indent, StarlarkThread starlarkThread)
throws EvalException {
return indent(encode(x), prefix, indent);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ public void serializationRoundTrip() throws Exception {
@Test
public void checkDefaultsMatch() {
BuildLanguageOptions defaultOptions = Options.getDefaults(BuildLanguageOptions.class);
StarlarkSemantics defaultSemantics = StarlarkSemantics.DEFAULT;
StarlarkSemantics defaultSemantics =
StarlarkSemantics.DEFAULT.toBuilder()
// This flag must be false in Starlark, but true in Bazel by default.
.setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS, true)
.build();
StarlarkSemantics semanticsFromOptions = defaultOptions.toStarlarkSemantics();
assertThat(semanticsFromOptions).isEqualTo(defaultSemantics);
}
Expand Down Expand Up @@ -153,6 +157,7 @@ private static BuildLanguageOptions buildRandomOptions(Random rand) throws Excep
"--incompatible_use_cc_configure_from_rules_cc=" + rand.nextBoolean(),
"--incompatible_unambiguous_label_stringification=" + rand.nextBoolean(),
"--internal_starlark_flag_test_canary=" + rand.nextBoolean(),
"--internal_starlark_utf_8_byte_strings=" + rand.nextBoolean(),
"--max_computation_steps=" + rand.nextLong());
}

Expand Down Expand Up @@ -208,6 +213,7 @@ private static StarlarkSemantics buildRandomSemantics(Random rand) {
.setBool(
BuildLanguageOptions.INCOMPATIBLE_UNAMBIGUOUS_LABEL_STRINGIFICATION, rand.nextBoolean())
.setBool(StarlarkSemantics.PRINT_TEST_MARKER, rand.nextBoolean())
.setBool(StarlarkSemantics.INTERNAL_BAZEL_ONLY_UTF_8_BYTE_STRINGS, rand.nextBoolean())
.set(BuildLanguageOptions.MAX_COMPUTATION_STEPS, rand.nextLong())
.build();
}
Expand Down
26 changes: 21 additions & 5 deletions src/test/java/net/starlark/java/eval/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,10 @@ java_test(
],
)

# Script-based tests of the Starlark interpreter.
java_test(
name = "ScriptTest",
java_library(
name = "ScriptTest_lib",
srcs = ["ScriptTest.java"],
data = glob(["testdata/*.star"]),
jvm_flags = ["-Dfile.encoding=UTF8"],
use_testrunner = False,
deps = [
"//src/main/java/net/starlark/java/annot",
"//src/main/java/net/starlark/java/eval",
Expand All @@ -66,6 +63,25 @@ java_test(
],
)

# Script-based tests of the Starlark interpreter.
java_test(
name = "ScriptTest",
use_testrunner = False,
runtime_deps = [
":ScriptTest_lib",
],
)

java_test(
name = "ScriptTest_Latin1",
jvm_flags = ["-Dnet.starlark.java.eval.ScriptTest.utf8ByteStrings=true"],
main_class = "net.starlark.java.eval.ScriptTest",
use_testrunner = False,
runtime_deps = [
":ScriptTest_lib",
],
)

# Script-based benchmarks of the Starlark interpreter.
java_binary(
name = "Benchmarks",
Expand Down
Loading