From 2bb646c9c7aeb489fb76123325575462a56e9267 Mon Sep 17 00:00:00 2001 From: Josef Haider Date: Mon, 17 Jul 2023 12:29:48 +0200 Subject: [PATCH 01/14] TruffleString: add ToWellFormedStringNode --- truffle/CHANGELOG.md | 1 + truffle/docs/TruffleStrings.md | 2 + .../ops/TStringToWellFormedStringTest.java | 168 +++++++++++++++ .../snapshot.sigtest | 11 +- .../api/strings/AbstractTruffleString.java | 4 + .../oracle/truffle/api/strings/Encodings.java | 4 + .../truffle/api/strings/JCodingsImpl.java | 3 +- .../api/strings/TStringInternalNodes.java | 195 +++++++++++++++++- .../truffle/api/strings/TStringOps.java | 2 +- .../truffle/api/strings/TruffleString.java | 75 ++++++- 10 files changed, 455 insertions(+), 10 deletions(-) create mode 100644 truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToWellFormedStringTest.java diff --git a/truffle/CHANGELOG.md b/truffle/CHANGELOG.md index 4e20b099a244..54721eadc48c 100644 --- a/truffle/CHANGELOG.md +++ b/truffle/CHANGELOG.md @@ -32,6 +32,7 @@ This changelog summarizes major changes between Truffle versions relevant to lan * GR-44420 Added `TruffleLanguage.finalizeThread(Object, Thread)` to allow languages run finalization hooks for initialized threads before the context is disposed. * GR-45923 Added `EventBinding.tryAttach()` to try to attach a binding, if not disposed or attached already. * GR-39571 Added `TranscodingErrorHandler` to `TruffleString.SwitchEncodingNode`. +* GR-44464 Added `TruffleString.ToWellFormedStringNode`. ## Version 23.0.0 diff --git a/truffle/docs/TruffleStrings.md b/truffle/docs/TruffleStrings.md index 8989ab47360d..2751418b6539 100644 --- a/truffle/docs/TruffleStrings.md +++ b/truffle/docs/TruffleStrings.md @@ -108,6 +108,8 @@ Conversion: Convert a MutableTruffleString to an immutable TruffleString. * [AsManaged](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.AsManagedNode.html): Convert a TruffleString backed by a native pointer to one backed by a java byte array. +* [ToWellFormed](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.ToWellFormedStringNode.html): + Convert a TruffleString to a version that is encoded correctly. * [CopyToByteArray](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.CopyToByteArrayNode.html): Copy a string's content into a byte array. * [GetInternalByteArray](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.GetInternalByteArrayNode.html): diff --git a/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToWellFormedStringTest.java b/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToWellFormedStringTest.java new file mode 100644 index 000000000000..890e46bc090f --- /dev/null +++ b/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToWellFormedStringTest.java @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * The Universal Permissive License (UPL), Version 1.0 + * + * Subject to the condition set forth below, permission is hereby granted to any + * person obtaining a copy of this software, associated documentation and/or + * data (collectively the "Software"), free of charge and under any and all + * copyright rights in the Software, and any and all patent rights owned or + * freely licensable by each licensor hereunder covering either (i) the + * unmodified Software as contributed to or provided by such licensor, or (ii) + * the Larger Works (as defined below), to deal in both + * + * (a) the Software, and + * + * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if + * one is included with the Software each a "Larger Work" to which the Software + * is contributed by such licensors), + * + * without restriction, including without limitation the rights to copy, create + * derivative works of, display, perform, and distribute the Software and make, + * use, sell, offer for sale, import, export, have made, and have sold the + * Software and the Larger Work(s), and to sublicense the foregoing rights on + * either these or other terms. + * + * This license is subject to the following condition: + * + * The above copyright notice and either this complete permission notice or at a + * minimum a reference to the UPL must be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package com.oracle.truffle.api.strings.test.ops; + +import static com.oracle.truffle.api.strings.TruffleString.Encoding.BYTES; +import static com.oracle.truffle.api.strings.TruffleString.Encoding.ISO_8859_1; +import static com.oracle.truffle.api.strings.TruffleString.Encoding.US_ASCII; +import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_16; +import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_32; +import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_8; +import static com.oracle.truffle.api.strings.test.TStringTestUtil.byteArray; +import static org.junit.runners.Parameterized.Parameter; + +import java.util.Arrays; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +import com.oracle.truffle.api.strings.TruffleString; +import com.oracle.truffle.api.strings.test.TStringTestBase; + +@RunWith(Parameterized.class) +public class TStringToWellFormedStringTest extends TStringTestBase { + + @Parameter public TruffleString.ToWellFormedStringNode node; + + @Parameters(name = "{0}") + public static Iterable data() { + return Arrays.asList(TruffleString.ToWellFormedStringNode.create(), TruffleString.ToWellFormedStringNode.getUncached()); + } + + @Test + public void testAll() throws Exception { + forAllStrings(new TruffleString.Encoding[]{US_ASCII, ISO_8859_1, BYTES, UTF_8, UTF_16, UTF_32}, true, (a, array, codeRange, isValid, encoding, codepoints, byteIndices) -> { + TruffleString wellFormed = node.execute(a, encoding); + if (isValid && a instanceof TruffleString) { + Assert.assertSame(a, wellFormed); + } + Assert.assertTrue(wellFormed.isValidUncached(encoding)); + }); + } + + @Test + public void testAscii() { + testAscii(byteArray('a', '?'), byteArray('a', 0xff)); + testAscii(byteArray('a', '?'), byteArray('a', 0x80)); + testAscii(byteArray('a', '?', 'b'), byteArray('a', 0xff, 'b')); + testAscii(byteArray('a', '?', 'b'), byteArray('a', 0x80, 'b')); + testAscii(byteArray('a', 0x7f, 'b'), byteArray('a', 0x7f, 'b')); + } + + @Test + public void testUTF8() { + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD), byteArray('a', 0xff)); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD), byteArray('a', 0xf0, 0x90)); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD), byteArray('a', 0xf0, 0x90, 0x80)); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 0xf0, 0x90, 0x80, 0x80), byteArray('a', 0xf0, 0x90, 0x80, 0xf0, 0x90, 0x80, 0x80)); + testUTF8(byteArray('a', 0xf0, 0x90, 0x80, 0x80, 0xEF, 0xBF, 0xBD), byteArray('a', 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80)); + testUTF8(byteArray('a', 0xf0, 0x90, 0x80, 0x80), byteArray('a', 0xf0, 0x90, 0x80, 0x80)); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD), byteArray('a', 0xf8, 0x90)); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 'b'), byteArray('a', 0xff, 'b')); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 'b'), byteArray('a', 0xf0, 0x90, 'b')); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 'b'), byteArray('a', 0xf0, 0x90, 0x80, 'b')); + testUTF8(byteArray('a', 0xf0, 0x90, 0x80, 0x80, 'b'), byteArray('a', 0xf0, 0x90, 0x80, 0x80, 'b')); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 0xf0, 0x90, 0x80, 0x80, 'b'), byteArray('a', 0xf0, 0x90, 0x80, 0xf0, 0x90, 0x80, 0x80, 'b')); + testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 'b'), byteArray('a', 0xf8, 0x90, 'b')); + } + + private void testAscii(byte[] expected, byte[] input) { + testByteArray(expected, input, US_ASCII); + } + + private void testUTF8(byte[] expected, byte[] input) { + testByteArray(expected, input, UTF_8); + } + + private void testByteArray(byte[] expected, byte[] input, TruffleString.Encoding encoding) { + TruffleString wellFormed = node.execute(TruffleString.fromByteArrayUncached(input, encoding), encoding); + for (int i = 0; i < expected.length; i++) { + Assert.assertEquals(Byte.toUnsignedInt(expected[i]), wellFormed.readByteUncached(i, encoding)); + } + Assert.assertTrue(wellFormed.isValidUncached(encoding)); + } + + @Test + public void testUTF16() { + testUTF16("a\ufffd", "a\udfff"); + testUTF16("a\ufffd", "a\udbff"); + testUTF16("a\ufffd\ufffd", "a\udfff\udfff"); + testUTF16("a\ufffd\ufffd", "a\udbff\udbff"); + testUTF16("a\udbff\udfff\ufffd", "a\udbff\udfff\udbff"); + testUTF16("a\udbff\udfff\ufffdb", "a\udbff\udfff\udbffb"); + } + + private void testUTF16(String expected, String input) { + TruffleString wellFormed = node.execute(TruffleString.fromJavaStringUncached(input, UTF_16), UTF_16); + Assert.assertEquals(expected, wellFormed.toJavaStringUncached()); + Assert.assertTrue(wellFormed.isValidUncached(UTF_16)); + } + + @Test + public void testUTF32() { + testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', Character.MIN_SURROGATE}); + testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', Character.MAX_SURROGATE}); + testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', Integer.MAX_VALUE}); + testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', Integer.MIN_VALUE}); + testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', 0x110000}); + testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', 0xffff_ffff}); + testUTF32(new int[]{'a', Character.MAX_CODE_POINT}, new int[]{'a', Character.MAX_CODE_POINT}); + testUTF32(new int[]{'a', Character.MAX_CODE_POINT, 0xfffd}, new int[]{'a', Character.MAX_CODE_POINT, Character.MIN_SURROGATE}); + } + + private void testUTF32(int[] expected, int[] input) { + TruffleString wellFormed = node.execute(TruffleString.fromIntArrayUTF32Uncached(input), UTF_32); + for (int i = 0; i < expected.length; i++) { + Assert.assertEquals(expected[i], wellFormed.codePointAtIndexUncached(i, UTF_32)); + } + Assert.assertTrue(wellFormed.isValidUncached(UTF_32)); + } + + @Test + public void testNull() throws Exception { + expectNullPointerException(() -> node.execute(null, UTF_16)); + expectNullPointerException(() -> node.execute(S_UTF16, null)); + } +} diff --git a/truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest b/truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest index bd3d40410803..41387d60dff9 100644 --- a/truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest +++ b/truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest @@ -353,6 +353,7 @@ innr public abstract static SubstringByteIndexNode innr public abstract static SubstringNode innr public abstract static SwitchEncodingNode innr public abstract static ToJavaStringNode +innr public abstract static ToWellFormedStringNode innr public final static !enum CodeRange innr public final static !enum CompactionLevel innr public final static !enum Encoding @@ -362,6 +363,7 @@ innr public final static IllegalByteArrayLengthException innr public final static NumberFormatException innr public final static WithMask meth public com.oracle.truffle.api.strings.TruffleString asNativeUncached(com.oracle.truffle.api.strings.NativeAllocator,com.oracle.truffle.api.strings.TruffleString$Encoding,boolean,boolean) +meth public com.oracle.truffle.api.strings.TruffleString toWellFormedStringUncached(com.oracle.truffle.api.strings.TruffleString$Encoding) meth public static com.oracle.truffle.api.strings.TruffleString fromByteArrayUncached(byte[],com.oracle.truffle.api.strings.TruffleString$Encoding) meth public static com.oracle.truffle.api.strings.TruffleString fromByteArrayUncached(byte[],com.oracle.truffle.api.strings.TruffleString$Encoding,boolean) meth public static com.oracle.truffle.api.strings.TruffleString fromByteArrayUncached(byte[],int,int,com.oracle.truffle.api.strings.TruffleString$Encoding,boolean) @@ -985,6 +987,13 @@ meth public static com.oracle.truffle.api.strings.TruffleString$ToJavaStringNode meth public static com.oracle.truffle.api.strings.TruffleString$ToJavaStringNode getUncached() supr com.oracle.truffle.api.nodes.Node +CLSS public abstract static com.oracle.truffle.api.strings.TruffleString$ToWellFormedStringNode + outer com.oracle.truffle.api.strings.TruffleString +meth public abstract com.oracle.truffle.api.strings.TruffleString execute(com.oracle.truffle.api.strings.AbstractTruffleString,com.oracle.truffle.api.strings.TruffleString$Encoding) +meth public static com.oracle.truffle.api.strings.TruffleString$ToWellFormedStringNode create() +meth public static com.oracle.truffle.api.strings.TruffleString$ToWellFormedStringNode getUncached() +supr com.oracle.truffle.api.nodes.Node + CLSS public final static com.oracle.truffle.api.strings.TruffleString$WithMask outer com.oracle.truffle.api.strings.TruffleString innr public abstract static CreateNode @@ -1139,7 +1148,7 @@ CLSS public final com.oracle.truffle.api.strings.TruffleStringFactory cons public init() innr public final static WithMaskFactory supr java.lang.Object -hcls AsManagedNodeGen,AsNativeNodeGen,AsTruffleStringNodeGen,ByteIndexOfAnyByteNodeGen,ByteIndexOfCodePointNodeGen,ByteIndexOfCodePointSetNodeGen,ByteIndexOfStringNodeGen,ByteIndexToCodePointIndexNodeGen,ByteLengthOfCodePointNodeGen,CharIndexOfAnyCharUTF16NodeGen,CodePointAtByteIndexNodeGen,CodePointAtIndexNodeGen,CodePointIndexToByteIndexNodeGen,CodePointLengthNodeGen,CodeRangeEqualsNodeGen,CompareBytesNodeGen,CompareCharsUTF16NodeGen,CompareIntsUTF32NodeGen,ConcatNodeGen,CopyToByteArrayNodeGen,CopyToNativeMemoryNodeGen,CreateBackwardCodePointIteratorNodeGen,CreateCodePointIteratorNodeGen,EqualNodeGen,ForceEncodingNodeGen,FromByteArrayNodeGen,FromCharArrayUTF16NodeGen,FromCodePointNodeGen,FromIntArrayUTF32NodeGen,FromJavaStringNodeGen,FromLongNodeGen,FromNativePointerNodeGen,GetByteCodeRangeNodeGen,GetCodeRangeImpreciseNodeGen,GetCodeRangeNodeGen,GetInternalByteArrayNodeGen,GetInternalNativePointerNodeGen,GetStringCompactionLevelNodeGen,HashCodeNodeGen,IndexOfCodePointNodeGen,IndexOfStringNodeGen,IntIndexOfAnyIntUTF32NodeGen,InternalAsTruffleStringNodeGen,InternalCopyToByteArrayNodeGen,InternalSwitchEncodingNodeGen,IsValidNodeGen,LastByteIndexOfCodePointNodeGen,LastByteIndexOfStringNodeGen,LastIndexOfCodePointNodeGen,LastIndexOfStringNodeGen,MaterializeNodeGen,ParseDoubleNodeGen,ParseIntNodeGen,ParseLongNodeGen,ReadByteNodeGen,ReadCharUTF16NodeGen,RegionEqualByteIndexNodeGen,RegionEqualNodeGen,RepeatNodeGen,SubstringByteIndexNodeGen,SubstringNodeGen,SwitchEncodingNodeGen,ToIndexableNodeGen,ToJavaStringNodeGen +hcls AsManagedNodeGen,AsNativeNodeGen,AsTruffleStringNodeGen,ByteIndexOfAnyByteNodeGen,ByteIndexOfCodePointNodeGen,ByteIndexOfCodePointSetNodeGen,ByteIndexOfStringNodeGen,ByteIndexToCodePointIndexNodeGen,ByteLengthOfCodePointNodeGen,CharIndexOfAnyCharUTF16NodeGen,CodePointAtByteIndexNodeGen,CodePointAtIndexNodeGen,CodePointIndexToByteIndexNodeGen,CodePointLengthNodeGen,CodeRangeEqualsNodeGen,CompareBytesNodeGen,CompareCharsUTF16NodeGen,CompareIntsUTF32NodeGen,ConcatNodeGen,CopyToByteArrayNodeGen,CopyToNativeMemoryNodeGen,CreateBackwardCodePointIteratorNodeGen,CreateCodePointIteratorNodeGen,EqualNodeGen,ForceEncodingNodeGen,FromByteArrayNodeGen,FromCharArrayUTF16NodeGen,FromCodePointNodeGen,FromIntArrayUTF32NodeGen,FromJavaStringNodeGen,FromLongNodeGen,FromNativePointerNodeGen,GetByteCodeRangeNodeGen,GetCodeRangeImpreciseNodeGen,GetCodeRangeNodeGen,GetInternalByteArrayNodeGen,GetInternalNativePointerNodeGen,GetStringCompactionLevelNodeGen,HashCodeNodeGen,IndexOfCodePointNodeGen,IndexOfStringNodeGen,IntIndexOfAnyIntUTF32NodeGen,InternalAsTruffleStringNodeGen,InternalCopyToByteArrayNodeGen,InternalSwitchEncodingNodeGen,IsValidNodeGen,LastByteIndexOfCodePointNodeGen,LastByteIndexOfStringNodeGen,LastIndexOfCodePointNodeGen,LastIndexOfStringNodeGen,MaterializeNodeGen,ParseDoubleNodeGen,ParseIntNodeGen,ParseLongNodeGen,ReadByteNodeGen,ReadCharUTF16NodeGen,RegionEqualByteIndexNodeGen,RegionEqualNodeGen,RepeatNodeGen,SubstringByteIndexNodeGen,SubstringNodeGen,SwitchEncodingNodeGen,ToIndexableNodeGen,ToJavaStringNodeGen,ToWellFormedStringNodeGen CLSS public final static com.oracle.truffle.api.strings.TruffleStringFactory$WithMaskFactory outer com.oracle.truffle.api.strings.TruffleStringFactory diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java index b34ec075d23e..77b8365b2990 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java @@ -382,6 +382,10 @@ final void invalidateCodePointLength() { codePointLength = -1; } + final boolean isCodePointLengthKnown() { + return codePointLength >= 0; + } + final void invalidateHashCode() { hashCode = 0; } diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/Encodings.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/Encodings.java index 6fa69b59bb32..d6f59b6645b7 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/Encodings.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/Encodings.java @@ -163,6 +163,10 @@ final class Encodings { static final byte UTF8_ACCEPT = 0; static final byte UTF8_REJECT = 12; static final byte UTF8_REVERSE_INCOMPLETE_SEQ = 24; + /** + * UTF-8 encoded 0xfffd. + */ + static final byte[] CONVERSION_REPLACEMENT_UTF_8 = {(byte) 0xEF, (byte) 0xBF, (byte) 0xBD}; static byte[] getUTF8DecodingStateMachine(DecodingErrorHandler errorHandler) { return errorHandler == DecodingErrorHandler.DEFAULT_KEEP_SURROGATES_IN_UTF8 ? Encodings.UTF_8_STATE_MACHINE_ALLOW_UTF16_SURROGATES : Encodings.UTF_8_STATE_MACHINE; diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/JCodingsImpl.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/JCodingsImpl.java index 79a9fff04f44..662aeac67f8c 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/JCodingsImpl.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/JCodingsImpl.java @@ -282,7 +282,6 @@ private static void econvInsertOutput(TruffleString.Encoding targetEncoding, Enc } private static final byte[] CONVERSION_REPLACEMENT = {'?'}; - private static final byte[] CONVERSION_REPLACEMENT_UTF_8 = {(byte) 0xEF, (byte) 0xBF, (byte) 0xBD}; private static final byte[] CONVERSION_REPLACEMENT_UTF_16 = TStringGuards.littleEndian() ? new byte[]{(byte) 0xFD, (byte) 0xFF} : new byte[]{(byte) 0xFF, (byte) 0xFD}; private static final byte[] CONVERSION_REPLACEMENT_UTF_32 = TStringGuards.littleEndian() ? new byte[]{(byte) 0xFD, (byte) 0xFF, 0, 0} : new byte[]{0, 0, (byte) 0xFF, (byte) 0xFD}; @@ -318,7 +317,7 @@ public TruffleString transcode(Node location, AbstractTruffleString a, Object ar } else { final byte[] replacement; if (isUTF8(targetEncoding)) { - replacement = CONVERSION_REPLACEMENT_UTF_8; + replacement = Encodings.CONVERSION_REPLACEMENT_UTF_8; } else if (isUTF16(targetEncoding)) { replacement = CONVERSION_REPLACEMENT_UTF_16; } else if (isUTF32(targetEncoding)) { diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringInternalNodes.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringInternalNodes.java index 5aec5d1bc171..f84209da3627 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringInternalNodes.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringInternalNodes.java @@ -43,8 +43,11 @@ import static com.oracle.truffle.api.strings.AbstractTruffleString.checkArrayRange; import static com.oracle.truffle.api.strings.AbstractTruffleString.checkByteLengthUTF16; import static com.oracle.truffle.api.strings.AbstractTruffleString.checkByteLengthUTF32; +import static com.oracle.truffle.api.strings.Encodings.UTF8_ACCEPT; +import static com.oracle.truffle.api.strings.Encodings.UTF8_REJECT; import static com.oracle.truffle.api.strings.Encodings.isUTF16Surrogate; import static com.oracle.truffle.api.strings.TSCodeRange.isBroken; +import static com.oracle.truffle.api.strings.TSCodeRange.isPrecise; import static com.oracle.truffle.api.strings.TStringGuards.indexOfCannotMatch; import static com.oracle.truffle.api.strings.TStringGuards.is16Bit; import static com.oracle.truffle.api.strings.TStringGuards.is7Bit; @@ -54,9 +57,9 @@ import static com.oracle.truffle.api.strings.TStringGuards.isAsciiBytesOrLatin1; import static com.oracle.truffle.api.strings.TStringGuards.isBrokenFixedWidth; import static com.oracle.truffle.api.strings.TStringGuards.isBrokenMultiByte; +import static com.oracle.truffle.api.strings.TStringGuards.isBuiltin; import static com.oracle.truffle.api.strings.TStringGuards.isBytes; import static com.oracle.truffle.api.strings.TStringGuards.isFixedWidth; -import static com.oracle.truffle.api.strings.TStringGuards.isBuiltin; import static com.oracle.truffle.api.strings.TStringGuards.isStride0; import static com.oracle.truffle.api.strings.TStringGuards.isStride1; import static com.oracle.truffle.api.strings.TStringGuards.isStride2; @@ -95,6 +98,10 @@ final class TStringInternalNodes { + /** + * Gets a string's code range with enough precision to decide whether the code range makes the + * string fixed-width. + */ abstract static class GetCodeRangeForIndexCalculationNode extends AbstractInternalNode { abstract int execute(Node node, AbstractTruffleString a, Encoding encoding); @@ -112,6 +119,26 @@ static int get(Node node, AbstractTruffleString a, Encoding encoding, } } + /** + * Gets a string's code range with enough precision to decide whether the string is valid. + */ + abstract static class GetValidOrBrokenCodeRangeNode extends AbstractInternalNode { + + abstract int execute(Node node, AbstractTruffleString a, Encoding encoding); + + @Specialization + static int get(Node node, AbstractTruffleString a, Encoding encoding, + @Cached InlinedConditionProfile impreciseProfile, + @Cached TruffleString.ToIndexableNode toIndexableNode, + @Cached CalcStringAttributesNode calcStringAttributesNode) { + int codeRange = a.codeRange(); + if (impreciseProfile.profile(node, !TSCodeRange.isPrecise(codeRange) && TSCodeRange.isBroken(codeRange))) { + return StringAttributes.getCodeRange(updateAttributes(node, a, encoding, codeRange, toIndexableNode, calcStringAttributesNode)); + } + return codeRange; + } + } + abstract static class GetPreciseCodeRangeNode extends AbstractInternalNode { abstract int execute(Node node, AbstractTruffleString a, Encoding encoding); @@ -1578,7 +1605,173 @@ static String createJavaString(Node node, AbstractTruffleString a, Object arrayA } return TStringUnsafe.createJavaString(bytes, stride); } + } + + abstract static class ToWellFormedStringNode extends AbstractInternalNode { + + private static final int[] UTF_32_ASTRAL_RANGE = {0x10000, 0x10ffff}; + private static final int[] UTF_32_INVALID_RANGES = {Character.MIN_SURROGATE, Character.MAX_SURROGATE, 0x11_0000, 0xffff_ffff}; + + abstract TruffleString execute(Node node, AbstractTruffleString a, Object arrayA, Encoding encoding); + + @Specialization(guards = "isAscii(encoding)") + static TruffleString ascii(Node node, AbstractTruffleString a, Object arrayA, @SuppressWarnings("unused") Encoding encoding) { + assert isStride0(a); + int length = a.length(); + byte[] array = TStringOps.arraycopyOfWithStride(node, arrayA, a.offset(), length, 0, length, 0); + int pos = 0; + int loopCount = 0; + while (pos < length) { + pos = TStringOps.indexOfCodePointWithMaskWithStrideIntl(node, array, 0, length, 0, pos, 0xff, 0x7f); + if (pos >= 0) { + TStringOps.writeToByteArray(array, 0, pos++, '?'); + } else { + break; + } + TStringConstants.truffleSafePointPoll(node, ++loopCount); + } + return TruffleString.createFromByteArray(array, length, 0, Encoding.US_ASCII, length, TSCodeRange.get7Bit()); + } + + @Specialization(guards = "isUTF8(encoding)") + static TruffleString utf8(Node node, AbstractTruffleString a, Object arrayA, @SuppressWarnings("unused") Encoding encoding, + @Cached InlinedBranchProfile outOfMemoryProfile) { + assert isStride0(a); + assert isPrecise(a.codeRange()); + assert a.isCodePointLengthKnown(); + + boolean isLarge = TransCodeIntlNode.isLarge(a.codePointLength()); + byte[] buffer = new byte[isLarge ? TStringConstants.MAX_ARRAY_SIZE : a.codePointLength() * 4]; + int length = 0; + int state = UTF8_ACCEPT; + int lastCodePointPos = 0; + int lastErrorPos = 0; + int codePointLength = a.codePointLength(); + byte[] stateMachine = Encodings.UTF_8_STATE_MACHINE; + int i = 0; + while (i < a.length()) { + int b = readS0(a, arrayA, i++); + int type = stateMachine[b]; + state = stateMachine[256 + state + type]; + if (state == UTF8_ACCEPT) { + lastCodePointPos = i; + } else if (state == UTF8_REJECT) { + int curCPLength = i - (lastCodePointPos + 1); + length = utf8CopyValidRegion(node, a, arrayA, outOfMemoryProfile, isLarge, buffer, length, lastCodePointPos, lastErrorPos); + System.arraycopy(Encodings.CONVERSION_REPLACEMENT_UTF_8, 0, buffer, length, Encodings.CONVERSION_REPLACEMENT_UTF_8.length); + length += Encodings.CONVERSION_REPLACEMENT_UTF_8.length; + state = UTF8_ACCEPT; + if (curCPLength > 1) { + codePointLength -= curCPLength - 1; + i--; + } + lastErrorPos = i; + lastCodePointPos = i; + } + TStringConstants.truffleSafePointPoll(node, i); + } + length = utf8CopyValidRegion(node, a, arrayA, outOfMemoryProfile, isLarge, buffer, length, lastCodePointPos, lastErrorPos); + if (lastCodePointPos != a.length() && lastErrorPos != lastCodePointPos) { + System.arraycopy(Encodings.CONVERSION_REPLACEMENT_UTF_8, 0, buffer, length, Encodings.CONVERSION_REPLACEMENT_UTF_8.length); + length += Encodings.CONVERSION_REPLACEMENT_UTF_8.length; + int curCPLength = a.length() - lastCodePointPos; + if (curCPLength > 1) { + codePointLength -= curCPLength - 1; + } + } + return TruffleString.createFromByteArray(Arrays.copyOf(buffer, length), length, 0, Encoding.UTF_8, codePointLength, TSCodeRange.getValidMultiByte()); + } + + private static int utf8CopyValidRegion(Node node, AbstractTruffleString a, Object arrayA, + InlinedBranchProfile outOfMemoryProfile, boolean isLarge, byte[] buffer, int length, int lastCodePointPos, int lastErrorPos) { + int lengthCPY = lastCodePointPos - lastErrorPos; + if (isLarge && Integer.compareUnsigned(length + lengthCPY + Encodings.CONVERSION_REPLACEMENT_UTF_8.length, buffer.length) > 0) { + outOfMemoryProfile.enter(node); + throw InternalErrors.outOfMemory(); + } + TStringOps.arraycopyWithStride(node, arrayA, a.offset(), 0, lastErrorPos, buffer, 0, 0, length, lengthCPY); + return length + lengthCPY; + } + + @Specialization(guards = "isUTF16(encoding)") + static TruffleString utf16(Node node, AbstractTruffleString a, Object arrayA, @SuppressWarnings("unused") Encoding encoding) { + assert isStride1(a); + int length = a.length(); + byte[] array = TStringOps.arraycopyOfWithStride(node, arrayA, a.offset(), length, 1, length, 1); + int pos = 0; + int codeRange = TSCodeRange.get16Bit(); + int loopCount = 0; + while (true) { + pos = TStringOps.indexOfCodePointWithMaskWithStrideIntl(node, array, 0, length, 1, pos, 0xdfff, 0x7ff); + if (pos >= 0) { + boolean invalid = true; + if (pos != length - 1) { + char c = (char) TStringOps.readFromByteArray(array, 1, pos); + assert Encodings.isUTF16Surrogate(c); + if (!Encodings.isUTF16LowSurrogate(c)) { + assert Encodings.isUTF16HighSurrogate(c); + if (Encodings.isUTF16LowSurrogate((char) TStringOps.readFromByteArray(array, 1, pos + 1))) { + invalid = false; + codeRange = TSCodeRange.getValidMultiByte(); + pos++; + } + } + } + if (invalid) { + TStringOps.writeToByteArray(array, 1, pos, 0xfffd); + } + if (++pos == length) { + break; + } + } else { + break; + } + TStringConstants.truffleSafePointPoll(node, ++loopCount); + } + return TruffleString.createFromByteArray(array, length, 1, Encoding.UTF_16, a.codePointLength(), codeRange); + } + + @Specialization(guards = "isUTF32(encoding)") + static TruffleString utf32(Node node, AbstractTruffleString a, Object arrayA, @SuppressWarnings("unused") Encoding encoding, + @Cached InlinedConditionProfile strideProfile) { + assert isStride2(a); + int length = a.length(); + final byte[] array; + final int stride; + final int codeRange; + if (strideProfile.profile(node, TStringOps.indexOfAnyIntRange(node, arrayA, 0, 2, 0, a.length(), UTF_32_ASTRAL_RANGE) < 0)) { + array = TStringOps.arraycopyOfWithStride(node, arrayA, a.offset(), length, 2, length, 1); + stride = 1; + codeRange = TSCodeRange.get16Bit(); + utf32ReplaceInvalid(node, arrayA, length, array, 1); + } else { + array = TStringOps.arraycopyOfWithStride(node, arrayA, a.offset(), length, 2, length, 2); + stride = 2; + codeRange = TSCodeRange.getValidFixedWidth(); + utf32ReplaceInvalid(node, arrayA, length, array, 2); + } + return TruffleString.createFromByteArray(array, length, stride, Encoding.UTF_32, a.codePointLength(), codeRange); + } + + private static void utf32ReplaceInvalid(Node node, Object arrayA, int length, byte[] array, int stride) { + int pos = 0; + int loopCount = 0; + while (pos < length) { + pos = TStringOps.indexOfAnyIntRange(node, arrayA, 0, 2, pos, length, UTF_32_INVALID_RANGES); + if (pos >= 0) { + TStringOps.writeToByteArray(array, stride, pos++, 0xfffd); + } else { + break; + } + TStringConstants.truffleSafePointPoll(node, ++loopCount); + } + } + @SuppressWarnings("unused") + @Specialization(guards = "isUnsupportedEncoding(encoding)") + static TruffleString unsupported(Node node, AbstractTruffleString a, Object arrayA, Encoding encoding) { + throw InternalErrors.unsupportedOperation(); + } } abstract static class TransCodeNode extends AbstractInternalNode { diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringOps.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringOps.java index 022b20d2fd54..66fe6ccfe47d 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringOps.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringOps.java @@ -398,7 +398,7 @@ static int indexOfCodePointWithOrMaskWithStride(Node location, AbstractTruffleSt return indexOfCodePointWithMaskWithStrideIntl(location, arrayA, a.offset(), toIndex, strideA, fromIndex, codepoint, maskA); } - private static int indexOfCodePointWithMaskWithStrideIntl(Node location, Object array, int offset, int length, int stride, int fromIndex, int v1, int mask1) { + static int indexOfCodePointWithMaskWithStrideIntl(Node location, Object array, int offset, int length, int stride, int fromIndex, int v1, int mask1) { final boolean isNative = isNativePointer(array); final byte[] stubArray = stubArray(array, isNative); validateRegionIndex(stubArray, offset, length, stride, fromIndex, isNative); diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java index 3ac56c79a0bd..b10a792567c9 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java @@ -2161,7 +2161,7 @@ final TruffleString doNonEmpty(int[] value, int intOffset, int length, if (length == 0) { return Encoding.UTF_32.getEmpty(); } - if (length == 1 && value[intOffset] <= 0xff) { + if (length == 1 && Integer.compareUnsigned(value[intOffset], 0xff) <= 0) { return TStringConstants.getSingleByte(Encoding.UTF_32, value[intOffset]); } int offsetV = intOffset << 2; @@ -2794,9 +2794,9 @@ public abstract static class IsValidNode extends AbstractPublicNode { @Specialization final boolean isValid(AbstractTruffleString a, Encoding expectedEncoding, - @Cached TStringInternalNodes.GetPreciseCodeRangeNode getPreciseCodeRangeNode) { + @Cached TStringInternalNodes.GetValidOrBrokenCodeRangeNode getCodeRangeNode) { a.checkEncoding(expectedEncoding); - return !isBroken(getPreciseCodeRangeNode.execute(this, a, expectedEncoding)); + return !isBroken(getCodeRangeNode.execute(this, a, expectedEncoding)); } /** @@ -6151,6 +6151,72 @@ public TruffleString asNativeUncached(NativeAllocator allocator, Encoding expect return AsNativeNode.getUncached().execute(this, allocator, expectedEncoding, useCompaction, cacheResult); } + /** + * Node to replace all invalid bytes in a given string, such that the resulting string is + * encoded correctly in the given encoding. See + * {@link #execute(AbstractTruffleString, TruffleString.Encoding)} for details. + * + * @since 23.1 + */ + public abstract static class ToWellFormedStringNode extends AbstractPublicNode { + + ToWellFormedStringNode() { + } + + /** + * Returns a version of string {@code a} that is encoded correctly in the given + * encoding, which may be the string itself or a converted version. Invalid byte sequences + * are replaced with {@code '\ufffd'} (for UTF-*) or {@code '?'}. + * + * @since 23.1 + */ + public abstract TruffleString execute(AbstractTruffleString a, Encoding expectedEncoding); + + @Specialization + final TruffleString toWellFormed(AbstractTruffleString a, Encoding encoding, + @Cached InlinedConditionProfile isValidProfile, + @Cached TStringInternalNodes.GetValidOrBrokenCodeRangeNode getCodeRangeNode, + @Cached InternalAsTruffleStringNode asTruffleStringNode, + @Cached TStringInternalNodes.ToWellFormedStringNode internalNode, + @Cached ToIndexableNode toIndexableNode) { + a.checkEncoding(encoding); + int codeRangeA = getCodeRangeNode.execute(this, a, encoding); + if (isValidProfile.profile(this, !isBroken(codeRangeA))) { + return asTruffleStringNode.execute(this, a, encoding); + } + return internalNode.execute(this, a, toIndexableNode.execute(this, a, a.data()), encoding); + } + + /** + * Create a new {@link SwitchEncodingNode}. + * + * @since 23.1 + */ + @NeverDefault + public static ToWellFormedStringNode create() { + return TruffleStringFactory.ToWellFormedStringNodeGen.create(); + } + + /** + * Get the uncached version of {@link SwitchEncodingNode}. + * + * @since 23.1 + */ + public static ToWellFormedStringNode getUncached() { + return TruffleStringFactory.ToWellFormedStringNodeGen.getUncached(); + } + } + + /** + * Shorthand for calling the uncached version of {@link TruffleString.ToWellFormedStringNode}. + * + * @since 23.1 + */ + @TruffleBoundary + public TruffleString toWellFormedStringUncached(Encoding expectedEncoding) { + return ToWellFormedStringNode.getUncached().execute(this, expectedEncoding); + } + /** * Node to get a given string in a specific encoding. See * {@link #execute(AbstractTruffleString, TruffleString.Encoding)} for details. @@ -6217,7 +6283,7 @@ public static SwitchEncodingNode getUncached() { abstract static class InternalSwitchEncodingNode extends AbstractInternalNode { - public abstract TruffleString execute(Node node, AbstractTruffleString a, Encoding targetEncoding, TranscodingErrorHandler errorHandler); + abstract TruffleString execute(Node node, AbstractTruffleString a, Encoding targetEncoding, TranscodingErrorHandler errorHandler); @Specialization(guards = "a.isCompatibleToIntl(targetEncoding)") static TruffleString compatibleImmutable(TruffleString a, @SuppressWarnings("unused") Encoding targetEncoding, @SuppressWarnings("unused") TranscodingErrorHandler errorHandler) { @@ -6287,7 +6353,6 @@ static TruffleString transCodeMutable(Node node, MutableTruffleString a, Encodin return transCodeNode.execute(node, a, a.data(), codePointLengthA, codeRangeA, targetEncoding, errorHandler); } } - } /** From a7d0905cac8fb869b3f18997f4f297779827b96c Mon Sep 17 00:00:00 2001 From: Josef Haider Date: Thu, 20 Jul 2023 14:54:23 +0200 Subject: [PATCH 02/14] TruffleString: fix NPE in AbstractTruffleString.equals() --- .../api/strings/test/TStringCornerCaseTests.java | 11 +++++++++++ .../truffle/api/strings/AbstractTruffleString.java | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/TStringCornerCaseTests.java b/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/TStringCornerCaseTests.java index efcf28abf88c..2c2859140a4a 100644 --- a/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/TStringCornerCaseTests.java +++ b/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/TStringCornerCaseTests.java @@ -44,6 +44,7 @@ import static com.oracle.truffle.api.strings.test.TStringTestUtil.byteArray; import java.nio.charset.StandardCharsets; +import java.util.Arrays; import org.junit.Assert; import org.junit.Test; @@ -117,4 +118,14 @@ public void testMutableImpreciseCodeRange() { Assert.assertTrue(a.isCompatibleToUncached(TruffleString.Encoding.BYTES)); Assert.assertEquals(TruffleString.CodeRange.VALID, a.getCodeRangeImpreciseUncached(TruffleString.Encoding.BYTES)); } + + @Test + public void testSafePointPollInObjectEquals() { + char[] chars = new char[2000000]; + Arrays.fill(chars, 'a'); + String s = new String(chars); + TruffleString t1 = TruffleString.fromConstant(s, TruffleString.Encoding.UTF_16); + TruffleString t2 = TruffleString.fromConstant(s, TruffleString.Encoding.UTF_16); + Assert.assertEquals(t1, t2); + } } diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java index 77b8365b2990..c62d0131a77a 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java @@ -1302,7 +1302,7 @@ public final boolean equals(Object obj) { return false; } } - return TruffleString.EqualNode.checkContentEquals(null, this, b, + return TruffleString.EqualNode.checkContentEquals(TruffleString.EqualNode.getUncached(), this, b, ToIndexableNodeGen.getUncached(), ToIndexableNodeGen.getUncached(), InlinedConditionProfile.getUncached(), @@ -1432,7 +1432,7 @@ private static void flatten(Node location, TruffleString src, int srcBegin, int @TruffleBoundary private static void copy(Node location, TruffleString src, byte[] dst, int dstFrom, int dstStride) { - Object arrayA = ToIndexableNodeGen.getUncached().execute(null, src, src.data()); + Object arrayA = ToIndexableNodeGen.getUncached().execute(location, src, src.data()); TStringOps.arraycopyWithStride(location, arrayA, src.offset(), src.stride(), 0, dst, 0, dstStride, dstFrom, src.length()); From 84c147d7fc6437463a0e0b35aa03ad40aff22c9a Mon Sep 17 00:00:00 2001 From: Josef Haider Date: Fri, 21 Jul 2023 08:44:37 +0200 Subject: [PATCH 03/14] TruffleString: clarify docs --- truffle/CHANGELOG.md | 2 +- .../com/oracle/truffle/api/strings/TruffleString.java | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/truffle/CHANGELOG.md b/truffle/CHANGELOG.md index 54721eadc48c..97c29bfec007 100644 --- a/truffle/CHANGELOG.md +++ b/truffle/CHANGELOG.md @@ -32,7 +32,7 @@ This changelog summarizes major changes between Truffle versions relevant to lan * GR-44420 Added `TruffleLanguage.finalizeThread(Object, Thread)` to allow languages run finalization hooks for initialized threads before the context is disposed. * GR-45923 Added `EventBinding.tryAttach()` to try to attach a binding, if not disposed or attached already. * GR-39571 Added `TranscodingErrorHandler` to `TruffleString.SwitchEncodingNode`. -* GR-44464 Added `TruffleString.ToWellFormedStringNode`. +* GR-44464 Added `TruffleString.ToWellFormedStringNode` for encoding-level string sanitization. ## Version 23.0.0 diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java index b10a792567c9..b44b65f4debb 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java @@ -6153,7 +6153,7 @@ public TruffleString asNativeUncached(NativeAllocator allocator, Encoding expect /** * Node to replace all invalid bytes in a given string, such that the resulting string is - * encoded correctly in the given encoding. See + * {@link IsValidNode valid}. See * {@link #execute(AbstractTruffleString, TruffleString.Encoding)} for details. * * @since 23.1 @@ -6164,9 +6164,11 @@ public abstract static class ToWellFormedStringNode extends AbstractPublicNode { } /** - * Returns a version of string {@code a} that is encoded correctly in the given - * encoding, which may be the string itself or a converted version. Invalid byte sequences - * are replaced with {@code '\ufffd'} (for UTF-*) or {@code '?'}. + * Returns a version of string {@code a} that contains only valid codepoints, which may be + * the string itself or a converted version. Invalid byte sequences are replaced with + * {@code '\ufffd'} (for UTF-*) or {@code '?'}. This is useful for string sanitization in + * all uses cases where a string is required to actually be {@link IsValidNode valid}, such + * as libraries that actively reject broken input, network and file system I/O, etc. * * @since 23.1 */ From d48321f8cc608e3f18f056b6233d10b9c15ea996 Mon Sep 17 00:00:00 2001 From: Josef Haider Date: Fri, 11 Aug 2023 11:38:50 +0200 Subject: [PATCH 04/14] TruffleStrings: Rename ToWellFormedStringNode to ToValidStringNode. --- ...est.java => TStringToValidStringTest.java} | 8 +++--- .../snapshot.sigtest | 12 ++++----- .../api/strings/AbstractTruffleString.java | 10 +++++++ .../api/strings/TStringInternalNodes.java | 2 +- .../truffle/api/strings/TruffleString.java | 26 ++++++------------- 5 files changed, 29 insertions(+), 29 deletions(-) rename truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/{TStringToWellFormedStringTest.java => TStringToValidStringTest.java} (95%) diff --git a/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToWellFormedStringTest.java b/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToValidStringTest.java similarity index 95% rename from truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToWellFormedStringTest.java rename to truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToValidStringTest.java index 890e46bc090f..2bbd8dae9ea2 100644 --- a/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToWellFormedStringTest.java +++ b/truffle/src/com.oracle.truffle.api.strings.test/src/com/oracle/truffle/api/strings/test/ops/TStringToValidStringTest.java @@ -62,13 +62,13 @@ import com.oracle.truffle.api.strings.test.TStringTestBase; @RunWith(Parameterized.class) -public class TStringToWellFormedStringTest extends TStringTestBase { +public class TStringToValidStringTest extends TStringTestBase { - @Parameter public TruffleString.ToWellFormedStringNode node; + @Parameter public TruffleString.ToValidStringNode node; @Parameters(name = "{0}") - public static Iterable data() { - return Arrays.asList(TruffleString.ToWellFormedStringNode.create(), TruffleString.ToWellFormedStringNode.getUncached()); + public static Iterable data() { + return Arrays.asList(TruffleString.ToValidStringNode.create(), TruffleString.ToValidStringNode.getUncached()); } @Test diff --git a/truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest b/truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest index 41387d60dff9..04d3b179b977 100644 --- a/truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest +++ b/truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest @@ -79,6 +79,7 @@ hfds GIL_LOCK,PARENT_LIMIT,SAME_LANGUAGE_CHECK_VISITOR,parent CLSS public abstract interface com.oracle.truffle.api.nodes.NodeInterface CLSS public abstract com.oracle.truffle.api.strings.AbstractTruffleString +meth public com.oracle.truffle.api.strings.TruffleString toValidStringUncached(com.oracle.truffle.api.strings.TruffleString$Encoding) meth public final boolean codeRangeEqualsUncached(com.oracle.truffle.api.strings.TruffleString$CodeRange) meth public final boolean equals(java.lang.Object) meth public final boolean equalsUncached(com.oracle.truffle.api.strings.AbstractTruffleString,com.oracle.truffle.api.strings.TruffleString$Encoding) @@ -353,7 +354,7 @@ innr public abstract static SubstringByteIndexNode innr public abstract static SubstringNode innr public abstract static SwitchEncodingNode innr public abstract static ToJavaStringNode -innr public abstract static ToWellFormedStringNode +innr public abstract static ToValidStringNode innr public final static !enum CodeRange innr public final static !enum CompactionLevel innr public final static !enum Encoding @@ -363,7 +364,6 @@ innr public final static IllegalByteArrayLengthException innr public final static NumberFormatException innr public final static WithMask meth public com.oracle.truffle.api.strings.TruffleString asNativeUncached(com.oracle.truffle.api.strings.NativeAllocator,com.oracle.truffle.api.strings.TruffleString$Encoding,boolean,boolean) -meth public com.oracle.truffle.api.strings.TruffleString toWellFormedStringUncached(com.oracle.truffle.api.strings.TruffleString$Encoding) meth public static com.oracle.truffle.api.strings.TruffleString fromByteArrayUncached(byte[],com.oracle.truffle.api.strings.TruffleString$Encoding) meth public static com.oracle.truffle.api.strings.TruffleString fromByteArrayUncached(byte[],com.oracle.truffle.api.strings.TruffleString$Encoding,boolean) meth public static com.oracle.truffle.api.strings.TruffleString fromByteArrayUncached(byte[],int,int,com.oracle.truffle.api.strings.TruffleString$Encoding,boolean) @@ -987,11 +987,11 @@ meth public static com.oracle.truffle.api.strings.TruffleString$ToJavaStringNode meth public static com.oracle.truffle.api.strings.TruffleString$ToJavaStringNode getUncached() supr com.oracle.truffle.api.nodes.Node -CLSS public abstract static com.oracle.truffle.api.strings.TruffleString$ToWellFormedStringNode +CLSS public abstract static com.oracle.truffle.api.strings.TruffleString$ToValidStringNode outer com.oracle.truffle.api.strings.TruffleString meth public abstract com.oracle.truffle.api.strings.TruffleString execute(com.oracle.truffle.api.strings.AbstractTruffleString,com.oracle.truffle.api.strings.TruffleString$Encoding) -meth public static com.oracle.truffle.api.strings.TruffleString$ToWellFormedStringNode create() -meth public static com.oracle.truffle.api.strings.TruffleString$ToWellFormedStringNode getUncached() +meth public static com.oracle.truffle.api.strings.TruffleString$ToValidStringNode create() +meth public static com.oracle.truffle.api.strings.TruffleString$ToValidStringNode getUncached() supr com.oracle.truffle.api.nodes.Node CLSS public final static com.oracle.truffle.api.strings.TruffleString$WithMask @@ -1148,7 +1148,7 @@ CLSS public final com.oracle.truffle.api.strings.TruffleStringFactory cons public init() innr public final static WithMaskFactory supr java.lang.Object -hcls AsManagedNodeGen,AsNativeNodeGen,AsTruffleStringNodeGen,ByteIndexOfAnyByteNodeGen,ByteIndexOfCodePointNodeGen,ByteIndexOfCodePointSetNodeGen,ByteIndexOfStringNodeGen,ByteIndexToCodePointIndexNodeGen,ByteLengthOfCodePointNodeGen,CharIndexOfAnyCharUTF16NodeGen,CodePointAtByteIndexNodeGen,CodePointAtIndexNodeGen,CodePointIndexToByteIndexNodeGen,CodePointLengthNodeGen,CodeRangeEqualsNodeGen,CompareBytesNodeGen,CompareCharsUTF16NodeGen,CompareIntsUTF32NodeGen,ConcatNodeGen,CopyToByteArrayNodeGen,CopyToNativeMemoryNodeGen,CreateBackwardCodePointIteratorNodeGen,CreateCodePointIteratorNodeGen,EqualNodeGen,ForceEncodingNodeGen,FromByteArrayNodeGen,FromCharArrayUTF16NodeGen,FromCodePointNodeGen,FromIntArrayUTF32NodeGen,FromJavaStringNodeGen,FromLongNodeGen,FromNativePointerNodeGen,GetByteCodeRangeNodeGen,GetCodeRangeImpreciseNodeGen,GetCodeRangeNodeGen,GetInternalByteArrayNodeGen,GetInternalNativePointerNodeGen,GetStringCompactionLevelNodeGen,HashCodeNodeGen,IndexOfCodePointNodeGen,IndexOfStringNodeGen,IntIndexOfAnyIntUTF32NodeGen,InternalAsTruffleStringNodeGen,InternalCopyToByteArrayNodeGen,InternalSwitchEncodingNodeGen,IsValidNodeGen,LastByteIndexOfCodePointNodeGen,LastByteIndexOfStringNodeGen,LastIndexOfCodePointNodeGen,LastIndexOfStringNodeGen,MaterializeNodeGen,ParseDoubleNodeGen,ParseIntNodeGen,ParseLongNodeGen,ReadByteNodeGen,ReadCharUTF16NodeGen,RegionEqualByteIndexNodeGen,RegionEqualNodeGen,RepeatNodeGen,SubstringByteIndexNodeGen,SubstringNodeGen,SwitchEncodingNodeGen,ToIndexableNodeGen,ToJavaStringNodeGen,ToWellFormedStringNodeGen +hcls AsManagedNodeGen,AsNativeNodeGen,AsTruffleStringNodeGen,ByteIndexOfAnyByteNodeGen,ByteIndexOfCodePointNodeGen,ByteIndexOfCodePointSetNodeGen,ByteIndexOfStringNodeGen,ByteIndexToCodePointIndexNodeGen,ByteLengthOfCodePointNodeGen,CharIndexOfAnyCharUTF16NodeGen,CodePointAtByteIndexNodeGen,CodePointAtIndexNodeGen,CodePointIndexToByteIndexNodeGen,CodePointLengthNodeGen,CodeRangeEqualsNodeGen,CompareBytesNodeGen,CompareCharsUTF16NodeGen,CompareIntsUTF32NodeGen,ConcatNodeGen,CopyToByteArrayNodeGen,CopyToNativeMemoryNodeGen,CreateBackwardCodePointIteratorNodeGen,CreateCodePointIteratorNodeGen,EqualNodeGen,ForceEncodingNodeGen,FromByteArrayNodeGen,FromCharArrayUTF16NodeGen,FromCodePointNodeGen,FromIntArrayUTF32NodeGen,FromJavaStringNodeGen,FromLongNodeGen,FromNativePointerNodeGen,GetByteCodeRangeNodeGen,GetCodeRangeImpreciseNodeGen,GetCodeRangeNodeGen,GetInternalByteArrayNodeGen,GetInternalNativePointerNodeGen,GetStringCompactionLevelNodeGen,HashCodeNodeGen,IndexOfCodePointNodeGen,IndexOfStringNodeGen,IntIndexOfAnyIntUTF32NodeGen,InternalAsTruffleStringNodeGen,InternalCopyToByteArrayNodeGen,InternalSwitchEncodingNodeGen,IsValidNodeGen,LastByteIndexOfCodePointNodeGen,LastByteIndexOfStringNodeGen,LastIndexOfCodePointNodeGen,LastIndexOfStringNodeGen,MaterializeNodeGen,ParseDoubleNodeGen,ParseIntNodeGen,ParseLongNodeGen,ReadByteNodeGen,ReadCharUTF16NodeGen,RegionEqualByteIndexNodeGen,RegionEqualNodeGen,RepeatNodeGen,SubstringByteIndexNodeGen,SubstringNodeGen,SwitchEncodingNodeGen,ToIndexableNodeGen,ToJavaStringNodeGen,ToValidStringNodeGen CLSS public final static com.oracle.truffle.api.strings.TruffleStringFactory$WithMaskFactory outer com.oracle.truffle.api.strings.TruffleStringFactory diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java index c62d0131a77a..232daa373efd 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/AbstractTruffleString.java @@ -1211,6 +1211,16 @@ public final void copyToNativeMemoryUncached(int byteFromIndexA, Object pointerO TruffleString.CopyToNativeMemoryNode.getUncached().execute(this, byteFromIndexA, pointerObject, byteFromIndexDst, byteLength, expectedEncoding); } + /** + * Shorthand for calling the uncached version of {@link TruffleString.ToValidStringNode}. + * + * @since 23.1 + */ + @TruffleBoundary + public TruffleString toValidStringUncached(Encoding expectedEncoding) { + return TruffleString.ToValidStringNode.getUncached().execute(this, expectedEncoding); + } + /** * Shorthand for calling the uncached version of {@link TruffleString.ToJavaStringNode}. * diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringInternalNodes.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringInternalNodes.java index f84209da3627..d1562cbb12ae 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringInternalNodes.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TStringInternalNodes.java @@ -1607,7 +1607,7 @@ static String createJavaString(Node node, AbstractTruffleString a, Object arrayA } } - abstract static class ToWellFormedStringNode extends AbstractInternalNode { + abstract static class ToValidStringNode extends AbstractInternalNode { private static final int[] UTF_32_ASTRAL_RANGE = {0x10000, 0x10ffff}; private static final int[] UTF_32_INVALID_RANGES = {Character.MIN_SURROGATE, Character.MAX_SURROGATE, 0x11_0000, 0xffff_ffff}; diff --git a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java index b44b65f4debb..8809519c22c5 100644 --- a/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java +++ b/truffle/src/com.oracle.truffle.api.strings/src/com/oracle/truffle/api/strings/TruffleString.java @@ -6158,9 +6158,9 @@ public TruffleString asNativeUncached(NativeAllocator allocator, Encoding expect * * @since 23.1 */ - public abstract static class ToWellFormedStringNode extends AbstractPublicNode { + public abstract static class ToValidStringNode extends AbstractPublicNode { - ToWellFormedStringNode() { + ToValidStringNode() { } /** @@ -6175,11 +6175,11 @@ public abstract static class ToWellFormedStringNode extends AbstractPublicNode { public abstract TruffleString execute(AbstractTruffleString a, Encoding expectedEncoding); @Specialization - final TruffleString toWellFormed(AbstractTruffleString a, Encoding encoding, + final TruffleString toValid(AbstractTruffleString a, Encoding encoding, @Cached InlinedConditionProfile isValidProfile, @Cached TStringInternalNodes.GetValidOrBrokenCodeRangeNode getCodeRangeNode, @Cached InternalAsTruffleStringNode asTruffleStringNode, - @Cached TStringInternalNodes.ToWellFormedStringNode internalNode, + @Cached TStringInternalNodes.ToValidStringNode internalNode, @Cached ToIndexableNode toIndexableNode) { a.checkEncoding(encoding); int codeRangeA = getCodeRangeNode.execute(this, a, encoding); @@ -6195,8 +6195,8 @@ final TruffleString toWellFormed(AbstractTruffleString a, Encoding encoding, * @since 23.1 */ @NeverDefault - public static ToWellFormedStringNode create() { - return TruffleStringFactory.ToWellFormedStringNodeGen.create(); + public static ToValidStringNode create() { + return TruffleStringFactory.ToValidStringNodeGen.create(); } /** @@ -6204,21 +6204,11 @@ public static ToWellFormedStringNode create() { * * @since 23.1 */ - public static ToWellFormedStringNode getUncached() { - return TruffleStringFactory.ToWellFormedStringNodeGen.getUncached(); + public static ToValidStringNode getUncached() { + return TruffleStringFactory.ToValidStringNodeGen.getUncached(); } } - /** - * Shorthand for calling the uncached version of {@link TruffleString.ToWellFormedStringNode}. - * - * @since 23.1 - */ - @TruffleBoundary - public TruffleString toWellFormedStringUncached(Encoding expectedEncoding) { - return ToWellFormedStringNode.getUncached().execute(this, expectedEncoding); - } - /** * Node to get a given string in a specific encoding. See * {@link #execute(AbstractTruffleString, TruffleString.Encoding)} for details. From 87691f37044fea6d992c49949d668d7e1c173511 Mon Sep 17 00:00:00 2001 From: Josef Haider Date: Fri, 11 Aug 2023 12:02:21 +0000 Subject: [PATCH 05/14] TruffleStrings: fix changelog entry. --- truffle/CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/truffle/CHANGELOG.md b/truffle/CHANGELOG.md index 97c29bfec007..8b497651c366 100644 --- a/truffle/CHANGELOG.md +++ b/truffle/CHANGELOG.md @@ -32,7 +32,7 @@ This changelog summarizes major changes between Truffle versions relevant to lan * GR-44420 Added `TruffleLanguage.finalizeThread(Object, Thread)` to allow languages run finalization hooks for initialized threads before the context is disposed. * GR-45923 Added `EventBinding.tryAttach()` to try to attach a binding, if not disposed or attached already. * GR-39571 Added `TranscodingErrorHandler` to `TruffleString.SwitchEncodingNode`. -* GR-44464 Added `TruffleString.ToWellFormedStringNode` for encoding-level string sanitization. +* GR-44464 Added `TruffleString.ToValidStringNode` for encoding-level string sanitization. ## Version 23.0.0 From 772dd1366716d1cb8b6d6fa4439ec129f15d3ee0 Mon Sep 17 00:00:00 2001 From: Josef Haider Date: Fri, 11 Aug 2023 13:15:24 +0000 Subject: [PATCH 06/14] TruffleStrings: fix docs. --- truffle/docs/TruffleStrings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/truffle/docs/TruffleStrings.md b/truffle/docs/TruffleStrings.md index 2751418b6539..a0bde6b7e478 100644 --- a/truffle/docs/TruffleStrings.md +++ b/truffle/docs/TruffleStrings.md @@ -108,7 +108,7 @@ Conversion: Convert a MutableTruffleString to an immutable TruffleString. * [AsManaged](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.AsManagedNode.html): Convert a TruffleString backed by a native pointer to one backed by a java byte array. -* [ToWellFormed](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.ToWellFormedStringNode.html): +* [ToValidString](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.ToValidStringNode.html): Convert a TruffleString to a version that is encoded correctly. * [CopyToByteArray](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.CopyToByteArrayNode.html): Copy a string's content into a byte array. From 8ed7de11d332305e06e1761391914591b25fa21d Mon Sep 17 00:00:00 2001 From: Christian Haeubl Date: Fri, 6 Oct 2023 15:42:04 +0200 Subject: [PATCH 07/14] Verify that early-parsed isolate arguments don't change later on. --- .../com/oracle/svm/core/IsolateArgumentParser.java | 13 ++++++++++++- .../src/com/oracle/svm/core/jdk/RuntimeSupport.java | 3 ++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/IsolateArgumentParser.java b/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/IsolateArgumentParser.java index 10a9e4bee165..9f05d7c798c0 100644 --- a/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/IsolateArgumentParser.java +++ b/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/IsolateArgumentParser.java @@ -183,10 +183,21 @@ public void persistOptions(CLongPointer parsedArgs) { public void verifyOptionValues() { for (int i = 0; i < OPTION_COUNT; i++) { - validate(OPTIONS[i], getOptionValue(i)); + RuntimeOptionKey option = OPTIONS[i]; + if (shouldValidate(option)) { + validate(option, getOptionValue(i)); + } } } + private static boolean shouldValidate(RuntimeOptionKey option) { + if (SubstrateOptions.UseSerialGC.getValue()) { + /* The serial GC supports changing the heap size at run-time to some degree. */ + return option != SubstrateGCOptions.MinHeapSize && option != SubstrateGCOptions.MaxHeapSize && option != SubstrateGCOptions.MaxNewSize; + } + return true; + } + @Uninterruptible(reason = "Called from uninterruptible code.", mayBeInlined = true) public static boolean getBooleanOptionValue(int index) { return PARSED_OPTION_VALUES[index] == 1; diff --git a/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/jdk/RuntimeSupport.java b/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/jdk/RuntimeSupport.java index 105c9a9f23ac..c927913ad065 100644 --- a/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/jdk/RuntimeSupport.java +++ b/substratevm/src/com.oracle.svm.core/src/com/oracle/svm/core/jdk/RuntimeSupport.java @@ -35,6 +35,7 @@ import org.graalvm.nativeimage.VMRuntime; import org.graalvm.nativeimage.impl.VMRuntimeSupport; +import com.oracle.svm.core.IsolateArgumentParser; import com.oracle.svm.core.Isolates; import com.oracle.svm.core.feature.AutomaticallyRegisteredImageSingleton; import com.oracle.svm.core.heap.HeapSizeVerifier; @@ -92,7 +93,7 @@ public boolean isUninitialized() { public void initialize() { boolean shouldInitialize = initializationState.compareAndSet(InitializationState.Uninitialized, InitializationState.InProgress); if (shouldInitialize) { - // GR-35186: we should verify that none of the early parsed isolate arguments changed. + IsolateArgumentParser.singleton().verifyOptionValues(); HeapSizeVerifier.verifyHeapOptions(); executeHooks(startupHooks); From 252d084d9c55a011c6d53b9e05e2a6a6cedf769d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20W=C3=B6gerer?= Date: Wed, 18 Oct 2023 17:25:04 +0200 Subject: [PATCH 08/14] [GR-46837] Ensure libjvmcicompiler.so does not contain fully qualified build-path. --- substratevm/mx.substratevm/mx_substratevm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/substratevm/mx.substratevm/mx_substratevm.py b/substratevm/mx.substratevm/mx_substratevm.py index 56351fa00ec7..9ca01c23297b 100644 --- a/substratevm/mx.substratevm/mx_substratevm.py +++ b/substratevm/mx.substratevm/mx_substratevm.py @@ -1337,6 +1337,8 @@ def _native_image_launcher_extra_jvm_args(): ] if mx.get_arch() == 'aarch64' else []) + ([ # Build libgraal with 'Full RELRO' to prevent GOT overwriting exploits on Linux (GR-46838) '-H:NativeLinkerOption=-Wl,-z,relro,-z,now', + # Ensure shared library name in binary does not use fully qualified build-path (GR-46837) + '-H:NativeLinkerOption=-Wl,-soname=libjvmcicompiler.so', ] if mx.is_linux() else [])) libgraal = mx_sdk_vm.GraalVmJreComponent( From e02e9a13cd392610158584603f018bdd81cf6861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20W=C3=B6gerer?= Date: Thu, 19 Oct 2023 11:42:14 +0200 Subject: [PATCH 09/14] Shared library image builds must not embed fully qualified build-path --- substratevm/mx.substratevm/mx_substratevm.py | 2 -- .../src/com/oracle/svm/hosted/image/CCLinkerInvocation.java | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/substratevm/mx.substratevm/mx_substratevm.py b/substratevm/mx.substratevm/mx_substratevm.py index 9ca01c23297b..56351fa00ec7 100644 --- a/substratevm/mx.substratevm/mx_substratevm.py +++ b/substratevm/mx.substratevm/mx_substratevm.py @@ -1337,8 +1337,6 @@ def _native_image_launcher_extra_jvm_args(): ] if mx.get_arch() == 'aarch64' else []) + ([ # Build libgraal with 'Full RELRO' to prevent GOT overwriting exploits on Linux (GR-46838) '-H:NativeLinkerOption=-Wl,-z,relro,-z,now', - # Ensure shared library name in binary does not use fully qualified build-path (GR-46837) - '-H:NativeLinkerOption=-Wl,-soname=libjvmcicompiler.so', ] if mx.is_linux() else [])) libgraal = mx_sdk_vm.GraalVmJreComponent( diff --git a/substratevm/src/com.oracle.svm.hosted/src/com/oracle/svm/hosted/image/CCLinkerInvocation.java b/substratevm/src/com.oracle.svm.hosted/src/com/oracle/svm/hosted/image/CCLinkerInvocation.java index 14de7653ec1b..967aa3f642d5 100644 --- a/substratevm/src/com.oracle.svm.hosted/src/com/oracle/svm/hosted/image/CCLinkerInvocation.java +++ b/substratevm/src/com.oracle.svm.hosted/src/com/oracle/svm/hosted/image/CCLinkerInvocation.java @@ -320,6 +320,8 @@ protected void setOutputKind(List cmd) { break; case SHARED_LIBRARY: cmd.add("-shared"); + // Ensure shared library name in image does not use fully qualified build-path (GR-46837) + cmd.add("-Wl,-soname=" + outputFile.getFileName()); break; default: VMError.shouldNotReachHereUnexpectedInput(imageKind); // ExcludeFromJacocoGeneratedReport From 8e6ba488d3b548c9b89b771d9eb8a510946e2c79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20W=C3=B6gerer?= Date: Thu, 19 Oct 2023 13:53:22 +0200 Subject: [PATCH 10/14] Check every image of a GraalVM-build for bad strings --- sdk/mx.sdk/mx_sdk_vm_impl.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/sdk/mx.sdk/mx_sdk_vm_impl.py b/sdk/mx.sdk/mx_sdk_vm_impl.py index 1deaffaffed8..0cbae43e07a6 100644 --- a/sdk/mx.sdk/mx_sdk_vm_impl.py +++ b/sdk/mx.sdk/mx_sdk_vm_impl.py @@ -1210,7 +1210,10 @@ def is_ee_supported(self): def is_pgo_supported(self): return self.is_ee_supported() - def native_image(self, build_args, output_file, allow_server=False, nonZeroIsFatal=True, out=None, err=None): + search_tool = 'strings' + has_search_tool = shutil.which(search_tool) is not None + + def native_image(self, build_args, output_file, allow_server=False, nonZeroIsFatal=True, out=None, err=None, find_bad_strings=False): assert self._svm_supported stage1 = get_stage1_graalvm_distribution() native_image_project_name = GraalVmLauncher.launcher_project_name(mx_sdk.LauncherConfig(mx.exe_suffix('native-image'), [], "", []), stage1=True) @@ -1220,7 +1223,24 @@ def native_image(self, build_args, output_file, allow_server=False, nonZeroIsFat native_image_command += svm_experimental_options([ '-H:Path=' + output_directory or ".", ]) - return mx.run(native_image_command, nonZeroIsFatal=nonZeroIsFatal, out=out, err=err) + + retcode = mx.run(native_image_command, nonZeroIsFatal=nonZeroIsFatal, out=out, err=err) + + if find_bad_strings and not mx.is_windows(): + if not self.__class__.has_search_tool: + mx.abort(f"Searching for strings requires '{self.__class__.search_tool}' executable.") + try: + strings_in_image = subprocess.check_output([self.__class__.search_tool, output_file], stderr=None, text=True).strip().split('\n') + bad_strings = (output_directory, dirname(native_image_bin)) + for entry in strings_in_image: + for bad_string in bad_strings: + if bad_string in entry: + mx.abort(f"Found forbidden string '{bad_string}' in native image {output_file}.") + + except subprocess.CalledProcessError: + mx.abort(f"Using '{self.__class__.search_tool}' to search for strings in native image {output_file} failed.") + + return retcode def is_debug_supported(self): return self._debug_supported @@ -2367,7 +2387,7 @@ def build(self): mx.ensure_dir_exists(dirname(output_file)) # Disable build server (different Java properties on each build prevent server reuse) - self.svm_support.native_image(build_args, output_file) + self.svm_support.native_image(build_args, output_file, find_bad_strings=True) with open(self._get_command_file(), 'w') as f: f.writelines((l + os.linesep for l in build_args)) From 10742cad8bf5a045a7226ed736215d48e0963bc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20W=C3=B6gerer?= Date: Thu, 19 Oct 2023 14:27:08 +0200 Subject: [PATCH 11/14] Replace text=True argument with explicit decode() for backward compatibility --- sdk/mx.sdk/mx_sdk_vm_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/mx.sdk/mx_sdk_vm_impl.py b/sdk/mx.sdk/mx_sdk_vm_impl.py index 0cbae43e07a6..0bce847704cd 100644 --- a/sdk/mx.sdk/mx_sdk_vm_impl.py +++ b/sdk/mx.sdk/mx_sdk_vm_impl.py @@ -1230,7 +1230,7 @@ def native_image(self, build_args, output_file, allow_server=False, nonZeroIsFat if not self.__class__.has_search_tool: mx.abort(f"Searching for strings requires '{self.__class__.search_tool}' executable.") try: - strings_in_image = subprocess.check_output([self.__class__.search_tool, output_file], stderr=None, text=True).strip().split('\n') + strings_in_image = subprocess.check_output([self.__class__.search_tool, output_file], stderr=None).decode().strip().split('\n') bad_strings = (output_directory, dirname(native_image_bin)) for entry in strings_in_image: for bad_string in bad_strings: From 1042b22d29d03fe38005422bb7c901773e1616f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20W=C3=B6gerer?= Date: Thu, 19 Oct 2023 15:30:29 +0200 Subject: [PATCH 12/14] An unsuccessful GraalVM image build is always fatal --- sdk/mx.sdk/mx_sdk_vm_impl.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sdk/mx.sdk/mx_sdk_vm_impl.py b/sdk/mx.sdk/mx_sdk_vm_impl.py index 0bce847704cd..7a1325091ea1 100644 --- a/sdk/mx.sdk/mx_sdk_vm_impl.py +++ b/sdk/mx.sdk/mx_sdk_vm_impl.py @@ -1213,7 +1213,7 @@ def is_pgo_supported(self): search_tool = 'strings' has_search_tool = shutil.which(search_tool) is not None - def native_image(self, build_args, output_file, allow_server=False, nonZeroIsFatal=True, out=None, err=None, find_bad_strings=False): + def native_image(self, build_args, output_file, out=None, err=None, find_bad_strings=False): assert self._svm_supported stage1 = get_stage1_graalvm_distribution() native_image_project_name = GraalVmLauncher.launcher_project_name(mx_sdk.LauncherConfig(mx.exe_suffix('native-image'), [], "", []), stage1=True) @@ -1224,7 +1224,7 @@ def native_image(self, build_args, output_file, allow_server=False, nonZeroIsFat '-H:Path=' + output_directory or ".", ]) - retcode = mx.run(native_image_command, nonZeroIsFatal=nonZeroIsFatal, out=out, err=err) + mx.run(native_image_command, nonZeroIsFatal=True, out=out, err=err) if find_bad_strings and not mx.is_windows(): if not self.__class__.has_search_tool: @@ -1240,8 +1240,6 @@ def native_image(self, build_args, output_file, allow_server=False, nonZeroIsFat except subprocess.CalledProcessError: mx.abort(f"Using '{self.__class__.search_tool}' to search for strings in native image {output_file} failed.") - return retcode - def is_debug_supported(self): return self._debug_supported From 35c453660a7c892fa0aa13966ec0609dfab81756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20W=C3=B6gerer?= Date: Thu, 19 Oct 2023 16:47:19 +0200 Subject: [PATCH 13/14] Style fix --- .../src/com/oracle/svm/hosted/image/CCLinkerInvocation.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/substratevm/src/com.oracle.svm.hosted/src/com/oracle/svm/hosted/image/CCLinkerInvocation.java b/substratevm/src/com.oracle.svm.hosted/src/com/oracle/svm/hosted/image/CCLinkerInvocation.java index 967aa3f642d5..2faa0a2bc2d2 100644 --- a/substratevm/src/com.oracle.svm.hosted/src/com/oracle/svm/hosted/image/CCLinkerInvocation.java +++ b/substratevm/src/com.oracle.svm.hosted/src/com/oracle/svm/hosted/image/CCLinkerInvocation.java @@ -320,7 +320,10 @@ protected void setOutputKind(List cmd) { break; case SHARED_LIBRARY: cmd.add("-shared"); - // Ensure shared library name in image does not use fully qualified build-path (GR-46837) + /* + * Ensure shared library name in image does not use fully qualified build-path + * (GR-46837) + */ cmd.add("-Wl,-soname=" + outputFile.getFileName()); break; default: From 22ac787b286ed44b361561b6c4d98e78ca7f6012 Mon Sep 17 00:00:00 2001 From: Martin Entlicher Date: Tue, 24 Oct 2023 08:53:10 +0200 Subject: [PATCH 14/14] Assert ProbeNode parity calls on thread finalization rather than thread context leave. (GR-49626) --- .../truffle/polyglot/PolyglotContextImpl.java | 14 +++--- .../truffle/polyglot/PolyglotEngineImpl.java | 8 ++-- .../truffle/polyglot/PolyglotThreadInfo.java | 46 ++++++++++--------- 3 files changed, 36 insertions(+), 32 deletions(-) diff --git a/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotContextImpl.java b/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotContextImpl.java index 2a42890ba4bc..2569e7e22982 100644 --- a/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotContextImpl.java +++ b/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotContextImpl.java @@ -906,7 +906,7 @@ Object[] enterThreadChanged(boolean enterReverted, boolean pollSafepoint, boolea initializeThreadLocals(threadInfo); } - prev = threadInfo.enterInternal(engine); + prev = threadInfo.enterInternal(); if (leaveAndEnter) { threadInfo.setLeaveAndEnterInterrupter(null); notifyAll(); @@ -918,7 +918,7 @@ Object[] enterThreadChanged(boolean enterReverted, boolean pollSafepoint, boolea try { threadInfo.notifyEnter(engine, this); } catch (Throwable t) { - threadInfo.leaveInternal(engine, prev); + threadInfo.leaveInternal(prev); throw t; } } @@ -1130,7 +1130,7 @@ void leaveThreadChanged(Object[] prev, boolean entered, boolean finalizeAndDispo * Thread finalization notification is invoked outside of the context lock so that the * guest languages can operate freely without the risk of a deadlock. */ - ex = notifyThreadFinalizing(threadInfo, null); + ex = notifyThreadFinalizing(threadInfo, null, false); } synchronized (this) { if (finalizeAndDispose) { @@ -1145,7 +1145,7 @@ void leaveThreadChanged(Object[] prev, boolean entered, boolean finalizeAndDispo threadInfo.notifyLeave(engine, this); } } finally { - threadInfo.leaveInternal(engine, prev); + threadInfo.leaveInternal(prev); } } if (threadInfo.getEnteredCount() == 0) { @@ -1205,7 +1205,7 @@ private void finishThreadDispose(Thread current, PolyglotThreadInfo info, Throwa } } - private Throwable notifyThreadFinalizing(PolyglotThreadInfo threadInfo, Throwable previousEx) { + private Throwable notifyThreadFinalizing(PolyglotThreadInfo threadInfo, Throwable previousEx, boolean mustSucceed) { Throwable ex = previousEx; Thread thread = threadInfo.getThread(); if (thread == null) { @@ -1258,7 +1258,7 @@ private Throwable notifyThreadFinalizing(PolyglotThreadInfo threadInfo, Throwabl } synchronized (this) { if (finalizedContexts.cardinality() == threadInfo.initializedLanguageContextsCount()) { - threadInfo.setFinalizationComplete(); + threadInfo.setFinalizationComplete(engine, mustSucceed); break; } } @@ -3243,7 +3243,7 @@ private void finalizeContext(boolean notifyInstruments, boolean mustSucceed) { embedderThreads = getSeenThreads().values().stream().filter(threadInfo -> !threadInfo.isPolyglotThread(this)).toList().toArray(new PolyglotThreadInfo[0]); } for (PolyglotThreadInfo threadInfo : embedderThreads) { - ex = notifyThreadFinalizing(threadInfo, ex); + ex = notifyThreadFinalizing(threadInfo, ex, mustSucceed); } if (ex != null) { if (!mustSucceed || isInternalError(ex)) { diff --git a/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotEngineImpl.java b/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotEngineImpl.java index c1ae5dfb6968..c3dd1ec34ac5 100644 --- a/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotEngineImpl.java +++ b/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotEngineImpl.java @@ -2060,7 +2060,7 @@ Object[] enterCached(PolyglotContextImpl context, boolean pollSafepoint) { boolean enterReverted = false; if (CompilerDirectives.injectBranchProbability(CompilerDirectives.LIKELY_PROBABILITY, info.getThread() == Thread.currentThread())) { // Volatile increment is safe if only one thread does it. - prev = info.enterInternal(this); + prev = info.enterInternal(); // Check again whether the cached thread info is still the same as expected if (CompilerDirectives.injectBranchProbability(CompilerDirectives.FASTPATH_PROBABILITY, info == context.getCachedThread())) { @@ -2073,7 +2073,7 @@ Object[] enterCached(PolyglotContextImpl context, boolean pollSafepoint) { try { info.notifyEnter(this, context); } catch (Throwable e) { - info.leaveInternal(this, prev); + info.leaveInternal(prev); throw e; } return prev; @@ -2082,7 +2082,7 @@ Object[] enterCached(PolyglotContextImpl context, boolean pollSafepoint) { * If we go this path and enteredCount drops to 0, the subsequent slowpath enter * must call deactivateThread. */ - info.leaveInternal(this, prev); + info.leaveInternal(prev); enterReverted = true; } } @@ -2133,7 +2133,7 @@ void leaveCached(Object[] prev, PolyglotContextImpl context) { try { info.notifyLeave(this, context); } finally { - info.leaveInternal(this, prev); + info.leaveInternal(prev); entered = false; } if (CompilerDirectives.injectBranchProbability(CompilerDirectives.FASTPATH_PROBABILITY, info == context.getCachedThread())) { diff --git a/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotThreadInfo.java b/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotThreadInfo.java index 564c212596af..1ab839de5bc4 100644 --- a/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotThreadInfo.java +++ b/truffle/src/com.oracle.truffle.polyglot/src/com/oracle/truffle/polyglot/PolyglotThreadInfo.java @@ -46,6 +46,7 @@ import java.util.ArrayList; import java.util.LinkedList; import java.util.List; +import java.util.Objects; import com.oracle.truffle.api.CompilerAsserts; import com.oracle.truffle.api.CompilerDirectives; @@ -171,9 +172,13 @@ boolean isFinalizationComplete() { return finalizationComplete; } - void setFinalizationComplete() { + void setFinalizationComplete(PolyglotEngineImpl engine, boolean mustSucceed) { assert Thread.holdsLock(context); this.finalizationComplete = true; + // Assert only when !mustSucceed, partity might not be met on cancellation. + if (ASSERT_ENTER_RETURN_PARITY && !mustSucceed && engine.probeAssertionsEnabled) { + assertProbeThreadFinalized(); + } } boolean isSafepointActive() { @@ -218,13 +223,10 @@ void setLeaveAndEnterInterrupter(TruffleSafepoint.Interrupter interrupter) { * {@link PolyglotEngineImpl#enter(PolyglotContextImpl, boolean, Node, boolean)} instead. */ @SuppressFBWarnings("VO_VOLATILE_INCREMENT") - Object[] enterInternal(PolyglotEngineImpl engine) { + Object[] enterInternal() { Object[] prev = PolyglotFastThreadLocals.enter(this); assert Thread.currentThread() == getThread() : "Volatile increment is safe on a single thread only."; enteredCount++; - if (ASSERT_ENTER_RETURN_PARITY && engine.probeAssertionsEnabled) { - assertProbeThreadEnter(); - } return prev; } @@ -238,12 +240,9 @@ int getEnteredCount() { * {@link PolyglotEngineImpl#leave(PolyglotContextImpl, PolyglotContextImpl)} instead. */ @SuppressFBWarnings("VO_VOLATILE_INCREMENT") - void leaveInternal(PolyglotEngineImpl engine, Object[] prev) { + void leaveInternal(Object[] prev) { assert Thread.currentThread() == getThread() : "Volatile decrement is safe on a single thread only."; enteredCount--; - if (ASSERT_ENTER_RETURN_PARITY && engine.probeAssertionsEnabled) { - assertProbeThreadLeave(); - } PolyglotFastThreadLocals.leave(prev); } @@ -282,33 +281,38 @@ void notifyLeave(PolyglotEngineImpl engine, PolyglotContextImpl profiledContext) } @TruffleBoundary - private void assertProbeThreadEnter() { + private void assertProbeThreadFinalized() { if (probesEnterList != null) { - probesEnterList.add(null); + assert probesEnterList.isEmpty() : getEnteredProbesMessage(probesEnterList); } } - @TruffleBoundary - private void assertProbeThreadLeave() { - if (probesEnterList != null) { - int size = probesEnterList.size(); - assert size > 0 : "Leave of polyglot thread does not have a preceding enter."; - ProbeNode probe = probesEnterList.remove(size - 1); - assert probe == null : "Found an entered probe without return: " + probe + " with parent node " + probe.getParent().getClass() + "\n" + - "Specifically, a call to ProbeNode.onEnter()/onResume() does not have a corresponding call to ProbeNode.onReturnValue()/onReturnExceptionalOrUnwind()/onYield()."; + private static String getEnteredProbesMessage(List probes) { + StringBuilder sb = new StringBuilder("Found entered probes without return: "); + sb.append(probes); + sb.append("\nSpecifically, a call to ProbeNode.onEnter()/onResume() does not have a corresponding call to ProbeNode.onReturnValue()/onReturnExceptionalOrUnwind()/onYield()."); + for (ProbeNode probe : probes) { + sb.append("\n probe "); + sb.append(probe); + sb.append(" with parent node "); + sb.append(probe.getParent().getClass()); } + sb.append('\n'); + return sb.toString(); } @TruffleBoundary void assertProbeEntered(ProbeNode probe) { + Objects.requireNonNull(probe); probesEnterList.add(probe); } @TruffleBoundary void assertProbeReturned(ProbeNode probe) { - assert !probesEnterList.isEmpty() : "ProbeNode exited without enter"; + assert !probesEnterList.isEmpty() : "ProbeNode " + probe + " with parent " + probe.getParent().getClass() + " exited without enter"; ProbeNode lastProbe = probesEnterList.remove(probesEnterList.size() - 1); - assert probe == lastProbe : "Entered probe " + lastProbe + " differs from the returned probe " + probe + " with parent " + probe.getParent().getClass() + "\n" + + assert probe == lastProbe : "Entered probe " + lastProbe + " with parent " + lastProbe.getParent().getClass() + " differs from the returned probe " + + probe + " with parent " + probe.getParent().getClass() + "\n" + "Specifically, a call to onEnter()/onResume() on " + lastProbe + " was not followed by a call to onReturnValue()/onReturnExceptionalOrUnwind()/onYield() on the same probe, " + "but on " + probe + " instead."; }