From 4f34a4710108fe156ea1b8b60a86f8ec1dae4ef8 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 1 Mar 2024 16:00:24 -0500 Subject: [PATCH 01/75] More tests, and various minor fixes --- benchmark/UTF8_runtime.cs | 2 +- benchmark/benchmark.csproj | 1 - src/UTF8.cs | 46 ++++++-------------- test/UTF8ValidationTests.cs | 87 +++++++++++++++++++++++++++---------- test/tests.csproj | 3 +- 5 files changed, 79 insertions(+), 60 deletions(-) diff --git a/benchmark/UTF8_runtime.cs b/benchmark/UTF8_runtime.cs index 82e7384..8583e5a 100644 --- a/benchmark/UTF8_runtime.cs +++ b/benchmark/UTF8_runtime.cs @@ -28,7 +28,7 @@ namespace DotnetRuntime { - internal static unsafe partial class Utf8Utility + public static unsafe partial class Utf8Utility { /// /// Returns iff the low byte of diff --git a/benchmark/benchmark.csproj b/benchmark/benchmark.csproj index 2d36be2..efb963c 100644 --- a/benchmark/benchmark.csproj +++ b/benchmark/benchmark.csproj @@ -14,7 +14,6 @@ - diff --git a/src/UTF8.cs b/src/UTF8.cs index 9d5f6ca..d7b5c74 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -45,21 +45,6 @@ public static class UTF8 uint codePoint = 0; while (pos < inputLength) { - // If the next 16 bytes are ascii, we can skip them. - nextPos = pos + 16; - if (nextPos <= inputLength) - { // if it is safe to read 16 more bytes, check that they are ascii - ulong v1 = *(ulong*)pInputBuffer; - ulong v2 = *(ulong*)(pInputBuffer + 8); - ulong v = v1 | v2; - - if ((v & 0x8080808080808080) == 0) - { - pos = nextPos; - continue; - } - - } byte firstByte = pInputBuffer[pos]; while (firstByte < 0b10000000) { @@ -233,7 +218,6 @@ public static class UTF8 for (; processedLength + 32 <= inputLength; processedLength += 32) { - Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength); int mask = Avx2.MoveMask(currentBlock); @@ -241,9 +225,10 @@ public static class UTF8 { // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. - if (Avx2.MoveMask(prevIncomplete) != 0) + if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + int off = processedLength >= 32 ? processedLength - 32 : processedLength; + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); } prevIncomplete = Vector256.Zero; } @@ -264,34 +249,31 @@ public static class UTF8 Vector256 must23 = Avx2.Or(isThirdByte, isFourthByte); Vector256 must23As80 = Avx2.And(must23, v80); Vector256 error = Avx2.Xor(must23As80, sc); - if (Avx2.MoveMask(error) != 0) + if (!Avx2.TestZ(error, error)) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + int off = processedLength >= 32 ? processedLength - 32 : processedLength; + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); } prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); } } + + if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) + { + int off = processedLength >= 32 ? processedLength - 32 : processedLength; + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); + } } } - // We have processed all the blocks using SIMD, we need to process the remaining bytes. + // We have processed all the blocks using SIMD, we need to process the remaining bytes. // Process the remaining bytes with the scalar function if (processedLength < inputLength) { // We need to possibly backtrack to the start of the last code point - // worst possible case is 4 bytes, where we need to backtrack 3 bytes - // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte - if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) + while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) { processedLength -= 1; - if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) - { - processedLength -= 1; - if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) - { - processedLength -= 1; - } - } } byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength); if (invalidBytePointer != pInputBuffer + inputLength) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index da02acc..fb9869d 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -11,6 +11,35 @@ public class Utf8SIMDValidationTests private readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1); private static readonly Random rand = new Random(); + + /* + [Fact] + public void TooLongErrorTest() + { + for (int trial = 0; trial < 10; trial++) + { + Console.WriteLine("Trial {0}", trial); + + byte[] utf8 = generator.Generate(3*64); + for (int i = 0; i < utf8.Length; i++) + { + Console.WriteLine("Trial {0} i = {1}", trial, i); + + if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes + { + byte oldByte = utf8[i]; + utf8[i] = 0b10000000; // Forcing a too long error + Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); + utf8[i] = oldByte; // Restore the original byte + } + Console.WriteLine("Trial {0} i = {1} DONE", trial, i); + + } + } + }*/ + + [Fact] public void TestGoodSequences() { @@ -38,7 +67,7 @@ public void TestGoodSequences() byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, - $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); // byte* result = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); + $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); } } } @@ -92,7 +121,7 @@ public void TestBadSequences() byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, - $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); // byte* result = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); + $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); } } @@ -111,9 +140,7 @@ public void NoErrorTest() { for (int trial = 0; trial < NumTrials; trial++) { - // Console.WriteLine("Trial run:" + trial); byte[] utf8 = generator.Generate(512); - // Assert.True(ValidateUtf8(utf8),$"Failure NoErrorTest: {utf8}"); bool isValidUtf8 = ValidateUtf8(utf8); string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); @@ -150,19 +177,6 @@ public void NoErrorTest4Bytes() RunTestForByteLength(4); } - // private void RunTestForByteLength(int byteLength) - // { - // for (int trial = 0; trial < NumTrials; trial++) - // { - // // Console.WriteLine($"Trial run {trial} for byte length {byteLength}"); - // byte[] utf8 = generator.Generate(990, byteLength); - // bool isValidUtf8 = ValidateUtf8(utf8); - // // string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); - // // Assert.True(isValidUtf8, $"Failure NoErrorTest for {byteLength}-byte UTF8. Sequence: {utf8HexString}"); - // Assert.True(isValidUtf8); - // } - // } - private void RunTestForByteLength(int byteLength) { int[] outputLengths = { 128, 256, 512, 1024, 1000 }; // Example lengths @@ -191,6 +205,7 @@ public void HeaderBitsErrorTest() byte oldByte = utf8[i]; utf8[i] = 0b11111000; // Forcing a header bits error Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); utf8[i] = oldByte; // Restore the original byte } } @@ -211,6 +226,7 @@ public void TooShortErrorTest() byte oldByte = utf8[i]; utf8[i] = 0b11100000; // Forcing a too short error Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); utf8[i] = oldByte; // Restore the original byte } } @@ -231,14 +247,13 @@ public void TooLongErrorTest() byte oldByte = utf8[i]; utf8[i] = 0b10000000; // Forcing a too long error Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); utf8[i] = oldByte; // Restore the original byte } } } } - // - [Fact] public void OverlongErrorTest() { @@ -294,7 +309,7 @@ public void TooLargeErrorTest() utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100); Assert.False(ValidateUtf8(utf8)); - + Assert.True(InvalidateUtf8(utf8, i)); utf8[i] = old; } } @@ -322,6 +337,7 @@ public void SurrogateErrorTest() utf8[i + 1] = (byte)((utf8[i + 1] & 0b11000011) | (s << 2)); Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); } utf8[i] = old; @@ -334,8 +350,6 @@ public void SurrogateErrorTest() [Fact] public void BruteForceTest() { - // Random rand = new Random(); // Random instance for test - for (int i = 0; i < NumTrials; i++) { // Generate random UTF-8 sequence @@ -368,9 +382,7 @@ public void BruteForceTest() } } - // credit: based on code from Google Fuchsia (Apache Licensed) - public static bool ValidateUtf8Fuschia(byte[] data) { int pos = 0; @@ -424,7 +436,34 @@ public static bool ValidateUtf8Fuschia(byte[] data) return true; } + // Check that all functions agree on the result when the input might be invalid. + private bool InvalidateUtf8(byte[] utf8, int badindex) + { + unsafe + { + fixed (byte* pInput = utf8) + { + byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length); + int scalarOffset = (int)(scalarResult - pInput); + byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length); + int simdOffset = (int)(simdResult - pInput); + int utf16CodeUnitCountAdjustment, scalarCountAdjustment; + byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, utf8.Length, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + int dotnetOffset = (int)(dotnetResult - pInput); + if (scalarOffset != simdOffset) + { + Console.WriteLine("Suprisingly, scalarResult != simdResult {0} != {1}, badindex = {2}, length = {3}", scalarOffset, simdOffset, badindex, utf8.Length); + } + if (dotnetOffset != simdOffset) + { + Console.WriteLine("Suprisingly, dotnetOffset != simdResult {0} != {1}, badindex = {2}, length = {3}", dotnetOffset, simdOffset, badindex, utf8.Length); + } + return (scalarResult == simdResult) && (simdResult == dotnetResult); + } + } + } + // check that all methods agree that the result is valid private bool ValidateUtf8(byte[] utf8) { unsafe diff --git a/test/tests.csproj b/test/tests.csproj index 351e00f..221066e 100644 --- a/test/tests.csproj +++ b/test/tests.csproj @@ -26,8 +26,7 @@ - - + From 34328491377e3bb7c43ce7cad8e698e8696fb083 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 1 Mar 2024 16:23:51 -0500 Subject: [PATCH 02/75] code simplification --- src/UTF8.cs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index d7b5c74..63519b7 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -227,7 +227,7 @@ public static class UTF8 // we need to check if the previous block was incomplete. if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - int off = processedLength >= 32 ? processedLength - 32 : processedLength; + int off = processedLength >= 3 ? processedLength - 3 : processedLength; return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); } prevIncomplete = Vector256.Zero; @@ -258,11 +258,19 @@ public static class UTF8 } } - if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) - { - int off = processedLength >= 32 ? processedLength - 32 : processedLength; - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); - } + if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) + { + // We have an unterminated sequence. + processedLength -= 3; + for(int k = 0; k < 3; k++) + { + if ((pInputBuffer[processedLength + k] & 0b11000000) == 0b11000000) + { + processedLength += k; + break; + } + } + } } } From 50fa499e5fc6140e438adc5c037709391ba35767 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 4 Mar 2024 12:13:44 -0500 Subject: [PATCH 03/75] minor improvements and cleanup --- test/UTF8ValidationTests.cs | 285 +++++++++++++++++++++--------------- test/helpers/randomutf8.cs | 11 +- 2 files changed, 170 insertions(+), 126 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index fb9869d..fb52708 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -11,33 +11,12 @@ public class Utf8SIMDValidationTests private readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1); private static readonly Random rand = new Random(); + // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; + int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths + + + - /* - [Fact] - public void TooLongErrorTest() - { - for (int trial = 0; trial < 10; trial++) - { - Console.WriteLine("Trial {0}", trial); - - byte[] utf8 = generator.Generate(3*64); - for (int i = 0; i < utf8.Length; i++) - { - Console.WriteLine("Trial {0} i = {1}", trial, i); - - if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes - { - byte oldByte = utf8[i]; - utf8[i] = 0b10000000; // Forcing a too long error - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); - utf8[i] = oldByte; // Restore the original byte - } - Console.WriteLine("Trial {0} i = {1} DONE", trial, i); - - } - } - }*/ [Fact] @@ -138,12 +117,15 @@ public void Node48995Test() [Fact] public void NoErrorTest() { - for (int trial = 0; trial < NumTrials; trial++) + foreach (int outputLength in outputLengths) { - byte[] utf8 = generator.Generate(512); - bool isValidUtf8 = ValidateUtf8(utf8); - string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); - Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); + for (int trial = 0; trial < NumTrials; trial++) + { + byte[] utf8 = generator.Generate(outputLength); + bool isValidUtf8 = ValidateUtf8(utf8); + string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); + Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); + } } } @@ -179,7 +161,7 @@ public void NoErrorTest4Bytes() private void RunTestForByteLength(int byteLength) { - int[] outputLengths = { 128, 256, 512, 1024, 1000 }; // Example lengths + // int[] outputLengths = { 128, 256, 512, 1024, 1000 }; // Example lengths foreach (int outputLength in outputLengths) { for (int trial = 0; trial < NumTrials; trial++) @@ -215,40 +197,47 @@ public void HeaderBitsErrorTest() [Fact] public void TooShortErrorTest() { - for (int trial = 0; trial < NumTrials; trial++) + foreach (int outputLength in outputLengths) { - - byte[] utf8 = generator.Generate(512); - for (int i = 0; i < utf8.Length; i++) + for (int trial = 0; trial < NumTrials; trial++) { - if ((utf8[i] & 0b11000000) == 0b10000000) // Only process continuation bytes + byte[] utf8 = generator.Generate(outputLength); + + for (int i = 0; i < utf8.Length; i++) { - byte oldByte = utf8[i]; - utf8[i] = 0b11100000; // Forcing a too short error - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); - utf8[i] = oldByte; // Restore the original byte + if ((utf8[i] & 0b11000000) == 0b10000000) // Only process continuation bytes + { + byte oldByte = utf8[i]; + utf8[i] = 0b11100000; // Forcing a too short error + Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); + utf8[i] = oldByte; // Restore the original byte + } } } } + } [Fact] public void TooLongErrorTest() { - for (int trial = 0; trial < NumTrials; trial++) + foreach (int outputLength in outputLengths) { - - byte[] utf8 = generator.Generate(512); - for (int i = 0; i < utf8.Length; i++) + for (int trial = 0; trial < NumTrials; trial++) { - if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes + byte[] utf8 = generator.Generate(outputLength); + + for (int i = 0; i < utf8.Length; i++) { - byte oldByte = utf8[i]; - utf8[i] = 0b10000000; // Forcing a too long error - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); - utf8[i] = oldByte; // Restore the original byte + if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes + { + byte oldByte = utf8[i]; + utf8[i] = 0b10000000; // Forcing a too long error + Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); + utf8[i] = oldByte; // Restore the original byte + } } } } @@ -259,58 +248,107 @@ public void OverlongErrorTest() { for (int trial = 0; trial < NumTrials; trial++) { + foreach (int outputLength in outputLengths) + { + byte[] utf8 = generator.Generate(outputLength); - byte[] utf8 = generator.Generate(512); - for (int i = 0; i < utf8.Length; i++) - { - if (utf8[i] >= 0b11000000) // Only non-ASCII leading bytes can be overlong + for (int i = 0; i < utf8.Length; i++) { - byte old = utf8[i]; - byte secondOld = utf8[i + 1]; - - if ((old & 0b11100000) == 0b11000000) // two-bytes case, change to a value less or equal than 0x7f + if (utf8[i] >= 0b11000000) // Only non-ASCII leading bytes can be overlong { - utf8[i] = 0b11000000; - } - else if ((old & 0b11110000) == 0b11100000) // three-bytes case, change to a value less or equal than 0x7ff - { - utf8[i] = 0b11100000; - utf8[i + 1] = (byte)(utf8[i + 1] & 0b11011111); - } - else if ((old & 0b11111000) == 0b11110000) // four-bytes case, change to a value less or equal than 0xffff - { - utf8[i] = 0b11110000; - utf8[i + 1] = (byte)(utf8[i + 1] & 0b11001111); - } + byte old = utf8[i]; + byte secondOld = utf8[i + 1]; + + if ((old & 0b11100000) == 0b11000000) // two-bytes case, change to a value less or equal than 0x7f + { + utf8[i] = 0b11000000; + } + else if ((old & 0b11110000) == 0b11100000) // three-bytes case, change to a value less or equal than 0x7ff + { + utf8[i] = 0b11100000; + utf8[i + 1] = (byte)(utf8[i + 1] & 0b11011111); + } + else if ((old & 0b11111000) == 0b11110000) // four-bytes case, change to a value less or equal than 0xffff + { + utf8[i] = 0b11110000; + utf8[i + 1] = (byte)(utf8[i + 1] & 0b11001111); + } - Assert.False(ValidateUtf8(utf8)); + Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); - utf8[i] = old; - utf8[i + 1] = secondOld; + utf8[i] = old; + utf8[i + 1] = secondOld; + } } } } } + +// This might seems redundant with but it actually failed PR #17. +// The issue is fixed in PR#18 but I thought it a good idea to formally cover it as further changes are possible. [Fact] - public void TooLargeErrorTest() + public void TooShortTest2() { for (int trial = 0; trial < NumTrials; trial++) { + foreach (int outputLength in outputLengths) + { + byte[] oneUTFunit = generator.Generate( howManyUnits:1 ,byteCountInUnit: 2); + // PrintHexAndBinary(oneUTFunit); + byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1); + // for (int i = 0; i < utf8.Length; i++) + // { + // if ((utf8[i] & 0b11000000) == 0b10000000) // Only process continuation bytes + // { + byte oldByte = utf8[outputLength - 1]; + utf8[outputLength -1] = oneUTFunit[0];//0b11000000; // Forcing a too short error at the very end + // PrintHexAndBinary(utf8); + Assert.False(ValidateUtf8(utf8)); + utf8[outputLength -1] = oldByte; // Restore the original byte + + // } + } + } + } - byte[] utf8 = generator.Generate(512); + // Prints both hexadecimal and binary representations of a byte array + static void PrintHexAndBinary(byte[] bytes) + { + // Convert to hexadecimal + string hexRepresentation = BitConverter.ToString(bytes).Replace("-", " "); + Console.WriteLine($"Hex: {hexRepresentation}"); - for (int i = 0; i < utf8.Length; i++) + // Convert to binary + string binaryRepresentation = string.Join(" ", Array.ConvertAll(bytes, byteValue => Convert.ToString(byteValue, 2).PadLeft(8, '0'))); + Console.WriteLine($"Binary: {binaryRepresentation}"); + } + + + [Fact] + public void TooLargeErrorTest() + { + foreach (int outputLength in outputLengths) + { + + for (int trial = 0; trial < NumTrials; trial++) { - if ((utf8[i] & 0b11111000) == 0b11110000) // Only in 4-bytes case + + byte[] utf8 = generator.Generate(outputLength); + + for (int i = 0; i < utf8.Length; i++) { - byte old = utf8[i]; - utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100); + if ((utf8[i] & 0b11111000) == 0b11110000) // Only in 4-bytes case + { + byte old = utf8[i]; + utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100); - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); - utf8[i] = old; + Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); + utf8[i] = old; + } } } } @@ -319,29 +357,32 @@ public void TooLargeErrorTest() [Fact] public void SurrogateErrorTest() { - for (int trial = 0; trial < NumTrials; trial++) + foreach (int outputLength in outputLengths) { + for (int trial = 0; trial < NumTrials; trial++) + { - byte[] utf8 = generator.Generate(512); + byte[] utf8 = generator.Generate(outputLength); - for (int i = 0; i < utf8.Length; i++) - { - if ((utf8[i] & 0b11110000) == 0b11100000) // Only in 3-bytes case + for (int i = 0; i < utf8.Length; i++) { - byte old = utf8[i]; - byte secondOld = utf8[i + 1]; - - utf8[i] = 0b11101101; // Leading byte for surrogate - for (int s = 0x8; s < 0xf; s++) + if ((utf8[i] & 0b11110000) == 0b11100000) // Only in 3-bytes case { - utf8[i + 1] = (byte)((utf8[i + 1] & 0b11000011) | (s << 2)); + byte old = utf8[i]; + byte secondOld = utf8[i + 1]; - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); - } + utf8[i] = 0b11101101; // Leading byte for surrogate + for (int s = 0x8; s < 0xf; s++) + { + utf8[i + 1] = (byte)((utf8[i + 1] & 0b11000011) | (s << 2)); + + Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); + } - utf8[i] = old; - utf8[i + 1] = secondOld; + utf8[i] = old; + utf8[i + 1] = secondOld; + } } } } @@ -350,34 +391,38 @@ public void SurrogateErrorTest() [Fact] public void BruteForceTest() { - for (int i = 0; i < NumTrials; i++) + foreach (int outputLength in outputLengths) { - // Generate random UTF-8 sequence - byte[] utf8 = generator.Generate(rand.Next(2000)); + for (int i = 0; i < NumTrials; i++) + { - Assert.True(ValidateUtf8(utf8), "Initial UTF-8 validation (primary) failed."); + // Generate random UTF-8 sequence + byte[] utf8 = generator.Generate(rand.Next(outputLength)); - Assert.True(ValidateUtf8Fuschia(utf8), "Initial UTF-8 validation (Fuschia) failed."); + Assert.True(ValidateUtf8(utf8), "Initial UTF-8 validation (primary) failed."); - // Perform random bit flips - for (int flip = 0; flip < 1000; flip++) - { - if (utf8.Length == 0) + Assert.True(ValidateUtf8Fuschia(utf8), "Initial UTF-8 validation (Fuschia) failed."); + + // Perform random bit flips + for (int flip = 0; flip < 1000; flip++) { - break; - } + if (utf8.Length == 0) + { + break; + } - byte[] modifiedUtf8 = (byte[])utf8.Clone(); - int byteIndex = rand.Next(modifiedUtf8.Length); - int bitFlip = 1 << rand.Next(8); - modifiedUtf8[byteIndex] ^= (byte)bitFlip; + byte[] modifiedUtf8 = (byte[])utf8.Clone(); + int byteIndex = rand.Next(modifiedUtf8.Length); + int bitFlip = 1 << rand.Next(8); + modifiedUtf8[byteIndex] ^= (byte)bitFlip; - // Validate the modified sequence with both methods - bool isValidPrimary = ValidateUtf8(modifiedUtf8); - bool isValidFuschia = ValidateUtf8Fuschia(modifiedUtf8); + // Validate the modified sequence with both methods + bool isValidPrimary = ValidateUtf8(modifiedUtf8); + bool isValidFuschia = ValidateUtf8Fuschia(modifiedUtf8); - // Ensure both methods agree on the validation result - Assert.Equal(isValidPrimary, isValidFuschia); + // Ensure both methods agree on the validation result + Assert.Equal(isValidPrimary, isValidFuschia); + } } } } diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index a403a48..5498576 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -15,23 +15,22 @@ public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, i probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes }; } - public byte[] Generate(int outputBytes, int? byteCount = null) + public byte[] Generate(int howManyUnits, int? byteCountInUnit = null) { var result = new List(); - while (result.Count < outputBytes) + while (result.Count < howManyUnits) { - int count = byteCount ?? PickRandomByteCount(); + int count = byteCountInUnit ?? PickRandomByteCount(); int codePoint = GenerateCodePoint(count); byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); - if (result.Count + utf8Bytes.Length > outputBytes) - break; result.AddRange(utf8Bytes); + if (result.Count + utf8Bytes.Length > howManyUnits) + break; } return result.ToArray(); } - private int GenerateCodePoint(int byteCount) { switch (byteCount) From b1097846dcc50039b2882ffb67a38027a8ed472b Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 5 Mar 2024 17:28:13 -0500 Subject: [PATCH 04/75] Short test 's end improvements --- test/UTF8ValidationTests.cs | 70 +++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index fb52708..3d7c49c 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -12,7 +12,7 @@ public class Utf8SIMDValidationTests private static readonly Random rand = new Random(); // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; - int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths + static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths @@ -289,29 +289,53 @@ public void OverlongErrorTest() // This might seems redundant with but it actually failed PR #17. // The issue is fixed in PR#18 but I thought it a good idea to formally cover it as further changes are possible. - [Fact] - public void TooShortTest2() - { - for (int trial = 0; trial < NumTrials; trial++) - { - foreach (int outputLength in outputLengths) - { - byte[] oneUTFunit = generator.Generate( howManyUnits:1 ,byteCountInUnit: 2); - // PrintHexAndBinary(oneUTFunit); - byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1); - // for (int i = 0; i < utf8.Length; i++) - // { - // if ((utf8[i] & 0b11000000) == 0b10000000) // Only process continuation bytes - // { - byte oldByte = utf8[outputLength - 1]; - utf8[outputLength -1] = oneUTFunit[0];//0b11000000; // Forcing a too short error at the very end - // PrintHexAndBinary(utf8); - Assert.False(ValidateUtf8(utf8)); - utf8[outputLength -1] = oldByte; // Restore the original byte + // [Fact] + // public void TooShortTest2() + // { + // for (int trial = 0; trial < NumTrials; trial++) + // { + // foreach (int outputLength in outputLengths) + // { + // byte[] oneUTFunit = generator.Generate( howManyUnits:1 ,byteCountInUnit: 2); + // // PrintHexAndBinary(oneUTFunit); + // byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1); + // // for (int i = 0; i < utf8.Length; i++) + // // { + // // if ((utf8[i] & 0b11000000) == 0b10000000) // Only process continuation bytes + // // { + // byte oldByte = utf8[outputLength - 1]; + // utf8[outputLength -1] = oneUTFunit[0];//0b11000000; // Forcing a too short error at the very end + // // PrintHexAndBinary(utf8); + // Assert.False(ValidateUtf8(utf8)); + // utf8[outputLength -1] = oldByte; // Restore the original byte - // } - } - } + // // } + // } + // } + // } + + public static IEnumerable TestData() + { + var utf8CharacterLengths = new[] { 2, 3, 4 }; // UTF-8 characters can be 1-4 bytes. + return outputLengths.SelectMany( + outputLength => Enumerable.Range(0, outputLength), + (outputLength, position) => new object[] { outputLength, position }); + } + + + [Theory] + [MemberData(nameof(TestData))] + public void TooShortTestEnd(int outputLength, int position) + { + byte[] oneUTFunit = generator.Generate(howManyUnits: 1, byteCountInUnit: 2); + byte[] utf8 = generator.Generate(outputLength, byteCountInUnit: 1); + + byte oldByte = utf8[position]; + utf8[position] = oneUTFunit[0]; // Force a condition + + Assert.False(ValidateUtf8(utf8)); // Test the condition + + utf8[position] = oldByte; // Restore } // Prints both hexadecimal and binary representations of a byte array From 42cd2cd2483ee11eb58af7cfd210d26f3d9ad237 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Wed, 6 Mar 2024 16:39:22 -0500 Subject: [PATCH 05/75] 0x7f 9xff test --- test/UTF8ValidationTests.cs | 51 ++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 3d7c49c..ba7d0a9 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -316,7 +316,7 @@ public void OverlongErrorTest() public static IEnumerable TestData() { - var utf8CharacterLengths = new[] { 2, 3, 4 }; // UTF-8 characters can be 1-4 bytes. + // var utf8CharacterLengths = new[] { 2, 3, 4 }; // UTF-8 characters can be 1-4 bytes. return outputLengths.SelectMany( outputLength => Enumerable.Range(0, outputLength), (outputLength, position) => new object[] { outputLength, position }); @@ -338,6 +338,55 @@ public void TooShortTestEnd(int outputLength, int position) utf8[position] = oldByte; // Restore } + // public static IEnumerable InvalidTestData() + // { + // var random = new Random(); + // foreach (var length in outputLengths) + // { + // for (int trial = 0; trial < NumTrials; trial++) + // { + // int position = random.Next(length - 3); // Choose a random position + // byte invalidByte = (byte)random.Next(0xF5, 0x100); // Generate a random invalid byte + + // yield return new object[] { length, position, invalidByte }; + // } + // } + // } + + public static IEnumerable InvalidTestData() +{ + var invalidBytes = Enumerable.Range(0xF5, 0x100 - 0xF5).Select(i => (byte)i).ToArray(); // 0xF5 to 0xFF + foreach (var length in outputLengths) + { + for (int position = 0; position < length; position++) + { + foreach (var invalidByte in invalidBytes) + { + yield return new object[] { length, position, invalidByte }; + } + } + } +} + + + //corresponds to condition 5.4.1 in the paper + [Theory] + [MemberData(nameof(InvalidTestData))] + public void Invalid0xf50xff(int outputLength, int position, byte invalidByte) + { + byte[] utf8 = generator.Generate(outputLength,1); + + // Initialize utf8 with some valid data, if necessary + // Array.Fill(utf8, (byte)0x20); // Filling with spaces for simplicity + + utf8[position] = invalidByte; // Inject an invalid byte at a random position + + // PrintHexAndBinary(utf8); + + Assert.False(ValidateUtf8(utf8)); // Expect the validation to fail due to the invalid byte + } + + // Prints both hexadecimal and binary representations of a byte array static void PrintHexAndBinary(byte[] bytes) { From 7ae0283cf02c9293592db2b7b9cde6d9c1139ed4 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Fri, 8 Mar 2024 21:07:19 -0500 Subject: [PATCH 06/75] More exaustive testing of shortend --- test/UTF8ValidationTests.cs | 107 ++++++++++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 24 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index ba7d0a9..a4869e6 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -8,13 +8,40 @@ public class Utf8SIMDValidationTests private const int NumTrials = 1000; - private readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1); + private static readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1); private static readonly Random rand = new Random(); // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths + public static void ShiftLeft(T[] array, int shiftAmount) + { + int length = array.Length; + if (length == 0 || shiftAmount % length == 0) return; // No need to shift + T[] copy = new T[length]; + Array.Copy(array, copy, length); + + for (int i = 0; i < length; i++) + { + int newIndex = (i + length - shiftAmount % length) % length; + array[newIndex] = copy[i]; + } + } + + public static void ShiftRight(T[] array, int shiftAmount) + { + int length = array.Length; + if (length == 0 || shiftAmount % length == 0) return; // No need to shift + T[] copy = new T[length]; + Array.Copy(array, copy, length); + + for (int i = 0; i < length; i++) + { + int newIndex = (i + shiftAmount) % length; + array[newIndex] = copy[i]; + } + } @@ -176,19 +203,22 @@ private void RunTestForByteLength(int byteLength) [Fact] public void HeaderBitsErrorTest() { - for (int trial = 0; trial < NumTrials; trial++) - { - - byte[] utf8 = generator.Generate(512); - for (int i = 0; i < utf8.Length; i++) + foreach (int outputLength in outputLengths) + { + for (int trial = 0; trial < NumTrials; trial++) { - if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes + + byte[] utf8 = generator.Generate(outputLength); + for (int i = 0; i < utf8.Length; i++) { - byte oldByte = utf8[i]; - utf8[i] = 0b11111000; // Forcing a header bits error - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); - utf8[i] = oldByte; // Restore the original byte + if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes + { + byte oldByte = utf8[i]; + utf8[i] = 0b11111000; // Forcing a header bits error + Assert.False(ValidateUtf8(utf8)); + Assert.True(InvalidateUtf8(utf8, i)); + utf8[i] = oldByte; // Restore the original byte + } } } } @@ -318,24 +348,52 @@ public static IEnumerable TestData() { // var utf8CharacterLengths = new[] { 2, 3, 4 }; // UTF-8 characters can be 1-4 bytes. return outputLengths.SelectMany( - outputLength => Enumerable.Range(0, outputLength), + outputLength => Enumerable.Range(1, outputLength), (outputLength, position) => new object[] { outputLength, position }); } + // [Theory] + // [MemberData(nameof(TestData))] + // public void TooShortTestEnd(int outputLength, int position) + // { + // byte[] oneUTFunit = generator.Generate(howManyUnits: 1, byteCountInUnit: 2); + // byte[] utf8 = generator.Generate(outputLength, byteCountInUnit: 1); + + // byte oldByte = utf8[position]; + // utf8[position] = oneUTFunit[0]; // Force a condition + + // Assert.False(ValidateUtf8(utf8)); // Test the condition + + // utf8[position] = oldByte; // Restore + // } + + public byte[] PrependAndTake(byte[] first, byte[] second, int takeCount) + { + // Concatenate 'first' array at the beginning of 'second' array + var combined = first.Concat(second).ToArray(); + + // Take the first 'takeCount' elements from the combined array + return combined.Take(takeCount).ToArray(); + } + + [Theory] [MemberData(nameof(TestData))] public void TooShortTestEnd(int outputLength, int position) { - byte[] oneUTFunit = generator.Generate(howManyUnits: 1, byteCountInUnit: 2); - byte[] utf8 = generator.Generate(outputLength, byteCountInUnit: 1); + // (Nick: I know this is slow ... but I think for a first pass, it might be ok?) + byte[] filler = generator.Generate(howManyUnits: position, byteCountInUnit: 1); + byte[] utf8 = generator.Generate(outputLength); - byte oldByte = utf8[position]; - utf8[position] = oneUTFunit[0]; // Force a condition - - Assert.False(ValidateUtf8(utf8)); // Test the condition + // Assuming 'prepend' and 'take' logic needs to be applied here as per the pseudocode + byte[] result = PrependAndTake(filler, utf8, position); - utf8[position] = oldByte; // Restore + if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array otherwise + { + Assert.False(ValidateUtf8(utf8)); // Test the condition + } + } // public static IEnumerable InvalidTestData() @@ -355,14 +413,16 @@ public void TooShortTestEnd(int outputLength, int position) public static IEnumerable InvalidTestData() { + var invalidBytes = Enumerable.Range(0xF5, 0x100 - 0xF5).Select(i => (byte)i).ToArray(); // 0xF5 to 0xFF foreach (var length in outputLengths) { - for (int position = 0; position < length; position++) + byte[] utf8 = generator.Generate(length); + for (int position = 0; position < utf8.Length; position++) { foreach (var invalidByte in invalidBytes) { - yield return new object[] { length, position, invalidByte }; + yield return new object[] { length, position, invalidByte ,utf8 }; } } } @@ -372,9 +432,8 @@ public static IEnumerable InvalidTestData() //corresponds to condition 5.4.1 in the paper [Theory] [MemberData(nameof(InvalidTestData))] - public void Invalid0xf50xff(int outputLength, int position, byte invalidByte) + public void Invalid0xf50xff(int outputLength, int position, byte invalidByte,byte[] utf8) { - byte[] utf8 = generator.Generate(outputLength,1); // Initialize utf8 with some valid data, if necessary // Array.Fill(utf8, (byte)0x20); // Filling with spaces for simplicity From 3f6e5efc3ce37663f4097277fa3c727dcb04a370 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Fri, 8 Mar 2024 21:07:50 -0500 Subject: [PATCH 07/75] correction --- test/UTF8ValidationTests.cs | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index a4869e6..16c388b 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -15,34 +15,6 @@ public class Utf8SIMDValidationTests static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths - public static void ShiftLeft(T[] array, int shiftAmount) - { - int length = array.Length; - if (length == 0 || shiftAmount % length == 0) return; // No need to shift - T[] copy = new T[length]; - Array.Copy(array, copy, length); - - for (int i = 0; i < length; i++) - { - int newIndex = (i + length - shiftAmount % length) % length; - array[newIndex] = copy[i]; - } - } - - public static void ShiftRight(T[] array, int shiftAmount) - { - int length = array.Length; - if (length == 0 || shiftAmount % length == 0) return; // No need to shift - T[] copy = new T[length]; - Array.Copy(array, copy, length); - - for (int i = 0; i < length; i++) - { - int newIndex = (i + shiftAmount) % length; - array[newIndex] = copy[i]; - } - } - From 388e290e98812732f6e5042f62d4844efad54386 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 9 Mar 2024 22:38:56 -0500 Subject: [PATCH 08/75] added optional rangeto validate --- test/UTF8ValidationTests.cs | 82 +++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 9 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 16c388b..ab72275 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -354,20 +354,46 @@ public byte[] PrependAndTake(byte[] first, byte[] second, int takeCount) [MemberData(nameof(TestData))] public void TooShortTestEnd(int outputLength, int position) { - // (Nick: I know this is slow ... but I think for a first pass, it might be ok?) - byte[] filler = generator.Generate(howManyUnits: position, byteCountInUnit: 1); + // ( know this is slow ... but I think for a first pass, it might be ok?) byte[] utf8 = generator.Generate(outputLength); + byte[] filler = generator.Generate(howManyUnits: position, byteCountInUnit: 1); + // Assuming 'prepend' and 'take' logic needs to be applied here as per the pseudocode byte[] result = PrependAndTake(filler, utf8, position); - - if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array otherwise + + + if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array { Assert.False(ValidateUtf8(utf8)); // Test the condition } } + // [Fact] + // public void TooLongErrorTestEnd() + // { + // foreach (int outputLength in outputLengths) + // { + // for (int trial = 0; trial < NumTrials; trial++) + // { + // byte[] utf8 = generator.Generate(outputLength); + + // for (int i = 0; i < utf8.Length; i++) + // { + // if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes + // { + // byte oldByte = utf8[i]; + // utf8[i] = 0b10000000; // Forcing a too long error + // Assert.False(ValidateUtf8(utf8)); + // Assert.True(InvalidateUtf8(utf8, i)); + // utf8[i] = oldByte; // Restore the original byte + // } + // } + // } + // } + // } + // public static IEnumerable InvalidTestData() // { // var random = new Random(); @@ -613,20 +639,49 @@ private bool InvalidateUtf8(byte[] utf8, int badindex) } // check that all methods agree that the result is valid - private bool ValidateUtf8(byte[] utf8) + // private bool ValidateUtf8(byte[] utf8) + // { + // unsafe + // { + // fixed (byte* pInput = utf8) + // { + // byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length); + // if (scalarResult != pInput + utf8.Length) + // { + // return false; + // } + + // byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length); + // if (simdResult != pInput + utf8.Length) + // { + // return false; + // } + + // return true; + // } + // } + // } + + + private bool ValidateUtf8(byte[] utf8, Range range = default) { + // Adjusted check for default Range + var isDefaultRange = range.Equals(default(Range)); + var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); + unsafe { fixed (byte* pInput = utf8) { - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length); - if (scalarResult != pInput + utf8.Length) + byte* startPtr = pInput + offset; + byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(startPtr, length); + if (scalarResult != startPtr + length) { return false; } - byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length); - if (simdResult != pInput + utf8.Length) + byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(startPtr, length); + if (simdResult != startPtr + length) { return false; } @@ -635,4 +690,13 @@ private bool ValidateUtf8(byte[] utf8) } } } + + // Helper method to calculate the actual offset and length from a Range + private (int offset, int length) GetOffsetAndLength(int totalLength, Range range) + { + var start = range.Start.GetOffset(totalLength); + var end = range.End.GetOffset(totalLength); + var length = end - start; + return (start, length); + } } \ No newline at end of file From 2391958dad1f6ca12f1e7fe16e4352c3c45bbec5 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 10 Mar 2024 16:42:38 -0400 Subject: [PATCH 09/75] tests for too long at the end --- test/UTF8ValidationTests.cs | 26 ++++++++++++++++++++++ test/helpers/randomutf8.cs | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index ab72275..1f827a2 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -224,6 +224,9 @@ public void TooShortErrorTest() [Fact] public void TooLongErrorTest() { + + int[] outputLengths = { 128, 256, 512, 1024 }; // Example lengths + foreach (int outputLength in outputLengths) { for (int trial = 0; trial < NumTrials; trial++) @@ -484,6 +487,29 @@ public void TooLargeErrorTest() } } + [Fact] + public void TooLargeErrorTestEnd() + { + foreach (int outputLength in outputLengths) + { + for (int trial = 0; trial < NumTrials; trial++) + { + + byte[] filler = generator.Generate(outputLength,byteCountInUnit:1); + byte[] twobytetoolong = generator.AppendContinuationByte(generator.Generate(1,2)); + byte[] threebytetoolong = generator.Generate(1,3); + byte[] fourbytetoolong = generator.Generate(1,4); + + generator.ReplaceEndOfArray(filler,twobytetoolong); + + Assert.False(ValidateUtf8(filler )); + // Assert.True(InvalidateUtf8(utf8, outputLength)); + + } + } + } + + [Fact] public void SurrogateErrorTest() { diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index 5498576..6526ebb 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -63,6 +63,49 @@ private int GenerateCodePoint(int byteCount) } } + public void AppendContinuationByte(List utf8Bytes) + { + byte continuationByte = (byte)gen.Next(0x80, 0xBF + 1); + utf8Bytes.Add(continuationByte); + } + + public byte[] AppendContinuationByte(byte[] utf8Bytes) +{ + // Create a new array that is one byte larger than the original + byte[] newArray = new byte[utf8Bytes.Length + 1]; + + // Copy the original bytes into the new array + Array.Copy(utf8Bytes, newArray, utf8Bytes.Length); + + // Generate a random continuation byte (0x80 to 0xBF) + byte continuationByte = (byte)gen.Next(0x80, 0xBF + 1); + + // Append the continuation byte at the end of the new array + newArray[utf8Bytes.Length] = continuationByte; + + // Return the new array with the appended continuation byte + return newArray; +} + + + public void ReplaceEndOfArray(byte[] original, byte[] replacement)//, int startIndex) + { + // // Check if the startIndex is within the bounds of the original array + // if (startIndex < 0 || startIndex > original.Length) + // { + // throw new ArgumentOutOfRangeException(nameof(startIndex), "Start index is out of the range of the original array."); + // } + + // Calculate the start index for replacement + int startIndex = original.Length - replacement.Length; + + // Copy the replacement array into the original starting at startIndex + Array.Copy(replacement, 0, original, startIndex, Math.Min(replacement.Length, original.Length - startIndex)); + } + + + + private int PickRandomByteCount() { double randomValue = gen.NextDouble() * probabilities.Sum(); From 0d739fd6a5411fc8f94ae1aa95c29493d1adab0c Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 11 Mar 2024 19:55:15 -0400 Subject: [PATCH 10/75] some improvements --- test/UTF8ValidationTests.cs | 19 ++++++++++++------- test/helpers/randomutf8.cs | 26 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 1f827a2..a553da9 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -369,8 +369,11 @@ public void TooShortTestEnd(int outputLength, int position) if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array { Assert.False(ValidateUtf8(utf8)); // Test the condition + Assert.True(InvalidateUtf8(utf8,position)); } + Assert.True(ValidateUtf8(utf8)); // Test the condition + } // [Fact] @@ -444,6 +447,7 @@ public void Invalid0xf50xff(int outputLength, int position, byte invalidByte,byt // PrintHexAndBinary(utf8); Assert.False(ValidateUtf8(utf8)); // Expect the validation to fail due to the invalid byte + Assert.True(InvalidateUtf8(utf8,position)); } @@ -494,18 +498,19 @@ public void TooLargeErrorTestEnd() { for (int trial = 0; trial < NumTrials; trial++) { - + for (int i = 1; i <= 4; i++) + { byte[] filler = generator.Generate(outputLength,byteCountInUnit:1); - byte[] twobytetoolong = generator.AppendContinuationByte(generator.Generate(1,2)); - byte[] threebytetoolong = generator.Generate(1,3); - byte[] fourbytetoolong = generator.Generate(1,4); + byte[] toolong = generator.AppendContinuationByte(generator.Generate(1,i)); - generator.ReplaceEndOfArray(filler,twobytetoolong); + generator.ReplaceEndOfArray(filler,toolong); Assert.False(ValidateUtf8(filler )); - // Assert.True(InvalidateUtf8(utf8, outputLength)); + Assert.True(InvalidateUtf8(filler, outputLength -1)); + } - } + + } } } diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index 6526ebb..237c759 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -31,6 +31,31 @@ public byte[] Generate(int howManyUnits, int? byteCountInUnit = null) return result.ToArray(); } + // public object Generate(int howManyUnits, int? byteCountInUnit = null, bool returnAsList = false) + // { + // var result = new List(); + // while (result.Count < howManyUnits) + // { + // int count = byteCountInUnit ?? PickRandomByteCount(); + // int codePoint = GenerateCodePoint(count); + // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); + + // if (result.Count + utf8Bytes.Length > howManyUnits) + // break; + + // result.AddRange(utf8Bytes); + // } + + // if (returnAsList) + // { + // return result; + // } + // else + // { + // return result.ToArray(); + // } + // } + private int GenerateCodePoint(int byteCount) { switch (byteCount) @@ -69,6 +94,7 @@ public void AppendContinuationByte(List utf8Bytes) utf8Bytes.Add(continuationByte); } +//TODO(Nick): redo this monstruosity public byte[] AppendContinuationByte(byte[] utf8Bytes) { // Create a new array that is one byte larger than the original From c3632692ac87eb46f0bbe8332773533d300b696e Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 12 Mar 2024 21:47:44 -0400 Subject: [PATCH 11/75] Slight but nescessary cleanup --- test/AsciiTest.cs | 6 ++--- test/UTF8ValidationTests.cs | 28 +++++++++++------------ test/helpers/randomutf8.cs | 44 +++++++++++++++++-------------------- 3 files changed, 37 insertions(+), 41 deletions(-) diff --git a/test/AsciiTest.cs b/test/AsciiTest.cs index 5f41ba4..88fd035 100644 --- a/test/AsciiTest.cs +++ b/test/AsciiTest.cs @@ -101,7 +101,7 @@ public void Test_ASCII_generator() for (int i = 0; i < NUM_TRIALS; i++) { - byte[] sequence = utf8Generator.Generate(length); + byte[] sequence = utf8Generator.Generate(length).ToArray(); if (sequence.All(b => b >= 0x00 && b <= 0x7F)) { @@ -132,7 +132,7 @@ public void TestNoErrorGetIndexOfFirstNonAsciiByte() for (int trial = 0; trial < NUM_TRIALS; trial++) { - byte[] ascii = utf8Generator.Generate(LENGTH); + byte[] ascii = utf8Generator.Generate(LENGTH).ToArray(); unsafe { @@ -158,7 +158,7 @@ public void TestErrorGetIndexOfFirstNonAsciiByte() for (int trial = 0; trial < NUM_TRIALS; trial++) { - byte[] ascii = utf8Generator.Generate(LENGTH); + byte[] ascii = utf8Generator.Generate(LENGTH).ToArray(); for (int i = 0; i < ascii.Length; i++) { diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index a553da9..5e8dbf9 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -120,7 +120,7 @@ public void NoErrorTest() { for (int trial = 0; trial < NumTrials; trial++) { - byte[] utf8 = generator.Generate(outputLength); + byte[] utf8 = generator.Generate(outputLength).ToArray(); bool isValidUtf8 = ValidateUtf8(utf8); string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); @@ -165,7 +165,7 @@ private void RunTestForByteLength(int byteLength) { for (int trial = 0; trial < NumTrials; trial++) { - byte[] utf8 = generator.Generate(outputLength, byteLength); + byte[] utf8 = generator.Generate(outputLength, byteLength).ToArray(); bool isValidUtf8 = ValidateUtf8(utf8); Assert.True(isValidUtf8, $"Failure for {byteLength}-byte UTF8 of length {outputLength} in trial {trial}"); } @@ -180,7 +180,7 @@ public void HeaderBitsErrorTest() for (int trial = 0; trial < NumTrials; trial++) { - byte[] utf8 = generator.Generate(outputLength); + byte[] utf8 = generator.Generate(outputLength).ToArray(); for (int i = 0; i < utf8.Length; i++) { if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes @@ -203,7 +203,7 @@ public void TooShortErrorTest() { for (int trial = 0; trial < NumTrials; trial++) { - byte[] utf8 = generator.Generate(outputLength); + byte[] utf8 = generator.Generate(outputLength).ToArray(); for (int i = 0; i < utf8.Length; i++) { @@ -231,7 +231,7 @@ public void TooLongErrorTest() { for (int trial = 0; trial < NumTrials; trial++) { - byte[] utf8 = generator.Generate(outputLength); + byte[] utf8 = generator.Generate(outputLength).ToArray(); for (int i = 0; i < utf8.Length; i++) { @@ -255,7 +255,7 @@ public void OverlongErrorTest() { foreach (int outputLength in outputLengths) { - byte[] utf8 = generator.Generate(outputLength); + byte[] utf8 = generator.Generate(outputLength).ToArray(); for (int i = 0; i < utf8.Length; i++) @@ -358,8 +358,8 @@ public byte[] PrependAndTake(byte[] first, byte[] second, int takeCount) public void TooShortTestEnd(int outputLength, int position) { // ( know this is slow ... but I think for a first pass, it might be ok?) - byte[] utf8 = generator.Generate(outputLength); - byte[] filler = generator.Generate(howManyUnits: position, byteCountInUnit: 1); + byte[] utf8 = generator.Generate(outputLength).ToArray(); + byte[] filler = generator.Generate(howManyUnits: position, byteCountInUnit: 1).ToArray(); // Assuming 'prepend' and 'take' logic needs to be applied here as per the pseudocode @@ -421,7 +421,7 @@ public static IEnumerable InvalidTestData() var invalidBytes = Enumerable.Range(0xF5, 0x100 - 0xF5).Select(i => (byte)i).ToArray(); // 0xF5 to 0xFF foreach (var length in outputLengths) { - byte[] utf8 = generator.Generate(length); + byte[] utf8 = generator.Generate(length).ToArray(); for (int position = 0; position < utf8.Length; position++) { foreach (var invalidByte in invalidBytes) @@ -473,7 +473,7 @@ public void TooLargeErrorTest() for (int trial = 0; trial < NumTrials; trial++) { - byte[] utf8 = generator.Generate(outputLength); + byte[] utf8 = generator.Generate(outputLength).ToArray(); for (int i = 0; i < utf8.Length; i++) { @@ -500,8 +500,8 @@ public void TooLargeErrorTestEnd() { for (int i = 1; i <= 4; i++) { - byte[] filler = generator.Generate(outputLength,byteCountInUnit:1); - byte[] toolong = generator.AppendContinuationByte(generator.Generate(1,i)); + byte[] filler = generator.Generate(outputLength,byteCountInUnit:1).ToArray(); + byte[] toolong = generator.AppendContinuationByte(generator.Generate(1,i)).ToArray(); generator.ReplaceEndOfArray(filler,toolong); @@ -523,7 +523,7 @@ public void SurrogateErrorTest() for (int trial = 0; trial < NumTrials; trial++) { - byte[] utf8 = generator.Generate(outputLength); + byte[] utf8 = generator.Generate(outputLength).ToArray(); for (int i = 0; i < utf8.Length; i++) { @@ -558,7 +558,7 @@ public void BruteForceTest() { // Generate random UTF-8 sequence - byte[] utf8 = generator.Generate(rand.Next(outputLength)); + byte[] utf8 = generator.Generate(rand.Next(outputLength)).ToArray(); Assert.True(ValidateUtf8(utf8), "Initial UTF-8 validation (primary) failed."); diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index 237c759..61d9564 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -15,7 +15,23 @@ public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, i probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes }; } - public byte[] Generate(int howManyUnits, int? byteCountInUnit = null) + // public byte[] Generate(int howManyUnits, int? byteCountInUnit = null) + // { + // var result = new List(); + // while (result.Count < howManyUnits) + // { + // int count = byteCountInUnit ?? PickRandomByteCount(); + // int codePoint = GenerateCodePoint(count); + // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); + + // result.AddRange(utf8Bytes); + // if (result.Count + utf8Bytes.Length > howManyUnits) + // break; + // } + // return result.ToArray(); + // } + + public List Generate(int howManyUnits, int? byteCountInUnit = null) { var result = new List(); while (result.Count < howManyUnits) @@ -28,7 +44,7 @@ public byte[] Generate(int howManyUnits, int? byteCountInUnit = null) if (result.Count + utf8Bytes.Length > howManyUnits) break; } - return result.ToArray(); + return result; } // public object Generate(int howManyUnits, int? byteCountInUnit = null, bool returnAsList = false) @@ -88,30 +104,10 @@ private int GenerateCodePoint(int byteCount) } } - public void AppendContinuationByte(List utf8Bytes) - { - byte continuationByte = (byte)gen.Next(0x80, 0xBF + 1); - utf8Bytes.Add(continuationByte); - } - -//TODO(Nick): redo this monstruosity - public byte[] AppendContinuationByte(byte[] utf8Bytes) -{ - // Create a new array that is one byte larger than the original - byte[] newArray = new byte[utf8Bytes.Length + 1]; - - // Copy the original bytes into the new array - Array.Copy(utf8Bytes, newArray, utf8Bytes.Length); + public List AppendContinuationByte(List utf8Bytes) => + utf8Bytes.Concat(new byte[] {(byte)gen.Next(0x80, 0xBF + 1)}).ToList(); - // Generate a random continuation byte (0x80 to 0xBF) - byte continuationByte = (byte)gen.Next(0x80, 0xBF + 1); - // Append the continuation byte at the end of the new array - newArray[utf8Bytes.Length] = continuationByte; - - // Return the new array with the appended continuation byte - return newArray; -} public void ReplaceEndOfArray(byte[] original, byte[] replacement)//, int startIndex) From 37ea8c30967c73f77dbc5f295f3a640e2eb11cf8 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Fri, 15 Mar 2024 11:32:55 -0400 Subject: [PATCH 12/75] save game --- src/UTF16.cs | 9 -- src/scalar_tail.cs | 209 ------------------------------------ test/UTF8ValidationTests.cs | 43 ++++++++ 3 files changed, 43 insertions(+), 218 deletions(-) delete mode 100644 src/UTF16.cs delete mode 100644 src/scalar_tail.cs diff --git a/src/UTF16.cs b/src/UTF16.cs deleted file mode 100644 index eef7d57..0000000 --- a/src/UTF16.cs +++ /dev/null @@ -1,9 +0,0 @@ -using System; - -// This may not be needed? Placeholder -namespace SimdUnicode -{ - public static class UTF16 - { - } -} \ No newline at end of file diff --git a/src/scalar_tail.cs b/src/scalar_tail.cs deleted file mode 100644 index 7e0bede..0000000 --- a/src/scalar_tail.cs +++ /dev/null @@ -1,209 +0,0 @@ -// // Helpers.CheckForGCCollections("After AVX2 procession"); - - -// // | Method | FileName | Mean | Error | StdDev | Allocated | -// // |---------------------------- |----------------------- |----------:|----------:|----------:|----------:| -// // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 33.062 us | 0.5046 us | 0.4720 us | 56 B | -// // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 35.609 us | 0.3369 us | 0.3152 us | 56 B | -// // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 11.603 us | 0.2232 us | 0.2293 us | 56 B | -// // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 12.317 us | 0.1826 us | 0.1708 us | 56 B | -// // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 13.726 us | 0.2471 us | 0.2311 us | 56 B | -// // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 13.392 us | 0.0520 us | 0.0487 us | 56 B | -// // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 24.345 us | 0.2012 us | 0.1882 us | 56 B | -// // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 23.778 us | 0.1892 us | 0.1769 us | 56 B | -// // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 9.323 us | 0.0155 us | 0.0130 us | 56 B | -// // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 8.336 us | 0.0502 us | 0.0470 us | 56 B | -// // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 10.728 us | 0.1370 us | 0.1282 us | 56 B | -// // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 10.837 us | 0.1389 us | 0.1300 us | 56 B | -// // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 11.086 us | 0.1190 us | 0.1113 us | 56 B | -// // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 10.017 us | 0.0615 us | 0.0514 us | 56 B | - - -// // if (processedLength < inputLength) -// // { -// // // Unfortunalely, this approach with stackalloc might be expensive. -// // // TODO: replace it by a simple scalar routine. You need to handle -// // // prev_incomplete but it should be doable. - - -// // Span remainingBytes = stackalloc byte[32]; -// // for (int i = 0; i < inputLength - processedLength; i++) -// // { -// // remainingBytes[i] = pInputBuffer[processedLength + i]; -// // } - -// // Vector256 remainingBlock = Vector256.Create(remainingBytes.ToArray()); -// // Utf8Validation.utf8_checker.CheckNextInput(remainingBlock, ref prev_input_block, ref prev_incomplete, ref error); -// // processedLength += inputLength - processedLength; - -// // } - -// // CheckForGCCollections("After processed remaining bytes"); - -// // | Method | FileName | Mean | Error | StdDev | Allocated | -// // |---------------------------- |----------------------- |----------:|----------:|----------:|----------:| -// // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 31.509 us | 0.2234 us | 0.2089 us | - | -// // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 28.280 us | 0.2042 us | 0.1810 us | - | -// // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 6.682 us | 0.0400 us | 0.0354 us | - | -// // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 6.750 us | 0.1294 us | 0.1080 us | - | -// // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 9.291 us | 0.0345 us | 0.0323 us | - | -// // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 9.483 us | 0.0486 us | 0.0454 us | - | -// // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 19.547 us | 0.3349 us | 0.3132 us | - | -// // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 18.264 us | 0.2890 us | 0.2703 us | - | -// // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 4.972 us | 0.0402 us | 0.0357 us | - | -// // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 4.936 us | 0.0468 us | 0.0438 us | - | -// // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 6.039 us | 0.0680 us | 0.0636 us | - | -// // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 5.683 us | 0.0970 us | 0.0907 us | - | -// // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 6.054 us | 0.1161 us | 0.1627 us | - | -// // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 5.909 us | 0.0483 us | 0.0452 us | - | -// // scalar results: -// // if (processedLength < inputLength) -// // { -// // byte* invalidBytePointer = UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength, inputLength - processedLength); -// // // This makes little difference -// // if (invalidBytePointer != pInputBuffer + inputLength) -// // { -// // // An invalid byte was found. Adjust error handling as needed. -// // error = Vector256.Create((byte)1); -// // } -// // processedLength += (int)(invalidBytePointer - (pInputBuffer + processedLength)); -// // } - - -// // ThreadStaticAttribute approach is buggy -// // if (processedLength < inputLength) -// // { - -// // // int mask = Avx2.MoveMask(prev_incomplete.AsSByte()); -// // // int index = BitOperations.TrailingZeroCount(mask); - - -// // // byte* invalidBytePointer = UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength, inputLength - processedLength); -// // // // This makes little difference -// // // if (invalidBytePointer != pInputBuffer + inputLength) -// // // { -// // // // An invalid byte was found. Adjust error handling as needed. -// // // error = Vector256.Create((byte)1); -// // // } - -// // // Find the position of the first set bit in incompleteMask, indicating the start of an incomplete sequence. -// // int incompleteMask = Avx2.MoveMask(prev_incomplete.AsSByte()); -// // int firstIncompletePos = BitOperations.LeadingZeroCount((uint)incompleteMask); - -// // // Calculate the pointer adjustment based on the position of the incomplete sequence. -// // byte* startPtrForScalarValidation = pInputBuffer + processedLength + firstIncompletePos; - -// // // Ensure startPtrForScalarValidation does not precede pInputBuffer. -// // // startPtrForScalarValidation = Math.Max(pInputBuffer, startPtrForScalarValidation); - -// // // Now, ensure startPtrForScalarValidation points to a leading byte by backtracking if it's pointing to a continuation byte. -// // // while (startPtrForScalarValidation > pInputBuffer && (*startPtrForScalarValidation & 0xC0) == 0x80) { -// // // startPtrForScalarValidation--; -// // // } - -// // // Invoke scalar validation from the identified leading byte position. -// // byte* invalidBytePointer = UTF8.GetPointerToFirstInvalidByte(startPtrForScalarValidation, inputLength - (int)(startPtrForScalarValidation - pInputBuffer)); -// // if (invalidBytePointer != pInputBuffer + inputLength) -// // { -// // // An invalid byte was found. Adjust error handling as needed. -// // error = Vector256.Create((byte)1); -// // } -// // processedLength += (int)(invalidBytePointer - (pInputBuffer + processedLength)); -// // } - - - -// // | Method | FileName | Mean | Error | StdDev | Allocated | -// // |---------------------------- |----------------------- |----------:|----------:|----------:|----------:| -// // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 20.136 us | 0.3869 us | 0.5031 us | - | -// // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 19.576 us | 0.2366 us | 0.2098 us | - | -// // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 6.207 us | 0.0479 us | 0.0400 us | - | -// // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 6.169 us | 0.0541 us | 0.0506 us | - | -// // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 9.212 us | 0.0121 us | 0.0107 us | - | -// // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 9.373 us | 0.0250 us | 0.0209 us | - | -// // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 13.726 us | 0.2609 us | 0.2900 us | - | -// // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 13.948 us | 0.2122 us | 0.1985 us | - | -// // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 4.916 us | 0.0176 us | 0.0147 us | - | -// // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 4.897 us | 0.0525 us | 0.0491 us | - | -// // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 5.526 us | 0.0463 us | 0.0411 us | - | -// // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 5.538 us | 0.0405 us | 0.0379 us | - | -// // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 5.838 us | 0.0363 us | 0.0340 us | - | -// // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 5.813 us | 0.0440 us | 0.0412 us | - | - - -// if (processedLength < inputLength) -// { - -// Span remainingBytes = stackalloc byte[32]; -// for (int i = 0; i < inputLength - processedLength; i++) -// { -// remainingBytes[i] = pInputBuffer[processedLength + i]; -// } - -// ReadOnlySpan remainingBytesReadOnly = remainingBytes; -// Vector256 remainingBlock = Vector256.Create(remainingBytesReadOnly); -// Utf8Validation.utf8_checker.CheckNextInput(remainingBlock, ref prev_input_block, ref prev_incomplete, ref error); -// processedLength += inputLength - processedLength; - -// } - - - - -// // | Method | FileName | Mean | Error | StdDev | Allocated | -// // |---------------------------- |----------------------- |----------:|----------:|----------:|----------:| -// // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 31.216 us | 0.2960 us | 0.2624 us | - | -// // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 31.732 us | 0.3772 us | 0.3528 us | - | -// // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 10.281 us | 0.1234 us | 0.1154 us | - | -// // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 10.370 us | 0.2019 us | 0.1889 us | - | -// // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 12.003 us | 0.2378 us | 0.4102 us | - | -// // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 11.403 us | 0.1818 us | 0.1700 us | - | -// // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 25.936 us | 0.3735 us | 0.3311 us | - | -// // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 22.630 us | 0.3594 us | 0.3362 us | - | -// // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 7.186 us | 0.0220 us | 0.0195 us | - | -// // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 7.425 us | 0.1450 us | 0.1985 us | - | -// // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 9.359 us | 0.1549 us | 0.1294 us | - | -// // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 10.929 us | 0.2096 us | 0.1961 us | - | -// // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 10.493 us | 0.2098 us | 0.5708 us | - | -// // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 9.575 us | 0.1878 us | 0.1757 us | - | -// // if (processedLength < inputLength) -// // { - -// // Span remainingBytes = stackalloc byte[32]; -// // new Span(pInputBuffer + processedLength, inputLength - processedLength).CopyTo(remainingBytes); - -// // ReadOnlySpan remainingBytesReadOnly = remainingBytes; -// // Vector256 remainingBlock = Vector256.Create(remainingBytesReadOnly); -// // Utf8Validation.utf8_checker.CheckNextInput(remainingBlock, ref prev_input_block, ref prev_incomplete, ref error); -// // processedLength += inputLength - processedLength; - -// // } - -// // if (processedLength < inputLength) -// // { - -// // Span remainingBytes = stackalloc byte[32]; -// // new Span(pInputBuffer + processedLength, inputLength - processedLength).CopyTo(remainingBytes); - -// // ReadOnlySpan remainingBytesReadOnly = remainingBytes; -// // Vector256 remainingBlock = Vector256.Create(remainingBytesReadOnly); -// // Utf8Validation.utf8_checker.CheckNextInput(remainingBlock, ref prev_input_block, ref prev_incomplete, ref error); -// // processedLength += inputLength - processedLength; - -// // } - -// // if (processedLength < inputLength) -// // { -// // // Directly call the scalar function on the remaining part of the buffer -// // byte* startOfRemaining = pInputBuffer + processedLength; -// // int lengthOfRemaining = inputLength - processedLength; -// // byte* invalidBytePointer = UTF8.GetPointerToFirstInvalidByte(startOfRemaining, lengthOfRemaining); - -// // // Use `invalidBytePointer` as needed, for example: -// // // if (invalidBytePointer != startOfRemaining + lengthOfRemaining) { -// // // // Handle the case where an invalid byte is found -// // // } - -// // // Update processedLength based on the result of the scalar function -// // processedLength += (int)(invalidBytePointer - pInputBuffer); -// // } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 5e8dbf9..7c75af2 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -376,6 +376,49 @@ public void TooShortTestEnd(int outputLength, int position) } + // public List PrependAndTake(List first, List second, int takeCount) + // { + // // Concatenate 'first' list at the beginning of 'second' list + // List combined = new List(first); + // combined.AddRange(second); + + // // Take the first 'takeCount' elements from the combined list + // // Ensure we don't exceed the combined list's count + // takeCount = Math.Min(takeCount, combined.Count); + + // return combined.GetRange(0, takeCount); + // } + + + + // // [Theory] + // // [MemberData(nameof(TestData))] + // [Fact] + // public void TooShortTestEnd() + // { + // foreach (int outputLength in outputLengths) + // { + // // for (int trial = 0; trial < NumTrials; trial++) + // // { + // List utf8 = generator.Generate(outputLength); + + // for (int i = 0; i < utf8.Count; i++) + // { + // List filler = generator.Generate(howManyUnits: i, byteCountInUnit: 1); + + // // Assuming 'prepend' and 'take' logic needs to be applied here as per the pseudocode + // byte[] result = PrependAndTake(filler, utf8, i).ToArray(); + + + // if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array + // { + // Assert.False(ValidateUtf8(result)); // Test the condition + // Assert.True(InvalidateUtf8(result,result.Length)); + // } + // } + // // } + // } + // [Fact] // public void TooLongErrorTestEnd() // { From 017192e354f656ad10eb80e13781c4d3b5dd9272 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Fri, 15 Mar 2024 11:45:23 -0400 Subject: [PATCH 13/75] save game --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0cdbf5c..d51930e 100644 --- a/README.md +++ b/README.md @@ -29,14 +29,21 @@ dotnet test To get a list of available tests, enter the command: +``` +dotnet test --list-tests | cut -d '(' -f 1 | uniq +``` + +For a far more verbose output: + ``` dotnet test --list-tests ``` To run specific tests, it is helpful to use the filter parameter: + ``` -dotnet test -c Release --filter Ascii +dotnet test --filter Ascii ``` ## Running Benchmarks From cf7c79fa6f498fa0f52bd4adf1955df28b40ce0f Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 17 Mar 2024 11:50:48 -0400 Subject: [PATCH 14/75] save game --- test/UTF8ValidationTests.cs | 42 +++++++++++++++++++++++++++++++++++++ test/helpers/randomutf8.cs | 29 ++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 7c75af2..ff51e63 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -773,4 +773,46 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) var length = end - start; return (start, length); } + + +[Fact] +public void ExtraArgsTest() +{ + int utf16Adjustment, scalarCountAdjustment; + // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. + byte[] utf8 = generator.Generate(howManyUnits: 3, byteCountInUnit: 2).ToArray(); + PrintHexAndBinary(utf8); + var (offset, length) = (0, utf8.Length); + + unsafe + { + fixed (byte* pInput = utf8) + { + byte* startPtr = pInput + offset; + // Invoke the method under test. + byte* result = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out utf16Adjustment, out scalarCountAdjustment); + + // Since we are generating presumably valid 2-byte sequences, and depending on the specifics + // of the generator and Utf8Utility implementation, we need to assert expectations for adjustments. + // These assertions need to match your understanding of how utf16CodeUnitCountAdjustment and + // scalarCountAdjustment are supposed to be calculated based on the input data. + + // Example: For simple 2-byte characters that map 1:1 from UTF-8 to UTF-16, + // utf16CodeUnitCountAdjustment might be 0 if the utility directly translates byte count. + // Assert.Equal(0, utf16Adjustment); // Placeholder, adjust based on actual logic. + // Assert.Equal(0, scalarCountAdjustment); // Placeholder, adjust based on actual logic. + + Console.WriteLine("Scalar:" + scalarCountAdjustment); + + Console.WriteLine("utf16:" + utf16Adjustment); + + // If your generator creates specific patterns or the utility calculates these adjustments differently, + // you'll need to adjust the expected values accordingly. + } + } +} + + + + } \ No newline at end of file diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index 61d9564..05664cd 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -29,24 +29,47 @@ public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, i // break; // } // return result.ToArray(); + // } + + // public List Generate(int howManyUnits, int? byteCountInUnit = null) + // { + // var result = new List(); + // while (result.Count < howManyUnits) + // { + // int count = byteCountInUnit ?? PickRandomByteCount(); + // int codePoint = GenerateCodePoint(count); + // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); + + // result.AddRange(utf8Bytes); + // if (result.Count + utf8Bytes.Length > howManyUnits) + // break; + // } + // return result; // } public List Generate(int howManyUnits, int? byteCountInUnit = null) { var result = new List(); - while (result.Count < howManyUnits) + var unitsAdded = 0; // Track the number of characters added. + + while (unitsAdded < howManyUnits) { int count = byteCountInUnit ?? PickRandomByteCount(); int codePoint = GenerateCodePoint(count); byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); - result.AddRange(utf8Bytes); - if (result.Count + utf8Bytes.Length > howManyUnits) + // Ensure adding the new character won't exceed the howManyUnits limit. + if (unitsAdded + 1 > howManyUnits) break; + + result.AddRange(utf8Bytes); + unitsAdded++; // Increment the units (characters) count. } + return result; } + // public object Generate(int howManyUnits, int? byteCountInUnit = null, bool returnAsList = false) // { // var result = new List(); From 9634bfe30c534b474618b23e6b64f75427b79f53 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 17 Mar 2024 21:26:39 -0400 Subject: [PATCH 15/75] save game --- test/UTF8ValidationTests.cs | 67 +++++++++++++++++++++---------------- test/helpers/randomutf8.cs | 50 +++++++++++++-------------- 2 files changed, 63 insertions(+), 54 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index ff51e63..13e3ef3 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -775,44 +775,53 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) } -[Fact] -public void ExtraArgsTest() -{ - int utf16Adjustment, scalarCountAdjustment; - // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. - byte[] utf8 = generator.Generate(howManyUnits: 3, byteCountInUnit: 2).ToArray(); - PrintHexAndBinary(utf8); - var (offset, length) = (0, utf8.Length); - - unsafe + [Fact] + public void ExtraArgsTest() { - fixed (byte* pInput = utf8) + int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; + + foreach (int outputLength in outputLengths) { - byte* startPtr = pInput + offset; - // Invoke the method under test. - byte* result = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out utf16Adjustment, out scalarCountAdjustment); + int utf16Adjustment, scalarCountAdjustment; + // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. + // byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray(); + byte[] utf8 = generator.Generate(howManyUnits: 13).ToArray(); + PrintHexAndBinary(utf8); + var (offset, length) = (0, utf8.Length); + + unsafe + { + fixed (byte* pInput = utf8) + { + byte* startPtr = pInput + offset; + // Invoke the method under test. + byte* result = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out utf16Adjustment, out scalarCountAdjustment); + + // Since we are generating presumably valid 2-byte sequences, and depending on the specifics + // of the generator and Utf8Utility implementation, we need to assert expectations for adjustments. + // These assertions need to match your understanding of how utf16CodeUnitCountAdjustment and + // scalarCountAdjustment are supposed to be calculated based on the input data. - // Since we are generating presumably valid 2-byte sequences, and depending on the specifics - // of the generator and Utf8Utility implementation, we need to assert expectations for adjustments. - // These assertions need to match your understanding of how utf16CodeUnitCountAdjustment and - // scalarCountAdjustment are supposed to be calculated based on the input data. + // Example: For simple 2-byte characters that map 1:1 from UTF-8 to UTF-16, + // utf16CodeUnitCountAdjustment might be 0 if the utility directly translates byte count. + // Assert.Equal(0, utf16Adjustment); // Placeholder, adjust based on actual logic. + // Assert.Equal(0, scalarCountAdjustment); // Placeholder, adjust based on actual logic. - // Example: For simple 2-byte characters that map 1:1 from UTF-8 to UTF-16, - // utf16CodeUnitCountAdjustment might be 0 if the utility directly translates byte count. - // Assert.Equal(0, utf16Adjustment); // Placeholder, adjust based on actual logic. - // Assert.Equal(0, scalarCountAdjustment); // Placeholder, adjust based on actual logic. + Console.WriteLine("Lenght:" + utf8.Length); - Console.WriteLine("Scalar:" + scalarCountAdjustment); + Console.WriteLine("Scalar:" + scalarCountAdjustment); - Console.WriteLine("utf16:" + utf16Adjustment); + Console.WriteLine("utf16:" + utf16Adjustment); + Console.WriteLine("___________________________________________________"); - // If your generator creates specific patterns or the utility calculates these adjustments differently, - // you'll need to adjust the expected values accordingly. + + // If your generator creates specific patterns or the utility calculates these adjustments differently, + // you'll need to adjust the expected values accordingly. + } + } } } -} - +} -} \ No newline at end of file diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index 05664cd..857b68e 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -29,46 +29,46 @@ public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, i // break; // } // return result.ToArray(); - // } - - // public List Generate(int howManyUnits, int? byteCountInUnit = null) - // { - // var result = new List(); - // while (result.Count < howManyUnits) - // { - // int count = byteCountInUnit ?? PickRandomByteCount(); - // int codePoint = GenerateCodePoint(count); - // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); - - // result.AddRange(utf8Bytes); - // if (result.Count + utf8Bytes.Length > howManyUnits) - // break; - // } - // return result; // } public List Generate(int howManyUnits, int? byteCountInUnit = null) { var result = new List(); - var unitsAdded = 0; // Track the number of characters added. - - while (unitsAdded < howManyUnits) + while (result.Count < howManyUnits) { int count = byteCountInUnit ?? PickRandomByteCount(); int codePoint = GenerateCodePoint(count); byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); - // Ensure adding the new character won't exceed the howManyUnits limit. - if (unitsAdded + 1 > howManyUnits) - break; - result.AddRange(utf8Bytes); - unitsAdded++; // Increment the units (characters) count. + if (result.Count + utf8Bytes.Length > howManyUnits) + break; } - return result; } + // public List Generate(int howManyUnits, int? byteCountInUnit = null) + // { + // var result = new List(); + // var unitsAdded = 0; // Track the number of characters added. + + // while (unitsAdded < howManyUnits) + // { + // int count = byteCountInUnit ?? PickRandomByteCount(); + // int codePoint = GenerateCodePoint(count); + // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); + + // // Ensure adding the new character won't exceed the howManyUnits limit. + // if (unitsAdded + 1 > howManyUnits) + // break; + + // result.AddRange(utf8Bytes); + // unitsAdded++; // Increment the units (characters) count. + // } + + // return result; + // } + // public object Generate(int howManyUnits, int? byteCountInUnit = null, bool returnAsList = false) // { From d8c035e412e8ef92e1e34553e0e90d6c30ee1449 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 18 Mar 2024 20:47:11 -0400 Subject: [PATCH 16/75] save game --- benchmark/Benchmark.cs | 20 ++++++++++- src/UTF8.cs | 82 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 87 insertions(+), 15 deletions(-) diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs index 50153a3..48a5b68 100644 --- a/benchmark/Benchmark.cs +++ b/benchmark/Benchmark.cs @@ -188,14 +188,32 @@ public unsafe void SIMDUtf8ValidationRealData() } [Benchmark] + // [BenchmarkCategory("scalar")] + // public unsafe void Utf8ValidationRealDataScalar() + // { + // if (allLinesUtf8 != null) + // { + // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + // } + // } + [BenchmarkCategory("scalar")] public unsafe void Utf8ValidationRealDataScalar() { if (allLinesUtf8 != null) { - RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + // Assuming allLinesUtf8 is a byte* and its length is provided by another variable, for example, allLinesUtf8Length + RunUtf8ValidationBenchmark(allLinesUtf8, (byte* pInputBuffer, int inputLength) => + { + int dummyUtf16CodeUnitCountAdjustment, dummyScalarCountAdjustment; + // Call the method with additional out parameters within the lambda. + // You must handle these additional out parameters inside the lambda, as they cannot be passed back through the delegate. + return SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out dummyUtf16CodeUnitCountAdjustment, out dummyScalarCountAdjustment); + }); } } + + [Benchmark] [BenchmarkCategory("arm64")] public unsafe void SIMDUtf8ValidationRealDataArm64() diff --git a/src/UTF8.cs b/src/UTF8.cs index 1ec5bc5..4305d22 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -33,14 +33,18 @@ public static class UTF8 // Now buf points to the start of a UTF-8 sequence or the start of the buffer. // Validate from this new start point with the adjusted length. - byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen); + byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); return invalidByte; } - public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength) + public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { + + int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempScalarCountAdjustment = 0; + int pos = 0; int nextPos; uint codePoint = 0; @@ -49,25 +53,50 @@ public static class UTF8 byte firstByte = pInputBuffer[pos]; while (firstByte < 0b10000000) { - if (++pos == inputLength) { return pInputBuffer + inputLength; } + if (++pos == inputLength) { + + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + inputLength; } firstByte = pInputBuffer[pos]; + TempUtf16CodeUnitCountAdjustment -= 2; } if ((firstByte & 0b11100000) == 0b11000000) { nextPos = pos + 2; - if (nextPos > inputLength) { return pInputBuffer + pos; } // Too short - if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; } // Too short + if (nextPos > inputLength) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // Too short + if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // Too short // range check codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(pInputBuffer[pos + 1] & 0b00111111); - if ((codePoint < 0x80) || (0x7ff < codePoint)) { return pInputBuffer + pos; } // Overlong + if ((codePoint < 0x80) || (0x7ff < codePoint)) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // Overlong + TempUtf16CodeUnitCountAdjustment -= 2; } else if ((firstByte & 0b11110000) == 0b11100000) { nextPos = pos + 3; - if (nextPos > inputLength) { return pInputBuffer + pos; } // Too short - if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; } // Too short - if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; } // Too short + if (nextPos > inputLength) { + + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // Too short + if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // Too short + if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // Too short // range check codePoint = (uint)(firstByte & 0b00001111) << 12 | (uint)(pInputBuffer[pos + 1] & 0b00111111) << 6 | @@ -76,29 +105,54 @@ public static class UTF8 if ((codePoint < 0x800) || (0xffff < codePoint) || (0xd7ff < codePoint && codePoint < 0xe000)) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } + TempUtf16CodeUnitCountAdjustment -= 2; } else if ((firstByte & 0b11111000) == 0b11110000) { // 0b11110000 nextPos = pos + 4; - if (nextPos > inputLength) { return pInputBuffer + pos; } - if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; } - if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; } - if ((pInputBuffer[pos + 3] & 0b11000000) != 0b10000000) { return pInputBuffer + pos; } + if (nextPos > inputLength) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment;return pInputBuffer + pos; } + if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } + if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } + if ((pInputBuffer[pos + 3] & 0b11000000) != 0b10000000) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // range check codePoint = (uint)(firstByte & 0b00000111) << 18 | (uint)(pInputBuffer[pos + 1] & 0b00111111) << 12 | (uint)(pInputBuffer[pos + 2] & 0b00111111) << 6 | (uint)(pInputBuffer[pos + 3] & 0b00111111); - if (codePoint <= 0xffff || 0x10ffff < codePoint) { return pInputBuffer + pos; } + if (codePoint <= 0xffff || 0x10ffff < codePoint) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } + TempUtf16CodeUnitCountAdjustment -= 2; + TempScalarCountAdjustment = -1; + + } else { // we may have a continuation + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } pos = nextPos; } + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + inputLength; } From 03e406a37c8e371c717bb9f2ab835107afc29d5d Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 19 Mar 2024 17:49:48 -0400 Subject: [PATCH 17/75] scalar attempt --- src/UTF8.cs | 26 +++++++++++++++------ test/UTF8ValidationTests.cs | 46 +++++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 19 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 4305d22..84eb9eb 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -59,7 +59,7 @@ public static class UTF8 scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + inputLength; } firstByte = pInputBuffer[pos]; - TempUtf16CodeUnitCountAdjustment -= 2; + TempUtf16CodeUnitCountAdjustment -= 1; } if ((firstByte & 0b11100000) == 0b11000000) @@ -79,7 +79,7 @@ public static class UTF8 utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } // Overlong - TempUtf16CodeUnitCountAdjustment -= 2; + TempUtf16CodeUnitCountAdjustment -= 1; } else if ((firstByte & 0b11110000) == 0b11100000) { @@ -113,6 +113,8 @@ public static class UTF8 } else if ((firstByte & 0b11111000) == 0b11110000) { // 0b11110000 + TempScalarCountAdjustment = -1; + nextPos = pos + 4; if (nextPos > inputLength) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; @@ -138,7 +140,6 @@ public static class UTF8 scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } TempUtf16CodeUnitCountAdjustment -= 2; - TempScalarCountAdjustment = -1; } @@ -309,7 +310,9 @@ public static class UTF8 } } } - byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength); + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { // An invalid byte was found by the scalar function @@ -494,7 +497,9 @@ public static class UTF8 { processedLength -= 1; } - byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength); + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { // An invalid byte was found by the scalar function @@ -639,7 +644,9 @@ public static class UTF8 } } } - byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength); + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { // An invalid byte was found by the scalar function @@ -667,7 +674,12 @@ public static class UTF8 { return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); } - return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); + // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); + + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + } } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 13e3ef3..4727d72 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -39,7 +39,9 @@ public void TestGoodSequences() { fixed (byte* pInput = input) { - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, input.Length); + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); @@ -93,8 +95,10 @@ public void TestBadSequences() { fixed (byte* pInput = input) { - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, input.Length); - Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); @@ -692,7 +696,10 @@ private bool InvalidateUtf8(byte[] utf8, int badindex) { fixed (byte* pInput = utf8) { - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length); + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + + byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); int scalarOffset = (int)(scalarResult - pInput); byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length); int simdOffset = (int)(simdResult - pInput); @@ -748,7 +755,9 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) fixed (byte* pInput = utf8) { byte* startPtr = pInput + offset; - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(startPtr, length); + int TailScalarCodeUnitCountAdjustment =0; + int TailUtf16CodeUnitCountAdjustment = 0; + byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(startPtr, length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); if (scalarResult != startPtr + length) { return false; @@ -776,13 +785,15 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) [Fact] - public void ExtraArgsTest() + public void ScalarUTF16CountTest() { int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; + int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; + int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + foreach (int outputLength in outputLengths) { - int utf16Adjustment, scalarCountAdjustment; // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. // byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray(); byte[] utf8 = generator.Generate(howManyUnits: 13).ToArray(); @@ -795,7 +806,15 @@ public void ExtraArgsTest() { byte* startPtr = pInput + offset; // Invoke the method under test. - byte* result = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out utf16Adjustment, out scalarCountAdjustment); + + DotnetUtf16Adjustment= 0; + DotnetScalarCountAdjustment= 0; + DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); + + SimdUnicodeUtf16Adjustment= 0; + SimdUnicodeScalarCountAdjustment= 0; + SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + // Since we are generating presumably valid 2-byte sequences, and depending on the specifics // of the generator and Utf8Utility implementation, we need to assert expectations for adjustments. @@ -804,14 +823,17 @@ public void ExtraArgsTest() // Example: For simple 2-byte characters that map 1:1 from UTF-8 to UTF-16, // utf16CodeUnitCountAdjustment might be 0 if the utility directly translates byte count. - // Assert.Equal(0, utf16Adjustment); // Placeholder, adjust based on actual logic. - // Assert.Equal(0, scalarCountAdjustment); // Placeholder, adjust based on actual logic. + // Assert.Equal(DotnetUtf16Adjustment, SimdUnicodeUtf16Adjustment); // Placeholder, adjust based on actual logic. + // Assert.Equal(DotnetScalarCountAdjustment, SimdUnicodeScalarCountAdjustment); // Placeholder, adjust based on actual logic. + Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); + Console.WriteLine("Lenght:" + utf8.Length); - Console.WriteLine("Scalar:" + scalarCountAdjustment); + // Console.WriteLine("Scalar:" + scalarCountAdjustment); - Console.WriteLine("utf16:" + utf16Adjustment); + // Console.WriteLine("utf16:" + utf16Adjustment); Console.WriteLine("___________________________________________________"); From 1ec3b2414c075c4e309ae9139bf5ab4dc1d8c2a5 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Thu, 21 Mar 2024 20:21:56 -0400 Subject: [PATCH 18/75] fixes --- benchmark/UTF8_runtime.cs | 1 + src/UTF8.cs | 26 ++++++++++++++++---------- test/UTF8ValidationTests.cs | 25 ++++++++++--------------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/benchmark/UTF8_runtime.cs b/benchmark/UTF8_runtime.cs index 8583e5a..0530d65 100644 --- a/benchmark/UTF8_runtime.cs +++ b/benchmark/UTF8_runtime.cs @@ -500,6 +500,7 @@ private static bool UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(uint valu // the value isn't overlong using a single comparison. On big-endian platforms, we'll need // to validate the mask and validate that the sequence isn't overlong as two separate comparisons. + // Temp16 - 2 if ((BitConverter.IsLittleEndian && UInt32EndsWithValidUtf8TwoByteSequenceLittleEndian(thisDWord)) || (!BitConverter.IsLittleEndian && (UInt32EndsWithUtf8TwoByteMask(thisDWord) && !UInt32EndsWithOverlongUtf8TwoByteSequence(thisDWord)))) { diff --git a/src/UTF8.cs b/src/UTF8.cs index 84eb9eb..5e3059f 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -59,7 +59,7 @@ public static class UTF8 scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + inputLength; } firstByte = pInputBuffer[pos]; - TempUtf16CodeUnitCountAdjustment -= 1; + // TempUtf16CodeUnitCountAdjustment -= 1; } if ((firstByte & 0b11100000) == 0b11000000) @@ -86,14 +86,6 @@ public static class UTF8 nextPos = pos + 3; if (nextPos > inputLength) { - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } // Too short - if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } // Too short - if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } // Too short @@ -109,11 +101,23 @@ public static class UTF8 scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } + if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // Too short + if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + pos; } // Too short + // if (pInputBuffer[pos + 3] < 0b10000000) { + // TempUtf16CodeUnitCountAdjustment -= 1; + // } else { + // TempUtf16CodeUnitCountAdjustment -= 2; + // } TempUtf16CodeUnitCountAdjustment -= 2; } else if ((firstByte & 0b11111000) == 0b11110000) { // 0b11110000 - TempScalarCountAdjustment = -1; nextPos = pos + 4; if (nextPos > inputLength) { @@ -140,6 +144,8 @@ public static class UTF8 scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } TempUtf16CodeUnitCountAdjustment -= 2; + TempScalarCountAdjustment -= 1; + } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 4727d72..3383943 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -816,27 +816,22 @@ public void ScalarUTF16CountTest() SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - // Since we are generating presumably valid 2-byte sequences, and depending on the specifics - // of the generator and Utf8Utility implementation, we need to assert expectations for adjustments. - // These assertions need to match your understanding of how utf16CodeUnitCountAdjustment and - // scalarCountAdjustment are supposed to be calculated based on the input data. - - // Example: For simple 2-byte characters that map 1:1 from UTF-8 to UTF-16, - // utf16CodeUnitCountAdjustment might be 0 if the utility directly translates byte count. - // Assert.Equal(DotnetUtf16Adjustment, SimdUnicodeUtf16Adjustment); // Placeholder, adjust based on actual logic. - // Assert.Equal(DotnetScalarCountAdjustment, SimdUnicodeScalarCountAdjustment); // Placeholder, adjust based on actual logic. - Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); - Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - - Console.WriteLine("Lenght:" + utf8.Length); - // Console.WriteLine("Scalar:" + scalarCountAdjustment); + Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); + Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); - // Console.WriteLine("utf16:" + utf16Adjustment); + Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); + Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); Console.WriteLine("___________________________________________________"); + Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); + + + + // If your generator creates specific patterns or the utility calculates these adjustments differently, // you'll need to adjust the expected values accordingly. } From 2c557bd490a9be9a17c91aebf4e58dfd1b838ce5 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 23 Mar 2024 23:44:08 -0400 Subject: [PATCH 19/75] save game --- test/UTF8ValidationTests.cs | 95 ++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 2 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 3383943..446ff17 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -2,8 +2,13 @@ namespace tests; using System.Text; using SimdUnicode; using System.Diagnostics; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics.Arm; -public class Utf8SIMDValidationTests + +public unsafe class Utf8SIMDValidationTests { @@ -14,10 +19,93 @@ public class Utf8SIMDValidationTests // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths + // private static readonly delegate* ValidateFunc; + + // static Utf8Validation() + // { + // if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + // { + // // ARM64-specific SIMD method + // ValidateFunc = &Utf8ValidateArm64; + // } + // else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) + // { + // if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported) + // { + // // AVX-512 specific SIMD method + // ValidateFunc = &Utf8ValidateAvx512; + // } + // else if (Avx2.IsSupported) + // { + // // AVX2 specific SIMD method + // ValidateFunc = &Utf8ValidateAvx2; + // } + // else if (Sse2.IsSupported) + // { + // // SSE2 specific SIMD method + // ValidateFunc = &Utf8ValidateSse2; + // } + // else + // { + // // Fallback to scalar method + // ValidateFunc = &Utf8ValidateScalar; + // } + // } + // else + // { + // // Fallback for other architectures to scalar method + // ValidateFunc = &Utf8ValidateScalar; + // } + // } + + // public static unsafe byte* Validate(byte* utf8, int length) => ValidateFunc(utf8, length); + + // // Method implementations... + // private static unsafe byte* Utf8ValidateArm64(byte* utf8, int length) => /* ARM64 specific validation */; + // private static unsafe byte* Utf8ValidateAvx512(byte* utf8, int length) => /* AVX-512 specific validation */; + // private static unsafe byte* Utf8ValidateAvx2(byte* utf8, int length) => /* AVX2 specific validation */; + // private static unsafe byte* Utf8ValidateSse2(byte* utf8, int length) => /* SSE2 specific validation */; + // private static unsafe byte* Utf8ValidateScalar(byte* utf8, int length) => /* Scalar validation */; + +// Declare the delegate at the class level +public delegate byte* ValidationFunction(byte* utf8, int length); + +// public static class FunctionSelector +// { + public static IEnumerable SupportedValidationFunctions() + { + var supportedFunctions = new List(); + + // Example check for architecture and SIMD support + if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) + { + supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64) }); + } + else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) + { + + + if (Avx2.IsSupported) + { + supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2) }); + } + if (Sse2.IsSupported) + { + supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse) }); + } + // Add other conditions and functions as needed + } + + return supportedFunctions; + } +// } + + + [Fact] public void TestGoodSequences() { @@ -784,7 +872,10 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) } + + [Fact] + [Trait("Category", "Scalar")] public void ScalarUTF16CountTest() { int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; @@ -796,7 +887,7 @@ public void ScalarUTF16CountTest() { // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. // byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray(); - byte[] utf8 = generator.Generate(howManyUnits: 13).ToArray(); + byte[] utf8 = generator.Generate(howManyUnits: outputLength).ToArray(); PrintHexAndBinary(utf8); var (offset, length) = (0, utf8.Length); From 4681f2c702e204c5d6e17c70e79130895010a735 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 23 Mar 2024 23:44:38 -0400 Subject: [PATCH 20/75] save game --- test/UTF8ValidationTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 446ff17..ad2f9e1 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -862,6 +862,8 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) } } + + // Helper method to calculate the actual offset and length from a Range private (int offset, int length) GetOffsetAndLength(int totalLength, Range range) { From 5de82e75c7ae5fae3fe2075b2617b66a45aeb2be Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 25 Mar 2024 12:04:26 -0400 Subject: [PATCH 21/75] added generic tests for UTF16count --- src/UTF8.cs | 32 +++++++++++-- test/UTF8ValidationTests.cs | 89 +++++++++++++++++++++---------------- 2 files changed, 79 insertions(+), 42 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 5e3059f..de88ec8 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -330,12 +330,16 @@ public static class UTF8 } - public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength) + public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { int processedLength = 0; + int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempScalarCountAdjustment = 0; if (pInputBuffer == null || inputLength <= 0) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer; } if (inputLength > 128) @@ -445,6 +449,10 @@ public static class UTF8 // we need to check if the previous block was incomplete. if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { + + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + int off = processedLength >= 3 ? processedLength - 3 : processedLength; return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); } @@ -471,6 +479,10 @@ public static class UTF8 Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { + + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + int off = processedLength >= 32 ? processedLength - 32 : processedLength; return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); } @@ -508,11 +520,17 @@ public static class UTF8 byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; + // An invalid byte was found by the scalar function return invalidBytePointer; } } + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + return pInputBuffer + inputLength; } @@ -664,13 +682,21 @@ public static class UTF8 } public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength) { + + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + + + int SIMDScalarCodeUnitCountAdjustment = 0; // I know this is a horrible variable Iwill try to change it later + int SIMDUtf16CodeUnitCountAdjustment = 0; + if (AdvSimd.Arm64.IsSupported) { return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); } if (Avx2.IsSupported) { - return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength); + return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength,out SIMDUtf16CodeUnitCountAdjustment,out SIMDScalarCodeUnitCountAdjustment); } /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) { @@ -682,8 +708,6 @@ public static class UTF8 } // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index ad2f9e1..1e013c2 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -71,38 +71,38 @@ public unsafe class Utf8SIMDValidationTests -// Declare the delegate at the class level -public delegate byte* ValidationFunction(byte* utf8, int length); - -// public static class FunctionSelector -// { - public static IEnumerable SupportedValidationFunctions() - { - var supportedFunctions = new List(); - - // Example check for architecture and SIMD support - if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) - { - supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64) }); - } - else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) - { +// // Declare the delegate at the class level +// public delegate byte* ValidationFunction(byte* utf8, int length); + +// // public static class FunctionSelector +// // { +// public static IEnumerable SupportedValidationFunctions() +// { +// var supportedFunctions = new List(); + +// // Example check for architecture and SIMD support +// if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) +// { +// supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64) }); +// } +// else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) +// { - if (Avx2.IsSupported) - { - supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2) }); - } - if (Sse2.IsSupported) - { - supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse) }); - } - // Add other conditions and functions as needed - } +// if (Avx2.IsSupported) +// { +// supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2) }); +// } +// if (Sse2.IsSupported) +// { +// supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse) }); +// } +// // Add other conditions and functions as needed +// } - return supportedFunctions; - } -// } +// return supportedFunctions; +// } +// // } @@ -874,13 +874,18 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) } - +// Define a delegate that matches the signature of the methods you want to test + public unsafe delegate byte* Utf8ValidationDelegate(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); [Fact] [Trait("Category", "Scalar")] public void ScalarUTF16CountTest() { - int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; + UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + public void UTF16CountTest(Utf8ValidationDelegate utf8ValidationDelegate) + { + // int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; @@ -906,17 +911,17 @@ public void ScalarUTF16CountTest() SimdUnicodeUtf16Adjustment= 0; SimdUnicodeScalarCountAdjustment= 0; - SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - Console.WriteLine("Lenght:" + utf8.Length); + // Console.WriteLine("Lenght:" + utf8.Length); - Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); - Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); + // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); + // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); - Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); - Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); - Console.WriteLine("___________________________________________________"); + // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); + // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); + // Console.WriteLine("___________________________________________________"); Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); @@ -932,6 +937,14 @@ public void ScalarUTF16CountTest() } } + [Fact] + [Trait("Category", "Avx")] + public void AvxUTF16CountTest() + { + UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + } From 7cd0986327bcc5b4fc1a664ab8ee7b8f92dbd8a9 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 26 Mar 2024 12:45:27 -0400 Subject: [PATCH 22/75] save game --- src/UTF8.cs | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/UTF8.cs b/src/UTF8.cs index de88ec8..8c3623e 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -355,6 +355,66 @@ public static class UTF8 { break; } + + // // 4byte utf 8 character + // Vector256 Counts = Avx2.SubtractSaturate(block1, fourByte); + // int mask = Avx2.MoveMask(Counts); + // // Assuming PopCount is a function that counts set bits. + // TempScalarCountAdjustment -= PopCount(mask); + + // // 3byte or 4 utf 8 character + // Counts = Avx2.SubtractSaturate(block1, threeorfourByte); + // mask = Avx2.MoveMask(Counts); + // TempUtf16CodeUnitCountAdjustment -= PopCount(mask) * 2; + + // // 3byte or 4 utf 8 character + // Counts = Avx2.SubtractSaturate(block1, threeorfourByte); + // mask = Avx2.MoveMask(Counts); + // TempUtf16CodeUnitCountAdjustment -= PopCount(mask); + // Assuming 'block1' contains the current block of UTF-8 data you're processing. + +// int popCountResult = Popcnt.IsSupported ? Popcnt.PopCount((uint)mask) : FallbackPopCount(mask); + + + // Vector to identify bytes right before the start of a 4-byte sequence in UTF-8. + Vector256 beforeFourByteMarker = Vector256.Create((byte)(0xF0 - 1)); + // Vector to identify bytes right before the start of a 3-byte sequence in UTF-8. + Vector256 beforeThreeByteMarker = Vector256.Create((byte)(0xE0 - 1)); + // Vector to identify bytes right before the start of a 2-byte sequence in UTF-8. + Vector256 beforeTwoByteMarker = Vector256.Create((byte)(0xC0 - 1)); + + // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. + + // Identify start of 4-byte sequences. + Vector256 isFourByteStart = Avx2.SubtractSaturate(block1, beforeFourByteMarker); + int fourByteMask = Avx2.MoveMask(isFourByteStart); + uint fourByteCount = Popcnt.PopCount((uint)fourByteMask); + + // Identify start of 3-byte and 4-byte sequences. + Vector256 isThreeOrFourByteStart = Avx2.SubtractSaturate(block1, beforeThreeByteMarker); + int threeOrFourByteMask = Avx2.MoveMask(isThreeOrFourByteStart); + uint threeOrFourByteCount = Popcnt.PopCount((uint)threeOrFourByteMask); + + // Calculate only 3-byte sequence count by excluding 4-byte sequences. + uint threeByteCount = threeOrFourByteCount - fourByteCount; + + // Identify start of 2-byte sequences. + Vector256 isTwoByteStart = Avx2.SubtractSaturate(block1, beforeTwoByteMarker); + int twoByteMask = Avx2.MoveMask(isTwoByteStart); + uint twoByteCount = Popcnt.PopCount((uint)twoByteMask); + + // Calculate only 2-byte sequence count by excluding 3-byte and 4-byte sequences. + uint pureTwoByteCount = twoByteCount - threeOrFourByteCount; + + // Adjustments + TempUtf16CodeUnitCountAdjustment += fourByteCount * 2; // Two UTF-16 code units for each 4-byte sequence. + TempUtf16CodeUnitCountAdjustment += pureTwoByteCount; // One UTF-16 code unit for each 2-byte sequence. + TempScalarCountAdjustment += fourByteCount; // One scalar for each 4-byte sequence. + TempScalarCountAdjustment += threeByteCount; // One scalar for each 3-byte sequence. + TempScalarCountAdjustment += pureTwoByteCount; // One scalar for each 2-byte sequence. + + + } processedLength = asciirun; From 6511f641a123be05b894094d8b41adbfe89a0ae1 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 26 Mar 2024 20:58:12 -0400 Subject: [PATCH 23/75] save game --- src/UTF8.cs | 95 +++++++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 46 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 8c3623e..96a1af6 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -59,7 +59,6 @@ public static class UTF8 scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + inputLength; } firstByte = pInputBuffer[pos]; - // TempUtf16CodeUnitCountAdjustment -= 1; } if ((firstByte & 0b11100000) == 0b11000000) @@ -336,6 +335,9 @@ public static class UTF8 int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + if (pInputBuffer == null || inputLength <= 0) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; @@ -376,48 +378,11 @@ public static class UTF8 // int popCountResult = Popcnt.IsSupported ? Popcnt.PopCount((uint)mask) : FallbackPopCount(mask); - // Vector to identify bytes right before the start of a 4-byte sequence in UTF-8. - Vector256 beforeFourByteMarker = Vector256.Create((byte)(0xF0 - 1)); - // Vector to identify bytes right before the start of a 3-byte sequence in UTF-8. - Vector256 beforeThreeByteMarker = Vector256.Create((byte)(0xE0 - 1)); - // Vector to identify bytes right before the start of a 2-byte sequence in UTF-8. - Vector256 beforeTwoByteMarker = Vector256.Create((byte)(0xC0 - 1)); - - // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. - - // Identify start of 4-byte sequences. - Vector256 isFourByteStart = Avx2.SubtractSaturate(block1, beforeFourByteMarker); - int fourByteMask = Avx2.MoveMask(isFourByteStart); - uint fourByteCount = Popcnt.PopCount((uint)fourByteMask); - - // Identify start of 3-byte and 4-byte sequences. - Vector256 isThreeOrFourByteStart = Avx2.SubtractSaturate(block1, beforeThreeByteMarker); - int threeOrFourByteMask = Avx2.MoveMask(isThreeOrFourByteStart); - uint threeOrFourByteCount = Popcnt.PopCount((uint)threeOrFourByteMask); - - // Calculate only 3-byte sequence count by excluding 4-byte sequences. - uint threeByteCount = threeOrFourByteCount - fourByteCount; - - // Identify start of 2-byte sequences. - Vector256 isTwoByteStart = Avx2.SubtractSaturate(block1, beforeTwoByteMarker); - int twoByteMask = Avx2.MoveMask(isTwoByteStart); - uint twoByteCount = Popcnt.PopCount((uint)twoByteMask); - - // Calculate only 2-byte sequence count by excluding 3-byte and 4-byte sequences. - uint pureTwoByteCount = twoByteCount - threeOrFourByteCount; - - // Adjustments - TempUtf16CodeUnitCountAdjustment += fourByteCount * 2; // Two UTF-16 code units for each 4-byte sequence. - TempUtf16CodeUnitCountAdjustment += pureTwoByteCount; // One UTF-16 code unit for each 2-byte sequence. - TempScalarCountAdjustment += fourByteCount; // One scalar for each 4-byte sequence. - TempScalarCountAdjustment += threeByteCount; // One scalar for each 3-byte sequence. - TempScalarCountAdjustment += pureTwoByteCount; // One scalar for each 2-byte sequence. - - - } processedLength = asciirun; + + if (processedLength + 32 < inputLength) { // We still have work to do! @@ -491,11 +456,20 @@ public static class UTF8 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); + Vector256 secondByte = Vector256.Create((byte)(0b11000000u - 0x80)); Vector256 thirdByte = Vector256.Create((byte)(0b11100000u - 0x80)); Vector256 fourthByte = Vector256.Create((byte)(0b11110000u - 0x80)); + Vector256 v0f = Vector256.Create((byte)0x0F); Vector256 v80 = Vector256.Create((byte)0x80); + // Vector to identify bytes right before the start of a 4-byte sequence in UTF-8. + // Vector256 beforeFourByteMarker = Vector256.Create((byte)(0xF0 - 1)); + // // Vector to identify bytes right before the start of a 3-byte sequence in UTF-8. + // Vector256 beforeThreeByteMarker = Vector256.Create((byte)(0xE0 - 1)); + // // Vector to identify bytes right before the start of a 2-byte sequence in UTF-8. + // Vector256 beforeTwoByteMarker = Vector256.Create((byte)(0xC0 - 1)); + for (; processedLength + 32 <= inputLength; processedLength += 32) @@ -518,9 +492,39 @@ public static class UTF8 } prevIncomplete = Vector256.Zero; } - else + else // Contains non-ASCII characters, we need to do non-trivial processing { - // Contains non-ASCII characters, we need to do non-trivial processing + // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. + + // Identify start of 4-byte sequences. + Vector256 isFourByteStart = Avx2.SubtractSaturate(currentBlock, fourthByte); + int fourByteMask = Avx2.MoveMask(isFourByteStart); + uint fourByteCount = Popcnt.PopCount((uint)fourByteMask); + + // Identify start of 3-byte and 4-byte sequences. + Vector256 isThreeOrFourByteStart = Avx2.SubtractSaturate(currentBlock, thirdByte); + int threeOrFourByteMask = Avx2.MoveMask(isThreeOrFourByteStart); + uint threeOrFourByteCount = Popcnt.PopCount((uint)threeOrFourByteMask); + + // Calculate only 3-byte sequence count by excluding 4-byte sequences. + uint threeByteCount = threeOrFourByteCount - fourByteCount; + + // Identify start of 2-byte sequences. + Vector256 isTwoByteStart = Avx2.SubtractSaturate(currentBlock, secondByte); + int twoByteMask = Avx2.MoveMask(isTwoByteStart); + uint twoByteCount = Popcnt.PopCount((uint)twoByteMask); + + // Calculate only 2-byte sequence count by excluding 3-byte and 4-byte sequences. + // uint pureTwoByteCount = twoByteCount - threeOrFourByteCount; + + // Console.WriteLine("2byte count:" + twoByteCount); + + // Adjustments + TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; // Two UTF-16 code units for each 4-byte sequence. + TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; // One UTF-16 code unit for each 2-byte sequence. + TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; // One UTF-16 code unit for each 2-byte sequence. + TempScalarCountAdjustment -= (int)fourByteCount; // One scalar for each 4-byte sequence. + Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; Vector256 prev1 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 1)); @@ -575,8 +579,7 @@ public static class UTF8 { processedLength -= 1; } - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; + byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { @@ -588,8 +591,8 @@ public static class UTF8 } } - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment; + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; return pInputBuffer + inputLength; } From 747cc54b7cbb65434db3b18467c0606b40f7f07b Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Wed, 27 Mar 2024 20:22:34 -0400 Subject: [PATCH 24/75] save game --- src/UTF8.cs | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/UTF8.cs b/src/UTF8.cs index 96a1af6..4ac273a 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -38,6 +38,67 @@ public static class UTF8 return invalidByte; } + public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer, int skippedBytes, ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + { + for (int i = 0; i < skippedBytes; i++) + { + byte currentByte = *(pInputBuffer + i); + if (currentByte >= 0xC0 && currentByte < 0xE0) + { + // 2-byte sequence + utf16CodeUnitCountAdjustment -= 1; + } + else if (currentByte >= 0xE0 && currentByte < 0xF0) + { + // 3-byte sequence + utf16CodeUnitCountAdjustment -= 2; + } + else if (currentByte >= 0xF0) + { + // 4-byte sequence + utf16CodeUnitCountAdjustment -= 2; // or any other logic specific to 4-byte sequences + scalarCountAdjustment -= 1; + } + // Adjust for other conditions as necessary + } + } + + + public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment, int skippedBytes = 0) + { + utf16CodeUnitCountAdjustment = 0; + scalarCountAdjustment = 0; + + // Call the original function first. Assuming GetPointerToFirstInvalidByteOriginal exists and does the primary checking. + byte* result = GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + + // If the adjustments are still 0 and there are skipped bytes to consider, + // loop through the skipped bytes and adjust the counts as needed. + if (utf16CodeUnitCountAdjustment == 0 && scalarCountAdjustment == 0 && skippedBytes > 0) + { + for (int i = 0; i < skippedBytes; i++) + { + byte currentByte = *(pInputBuffer + i); + if (currentByte >= 0xC0 && currentByte < 0xE0) + { + // 2-byte sequence + utf16CodeUnitCountAdjustment -= 1; // Adjust according to your logic + scalarCountAdjustment -= 1; + } + else if ((currentByte >= 0xE0 && currentByte < 0xF0) || (currentByte >= 0xF0)) + { + // 3-byte or 4-byte sequence + utf16CodeUnitCountAdjustment -= 1; // This might need to be adjusted based on your specific logic for 3-byte and 4-byte sequences + scalarCountAdjustment -= 1; + } + // Adjust for other conditions as necessary + } + } + + return result; // Return the pointer from the original check + } + + public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { From cb0000f01acf38c3a2926e96e923280874578f74 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Fri, 29 Mar 2024 13:18:29 -0400 Subject: [PATCH 25/75] save game(some progress) --- src/UTF8.cs | 186 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 106 insertions(+), 80 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 4ac273a..ef24bd2 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -38,67 +38,75 @@ public static class UTF8 return invalidByte; } - public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer, int skippedBytes, ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) - { - for (int i = 0; i < skippedBytes; i++) - { - byte currentByte = *(pInputBuffer + i); - if (currentByte >= 0xC0 && currentByte < 0xE0) - { - // 2-byte sequence - utf16CodeUnitCountAdjustment -= 1; - } - else if (currentByte >= 0xE0 && currentByte < 0xF0) - { - // 3-byte sequence - utf16CodeUnitCountAdjustment -= 2; - } - else if (currentByte >= 0xF0) - { - // 4-byte sequence - utf16CodeUnitCountAdjustment -= 2; // or any other logic specific to 4-byte sequences - scalarCountAdjustment -= 1; - } - // Adjust for other conditions as necessary - } - } - - - public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment, int skippedBytes = 0) + public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippedBytes, + ref int utf16CodeUnitCountAdjustment, + ref int scalarCountAdjustment, + bool shouldAdd = false) { - utf16CodeUnitCountAdjustment = 0; - scalarCountAdjustment = 0; - - // Call the original function first. Assuming GetPointerToFirstInvalidByteOriginal exists and does the primary checking. - byte* result = GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + int adjustmentFactor = shouldAdd ? 1 : -1; - // If the adjustments are still 0 and there are skipped bytes to consider, - // loop through the skipped bytes and adjust the counts as needed. - if (utf16CodeUnitCountAdjustment == 0 && scalarCountAdjustment == 0 && skippedBytes > 0) + // for (int i = 0; i < skippedBytes; i++) + for (int i = 0; i < 3; i++) { - for (int i = 0; i < skippedBytes; i++) + byte currentByte = *(pInputBuffer + i); + if (currentByte >= 0xC0 && currentByte < 0xE0) { - byte currentByte = *(pInputBuffer + i); - if (currentByte >= 0xC0 && currentByte < 0xE0) - { - // 2-byte sequence - utf16CodeUnitCountAdjustment -= 1; // Adjust according to your logic - scalarCountAdjustment -= 1; - } - else if ((currentByte >= 0xE0 && currentByte < 0xF0) || (currentByte >= 0xF0)) - { - // 3-byte or 4-byte sequence - utf16CodeUnitCountAdjustment -= 1; // This might need to be adjusted based on your specific logic for 3-byte and 4-byte sequences - scalarCountAdjustment -= 1; - } - // Adjust for other conditions as necessary + // 2-byte sequence + utf16CodeUnitCountAdjustment += 1 * adjustmentFactor; + } + else if (currentByte >= 0xE0 && currentByte < 0xF0) + { + // 3-byte sequence + utf16CodeUnitCountAdjustment += 2 * adjustmentFactor; + scalarCountAdjustment += 1 * adjustmentFactor; // Assuming each 3-byte sequence translates to one scalar. + } + else if (currentByte >= 0xF0) + { + // 4-byte sequence + utf16CodeUnitCountAdjustment += 2 * adjustmentFactor; // Two UTF-16 code units for each 4-byte sequence. + scalarCountAdjustment += 1 * adjustmentFactor; // One scalar for each 4-byte sequence. } + // Adjust for other conditions as necessary } - - return result; // Return the pointer from the original check } + + // public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment, int skippedBytes = 0) + // { + // utf16CodeUnitCountAdjustment = 0; + // scalarCountAdjustment = 0; + + // // Call the original function first. Assuming GetPointerToFirstInvalidByteOriginal exists and does the primary checking. + // byte* result = GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + + // // If the adjustments are still 0 and there are skipped bytes to consider, + // // loop through the skipped bytes and adjust the counts as needed. + // if (utf16CodeUnitCountAdjustment == 0 && scalarCountAdjustment == 0 && skippedBytes > 0) + // { + // for (int i = 0; i < skippedBytes; i++) + // { + // byte currentByte = *(pInputBuffer + i); + // if (currentByte >= 0xC0 && currentByte < 0xE0) + // { + // // 2-byte sequence + // utf16CodeUnitCountAdjustment -= 1; // Adjust according to your logic + // scalarCountAdjustment -= 1; + // } + // else if ((currentByte >= 0xE0 && currentByte < 0xF0) || (currentByte >= 0xF0)) + // { + // // 3-byte or 4-byte sequence + // utf16CodeUnitCountAdjustment -= 1; // This might need to be adjusted based on your specific logic for 3-byte and 4-byte sequences + // scalarCountAdjustment -= 1; + // } + // // Adjust for other conditions as necessary + // } + // } + + // return result; // Return the pointer from the original check + // } + + public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { @@ -419,26 +427,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer, int skippedB break; } - // // 4byte utf 8 character - // Vector256 Counts = Avx2.SubtractSaturate(block1, fourByte); - // int mask = Avx2.MoveMask(Counts); - // // Assuming PopCount is a function that counts set bits. - // TempScalarCountAdjustment -= PopCount(mask); - - // // 3byte or 4 utf 8 character - // Counts = Avx2.SubtractSaturate(block1, threeorfourByte); - // mask = Avx2.MoveMask(Counts); - // TempUtf16CodeUnitCountAdjustment -= PopCount(mask) * 2; - - // // 3byte or 4 utf 8 character - // Counts = Avx2.SubtractSaturate(block1, threeorfourByte); - // mask = Avx2.MoveMask(Counts); - // TempUtf16CodeUnitCountAdjustment -= PopCount(mask); - // Assuming 'block1' contains the current block of UTF-8 data you're processing. - -// int popCountResult = Popcnt.IsSupported ? Popcnt.PopCount((uint)mask) : FallbackPopCount(mask); - - } processedLength = asciirun; @@ -521,6 +509,24 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer, int skippedB Vector256 thirdByte = Vector256.Create((byte)(0b11100000u - 0x80)); Vector256 fourthByte = Vector256.Create((byte)(0b11110000u - 0x80)); + // // Mask for the lower and upper parts of the vector + // Vector128 lowerMask = Vector128.Create( + // 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + // 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF).AsByte(); + + // Vector128 upperMask = Vector128.Create( + // 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + // 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00).AsByte(); + + // // Combine lower and upper masks into a Vector256 + // Vector256 mask = Vector256.Create(lowerMask, upperMask); + + // // Apply the mask to zero out the last 3 bytes of each vector + // Vector256 secondByteMasked = Avx2.And(secondByte, mask); + // Vector256 thirdByteMasked = Avx2.And(thirdByte, mask); + // Vector256 fourthByteMasked = Avx2.And(fourthByte, mask); + + Vector256 v0f = Vector256.Create((byte)0x0F); Vector256 v80 = Vector256.Create((byte)0x80); @@ -563,12 +569,12 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer, int skippedB uint fourByteCount = Popcnt.PopCount((uint)fourByteMask); // Identify start of 3-byte and 4-byte sequences. - Vector256 isThreeOrFourByteStart = Avx2.SubtractSaturate(currentBlock, thirdByte); - int threeOrFourByteMask = Avx2.MoveMask(isThreeOrFourByteStart); - uint threeOrFourByteCount = Popcnt.PopCount((uint)threeOrFourByteMask); + Vector256 isThreeByteStart = Avx2.SubtractSaturate(currentBlock, thirdByte); + int threeByteMask = Avx2.MoveMask(isThreeByteStart); + uint threeByteCount = Popcnt.PopCount((uint)threeByteMask); // Calculate only 3-byte sequence count by excluding 4-byte sequences. - uint threeByteCount = threeOrFourByteCount - fourByteCount; + // uint threeByteCount = threeOrFourByteCount - fourByteCount; // Identify start of 2-byte sequences. Vector256 isTwoByteStart = Avx2.SubtractSaturate(currentBlock, secondByte); @@ -581,10 +587,10 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer, int skippedB // Console.WriteLine("2byte count:" + twoByteCount); // Adjustments - TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; // Two UTF-16 code units for each 4-byte sequence. - TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; // One UTF-16 code unit for each 2-byte sequence. - TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; // One UTF-16 code unit for each 2-byte sequence. - TempScalarCountAdjustment -= (int)fourByteCount; // One scalar for each 4-byte sequence. + TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; + TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; + TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; + TempScalarCountAdjustment -= (int)fourByteCount; Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; @@ -621,11 +627,31 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer, int skippedB processedLength -= 3; for(int k = 0; k < 3; k++) { + + int candidateByte = pInputBuffer[processedLength + k]; if ((pInputBuffer[processedLength + k] & 0b11000000) == 0b11000000) { + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 1; // Still adjusts for a single UTF-16 unit + } + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 1; // Adjusts for two UTF-16 units (surrogate pair) + TempScalarCountAdjustment += 1; // Adjust for one scalar value + } + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 1; // Adjust for a single UTF-16 unit + } + processedLength += k; break; + } + + + } } } From 4428f261547665090e6221a00e1830ee90bab439 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 30 Mar 2024 11:23:27 -0400 Subject: [PATCH 26/75] WIP adding runtime dispatch test --- src/UTF8.cs | 71 +++++++++------- test/UTF8ValidationTests.cs | 157 ++++++++++++------------------------ 2 files changed, 90 insertions(+), 138 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index ef24bd2..565ceb2 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -564,22 +564,38 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. // Identify start of 4-byte sequences. - Vector256 isFourByteStart = Avx2.SubtractSaturate(currentBlock, fourthByte); - int fourByteMask = Avx2.MoveMask(isFourByteStart); - uint fourByteCount = Popcnt.PopCount((uint)fourByteMask); + // Vector256 isFourByteStart = Avx2.SubtractSaturate(currentBlock, fourthByte); + // int fourByteMask = Avx2.MoveMask(isFourByteStart); + // uint fourByteCount = Popcnt.PopCount((uint)fourByteMask); - // Identify start of 3-byte and 4-byte sequences. - Vector256 isThreeByteStart = Avx2.SubtractSaturate(currentBlock, thirdByte); - int threeByteMask = Avx2.MoveMask(isThreeByteStart); - uint threeByteCount = Popcnt.PopCount((uint)threeByteMask); + // // Identify start of 3-byte and 4-byte sequences. + // Vector256 isThreeByteStart = Avx2.SubtractSaturate(currentBlock, thirdByte); + // int threeByteMask = Avx2.MoveMask(isThreeByteStart); + // uint threeByteCount = Popcnt.PopCount((uint)threeByteMask); - // Calculate only 3-byte sequence count by excluding 4-byte sequences. - // uint threeByteCount = threeOrFourByteCount - fourByteCount; + // // Calculate only 3-byte sequence count by excluding 4-byte sequences. + // // uint threeByteCount = threeOrFourByteCount - fourByteCount; - // Identify start of 2-byte sequences. - Vector256 isTwoByteStart = Avx2.SubtractSaturate(currentBlock, secondByte); - int twoByteMask = Avx2.MoveMask(isTwoByteStart); - uint twoByteCount = Popcnt.PopCount((uint)twoByteMask); + // // Identify start of 2-byte,3 or 4 bytes sequences. + // Vector256 isTwoByteStart = Avx2.SubtractSaturate(currentBlock, secondByte); + // int twoByteMask = Avx2.MoveMask(isTwoByteStart); + // uint twoByteCount = Popcnt.PopCount((uint)twoByteMask); + + // Detect start of 4-byte sequences. + Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); + uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); + + // Detect start of 3-byte sequences (including those that start 4-byte sequences). + Vector256 isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte); + uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence)); + + // Detect start of 2-byte sequences (including those that start 3-byte and 4-byte sequences). + Vector256 isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte); + uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence)); + + // Calculate counts by isolating each type. + uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. + uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. // Calculate only 2-byte sequence count by excluding 3-byte and 4-byte sequences. // uint pureTwoByteCount = twoByteCount - threeOrFourByteCount; @@ -830,35 +846,28 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe return pInputBuffer + inputLength; } - public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength) + public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength,out int Utf16CodeUnitCountAdjustment,out int ScalarCodeUnitCountAdjustment) { - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - - - int SIMDScalarCodeUnitCountAdjustment = 0; // I know this is a horrible variable Iwill try to change it later - int SIMDUtf16CodeUnitCountAdjustment = 0; - - if (AdvSimd.Arm64.IsSupported) - { - return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); - } + // if (AdvSimd.Arm64.IsSupported) + // { + // return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); + // } if (Avx2.IsSupported) { - return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength,out SIMDUtf16CodeUnitCountAdjustment,out SIMDScalarCodeUnitCountAdjustment); + return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment,out ScalarCodeUnitCountAdjustment); } /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) { return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength); }*/ - if (Ssse3.IsSupported) - { - return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); - } + // if (Ssse3.IsSupported) + // { + // return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); + // } // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); - return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment,out ScalarCodeUnitCountAdjustment); } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 1e013c2..03d50ff 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -19,95 +19,8 @@ public unsafe class Utf8SIMDValidationTests // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths - // private static readonly delegate* ValidateFunc; - - // static Utf8Validation() - // { - // if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) - // { - // // ARM64-specific SIMD method - // ValidateFunc = &Utf8ValidateArm64; - // } - // else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) - // { - // if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported) - // { - // // AVX-512 specific SIMD method - // ValidateFunc = &Utf8ValidateAvx512; - // } - // else if (Avx2.IsSupported) - // { - // // AVX2 specific SIMD method - // ValidateFunc = &Utf8ValidateAvx2; - // } - // else if (Sse2.IsSupported) - // { - // // SSE2 specific SIMD method - // ValidateFunc = &Utf8ValidateSse2; - // } - // else - // { - // // Fallback to scalar method - // ValidateFunc = &Utf8ValidateScalar; - // } - // } - // else - // { - // // Fallback for other architectures to scalar method - // ValidateFunc = &Utf8ValidateScalar; - // } - // } - - // public static unsafe byte* Validate(byte* utf8, int length) => ValidateFunc(utf8, length); - - // // Method implementations... - // private static unsafe byte* Utf8ValidateArm64(byte* utf8, int length) => /* ARM64 specific validation */; - // private static unsafe byte* Utf8ValidateAvx512(byte* utf8, int length) => /* AVX-512 specific validation */; - // private static unsafe byte* Utf8ValidateAvx2(byte* utf8, int length) => /* AVX2 specific validation */; - // private static unsafe byte* Utf8ValidateSse2(byte* utf8, int length) => /* SSE2 specific validation */; - // private static unsafe byte* Utf8ValidateScalar(byte* utf8, int length) => /* Scalar validation */; - - - - - -// // Declare the delegate at the class level -// public delegate byte* ValidationFunction(byte* utf8, int length); - -// // public static class FunctionSelector -// // { -// public static IEnumerable SupportedValidationFunctions() -// { -// var supportedFunctions = new List(); - -// // Example check for architecture and SIMD support -// if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) -// { -// supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64) }); -// } -// else if (RuntimeInformation.ProcessArchitecture == Architecture.X64) -// { - - -// if (Avx2.IsSupported) -// { -// supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2) }); -// } -// if (Sse2.IsSupported) -// { -// supportedFunctions.Add(new object[] { new ValidationFunction(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse) }); -// } -// // Add other conditions and functions as needed -// } - -// return supportedFunctions; -// } -// // } - - - - [Fact] - public void TestGoodSequences() + + public void TestGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) { string[] goodSequences = { "a", @@ -120,6 +33,9 @@ public void TestGoodSequences() "\xEF\xBB\xBF" }; + int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + + foreach (var seq in goodSequences) { byte[] input = System.Text.Encoding.UTF8.GetBytes(seq); @@ -129,11 +45,11 @@ public void TestGoodSequences() { int TailScalarCodeUnitCountAdjustment = 0; int TailUtf16CodeUnitCountAdjustment = 0; - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + byte* scalarResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); - byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); + byte* SIMDResult = utf8ValidationDelegate(pInput, input.Length); Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); } @@ -834,6 +750,8 @@ private bool InvalidateUtf8(byte[] utf8, int badindex) private bool ValidateUtf8(byte[] utf8, Range range = default) { + + // Adjusted check for default Range var isDefaultRange = range.Equals(default(Range)); var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); @@ -842,26 +760,47 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) { fixed (byte* pInput = utf8) { + int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; + int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + + byte* startPtr = pInput + offset; - int TailScalarCodeUnitCountAdjustment =0; - int TailUtf16CodeUnitCountAdjustment = 0; - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(startPtr, length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - if (scalarResult != startPtr + length) + byte* DotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(startPtr, length,out DotnetUtf16Adjustment,out DotnetScalarCountAdjustment); + if (DotnetResult != startPtr + length) { - return false; + PrintDebugInfo(DotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position"); } - byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(startPtr, length); + byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(startPtr, length, out SimdUnicodeUtf16Adjustment,out SimdUnicodeScalarCountAdjustment); if (simdResult != startPtr + length) { - return false; + PrintDebugInfo(DotnetResult, startPtr, utf8, "Our result Fails to return the correct invalid position"); } + return true; } } } + void PrintDebugInfo(byte* failedByte, byte* startPtr, byte[] utf8, string source) +{ + int failedIndex = (int)(failedByte - startPtr); + byte failedByteValue = *failedByte; + Console.WriteLine($"Failure in {source}: Index {failedIndex}, Byte {failedByteValue:X2}"); + + // Print surrounding sequence, assuming 5 bytes context around the failure point + int contextRadius = 5; + int startContext = Math.Max(0, failedIndex - contextRadius); + int endContext = Math.Min(utf8.Length, failedIndex + contextRadius + 1); // Include the failed byte and some after + Console.Write("Sequence around failure point: "); + for (int i = startContext; i < endContext; i++) + { + Console.Write($"{utf8[i]:X2} "); + } + Console.WriteLine(); +} + // Helper method to calculate the actual offset and length from a Range @@ -877,19 +816,16 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) // Define a delegate that matches the signature of the methods you want to test public unsafe delegate byte* Utf8ValidationDelegate(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); - [Fact] - [Trait("Category", "Scalar")] - public void ScalarUTF16CountTest() - { - UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); - } + + + public void UTF16CountTest(Utf8ValidationDelegate utf8ValidationDelegate) { // int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; + int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - foreach (int outputLength in outputLengths) { // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. @@ -914,14 +850,14 @@ public void UTF16CountTest(Utf8ValidationDelegate utf8ValidationDelegate) utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - // Console.WriteLine("Lenght:" + utf8.Length); // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); + Console.WriteLine("Lenght:" + utf8.Length); // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); - // Console.WriteLine("___________________________________________________"); + Console.WriteLine("___________________________________________________"); Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); @@ -937,6 +873,13 @@ public void UTF16CountTest(Utf8ValidationDelegate utf8ValidationDelegate) } } + [Fact] + [Trait("Category", "Scalar")] + public void ScalarUTF16CountTest() + { + UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + [Fact] [Trait("Category", "Avx")] public void AvxUTF16CountTest() From 0cb260ed87414b2cf8bfdc7926285b22dc39a71f Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 30 Mar 2024 20:11:22 -0400 Subject: [PATCH 27/75] runtime dispatch progress --- test/UTF8ValidationTests.cs | 69 +++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 03d50ff..b727357 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -43,15 +43,16 @@ public void TestGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) { fixed (byte* pInput = input) { - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - byte* scalarResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, - $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); - - byte* SIMDResult = utf8ValidationDelegate(pInput, input.Length); - Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, - $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); + // int TailScalarCodeUnitCountAdjustment = 0; + // int TailUtf16CodeUnitCountAdjustment = 0; + // byte* scalarResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + // Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, + // $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); + + // byte* SIMDResult = utf8ValidationDelegate(pInput, input.Length); + // Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, + // $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); + ValidateUtf8(input); } } } @@ -99,15 +100,16 @@ public void TestBadSequences() { fixed (byte* pInput = input) { - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, - $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); + // int TailScalarCodeUnitCountAdjustment = 0; + // int TailUtf16CodeUnitCountAdjustment = 0; + // byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + // Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, + // $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); - byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); - Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, - $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); + // byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); + // Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, + // $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); + ValidateUtf8(input); } } @@ -702,11 +704,13 @@ private bool InvalidateUtf8(byte[] utf8, int badindex) { int TailScalarCodeUnitCountAdjustment = 0; int TailUtf16CodeUnitCountAdjustment = 0; + int SIMDUtf16CodeUnitCountAdjustment, SIMDScalarCountAdjustment; byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); int scalarOffset = (int)(scalarResult - pInput); - byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length); + byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length,out SIMDUtf16CodeUnitCountAdjustment,out SIMDScalarCountAdjustment); int simdOffset = (int)(simdResult - pInput); + int utf16CodeUnitCountAdjustment, scalarCountAdjustment; byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, utf8.Length, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); int dotnetOffset = (int)(dotnetResult - pInput); @@ -763,23 +767,36 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - byte* startPtr = pInput + offset; - byte* DotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(startPtr, length,out DotnetUtf16Adjustment,out DotnetScalarCountAdjustment); - if (DotnetResult != startPtr + length) + byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(startPtr, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); + + if (dotnetResult != startPtr + length) { - PrintDebugInfo(DotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position"); + PrintDebugInfo(dotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position"); } - byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(startPtr, length, out SimdUnicodeUtf16Adjustment,out SimdUnicodeScalarCountAdjustment); + byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(startPtr, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); if (simdResult != startPtr + length) { - PrintDebugInfo(DotnetResult, startPtr, utf8, "Our result Fails to return the correct invalid position"); + PrintDebugInfo(simdResult, startPtr, utf8, "Our result fails to return the correct invalid position"); } - - return true; + bool utf16AdjustmentsMatch = DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment; + bool scalarCountAdjustmentsMatch = DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment; + + if (!utf16AdjustmentsMatch) + { + Console.WriteLine($"UTF16 Adjustment mismatch: Expected {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + } + + if (!scalarCountAdjustmentsMatch) + { + Console.WriteLine($"Scalar Count Adjustment mismatch: Expected {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); + } + + return utf16AdjustmentsMatch && scalarCountAdjustmentsMatch; } + } } From 81ab5a3ac70bb5d43644cce05d3b71ee9d1eab0f Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 31 Mar 2024 12:30:36 -0400 Subject: [PATCH 28/75] WIP fixng tests (save game) --- test/UTF8ValidationTests.cs | 185 +++++++++++++++++++++--------------- 1 file changed, 108 insertions(+), 77 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index b727357..0d9fcd4 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -305,86 +305,114 @@ public void OverlongErrorTest() // This might seems redundant with but it actually failed PR #17. // The issue is fixed in PR#18 but I thought it a good idea to formally cover it as further changes are possible. // [Fact] - // public void TooShortTest2() - // { - // for (int trial = 0; trial < NumTrials; trial++) - // { - // foreach (int outputLength in outputLengths) - // { - // byte[] oneUTFunit = generator.Generate( howManyUnits:1 ,byteCountInUnit: 2); - // // PrintHexAndBinary(oneUTFunit); - // byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1); - // // for (int i = 0; i < utf8.Length; i++) - // // { - // // if ((utf8[i] & 0b11000000) == 0b10000000) // Only process continuation bytes - // // { - // byte oldByte = utf8[outputLength - 1]; - // utf8[outputLength -1] = oneUTFunit[0];//0b11000000; // Forcing a too short error at the very end - // // PrintHexAndBinary(utf8); - // Assert.False(ValidateUtf8(utf8)); - // utf8[outputLength -1] = oldByte; // Restore the original byte - - // // } - // } - // } - // } - - public static IEnumerable TestData() + public void TooShortTest2(Utf8ValidationDelegate utf8ValidationDelegate) { - // var utf8CharacterLengths = new[] { 2, 3, 4 }; // UTF-8 characters can be 1-4 bytes. - return outputLengths.SelectMany( - outputLength => Enumerable.Range(1, outputLength), - (outputLength, position) => new object[] { outputLength, position }); + for (int trial = 0; trial < NumTrials; trial++) + { + foreach (int outputLength in outputLengths) + { + // List oneUTFunit = generator.Generate( howManyUnits:1 ,byteCountInUnit: 2); + byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1).ToArray(); + + unsafe + { + fixed (byte* pInput = utf8) + { + + for (int i = 0; i < utf8.Length; i++) + { + // int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; + int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + byte currentByte = utf8[i]; + int offset = 0,length = 0; + + if ((currentByte & 0b11100000) == 0b11000000) { // This is a header byte of a 2-byte sequence + + } + if ((currentByte & 0b11110000) == 0b11100000) { + // This is a header byte of a 3-byte sequence + } + if ((currentByte & 0b11111000) == 0b11110000) { + // This is a header byte of a 4-byte sequence + } + + byte* ThisResult = utf8ValidationDelegate(pInput, i + offset, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + Assert.True(ThisResult == pInput + offset); + + // byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(startPtr, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); + + // if (dotnetResult != startPtr + length) + // { + // // PrintDebugInfo(dotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position"); + // return false; + // } + + byte* simdResult = utf8ValidationDelegate(startPtr, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + if (simdResult != startPtr + length) + { + // PrintDebugInfo(simdResult, startPtr, utf8, "Our result fails to return the correct invalid position"); + return false; + } + + return true; + } + + } + + + // if ((utf8[i] & 0b11000000) == 0b10000000) // Only process continuation bytes + // { + // byte oldByte = utf8[outputLength - 1]; + // utf8[outputLength -1] = oneUTFunit[0];//0b11000000; // Forcing a too short error at the very end + // // PrintHexAndBinary(utf8); + // Assert.False(ValidateUtf8(utf8)); + // utf8[outputLength -1] = oldByte; // Restore the original byte + + } + } + } } - - // [Theory] - // [MemberData(nameof(TestData))] - // public void TooShortTestEnd(int outputLength, int position) + // public static IEnumerable TestData() // { - // byte[] oneUTFunit = generator.Generate(howManyUnits: 1, byteCountInUnit: 2); - // byte[] utf8 = generator.Generate(outputLength, byteCountInUnit: 1); - - // byte oldByte = utf8[position]; - // utf8[position] = oneUTFunit[0]; // Force a condition - - // Assert.False(ValidateUtf8(utf8)); // Test the condition - - // utf8[position] = oldByte; // Restore + // // var utf8CharacterLengths = new[] { 2, 3, 4 }; // UTF-8 characters can be 1-4 bytes. + // return outputLengths.SelectMany( + // outputLength => Enumerable.Range(1, outputLength), + // (outputLength, position) => new object[] { outputLength, position }); // } - public byte[] PrependAndTake(byte[] first, byte[] second, int takeCount) - { - // Concatenate 'first' array at the beginning of 'second' array - var combined = first.Concat(second).ToArray(); + // public byte[] PrependAndTake(byte[] first, byte[] second, int takeCount) + // { + // // Concatenate 'first' array at the beginning of 'second' array + // var combined = first.Concat(second).ToArray(); - // Take the first 'takeCount' elements from the combined array - return combined.Take(takeCount).ToArray(); - } + // // Take the first 'takeCount' elements from the combined array + // return combined.Take(takeCount).ToArray(); + // } - [Theory] - [MemberData(nameof(TestData))] - public void TooShortTestEnd(int outputLength, int position) - { - // ( know this is slow ... but I think for a first pass, it might be ok?) - byte[] utf8 = generator.Generate(outputLength).ToArray(); - byte[] filler = generator.Generate(howManyUnits: position, byteCountInUnit: 1).ToArray(); + // [Theory] + // [MemberData(nameof(TestData))] + // public void TooShortTestEnd(int outputLength, int position) + // { + // // ( know this is slow ... but I think for a first pass, it might be ok?) + // byte[] utf8 = generator.Generate(outputLength).ToArray(); + // byte[] filler = generator.Generate(howManyUnits: position, byteCountInUnit: 1).ToArray(); - // Assuming 'prepend' and 'take' logic needs to be applied here as per the pseudocode - byte[] result = PrependAndTake(filler, utf8, position); + // // Assuming 'prepend' and 'take' logic needs to be applied here as per the pseudocode + // byte[] result = PrependAndTake(filler, utf8, position); - if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array - { - Assert.False(ValidateUtf8(utf8)); // Test the condition - Assert.True(InvalidateUtf8(utf8,position)); - } + // if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array + // { + // Assert.False(ValidateUtf8(utf8)); // Test the condition + // Assert.True(InvalidateUtf8(utf8,position)); + // } - Assert.True(ValidateUtf8(utf8)); // Test the condition + // Assert.True(ValidateUtf8(utf8)); // Test the condition - } + // } // public List PrependAndTake(List first, List second, int takeCount) // { @@ -772,29 +800,32 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) if (dotnetResult != startPtr + length) { - PrintDebugInfo(dotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position"); + // PrintDebugInfo(dotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position"); + return false; } byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(startPtr, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); if (simdResult != startPtr + length) { - PrintDebugInfo(simdResult, startPtr, utf8, "Our result fails to return the correct invalid position"); + // PrintDebugInfo(simdResult, startPtr, utf8, "Our result fails to return the correct invalid position"); + return false; } bool utf16AdjustmentsMatch = DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment; - bool scalarCountAdjustmentsMatch = DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment; + // bool scalarCountAdjustmentsMatch = DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment; - if (!utf16AdjustmentsMatch) - { - Console.WriteLine($"UTF16 Adjustment mismatch: Expected {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); - } + // if (!utf16AdjustmentsMatch) + // { + // Console.WriteLine($"UTF16 Adjustment mismatch: Expected {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + // } - if (!scalarCountAdjustmentsMatch) - { - Console.WriteLine($"Scalar Count Adjustment mismatch: Expected {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - } + // if (!scalarCountAdjustmentsMatch) + // { + // Console.WriteLine($"Scalar Count Adjustment mismatch: Expected {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); + // } - return utf16AdjustmentsMatch && scalarCountAdjustmentsMatch; + // return utf16AdjustmentsMatch && scalarCountAdjustmentsMatch; + return true; } } From d890bbdd8a515c2a4b3f5d3058c526d3a0863173 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 31 Mar 2024 17:16:37 -0400 Subject: [PATCH 29/75] fixed @#$^ short end test once and for all --- test/UTF8ValidationTests.cs | 240 ++++++------------------------------ 1 file changed, 37 insertions(+), 203 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 0d9fcd4..4b382be 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -302,10 +302,9 @@ public void OverlongErrorTest() } -// This might seems redundant with but it actually failed PR #17. -// The issue is fixed in PR#18 but I thought it a good idea to formally cover it as further changes are possible. - // [Fact] - public void TooShortTest2(Utf8ValidationDelegate utf8ValidationDelegate) + + + public void TooShortTest(Utf8ValidationDelegate utf8ValidationDelegate) { for (int trial = 0; trial < NumTrials; trial++) { @@ -314,188 +313,49 @@ public void TooShortTest2(Utf8ValidationDelegate utf8ValidationDelegate) // List oneUTFunit = generator.Generate( howManyUnits:1 ,byteCountInUnit: 2); byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1).ToArray(); - unsafe - { - fixed (byte* pInput = utf8) - { + unsafe + { + fixed (byte* pInput = utf8) + { - for (int i = 0; i < utf8.Length; i++) - { - // int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; - int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - byte currentByte = utf8[i]; - int offset = 0,length = 0; - - if ((currentByte & 0b11100000) == 0b11000000) { // This is a header byte of a 2-byte sequence - - } - if ((currentByte & 0b11110000) == 0b11100000) { - // This is a header byte of a 3-byte sequence - } - if ((currentByte & 0b11111000) == 0b11110000) { - // This is a header byte of a 4-byte sequence - } - - byte* ThisResult = utf8ValidationDelegate(pInput, i + offset, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - Assert.True(ThisResult == pInput + offset); - - // byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(startPtr, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); - - // if (dotnetResult != startPtr + length) - // { - // // PrintDebugInfo(dotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position"); - // return false; - // } - - byte* simdResult = utf8ValidationDelegate(startPtr, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - if (simdResult != startPtr + length) - { - // PrintDebugInfo(simdResult, startPtr, utf8, "Our result fails to return the correct invalid position"); - return false; - } - - return true; + for (int i = 0; i < utf8.Length; i++) + { + // int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; + int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + byte currentByte = utf8[i]; + int offset = 0; + + if ((currentByte & 0b11100000) == 0b11000000) { // This is a header byte of a 2-byte sequence + offset = 0; + } + if ((currentByte & 0b11110000) == 0b11100000) { + // This is a header byte of a 3-byte sequence + + offset = rand.Next(0, 3); + } + if ((currentByte & 0b11111000) == 0b11110000) { + // This is a header byte of a 4-byte sequence + offset = rand.Next(0, 4); } - } + byte* ThisResult = utf8ValidationDelegate(pInput, i + offset, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + Assert.True(ThisResult == pInput + i + offset); + byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, i + offset, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + Assert.True(dotnetResult == pInput + i + offset); + } - // if ((utf8[i] & 0b11000000) == 0b10000000) // Only process continuation bytes - // { - // byte oldByte = utf8[outputLength - 1]; - // utf8[outputLength -1] = oneUTFunit[0];//0b11000000; // Forcing a too short error at the very end - // // PrintHexAndBinary(utf8); - // Assert.False(ValidateUtf8(utf8)); - // utf8[outputLength -1] = oldByte; // Restore the original byte - - } + } + } } } } - // public static IEnumerable TestData() - // { - // // var utf8CharacterLengths = new[] { 2, 3, 4 }; // UTF-8 characters can be 1-4 bytes. - // return outputLengths.SelectMany( - // outputLength => Enumerable.Range(1, outputLength), - // (outputLength, position) => new object[] { outputLength, position }); - // } - - // public byte[] PrependAndTake(byte[] first, byte[] second, int takeCount) - // { - // // Concatenate 'first' array at the beginning of 'second' array - // var combined = first.Concat(second).ToArray(); - - // // Take the first 'takeCount' elements from the combined array - // return combined.Take(takeCount).ToArray(); - // } - - - // [Theory] - // [MemberData(nameof(TestData))] - // public void TooShortTestEnd(int outputLength, int position) - // { - // // ( know this is slow ... but I think for a first pass, it might be ok?) - // byte[] utf8 = generator.Generate(outputLength).ToArray(); - // byte[] filler = generator.Generate(howManyUnits: position, byteCountInUnit: 1).ToArray(); - - - // // Assuming 'prepend' and 'take' logic needs to be applied here as per the pseudocode - // byte[] result = PrependAndTake(filler, utf8, position); - - - // if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array - // { - // Assert.False(ValidateUtf8(utf8)); // Test the condition - // Assert.True(InvalidateUtf8(utf8,position)); - // } - - // Assert.True(ValidateUtf8(utf8)); // Test the condition - - // } - - // public List PrependAndTake(List first, List second, int takeCount) - // { - // // Concatenate 'first' list at the beginning of 'second' list - // List combined = new List(first); - // combined.AddRange(second); - - // // Take the first 'takeCount' elements from the combined list - // // Ensure we don't exceed the combined list's count - // takeCount = Math.Min(takeCount, combined.Count); - - // return combined.GetRange(0, takeCount); - // } - - - - // // [Theory] - // // [MemberData(nameof(TestData))] - // [Fact] - // public void TooShortTestEnd() - // { - // foreach (int outputLength in outputLengths) - // { - // // for (int trial = 0; trial < NumTrials; trial++) - // // { - // List utf8 = generator.Generate(outputLength); - - // for (int i = 0; i < utf8.Count; i++) - // { - // List filler = generator.Generate(howManyUnits: i, byteCountInUnit: 1); - - // // Assuming 'prepend' and 'take' logic needs to be applied here as per the pseudocode - // byte[] result = PrependAndTake(filler, utf8, i).ToArray(); - - - // if (result[^1] >= 0b11000000)// non-ASCII bytes will provide an error as we're truncating a perfectly good array - // { - // Assert.False(ValidateUtf8(result)); // Test the condition - // Assert.True(InvalidateUtf8(result,result.Length)); - // } - // } - // // } - // } - - // [Fact] - // public void TooLongErrorTestEnd() - // { - // foreach (int outputLength in outputLengths) - // { - // for (int trial = 0; trial < NumTrials; trial++) - // { - // byte[] utf8 = generator.Generate(outputLength); - - // for (int i = 0; i < utf8.Length; i++) - // { - // if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes - // { - // byte oldByte = utf8[i]; - // utf8[i] = 0b10000000; // Forcing a too long error - // Assert.False(ValidateUtf8(utf8)); - // Assert.True(InvalidateUtf8(utf8, i)); - // utf8[i] = oldByte; // Restore the original byte - // } - // } - // } - // } - // } - - // public static IEnumerable InvalidTestData() - // { - // var random = new Random(); - // foreach (var length in outputLengths) - // { - // for (int trial = 0; trial < NumTrials; trial++) - // { - // int position = random.Next(length - 3); // Choose a random position - // byte invalidByte = (byte)random.Next(0xF5, 0x100); // Generate a random invalid byte - - // yield return new object[] { length, position, invalidByte }; - // } - // } - // } - + [Fact] + public void TooShortTestAvx2() + { + TooShortTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } public static IEnumerable InvalidTestData() { @@ -754,32 +614,6 @@ private bool InvalidateUtf8(byte[] utf8, int badindex) } } } - - // check that all methods agree that the result is valid - // private bool ValidateUtf8(byte[] utf8) - // { - // unsafe - // { - // fixed (byte* pInput = utf8) - // { - // byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length); - // if (scalarResult != pInput + utf8.Length) - // { - // return false; - // } - - // byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length); - // if (simdResult != pInput + utf8.Length) - // { - // return false; - // } - - // return true; - // } - // } - // } - - private bool ValidateUtf8(byte[] utf8, Range range = default) { From 8153f9be57bfbbebac36b72d81edd091dfa24769 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 1 Apr 2024 11:33:50 -0400 Subject: [PATCH 30/75] WIP simplify tests + runtime dispatch update --- test/UTF8ValidationTests.cs | 322 ++++++++++++++---------------------- 1 file changed, 125 insertions(+), 197 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 4b382be..c25fb93 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -33,9 +33,6 @@ public void TestGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) "\xEF\xBB\xBF" }; - int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - - foreach (var seq in goodSequences) { byte[] input = System.Text.Encoding.UTF8.GetBytes(seq); @@ -43,23 +40,16 @@ public void TestGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) { fixed (byte* pInput = input) { - // int TailScalarCodeUnitCountAdjustment = 0; - // int TailUtf16CodeUnitCountAdjustment = 0; - // byte* scalarResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - // Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, - // $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); - - // byte* SIMDResult = utf8ValidationDelegate(pInput, input.Length); - // Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, - // $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); - ValidateUtf8(input); + Assert.True(ValidateUtf8(input,utf8ValidationDelegate), + $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); + + ValidateCount(input,utf8ValidationDelegate); } } } } - [Fact] - public void TestBadSequences() + public void TestBadSequences(Utf8ValidationDelegate utf8ValidationDelegate) { string[] badSequences = { "\xC3\x28", @@ -100,75 +90,42 @@ public void TestBadSequences() { fixed (byte* pInput = input) { - // int TailScalarCodeUnitCountAdjustment = 0; - // int TailUtf16CodeUnitCountAdjustment = 0; - // byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, input.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - // Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)scalarResult, - // $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); - - // byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length); - // Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult, - // $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); - ValidateUtf8(input); - + ValidateUtf8(input,utf8ValidationDelegate); } } } } - [Fact] - public void Node48995Test() + + public void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate) { byte[] bad = new byte[] { 0x80 }; - Assert.False(ValidateUtf8(bad)); + Assert.False(ValidateUtf8(bad,utf8ValidationDelegate)); } - [Fact] - public void NoErrorTest() + public void NoErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { for (int trial = 0; trial < NumTrials; trial++) { byte[] utf8 = generator.Generate(outputLength).ToArray(); - bool isValidUtf8 = ValidateUtf8(utf8); + bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); } } } - [Fact] - public void NoErrorTestASCII() - { - RunTestForByteLength(1); - } - - [Fact] - public void NoErrorTest1Byte() - { - RunTestForByteLength(1); - } - - [Fact] - public void NoErrorTest2Bytes() - { - RunTestForByteLength(2); - } - - [Fact] - public void NoErrorTest3Bytes() - { - RunTestForByteLength(3); - } - - [Fact] - public void NoErrorTest4Bytes() + public void validateNoErrorTestSpecificByteCount(Utf8ValidationDelegate utf8ValidationDelegate) { - RunTestForByteLength(4); + RunTestForByteLength(1,utf8ValidationDelegate); + RunTestForByteLength(2,utf8ValidationDelegate); + RunTestForByteLength(3,utf8ValidationDelegate); + RunTestForByteLength(4,utf8ValidationDelegate); } - private void RunTestForByteLength(int byteLength) + private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8ValidationDelegate) { // int[] outputLengths = { 128, 256, 512, 1024, 1000 }; // Example lengths foreach (int outputLength in outputLengths) @@ -176,14 +133,13 @@ private void RunTestForByteLength(int byteLength) for (int trial = 0; trial < NumTrials; trial++) { byte[] utf8 = generator.Generate(outputLength, byteLength).ToArray(); - bool isValidUtf8 = ValidateUtf8(utf8); + bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); Assert.True(isValidUtf8, $"Failure for {byteLength}-byte UTF8 of length {outputLength} in trial {trial}"); } } } - [Fact] - public void HeaderBitsErrorTest() + public void HeaderBitsErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -197,8 +153,8 @@ public void HeaderBitsErrorTest() { byte oldByte = utf8[i]; utf8[i] = 0b11111000; // Forcing a header bits error - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); utf8[i] = oldByte; // Restore the original byte } } @@ -206,8 +162,7 @@ public void HeaderBitsErrorTest() } } - [Fact] - public void TooShortErrorTest() + public void TooShortErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -221,8 +176,8 @@ public void TooShortErrorTest() { byte oldByte = utf8[i]; utf8[i] = 0b11100000; // Forcing a too short error - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); utf8[i] = oldByte; // Restore the original byte } } @@ -231,8 +186,30 @@ public void TooShortErrorTest() } + [Fact] + [Trait("Category", "scalar")] + public void TooShortErrorTestScalar() + { + TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // Uncomment when SSE is updated + // [Fact] + // [Trait("Category", "sse")] + // public void TooShortErrorTestSse() + // { + // TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + [Fact] - public void TooLongErrorTest() + [Trait("Category", "avx")] + public void TooShortErrorTestAVX() + { + TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + + public void TooLongErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { int[] outputLengths = { 128, 256, 512, 1024 }; // Example lengths @@ -249,8 +226,8 @@ public void TooLongErrorTest() { byte oldByte = utf8[i]; utf8[i] = 0b10000000; // Forcing a too long error - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); utf8[i] = oldByte; // Restore the original byte } } @@ -258,8 +235,7 @@ public void TooLongErrorTest() } } - [Fact] - public void OverlongErrorTest() + public void OverlongErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { for (int trial = 0; trial < NumTrials; trial++) { @@ -290,8 +266,8 @@ public void OverlongErrorTest() utf8[i + 1] = (byte)(utf8[i + 1] & 0b11001111); } - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); utf8[i] = old; utf8[i + 1] = secondOld; @@ -356,39 +332,28 @@ public void TooShortTestAvx2() { TooShortTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public static IEnumerable InvalidTestData() -{ - - var invalidBytes = Enumerable.Range(0xF5, 0x100 - 0xF5).Select(i => (byte)i).ToArray(); // 0xF5 to 0xFF - foreach (var length in outputLengths) - { - byte[] utf8 = generator.Generate(length).ToArray(); - for (int position = 0; position < utf8.Length; position++) - { - foreach (var invalidByte in invalidBytes) - { - yield return new object[] { length, position, invalidByte ,utf8 }; - } - } - } -} //corresponds to condition 5.4.1 in the paper - [Theory] - [MemberData(nameof(InvalidTestData))] - public void Invalid0xf50xff(int outputLength, int position, byte invalidByte,byte[] utf8) + public void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) { - // Initialize utf8 with some valid data, if necessary - // Array.Fill(utf8, (byte)0x20); // Filling with spaces for simplicity - - utf8[position] = invalidByte; // Inject an invalid byte at a random position - - // PrintHexAndBinary(utf8); + var invalidBytes = Enumerable.Range(0xF5, 0x100 - 0xF5).Select(i => (byte)i).ToArray(); // 0xF5 to 0xFF + foreach (var length in outputLengths) + { + byte[] utf8 = generator.Generate(length).ToArray(); + for (int position = 0; position < utf8.Length; position++) + { + foreach (var invalidByte in invalidBytes) + { + utf8[position] = invalidByte; + // PrintHexAndBinary(utf8); - Assert.False(ValidateUtf8(utf8)); // Expect the validation to fail due to the invalid byte - Assert.True(InvalidateUtf8(utf8,position)); + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); // Expect the validation to fail due to the invalid byte + Assert.True(InvalidateUtf8(utf8,position,utf8ValidationDelegate)); + } + } + } } @@ -405,8 +370,7 @@ static void PrintHexAndBinary(byte[] bytes) } - [Fact] - public void TooLargeErrorTest() + public void TooLargeErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -423,8 +387,8 @@ public void TooLargeErrorTest() byte old = utf8[i]; utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100); - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); utf8[i] = old; } } @@ -432,23 +396,22 @@ public void TooLargeErrorTest() } } - [Fact] - public void TooLargeErrorTestEnd() + public void TooLargeErrorTestEnd(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { for (int trial = 0; trial < NumTrials; trial++) { for (int i = 1; i <= 4; i++) - { - byte[] filler = generator.Generate(outputLength,byteCountInUnit:1).ToArray(); - byte[] toolong = generator.AppendContinuationByte(generator.Generate(1,i)).ToArray(); + { + byte[] filler = generator.Generate(outputLength,byteCountInUnit:1).ToArray(); + byte[] toolong = generator.AppendContinuationByte(generator.Generate(1,i)).ToArray(); - generator.ReplaceEndOfArray(filler,toolong); + generator.ReplaceEndOfArray(filler,toolong); - Assert.False(ValidateUtf8(filler )); - Assert.True(InvalidateUtf8(filler, outputLength -1)); - } + Assert.False(ValidateUtf8(filler,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(filler, outputLength -1,utf8ValidationDelegate)); + } } @@ -456,8 +419,7 @@ public void TooLargeErrorTestEnd() } - [Fact] - public void SurrogateErrorTest() + public void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -478,8 +440,8 @@ public void SurrogateErrorTest() { utf8[i + 1] = (byte)((utf8[i + 1] & 0b11000011) | (s << 2)); - Assert.False(ValidateUtf8(utf8)); - Assert.True(InvalidateUtf8(utf8, i)); + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); } utf8[i] = old; @@ -490,8 +452,7 @@ public void SurrogateErrorTest() } } - [Fact] - public void BruteForceTest() + public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -501,7 +462,7 @@ public void BruteForceTest() // Generate random UTF-8 sequence byte[] utf8 = generator.Generate(rand.Next(outputLength)).ToArray(); - Assert.True(ValidateUtf8(utf8), "Initial UTF-8 validation (primary) failed."); + Assert.True(ValidateUtf8(utf8,utf8ValidationDelegate), "Initial UTF-8 validation (primary) failed."); Assert.True(ValidateUtf8Fuschia(utf8), "Initial UTF-8 validation (Fuschia) failed."); @@ -519,7 +480,7 @@ public void BruteForceTest() modifiedUtf8[byteIndex] ^= (byte)bitFlip; // Validate the modified sequence with both methods - bool isValidPrimary = ValidateUtf8(modifiedUtf8); + bool isValidPrimary = ValidateUtf8(modifiedUtf8,utf8ValidationDelegate); bool isValidFuschia = ValidateUtf8Fuschia(modifiedUtf8); // Ensure both methods agree on the validation result @@ -584,7 +545,7 @@ public static bool ValidateUtf8Fuschia(byte[] data) } // Check that all functions agree on the result when the input might be invalid. - private bool InvalidateUtf8(byte[] utf8, int badindex) + private bool InvalidateUtf8(byte[] utf8, int badindex,Utf8ValidationDelegate utf8ValidationDelegate) { unsafe { @@ -596,7 +557,7 @@ private bool InvalidateUtf8(byte[] utf8, int badindex) byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); int scalarOffset = (int)(scalarResult - pInput); - byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length,out SIMDUtf16CodeUnitCountAdjustment,out SIMDScalarCountAdjustment); + byte* simdResult = utf8ValidationDelegate(pInput, utf8.Length,out SIMDUtf16CodeUnitCountAdjustment,out SIMDScalarCountAdjustment); int simdOffset = (int)(simdResult - pInput); int utf16CodeUnitCountAdjustment, scalarCountAdjustment; @@ -614,10 +575,8 @@ private bool InvalidateUtf8(byte[] utf8, int badindex) } } } - private bool ValidateUtf8(byte[] utf8, Range range = default) + private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) { - - // Adjusted check for default Range var isDefaultRange = range.Equals(default(Range)); var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); @@ -638,50 +597,35 @@ private bool ValidateUtf8(byte[] utf8, Range range = default) return false; } - byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(startPtr, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + byte* simdResult = utf8ValidationDelegate(startPtr, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); if (simdResult != startPtr + length) { // PrintDebugInfo(simdResult, startPtr, utf8, "Our result fails to return the correct invalid position"); return false; } - - bool utf16AdjustmentsMatch = DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment; - // bool scalarCountAdjustmentsMatch = DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment; - - // if (!utf16AdjustmentsMatch) - // { - // Console.WriteLine($"UTF16 Adjustment mismatch: Expected {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); - // } - - // if (!scalarCountAdjustmentsMatch) - // { - // Console.WriteLine($"Scalar Count Adjustment mismatch: Expected {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - // } - - // return utf16AdjustmentsMatch && scalarCountAdjustmentsMatch; return true; } } } - void PrintDebugInfo(byte* failedByte, byte* startPtr, byte[] utf8, string source) -{ - int failedIndex = (int)(failedByte - startPtr); - byte failedByteValue = *failedByte; - Console.WriteLine($"Failure in {source}: Index {failedIndex}, Byte {failedByteValue:X2}"); - - // Print surrounding sequence, assuming 5 bytes context around the failure point - int contextRadius = 5; - int startContext = Math.Max(0, failedIndex - contextRadius); - int endContext = Math.Min(utf8.Length, failedIndex + contextRadius + 1); // Include the failed byte and some after - Console.Write("Sequence around failure point: "); - for (int i = startContext; i < endContext; i++) - { - Console.Write($"{utf8[i]:X2} "); - } - Console.WriteLine(); -} +// void PrintDebugInfo(byte* failedByte, byte* startPtr, byte[] utf8, string source) +// { +// int failedIndex = (int)(failedByte - startPtr); +// byte failedByteValue = *failedByte; +// Console.WriteLine($"Failure in {source}: Index {failedIndex}, Byte {failedByteValue:X2}"); + +// // Print surrounding sequence, assuming 5 bytes context around the failure point +// int contextRadius = 5; +// int startContext = Math.Max(0, failedIndex - contextRadius); +// int endContext = Math.Min(utf8.Length, failedIndex + contextRadius + 1); // Include the failed byte and some after +// Console.Write("Sequence around failure point: "); +// for (int i = startContext; i < endContext; i++) +// { +// Console.Write($"{utf8[i]:X2} "); +// } +// Console.WriteLine(); +// } @@ -701,20 +645,13 @@ void PrintDebugInfo(byte* failedByte, byte* startPtr, byte[] utf8, string source - public void UTF16CountTest(Utf8ValidationDelegate utf8ValidationDelegate) + public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) { - // int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; - int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - foreach (int outputLength in outputLengths) - { - // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. - // byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray(); - byte[] utf8 = generator.Generate(howManyUnits: outputLength).ToArray(); - PrintHexAndBinary(utf8); - var (offset, length) = (0, utf8.Length); + var isDefaultRange = range.Equals(default(Range)); + var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); unsafe { @@ -731,43 +668,34 @@ public void UTF16CountTest(Utf8ValidationDelegate utf8ValidationDelegate) SimdUnicodeScalarCountAdjustment= 0; utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - - // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); - Console.WriteLine("Lenght:" + utf8.Length); + // Console.WriteLine("Lenght:" + utf8.Length); // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); - Console.WriteLine("___________________________________________________"); - + // Console.WriteLine("___________________________________________________"); Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - - - - - // If your generator creates specific patterns or the utility calculates these adjustments differently, - // you'll need to adjust the expected values accordingly. } } - } - } - - [Fact] - [Trait("Category", "Scalar")] - public void ScalarUTF16CountTest() - { - UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + // } } - [Fact] - [Trait("Category", "Avx")] - public void AvxUTF16CountTest() - { - UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); - } + // [Fact] + // [Trait("Category", "Scalar")] + // public void ScalarUTF16CountTest() + // { + // UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + // } + + // [Fact] + // [Trait("Category", "Avx")] + // public void AvxUTF16CountTest() + // { + // UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + // } } From 4c01282a094aed1b80339c452f27b41a5bc46ae7 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 1 Apr 2024 16:51:20 -0400 Subject: [PATCH 31/75] WIP Runtime dispatch tests --- test/UTF8ValidationTests.cs | 108 +++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 3 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index c25fb93..0b2ef72 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -6,7 +6,7 @@ namespace tests; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.Arm; - +using BenchmarkDotNet.Disassemblers; public unsafe class Utf8SIMDValidationTests { @@ -19,6 +19,89 @@ public unsafe class Utf8SIMDValidationTests // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths +// public class FactOnArchitectureAttribute : FactAttribute +// { +// public FactOnArchitectureAttribute(System.Runtime.InteropServices.Architecture architecture) +// { +// if (System.Runtime.InteropServices.RuntimeInformation.ProcessArchitecture != architecture) +// { +// Skip = $"Test is skipped because it runs only on {architecture} architecture"; +// } +// } +// } + +[Flags] +public enum TestSystemRequirements +{ + None = 0, + Arm64 = 1, + X64Avx512 = 2, + X64Avx2 = 4, + X64Sse = 8, + // Add more as needed +} + +public class FactOnSystemRequirementAttribute : FactAttribute +{ + private TestSystemRequirements RequiredSystems; + + public FactOnSystemRequirementAttribute(TestSystemRequirements requiredSystems) + { + RequiredSystems = requiredSystems; + + if (!IsSystemSupported(requiredSystems)) + { + Skip = "Test is skipped due to not meeting system requirements."; + } + } + + private bool IsSystemSupported(TestSystemRequirements requiredSystems) + { + var currentArchitecture = RuntimeInformation.ProcessArchitecture; + bool isSupported = false; + + if (currentArchitecture == Architecture.Arm64 && requiredSystems.HasFlag(TestSystemRequirements.Arm64)) + { + isSupported = true; + } + else if (currentArchitecture == Architecture.X64) + { + if (requiredSystems.HasFlag(TestSystemRequirements.X64Avx512) && Vector512.IsHardwareAccelerated && System.Runtime.Intrinsics.X86.Avx512F.IsSupported) + { + isSupported = true; + } + else if (requiredSystems.HasFlag(TestSystemRequirements.X64Avx2) && System.Runtime.Intrinsics.X86.Avx2.IsSupported) + { + isSupported = true; + } + else if (requiredSystems.HasFlag(TestSystemRequirements.X64Sse) && System.Runtime.Intrinsics.X86.Sse.IsSupported) + { + isSupported = true; + } + } + + // Implement other architecture checks as needed + + return isSupported; + } +} + + + public class TestIfCondition : FactAttribute + { + public TestIfCondition(Func condition, string skipReason) + { + // if (condition == null) throw new ArgumentNullException(nameof(condition)); + + // Only set the Skip property if the condition evaluates to false + if (!condition.Invoke()) + { + Skip = skipReason; + } + } + } + + public void TestGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) { @@ -186,14 +269,17 @@ public void TooShortErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) } - [Fact] + // [FactOnArchitecture(System.Runtime.InteropServices.Architecture.X64)] + // [TestIfCondition(RuntimeInformation.ProcessArchitecture == Architecture.X64,"a reason")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] [Trait("Category", "scalar")] public void TooShortErrorTestScalar() { TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // Uncomment when SSE is updated + // TODO:Uncomment when SSE is updated + // [FactOnArchitecture(System.Runtime.InteropServices.Architecture.X64)] // [Fact] // [Trait("Category", "sse")] // public void TooShortErrorTestSse() @@ -201,6 +287,22 @@ public void TooShortErrorTestScalar() // TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); // } + // TODO:Uncomment when AVX512 is updated + // [Fact] + // [Trait("Category", "avx512")] + // public void TooShortErrorTestSse() + // { + // TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnArchitecture(System.Runtime.InteropServices.Architecture.Arm64)] + // [Trait("Category", "arm64")] + // public void TooShortErrorTestArm64() + // { + // TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + [Fact] [Trait("Category", "avx")] public void TooShortErrorTestAVX() From ea0f6c769ff6d79440a1ec46a6f377c48db48d0f Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 2 Apr 2024 11:56:02 -0400 Subject: [PATCH 32/75] Runtime dispatch test (WORKING) + slight cleanup --- test/UTF8ValidationTests.cs | 683 ++++++++++++++++++++++++++++++------ test/helpers/randomutf8.cs | 6 - 2 files changed, 584 insertions(+), 105 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 0b2ef72..4d70dfc 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -19,72 +19,46 @@ public unsafe class Utf8SIMDValidationTests // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths -// public class FactOnArchitectureAttribute : FactAttribute -// { -// public FactOnArchitectureAttribute(System.Runtime.InteropServices.Architecture architecture) -// { -// if (System.Runtime.InteropServices.RuntimeInformation.ProcessArchitecture != architecture) -// { -// Skip = $"Test is skipped because it runs only on {architecture} architecture"; -// } -// } -// } - -[Flags] -public enum TestSystemRequirements -{ - None = 0, - Arm64 = 1, - X64Avx512 = 2, - X64Avx2 = 4, - X64Sse = 8, - // Add more as needed -} - -public class FactOnSystemRequirementAttribute : FactAttribute -{ - private TestSystemRequirements RequiredSystems; - - public FactOnSystemRequirementAttribute(TestSystemRequirements requiredSystems) + [Flags] + public enum TestSystemRequirements { - RequiredSystems = requiredSystems; - - if (!IsSystemSupported(requiredSystems)) - { - Skip = "Test is skipped due to not meeting system requirements."; - } + None = 0, + Arm64 = 1, + X64Avx512 = 2, + X64Avx2 = 4, + X64Sse = 8, + // Add more as needed } - private bool IsSystemSupported(TestSystemRequirements requiredSystems) + public class FactOnSystemRequirementAttribute : FactAttribute { - var currentArchitecture = RuntimeInformation.ProcessArchitecture; - bool isSupported = false; + private TestSystemRequirements RequiredSystems; - if (currentArchitecture == Architecture.Arm64 && requiredSystems.HasFlag(TestSystemRequirements.Arm64)) + public FactOnSystemRequirementAttribute(TestSystemRequirements requiredSystems) { - isSupported = true; - } - else if (currentArchitecture == Architecture.X64) - { - if (requiredSystems.HasFlag(TestSystemRequirements.X64Avx512) && Vector512.IsHardwareAccelerated && System.Runtime.Intrinsics.X86.Avx512F.IsSupported) - { - isSupported = true; - } - else if (requiredSystems.HasFlag(TestSystemRequirements.X64Avx2) && System.Runtime.Intrinsics.X86.Avx2.IsSupported) + RequiredSystems = requiredSystems; + + if (!IsSystemSupported(requiredSystems)) { - isSupported = true; + Skip = "Test is skipped due to not meeting system requirements."; } - else if (requiredSystems.HasFlag(TestSystemRequirements.X64Sse) && System.Runtime.Intrinsics.X86.Sse.IsSupported) + } + + private bool IsSystemSupported(TestSystemRequirements requiredSystems) + { + switch (RuntimeInformation.ProcessArchitecture) { - isSupported = true; + case Architecture.Arm64: + return requiredSystems.HasFlag(TestSystemRequirements.Arm64); + case Architecture.X64: + return (requiredSystems.HasFlag(TestSystemRequirements.X64Avx512) && Vector512.IsHardwareAccelerated && System.Runtime.Intrinsics.X86.Avx512F.IsSupported) || + (requiredSystems.HasFlag(TestSystemRequirements.X64Avx2) && System.Runtime.Intrinsics.X86.Avx2.IsSupported) || + (requiredSystems.HasFlag(TestSystemRequirements.X64Sse) && System.Runtime.Intrinsics.X86.Sse.IsSupported); + default: + return false; // If architecture is not covered above, the test is not supported. } } - - // Implement other architecture checks as needed - - return isSupported; } -} public class TestIfCondition : FactAttribute @@ -103,7 +77,7 @@ public TestIfCondition(Func condition, string skipReason) - public void TestGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) + public void simpleGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) { string[] goodSequences = { "a", @@ -132,7 +106,47 @@ public void TestGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) } } - public void TestBadSequences(Utf8ValidationDelegate utf8ValidationDelegate) + [Fact] + [Trait("Category", "scalar")] + public void simpleGoodSequencesScalar() + { + simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void simpleGoodSequencesSse() + // { + // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void simpleGoodSequencesAvx512() + // { + // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void simpleGoodSequencesArm64() + // { + // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void simpleGoodSequencesAVX() + { + simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + + public void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate) { string[] badSequences = { "\xC3\x28", @@ -179,6 +193,45 @@ public void TestBadSequences(Utf8ValidationDelegate utf8ValidationDelegate) } } + [Fact] + [Trait("Category", "scalar")] + public void BadSequencesScalar() + { + BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void BadSequencesSse() + // { + // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void BadSequencesAvx512() + // { + // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void BadSequencesArm64() + // { + // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void BadSequencesAVX() + { + BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + public void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate) { @@ -186,7 +239,7 @@ public void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(bad,utf8ValidationDelegate)); } - public void NoErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) + public void NoError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -200,7 +253,46 @@ public void NoErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) } } - public void validateNoErrorTestSpecificByteCount(Utf8ValidationDelegate utf8ValidationDelegate) + [Fact] + [Trait("Category", "scalar")] + public void NoErrorScalar() + { + NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void NoErrorSse() + // { + // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void NoErrorAvx512() + // { + // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void NoErrorArm64() + // { + // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void NoErrorAVX() + { + NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + public void NoErrorSpecificByteCount(Utf8ValidationDelegate utf8ValidationDelegate) { RunTestForByteLength(1,utf8ValidationDelegate); RunTestForByteLength(2,utf8ValidationDelegate); @@ -222,7 +314,46 @@ private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8Vali } } - public void HeaderBitsErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) + [Fact] + [Trait("Category", "scalar")] + public void NoErrorSpecificByteCountScalar() + { + NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void NoErrorSpecificByteCountSse() + // { + // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void NoErrorSpecificByteCountAvx512() + // { + // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void NoErrorSpecificByteCountArm64() + // { + // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void NoErrorSpecificByteCountAVX() + { + NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -245,7 +376,47 @@ public void HeaderBitsErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) } } - public void TooShortErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) + + [Fact] + [Trait("Category", "scalar")] + public void BadHeaderBitsScalar() + { + BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void BadHeaderBitsSse() + // { + // BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void BadHeaderBitsAvx512() + // { + // BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void NoErrorSpecificByteCountArm64() + // { + // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void BadHeaderBitsAVX() + { + BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + public void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -269,49 +440,46 @@ public void TooShortErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) } - // [FactOnArchitecture(System.Runtime.InteropServices.Architecture.X64)] - // [TestIfCondition(RuntimeInformation.ProcessArchitecture == Architecture.X64,"a reason")] - [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + [Fact] [Trait("Category", "scalar")] - public void TooShortErrorTestScalar() + public void TooShortErrorScalar() { - TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } // TODO:Uncomment when SSE is updated - // [FactOnArchitecture(System.Runtime.InteropServices.Architecture.X64)] + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] // [Fact] // [Trait("Category", "sse")] - // public void TooShortErrorTestSse() + // public void TooShortErrorSse() // { - // TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); // } // TODO:Uncomment when AVX512 is updated - // [Fact] + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] // [Trait("Category", "avx512")] - // public void TooShortErrorTestSse() + // public void TooShortErrorAvx512() // { - // TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); // } // TODO:Uncomment when Arm64 is updated - // [FactOnArchitecture(System.Runtime.InteropServices.Architecture.Arm64)] + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] // [Trait("Category", "arm64")] - // public void TooShortErrorTestArm64() + // public void TooShortErrorArm64() // { - // TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } [Fact] [Trait("Category", "avx")] - public void TooShortErrorTestAVX() + public void TooShortErrorAVX() { - TooShortErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - - public void TooLongErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) + public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) { int[] outputLengths = { 128, 256, 512, 1024 }; // Example lengths @@ -337,7 +505,46 @@ public void TooLongErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) } } - public void OverlongErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) + [Fact] + [Trait("Category", "scalar")] + public void TooLongErrorScalar() + { + TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void TooLongErrorSse() + // { + // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void TooLongErrorAvx512() + // { + // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void TooLongErrorArm64() + // { + // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void TooLongErrorAVX() + { + TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + public void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate) { for (int trial = 0; trial < NumTrials; trial++) { @@ -379,10 +586,47 @@ public void OverlongErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) } } + [Fact] + [Trait("Category", "scalar")] + public void OverlongErrorScalar() + { + OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void OverlongErrorSse() + // { + // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void OverlongErrorAvx512() + // { + // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void OverlongErrorArm64() + // { + // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void OverlongErrorAVX() + { + OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } - public void TooShortTest(Utf8ValidationDelegate utf8ValidationDelegate) + public void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) { for (int trial = 0; trial < NumTrials; trial++) { @@ -429,10 +673,50 @@ public void TooShortTest(Utf8ValidationDelegate utf8ValidationDelegate) } } + [Fact] - public void TooShortTestAvx2() + [Trait("Category", "scalar")] + public void TooShortErrorAtEndScalar() { - TooShortTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void TooShortErrorAtEndSse() + // { + // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void TooShortErrorAtEndAvx512() + // { + // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void TooShortErrorAtEndArm64() + // { + // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void TooShortErrorAtEndAVX() + { + TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + [Fact] + public void TooShortErrorAtEndAvx2() + { + TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } @@ -458,6 +742,50 @@ public void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) } } + [Fact] + [Trait("Category", "scalar")] + public void Invalid0xf50xffScalar() + { + Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void Invalid0xf50xffSse() + // { + // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void Invalid0xf50xffAvx512() + // { + // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void Invalid0xf50xffArm64() + // { + // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void Invalid0xf50xffAVX() + { + Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + [Fact] + public void Invalid0xf50xffAvx2() + { + Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } // Prints both hexadecimal and binary representations of a byte array static void PrintHexAndBinary(byte[] bytes) @@ -472,7 +800,7 @@ static void PrintHexAndBinary(byte[] bytes) } - public void TooLargeErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) + public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -498,7 +826,54 @@ public void TooLargeErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) } } - public void TooLargeErrorTestEnd(Utf8ValidationDelegate utf8ValidationDelegate) + [Fact] + [Trait("Category", "scalar")] + public void TooLargeErrorScalar() + { + TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void TooLargeErrorSse() + // { + // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void TooLargeErrorAvx512() + // { + // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void TooLargeErrorArm64() + // { + // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void TooLargeErrorAVX() + { + TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + [Fact] + public void TooLargeErrorAvx2() + { + TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + + // TODO: improve this test + public void TooLargeErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -520,6 +895,50 @@ public void TooLargeErrorTestEnd(Utf8ValidationDelegate utf8ValidationDelegate) } } + [Fact] + [Trait("Category", "scalar")] + public void TooLargeErrorAtEndScalar() + { + TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void TooLargeErrorAtEndSse() + // { + // TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void TooLargeErrorAtEndAvx512() + // { + // TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void TooLargeErrorAtEndArm64() + // { + // TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void TooLargeErrorAtEndAVX() + { + TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + [Fact] + public void TooLargeErrorAtEndAvx2() + { + TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } public void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { @@ -554,6 +973,47 @@ public void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) } } + + [Fact] + [Trait("Category", "scalar")] + public void SurrogateErrorTestScalar() + { + SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void SurrogateErrorTestSse() + // { + // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void SurrogateErrorTestAvx512() + // { + // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void SurrogateErrorTestArm64() + // { + // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void SurrogateErrorTestAVX() + { + SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) @@ -592,6 +1052,46 @@ public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) } } + [Fact] + [Trait("Category", "scalar")] + public void BruteForceTestScalar() + { + BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void BruteForceTestSse() + // { + // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void BruteForceTestAvx512() + // { + // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void BruteForceTestArm64() + // { + // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void BruteForceTestAVX() + { + BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + // credit: based on code from Google Fuchsia (Apache Licensed) public static bool ValidateUtf8Fuschia(byte[] data) { @@ -785,21 +1285,6 @@ public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg // } } - // [Fact] - // [Trait("Category", "Scalar")] - // public void ScalarUTF16CountTest() - // { - // UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); - // } - - // [Fact] - // [Trait("Category", "Avx")] - // public void AvxUTF16CountTest() - // { - // UTF16CountTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); - // } - - } diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index 857b68e..89c39ee 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -135,12 +135,6 @@ public List AppendContinuationByte(List utf8Bytes) => public void ReplaceEndOfArray(byte[] original, byte[] replacement)//, int startIndex) { - // // Check if the startIndex is within the bounds of the original array - // if (startIndex < 0 || startIndex > original.Length) - // { - // throw new ArgumentOutOfRangeException(nameof(startIndex), "Start index is out of the range of the original array."); - // } - // Calculate the start index for replacement int startIndex = original.Length - replacement.Length; From fe73718b142b77d22f55ed234b003d5b6a43453c Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 2 Apr 2024 17:01:12 -0400 Subject: [PATCH 33/75] adda validate count to tests --- src/UTF8.cs | 21 +++++++++++---------- test/UTF8ValidationTests.cs | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 565ceb2..8976a96 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -17,7 +17,7 @@ public static class UTF8 for (int i = 0; i <= howFarBack; i++) { byte b = buf[0 - i]; - foundLeadingBytes = ((b & 0b11000000) != 0b10000000); + foundLeadingBytes = (b & 0b11000000) != 0b10000000; if (foundLeadingBytes) { buf -= i; @@ -647,22 +647,23 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe int candidateByte = pInputBuffer[processedLength + k]; if ((pInputBuffer[processedLength + k] & 0b11000000) == 0b11000000) { - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { - TempUtf16CodeUnitCountAdjustment += 1; // Still adjusts for a single UTF-16 unit + TempUtf16CodeUnitCountAdjustment += 1; } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence { - TempUtf16CodeUnitCountAdjustment += 1; // Adjusts for two UTF-16 units (surrogate pair) - TempScalarCountAdjustment += 1; // Adjust for one scalar value + TempUtf16CodeUnitCountAdjustment += 2; } - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence { - TempUtf16CodeUnitCountAdjustment += 1; // Adjust for a single UTF-16 unit + TempUtf16CodeUnitCountAdjustment += 2; + TempScalarCountAdjustment += 1; } + processedLength += k; - break; + // break; } @@ -677,7 +678,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe // Process the remaining bytes with the scalar function if (processedLength < inputLength) { - // We need to possibly backtrack to the start of the last code point + // // We need to possibly backtrack to the start of the last code point while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) { processedLength -= 1; diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 4d70dfc..aea310f 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -188,6 +188,7 @@ public void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate) fixed (byte* pInput = input) { ValidateUtf8(input,utf8ValidationDelegate); + ValidateCount(input,utf8ValidationDelegate); } } } @@ -249,6 +250,7 @@ public void NoError(Utf8ValidationDelegate utf8ValidationDelegate) bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); + ValidateCount(utf8,utf8ValidationDelegate); } } } @@ -310,6 +312,7 @@ private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8Vali byte[] utf8 = generator.Generate(outputLength, byteLength).ToArray(); bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); Assert.True(isValidUtf8, $"Failure for {byteLength}-byte UTF8 of length {outputLength} in trial {trial}"); + ValidateCount(utf8,utf8ValidationDelegate); } } } @@ -369,6 +372,7 @@ public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) utf8[i] = 0b11111000; // Forcing a header bits error Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); utf8[i] = oldByte; // Restore the original byte } } @@ -432,6 +436,7 @@ public void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) utf8[i] = 0b11100000; // Forcing a too short error Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); utf8[i] = oldByte; // Restore the original byte } } @@ -498,6 +503,7 @@ public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) utf8[i] = 0b10000000; // Forcing a too long error Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); utf8[i] = oldByte; // Restore the original byte } } @@ -577,6 +583,7 @@ public void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); utf8[i] = old; utf8[i + 1] = secondOld; @@ -665,6 +672,8 @@ public void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, i + offset, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); Assert.True(dotnetResult == pInput + i + offset); + + ValidateCount(utf8,utf8ValidationDelegate); } } @@ -737,6 +746,7 @@ public void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); // Expect the validation to fail due to the invalid byte Assert.True(InvalidateUtf8(utf8,position,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); } } } @@ -819,6 +829,7 @@ public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); utf8[i] = old; } } @@ -888,6 +899,7 @@ public void TooLargeErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(filler,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(filler, outputLength -1,utf8ValidationDelegate)); + ValidateCount(filler,utf8ValidationDelegate); } @@ -963,6 +975,7 @@ public void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); } utf8[i] = old; @@ -1044,6 +1057,7 @@ public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) // Validate the modified sequence with both methods bool isValidPrimary = ValidateUtf8(modifiedUtf8,utf8ValidationDelegate); bool isValidFuschia = ValidateUtf8Fuschia(modifiedUtf8); + ValidateCount(modifiedUtf8,utf8ValidationDelegate); // Ensure both methods agree on the validation result Assert.Equal(isValidPrimary, isValidFuschia); From 051e55b563b9b979ad377b40f892ea07575302a6 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Wed, 3 Apr 2024 11:52:09 -0400 Subject: [PATCH 34/75] Count no error test working --- src/UTF8.cs | 116 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 68 insertions(+), 48 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 8976a96..70950b2 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -70,43 +70,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe } } - - - // public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment, int skippedBytes = 0) - // { - // utf16CodeUnitCountAdjustment = 0; - // scalarCountAdjustment = 0; - - // // Call the original function first. Assuming GetPointerToFirstInvalidByteOriginal exists and does the primary checking. - // byte* result = GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); - - // // If the adjustments are still 0 and there are skipped bytes to consider, - // // loop through the skipped bytes and adjust the counts as needed. - // if (utf16CodeUnitCountAdjustment == 0 && scalarCountAdjustment == 0 && skippedBytes > 0) - // { - // for (int i = 0; i < skippedBytes; i++) - // { - // byte currentByte = *(pInputBuffer + i); - // if (currentByte >= 0xC0 && currentByte < 0xE0) - // { - // // 2-byte sequence - // utf16CodeUnitCountAdjustment -= 1; // Adjust according to your logic - // scalarCountAdjustment -= 1; - // } - // else if ((currentByte >= 0xE0 && currentByte < 0xF0) || (currentByte >= 0xF0)) - // { - // // 3-byte or 4-byte sequence - // utf16CodeUnitCountAdjustment -= 1; // This might need to be adjusted based on your specific logic for 3-byte and 4-byte sequences - // scalarCountAdjustment -= 1; - // } - // // Adjust for other conditions as necessary - // } - // } - - // return result; // Return the pointer from the original check - // } - - public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { @@ -400,6 +363,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { + Console.WriteLine("--------------------------Calling function----------------------------------"); int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; @@ -551,10 +515,42 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { + // TODO/think about : this path iss not explicitly tested + Console.WriteLine("----Checkpoint 1:All ASCII need rewind"); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - int off = processedLength >= 3 ? processedLength - 3 : processedLength; + // int off = processedLength >= 3 ? processedLength - 3 : processedLength; + int off = 0; + + if (processedLength >= 32 + 3){ + off = processedLength -32 - 3; + int overlapCount =3; + + for(int k = 0; k < overlapCount; k++) + { + + int candidateByte = pInputBuffer[processedLength + k]; + if ((candidateByte & 0b11000000) == 0b11000000) + { + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 1; + } + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 2; + } + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 2; + TempScalarCountAdjustment += 1; + } + } + } + } + else{ off = processedLength;} + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); } prevIncomplete = Vector256.Zero; @@ -626,7 +622,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { - + // TODO: add error handling for Code count utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; @@ -639,13 +635,15 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { + + Console.WriteLine("----Checkpoint 2:SIMD rewind"); // We have an unterminated sequence. processedLength -= 3; for(int k = 0; k < 3; k++) { int candidateByte = pInputBuffer[processedLength + k]; - if ((pInputBuffer[processedLength + k] & 0b11000000) == 0b11000000) + if ((candidateByte & 0b11000000) == 0b11000000) { if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { @@ -660,15 +658,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe TempUtf16CodeUnitCountAdjustment += 2; TempScalarCountAdjustment += 1; } - - - processedLength += k; - // break; - } - - - } } } @@ -678,10 +668,40 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe // Process the remaining bytes with the scalar function if (processedLength < inputLength) { + + Console.WriteLine("----Process remaining Scalar"); + int overlapCount = 0; + // // We need to possibly backtrack to the start of the last code point while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) { processedLength -= 1; + overlapCount +=1; + } + + for(int k = 0; k < overlapCount; k++) + { + + int candidateByte = pInputBuffer[processedLength + k]; + if ((candidateByte & 0b11000000) == 0b11000000) + { + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 1; + } + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 2; + } + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 2; + TempScalarCountAdjustment += 1; + } + + // processedLength += k; + break; + } } byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); From 49196728554121d64ac449d916c0e7ba64107bb6 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Wed, 3 Apr 2024 17:07:27 -0400 Subject: [PATCH 35/75] slight cleanup + progress --- src/UTF8.cs | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 70950b2..a1fc901 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -9,8 +9,12 @@ namespace SimdUnicode public static class UTF8 { - public unsafe static byte* RewindAndValidateWithErrors(int priorBytes, byte* buf, int len) + public unsafe static byte* RewindAndValidateWithErrors(int priorBytes, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { + + int TempUtf16CodeUnitCountAdjustment = 0; + int TempScalarCountAdjustment = 0; + int howFarBack = priorBytes; int extraLen = 0; bool foundLeadingBytes = false; @@ -27,13 +31,20 @@ public static class UTF8 } if (!foundLeadingBytes) { + utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TempScalarCountAdjustment; return buf - howFarBack; } + // TODO : fix Count handling here + // Now buf points to the start of a UTF-8 sequence or the start of the buffer. // Validate from this new start point with the adjusted length. - byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); + byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TempUtf16CodeUnitCountAdjustment, out TempScalarCountAdjustment); + + utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TempScalarCountAdjustment; return invalidByte; } @@ -516,7 +527,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe { // TODO/think about : this path iss not explicitly tested - Console.WriteLine("----Checkpoint 1:All ASCII need rewind"); + // Console.WriteLine("----Checkpoint 1:All ASCII need rewind"); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; @@ -636,7 +647,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - Console.WriteLine("----Checkpoint 2:SIMD rewind"); + // Console.WriteLine("----Checkpoint 2:SIMD rewind"); // We have an unterminated sequence. processedLength -= 3; for(int k = 0; k < 3; k++) @@ -669,7 +680,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (processedLength < inputLength) { - Console.WriteLine("----Process remaining Scalar"); + // Console.WriteLine("----Process remaining Scalar"); int overlapCount = 0; // // We need to possibly backtrack to the start of the last code point From c61c83b60c10ed49bb07134cd54f03451d15df47 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Thu, 4 Apr 2024 14:28:00 -0400 Subject: [PATCH 36/75] fixing count erre attempt --- src/UTF8.cs | 69 +++++++++++++++++++++++++++---------- test/UTF8ValidationTests.cs | 64 ++++++++++++++++++++++++++++++---- 2 files changed, 108 insertions(+), 25 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index a1fc901..45622f3 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -15,6 +15,9 @@ public static class UTF8 int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + int TailScalarCountAdjustment = 0; + int howFarBack = priorBytes; int extraLen = 0; bool foundLeadingBytes = false; @@ -24,27 +27,47 @@ public static class UTF8 foundLeadingBytes = (b & 0b11000000) != 0b10000000; if (foundLeadingBytes) { + + + if ((b & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 1; + } + if ((b & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 2; + } + if ((b & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 2; + TempScalarCountAdjustment += 1; + } + + buf -= i; extraLen = i; break; } } + + utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TempScalarCountAdjustment; + + if (!foundLeadingBytes) { - utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TempScalarCountAdjustment; return buf - howFarBack; } - // TODO : fix Count handling here + // Now buf points to the start of a UTF-8 sequence or the start of the buffer. // Validate from this new start point with the adjusted length. - byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TempUtf16CodeUnitCountAdjustment, out TempScalarCountAdjustment); + byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); - utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TempScalarCountAdjustment; + utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TailScalarCountAdjustment; return invalidByte; } @@ -220,6 +243,8 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe { int processedLength = 0; + int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempScalarCountAdjustment = 0; if (pInputBuffer == null || inputLength <= 0) { @@ -309,7 +334,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe // return pInputBuffer + processedLength; // Console.WriteLine("not ascii"); - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment); } prevIncomplete = Vector128.Zero; } @@ -331,7 +356,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe Vector128 error = Sse2.Xor(must23As80, sc); if (Sse2.MoveMask(error) != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment); } prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue); } @@ -527,12 +552,12 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe { // TODO/think about : this path iss not explicitly tested - // Console.WriteLine("----Checkpoint 1:All ASCII need rewind"); + Console.WriteLine("----Checkpoint 1:All ASCII need rewind"); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; // int off = processedLength >= 3 ? processedLength - 3 : processedLength; - int off = 0; + int off = processedLength; if (processedLength >= 32 + 3){ off = processedLength -32 - 3; @@ -560,9 +585,10 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe } } } - else{ off = processedLength;} + // else{ off = processedLength;} - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); + // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = Vector256.Zero; } @@ -633,12 +659,14 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { - // TODO: add error handling for Code count + Console.WriteLine("-----Error path!!"); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; int off = processedLength >= 32 ? processedLength - 32 : processedLength; - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); + // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + } prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); } @@ -647,7 +675,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - // Console.WriteLine("----Checkpoint 2:SIMD rewind"); + Console.WriteLine("----Checkpoint 2:SIMD rewind"); // We have an unterminated sequence. processedLength -= 3; for(int k = 0; k < 3; k++) @@ -680,7 +708,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (processedLength < inputLength) { - // Console.WriteLine("----Process remaining Scalar"); + Console.WriteLine("----Process remaining Scalar"); int overlapCount = 0; // // We need to possibly backtrack to the start of the last code point @@ -736,6 +764,11 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe { int processedLength = 0; + int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempScalarCountAdjustment = 0; + + int utf16CodeUnitCountAdjustment=0, scalarCountAdjustment=0; + if (pInputBuffer == null || inputLength <= 0) { return pInputBuffer; @@ -817,7 +850,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe // we need to check if the previous block was incomplete. if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = Vector128.Zero; } @@ -839,7 +872,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe Vector128 error = AdvSimd.Xor(must23As80, sc); if (AdvSimd.Arm64.MaxAcross(error).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue); } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index aea310f..b6c4bcd 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -871,13 +871,7 @@ public void TooLargeErrorScalar() [Fact] [Trait("Category", "avx")] - public void TooLargeErrorAVX() - { - TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); - } - - [Fact] - public void TooLargeErrorAvx2() + public void TooLargeErrorAvx() { TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } @@ -1299,6 +1293,62 @@ public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg // } } + [Fact] + [Trait("Category", "Scalar")] + public void DotnetUTF16Count() + { + int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; + int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; + int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + + + foreach (int outputLength in outputLengths) + { + // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. + // byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray(); + byte[] utf8 = generator.Generate(howManyUnits: outputLength).ToArray(); + PrintHexAndBinary(utf8); + var (offset, length) = (0, utf8.Length); + + unsafe + { + fixed (byte* pInput = utf8) + { + byte* startPtr = pInput + offset; + // Invoke the method under test. + + DotnetUtf16Adjustment= 0; + DotnetScalarCountAdjustment= 0; + DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); + + SimdUnicodeUtf16Adjustment= 0; + SimdUnicodeScalarCountAdjustment= 0; + SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + + Console.WriteLine("Lenght:" + utf8.Length); + + Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); + Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); + + Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); + Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); + Console.WriteLine("___________________________________________________"); + + + Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); + + + + + // If your generator creates specific patterns or the utility calculates these adjustments differently, + // you'll need to adjust the expected values accordingly. + } + } + } + } + + } From dcbf949dd869afa89035756cead786cfc6b1efee Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Thu, 4 Apr 2024 22:21:47 -0400 Subject: [PATCH 37/75] A tiny bit of progress --- src/UTF8.cs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 45622f3..ea78991 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -28,7 +28,6 @@ public static class UTF8 if (foundLeadingBytes) { - if ((b & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { TempUtf16CodeUnitCountAdjustment += 1; @@ -43,7 +42,6 @@ public static class UTF8 TempScalarCountAdjustment += 1; } - buf -= i; extraLen = i; break; @@ -66,8 +64,8 @@ public static class UTF8 // Validate from this new start point with the adjusted length. byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); - utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TailScalarCountAdjustment; + utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TailScalarCountAdjustment; return invalidByte; } From f27117b03385fe31b507e5398b43fc382605c801 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 6 Apr 2024 09:26:15 -0400 Subject: [PATCH 38/75] more expressive validateCount test --- src/UTF8.cs | 81 +++++++++---------- test/UTF8ValidationTests.cs | 151 +++++++++++++++++++++++++++--------- 2 files changed, 151 insertions(+), 81 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index ea78991..60d8b5d 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -9,34 +9,32 @@ namespace SimdUnicode public static class UTF8 { - public unsafe static byte* RewindAndValidateWithErrors(int priorBytes, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + public unsafe static byte* RewindAndValidateWithErrors(int offset, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - int TailScalarCountAdjustment = 0; - - int howFarBack = priorBytes; + int howFarBack = offset; int extraLen = 0; bool foundLeadingBytes = false; for (int i = 0; i <= howFarBack; i++) { - byte b = buf[0 - i]; - foundLeadingBytes = (b & 0b11000000) != 0b10000000; + byte candidateByte = buf[0 - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) { - if ((b & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + // adjustment to avoid double counting + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { TempUtf16CodeUnitCountAdjustment += 1; } - if ((b & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence { TempUtf16CodeUnitCountAdjustment += 2; } - if ((b & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence { TempUtf16CodeUnitCountAdjustment += 2; TempScalarCountAdjustment += 1; @@ -48,26 +46,26 @@ public static class UTF8 } } - utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TempScalarCountAdjustment; - if (!foundLeadingBytes) { return buf - howFarBack; } + utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TempScalarCountAdjustment; - + int TailUtf16CodeUnitCountAdjustment = 0; + int TailScalarCountAdjustment = 0; // Now buf points to the start of a UTF-8 sequence or the start of the buffer. // Validate from this new start point with the adjusted length. - byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); + byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); - utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TailScalarCountAdjustment; + utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TailScalarCountAdjustment; - return invalidByte; + return invalidBytePointer; } public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippedBytes, @@ -594,24 +592,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe { // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. - // Identify start of 4-byte sequences. - // Vector256 isFourByteStart = Avx2.SubtractSaturate(currentBlock, fourthByte); - // int fourByteMask = Avx2.MoveMask(isFourByteStart); - // uint fourByteCount = Popcnt.PopCount((uint)fourByteMask); - - // // Identify start of 3-byte and 4-byte sequences. - // Vector256 isThreeByteStart = Avx2.SubtractSaturate(currentBlock, thirdByte); - // int threeByteMask = Avx2.MoveMask(isThreeByteStart); - // uint threeByteCount = Popcnt.PopCount((uint)threeByteMask); - - // // Calculate only 3-byte sequence count by excluding 4-byte sequences. - // // uint threeByteCount = threeOrFourByteCount - fourByteCount; - - // // Identify start of 2-byte,3 or 4 bytes sequences. - // Vector256 isTwoByteStart = Avx2.SubtractSaturate(currentBlock, secondByte); - // int twoByteMask = Avx2.MoveMask(isTwoByteStart); - // uint twoByteCount = Popcnt.PopCount((uint)twoByteMask); - // Detect start of 4-byte sequences. Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); @@ -628,11 +608,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. - // Calculate only 2-byte sequence count by excluding 3-byte and 4-byte sequences. - // uint pureTwoByteCount = twoByteCount - threeOrFourByteCount; - - // Console.WriteLine("2byte count:" + twoByteCount); - // Adjustments TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; @@ -655,15 +630,29 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe Vector256 must23 = Avx2.Or(isThirdByte, isFourthByte); Vector256 must23As80 = Avx2.And(must23, v80); Vector256 error = Avx2.Xor(must23As80, sc); - if (!Avx2.TestZ(error, error)) + if (!Avx2.TestZ(error, error)) //context: we are dealing with a 32 bit { Console.WriteLine("-----Error path!!"); - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment; + TailScalarCodeUnitCountAdjustment =0; + TailUtf16CodeUnitCountAdjustment =0; + - int off = processedLength >= 32 ? processedLength - 32 : processedLength; + int off = processedLength >= 32 ? processedLength - 32 : 0;//processedLength; + // Console.WriteLine(off); // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + + // Adjustments not to double count + TempUtf16CodeUnitCountAdjustment += (int)fourByteCount * 2; + TempUtf16CodeUnitCountAdjustment += (int)twoByteCount; + TempUtf16CodeUnitCountAdjustment += (int)threeByteCount *2; + TempScalarCountAdjustment += (int)fourByteCount; + + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; + + return invalidBytePointer; } prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index b6c4bcd..34b54ec 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -798,16 +798,55 @@ public void Invalid0xf50xffAvx2() } // Prints both hexadecimal and binary representations of a byte array - static void PrintHexAndBinary(byte[] bytes) + // static void PrintHexAndBinary(byte[] bytes) + // { + // // Convert to hexadecimal + // string hexRepresentation = BitConverter.ToString(bytes).Replace("-", " "); + // Console.WriteLine($"Hex: {hexRepresentation}"); + + // // Convert to binary + // string binaryRepresentation = string.Join(" ", Array.ConvertAll(bytes, byteValue => Convert.ToString(byteValue, 2).PadLeft(8, '0'))); + // Console.WriteLine($"Binary: {binaryRepresentation}"); + // } + + static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) +{ + // Convert to hexadecimal + Console.Write("Hex: "); + for (int i = 0; i < bytes.Length; i++) { - // Convert to hexadecimal - string hexRepresentation = BitConverter.ToString(bytes).Replace("-", " "); - Console.WriteLine($"Hex: {hexRepresentation}"); + if (i == highlightIndex) + { + Console.ForegroundColor = ConsoleColor.Red; + Console.Write($"{bytes[i]:X2} "); + Console.ResetColor(); + } + else + { + Console.Write($"{bytes[i]:X2} "); + } + } + Console.WriteLine(); // New line for readability - // Convert to binary - string binaryRepresentation = string.Join(" ", Array.ConvertAll(bytes, byteValue => Convert.ToString(byteValue, 2).PadLeft(8, '0'))); - Console.WriteLine($"Binary: {binaryRepresentation}"); + // Convert to binary + Console.Write("Binary: "); + for (int i = 0; i < bytes.Length; i++) + { + string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0'); + if (i == highlightIndex) + { + Console.ForegroundColor = ConsoleColor.Red; + Console.Write($"{binaryString} "); + Console.ResetColor(); + } + else + { + Console.Write($"{binaryString} "); + } } + Console.WriteLine(); // New line for readability +} + public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) @@ -1255,43 +1294,85 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg - public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) - { - int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; - int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + // public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) + // { + // int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; + // int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + + // var isDefaultRange = range.Equals(default(Range)); + // var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); + + // unsafe + // { + // fixed (byte* pInput = utf8) + // { + // byte* startPtr = pInput + offset; + // // Invoke the method under test. + + // DotnetUtf16Adjustment= 0; + // DotnetScalarCountAdjustment= 0; + // DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); + + // SimdUnicodeUtf16Adjustment= 0; + // SimdUnicodeScalarCountAdjustment= 0; + // utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + + // // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); + // // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); + + // // Console.WriteLine("Lenght:" + utf8.Length); + // // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); + // // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); + // // Console.WriteLine("___________________________________________________"); + + // Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + // Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); + // } + // } + // // } + // } - var isDefaultRange = range.Equals(default(Range)); - var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); + public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) +{ + int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; + int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - unsafe - { - fixed (byte* pInput = utf8) - { - byte* startPtr = pInput + offset; - // Invoke the method under test. + var isDefaultRange = range.Equals(default(Range)); + var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); - DotnetUtf16Adjustment= 0; - DotnetScalarCountAdjustment= 0; - DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); + unsafe + { + fixed (byte* pInput = utf8) + { + byte* startPtr = pInput + offset; - SimdUnicodeUtf16Adjustment= 0; - SimdUnicodeScalarCountAdjustment= 0; - utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + DotnetUtf16Adjustment = 0; + DotnetScalarCountAdjustment = 0; + DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); - // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); - // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); + SimdUnicodeUtf16Adjustment = 0; + SimdUnicodeScalarCountAdjustment = 0; + byte* simdResult = utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - // Console.WriteLine("Lenght:" + utf8.Length); - // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); - // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); - // Console.WriteLine("___________________________________________________"); + // Determine the index of the invalid byte if simdResult doesn't point to the end. + int failureIndex = simdResult != pInput + length ? (int)(simdResult - pInput) : -1; - Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); - Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - } + try + { + Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); } - // } + catch (Exception) + { + // Upon failure, print the utf8 array for inspection + Console.WriteLine("Assertion failed. Inspecting utf8 array:"); + PrintHexAndBinary(utf8,failureIndex); + throw; // Re-throw the exception to preserve the failure state + } + } } +} + [Fact] [Trait("Category", "Scalar")] From be79615dc96ded47ebdbc5576f42478f05c07c63 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 6 Apr 2024 09:56:56 -0400 Subject: [PATCH 39/75] some progress --- src/UTF8.cs | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 60d8b5d..b54eae8 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -9,22 +9,21 @@ namespace SimdUnicode public static class UTF8 { - public unsafe static byte* RewindAndValidateWithErrors(int offset, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; - int howFarBack = offset; int extraLen = 0; bool foundLeadingBytes = false; - for (int i = 0; i <= howFarBack; i++) + + for (int i = 0; i < howFarBack; i++) { byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) { - // adjustment to avoid double counting if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { @@ -39,7 +38,17 @@ public static class UTF8 TempUtf16CodeUnitCountAdjustment += 2; TempScalarCountAdjustment += 1; } - + break; + } + } + + + for (int i = 0; i <= howFarBack; i++) + { + byte candidateByte = buf[0 - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + if (foundLeadingBytes) + { buf -= i; extraLen = i; break; @@ -65,6 +74,9 @@ public static class UTF8 utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCountAdjustment; + Console.WriteLine("utf16count after rewint:" + utf16CodeUnitCountAdjustment); + Console.WriteLine("scalarcount after rewint:" + scalarCountAdjustment); + return invalidBytePointer; } @@ -651,6 +663,8 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; + + return invalidBytePointer; From 006738ce934cc79a998282c8a60adfb2d485a47e Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 7 Apr 2024 18:16:50 -0400 Subject: [PATCH 40/75] LongErrorAVX working + cleaner --- benchmark/Benchmark.cs | 20 +++++------ src/UTF8.cs | 40 ++++++++++++++-------- test/UTF8ValidationTests.cs | 67 +++++++++++++++++++++++++++++++++---- 3 files changed, 97 insertions(+), 30 deletions(-) diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs index 48a5b68..e6184cc 100644 --- a/benchmark/Benchmark.cs +++ b/benchmark/Benchmark.cs @@ -183,7 +183,7 @@ public unsafe void SIMDUtf8ValidationRealData() { if (allLinesUtf8 != null) { - RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByte); + // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByte); } } @@ -223,15 +223,15 @@ public unsafe void SIMDUtf8ValidationRealDataArm64() RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); } } - [Benchmark] - [BenchmarkCategory("avx")] - public unsafe void SIMDUtf8ValidationRealDataAvx2() - { - if (allLinesUtf8 != null) - { - RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); - } - } + // [Benchmark] + // [BenchmarkCategory("avx")] + // public unsafe void SIMDUtf8ValidationRealDataAvx2() + // { + // if (allLinesUtf8 != null) + // { + // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + // } + // } [Benchmark] [BenchmarkCategory("sse")] public unsafe void SIMDUtf8ValidationRealDataSse() diff --git a/src/UTF8.cs b/src/UTF8.cs index b54eae8..92ffe9a 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -18,23 +18,28 @@ public static class UTF8 int extraLen = 0; bool foundLeadingBytes = false; - for (int i = 0; i < howFarBack; i++) + for (int i = 0; i <= howFarBack; i++) { byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) { + if (i == 0) {break;} + Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); // adjustment to avoid double counting if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { + Console.WriteLine("Found 2 byte"); TempUtf16CodeUnitCountAdjustment += 1; } if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence { + Console.WriteLine("Found 3 byte"); TempUtf16CodeUnitCountAdjustment += 2; } if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence { + Console.WriteLine("Found 4 byte"); TempUtf16CodeUnitCountAdjustment += 2; TempScalarCountAdjustment += 1; } @@ -74,8 +79,11 @@ public static class UTF8 utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCountAdjustment; - Console.WriteLine("utf16count after rewint:" + utf16CodeUnitCountAdjustment); - Console.WriteLine("scalarcount after rewint:" + scalarCountAdjustment); + Console.WriteLine("utf16count after rewint(Temp):" + TempUtf16CodeUnitCountAdjustment); + Console.WriteLine("scalarcount after rewint:" + TempScalarCountAdjustment); + + Console.WriteLine("utf16count after rewint(Scalar):" + TailUtf16CodeUnitCountAdjustment); + Console.WriteLine("scalarcount after rewint:" + TailScalarCountAdjustment); return invalidBytePointer; } @@ -620,11 +628,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. - // Adjustments - TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; - TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; - TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; - TempScalarCountAdjustment -= (int)fourByteCount; + Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; @@ -649,17 +653,19 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe TailUtf16CodeUnitCountAdjustment =0; - int off = processedLength >= 32 ? processedLength - 32 : 0;//processedLength; - // Console.WriteLine(off); + int off = processedLength >= 32 ? processedLength : 0;//processedLength; + + Console.WriteLine("This is off :" + off); // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer,processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); // Adjustments not to double count - TempUtf16CodeUnitCountAdjustment += (int)fourByteCount * 2; - TempUtf16CodeUnitCountAdjustment += (int)twoByteCount; - TempUtf16CodeUnitCountAdjustment += (int)threeByteCount *2; - TempScalarCountAdjustment += (int)fourByteCount; + // TempUtf16CodeUnitCountAdjustment += (int)fourByteCount * 2; + // TempUtf16CodeUnitCountAdjustment += (int)twoByteCount; + // TempUtf16CodeUnitCountAdjustment += (int)threeByteCount *2; + // TempScalarCountAdjustment += (int)fourByteCount; utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; @@ -669,6 +675,12 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe return invalidBytePointer; } + // Adjustments + TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; + TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; + TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; + TempScalarCountAdjustment -= (int)fourByteCount; + prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); } } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 34b54ec..e36ab08 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -387,6 +387,7 @@ public void BadHeaderBitsScalar() { BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } + // TODO:Uncomment when SSE is updated // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] @@ -487,7 +488,9 @@ public void TooShortErrorAVX() public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) { - int[] outputLengths = { 128, 256, 512, 1024 }; // Example lengths + // int[] outputLengths = { 128, 256, 512, 1024 }; // Example lengths +// int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths + foreach (int outputLength in outputLengths) { @@ -809,12 +812,55 @@ public void Invalid0xf50xffAvx2() // Console.WriteLine($"Binary: {binaryRepresentation}"); // } - static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) +// static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) +// { +// // Convert to hexadecimal +// Console.Write("Hex: "); +// for (int i = 0; i < bytes.Length; i++) +// { +// if (i == highlightIndex) +// { +// Console.ForegroundColor = ConsoleColor.Red; +// Console.Write($"{bytes[i]:X2} "); +// Console.ResetColor(); +// } +// else +// { +// Console.Write($"{bytes[i]:X2} "); +// } +// } +// Console.WriteLine(); // New line for readability + +// // Convert to binary +// Console.Write("Binary: "); +// for (int i = 0; i < bytes.Length; i++) +// { +// string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0'); +// if (i == highlightIndex) +// { +// Console.ForegroundColor = ConsoleColor.Red; +// Console.Write($"{binaryString} "); +// Console.ResetColor(); +// } +// else +// { +// Console.Write($"{binaryString} "); +// } +// } +// Console.WriteLine(); // New line for readability +// } + +static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { - // Convert to hexadecimal + int chunkSize = 16; // 128 bits = 16 bytes + + // Process each chunk for hexadecimal Console.Write("Hex: "); for (int i = 0; i < bytes.Length; i++) { + if (i > 0 && i % chunkSize == 0) + Console.WriteLine(); // New line after every 16 bytes + if (i == highlightIndex) { Console.ForegroundColor = ConsoleColor.Red; @@ -825,13 +871,18 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { Console.Write($"{bytes[i]:X2} "); } + + if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line } - Console.WriteLine(); // New line for readability + Console.WriteLine("\n"); // New line for readability and to separate hex from binary - // Convert to binary + // Process each chunk for binary Console.Write("Binary: "); for (int i = 0; i < bytes.Length; i++) { + if (i > 0 && i % chunkSize == 0) + Console.WriteLine(); // New line after every 16 bytes + string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0'); if (i == highlightIndex) { @@ -843,19 +894,23 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { Console.Write($"{binaryString} "); } + + if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line } Console.WriteLine(); // New line for readability } - public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { + + Console.WriteLine("Outputlength:" + outputLength); for (int trial = 0; trial < NumTrials; trial++) { + Console.WriteLine("trial:",trial); byte[] utf8 = generator.Generate(outputLength).ToArray(); From b5445bba653828757c0d2d8f2e2afbcea8730603 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 8 Apr 2024 07:05:14 -0400 Subject: [PATCH 41/75] save game --- src/UTF8.cs | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 92ffe9a..44562fa 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -25,21 +25,21 @@ public static class UTF8 if (foundLeadingBytes) { if (i == 0) {break;} - Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); + // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); // adjustment to avoid double counting if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { - Console.WriteLine("Found 2 byte"); + // Console.WriteLine("Found 2 byte"); TempUtf16CodeUnitCountAdjustment += 1; } if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence { - Console.WriteLine("Found 3 byte"); + // Console.WriteLine("Found 3 byte"); TempUtf16CodeUnitCountAdjustment += 2; } if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence { - Console.WriteLine("Found 4 byte"); + // Console.WriteLine("Found 4 byte"); TempUtf16CodeUnitCountAdjustment += 2; TempScalarCountAdjustment += 1; } @@ -79,11 +79,11 @@ public static class UTF8 utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCountAdjustment; - Console.WriteLine("utf16count after rewint(Temp):" + TempUtf16CodeUnitCountAdjustment); - Console.WriteLine("scalarcount after rewint:" + TempScalarCountAdjustment); + // Console.WriteLine("utf16count after rewint(Temp):" + TempUtf16CodeUnitCountAdjustment); + // Console.WriteLine("scalarcount after rewint:" + TempScalarCountAdjustment); - Console.WriteLine("utf16count after rewint(Scalar):" + TailUtf16CodeUnitCountAdjustment); - Console.WriteLine("scalarcount after rewint:" + TailScalarCountAdjustment); + // Console.WriteLine("utf16count after rewint(Scalar):" + TailUtf16CodeUnitCountAdjustment); + // Console.WriteLine("scalarcount after rewint:" + TailScalarCountAdjustment); return invalidBytePointer; } @@ -415,7 +415,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - Console.WriteLine("--------------------------Calling function----------------------------------"); + // Console.WriteLine("--------------------------Calling function----------------------------------"); int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; @@ -568,7 +568,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe { // TODO/think about : this path iss not explicitly tested - Console.WriteLine("----Checkpoint 1:All ASCII need rewind"); + // Console.WriteLine("----Checkpoint 1:All ASCII need rewind"); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; @@ -648,17 +648,17 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) //context: we are dealing with a 32 bit { - Console.WriteLine("-----Error path!!"); + // Console.WriteLine("-----Error path!!"); TailScalarCodeUnitCountAdjustment =0; TailUtf16CodeUnitCountAdjustment =0; - int off = processedLength >= 32 ? processedLength : 0;//processedLength; + int off = processedLength >= 32 ? processedLength : processedLength; - Console.WriteLine("This is off :" + off); + // Console.WriteLine("This is off :" + off); // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer,processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); // Adjustments not to double count @@ -688,7 +688,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - Console.WriteLine("----Checkpoint 2:SIMD rewind"); + // Console.WriteLine("----Checkpoint 2:SIMD rewind"); // We have an unterminated sequence. processedLength -= 3; for(int k = 0; k < 3; k++) @@ -721,7 +721,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (processedLength < inputLength) { - Console.WriteLine("----Process remaining Scalar"); + // Console.WriteLine("----Process remaining Scalar"); int overlapCount = 0; // // We need to possibly backtrack to the start of the last code point From 7d90fde16dfddb4c14d0f53781b8b481b3451dd7 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 9 Apr 2024 09:08:41 -0400 Subject: [PATCH 42/75] Cleanup + better ShortTest --- README.md | 10 ++- src/UTF8.cs | 125 ++++++------------------------------ test/UTF8ValidationTests.cs | 67 ++++--------------- 3 files changed, 38 insertions(+), 164 deletions(-) diff --git a/README.md b/README.md index b73c506..6b682b1 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,17 @@ To run specific tests, it is helpful to use the filter parameter: ``` -dotnet test --filter Ascii +dotnet test --filter TooShortErrorAVX ``` +Or to target specific categories: + +``` +dotnet test --filter "Category=scalar" +``` + + + ## Running Benchmarks To run the benchmarks, run the following command: diff --git a/src/UTF8.cs b/src/UTF8.cs index 44562fa..073a189 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -190,16 +190,10 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } // Too short - // if (pInputBuffer[pos + 3] < 0b10000000) { - // TempUtf16CodeUnitCountAdjustment -= 1; - // } else { - // TempUtf16CodeUnitCountAdjustment -= 2; - // } TempUtf16CodeUnitCountAdjustment -= 2; } else if ((firstByte & 0b11111000) == 0b11110000) - { // 0b11110000 - + { nextPos = pos + 4; if (nextPos > inputLength) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; @@ -226,9 +220,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe return pInputBuffer + pos; } TempUtf16CodeUnitCountAdjustment -= 2; TempScalarCountAdjustment -= 1; - - - } else { @@ -525,36 +516,9 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe Vector256 thirdByte = Vector256.Create((byte)(0b11100000u - 0x80)); Vector256 fourthByte = Vector256.Create((byte)(0b11110000u - 0x80)); - // // Mask for the lower and upper parts of the vector - // Vector128 lowerMask = Vector128.Create( - // 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - // 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF).AsByte(); - - // Vector128 upperMask = Vector128.Create( - // 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - // 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00).AsByte(); - - // // Combine lower and upper masks into a Vector256 - // Vector256 mask = Vector256.Create(lowerMask, upperMask); - - // // Apply the mask to zero out the last 3 bytes of each vector - // Vector256 secondByteMasked = Avx2.And(secondByte, mask); - // Vector256 thirdByteMasked = Avx2.And(thirdByte, mask); - // Vector256 fourthByteMasked = Avx2.And(fourthByte, mask); - - Vector256 v0f = Vector256.Create((byte)0x0F); Vector256 v80 = Vector256.Create((byte)0x80); - // Vector to identify bytes right before the start of a 4-byte sequence in UTF-8. - // Vector256 beforeFourByteMarker = Vector256.Create((byte)(0xF0 - 1)); - // // Vector to identify bytes right before the start of a 3-byte sequence in UTF-8. - // Vector256 beforeThreeByteMarker = Vector256.Create((byte)(0xE0 - 1)); - // // Vector to identify bytes right before the start of a 2-byte sequence in UTF-8. - // Vector256 beforeTwoByteMarker = Vector256.Create((byte)(0xC0 - 1)); - - - for (; processedLength + 32 <= inputLength; processedLength += 32) { Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength); @@ -601,34 +565,26 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe } } } - // else{ off = processedLength;} - // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = Vector256.Zero; } else // Contains non-ASCII characters, we need to do non-trivial processing { - // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. + // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. - // Detect start of 4-byte sequences. - Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); - uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); - - // Detect start of 3-byte sequences (including those that start 4-byte sequences). - Vector256 isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte); - uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence)); - - // Detect start of 2-byte sequences (including those that start 3-byte and 4-byte sequences). - Vector256 isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte); - uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence)); - - // Calculate counts by isolating each type. - uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. - uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. + Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); + Vector256 isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte); + Vector256 isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte); + uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence)); + uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence)); + uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); + // Calculate counts by isolating each type. + uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. + uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; @@ -652,67 +608,24 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe TailScalarCodeUnitCountAdjustment =0; TailUtf16CodeUnitCountAdjustment =0; - - int off = processedLength >= 32 ? processedLength : processedLength; - - // Console.WriteLine("This is off :" + off); - // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); - // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); - - // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer,processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - // Adjustments not to double count - // TempUtf16CodeUnitCountAdjustment += (int)fourByteCount * 2; - // TempUtf16CodeUnitCountAdjustment += (int)twoByteCount; - // TempUtf16CodeUnitCountAdjustment += (int)threeByteCount *2; - // TempScalarCountAdjustment += (int)fourByteCount; + int off = processedLength >= 32 ? processedLength: processedLength; + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; - - return invalidBytePointer; } - // Adjustments - TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; - TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; - TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; - TempScalarCountAdjustment -= (int)fourByteCount; + // Adjustments + TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; + TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; + TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; + TempScalarCountAdjustment -= (int)fourByteCount; prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); } } - - if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) - { - - // Console.WriteLine("----Checkpoint 2:SIMD rewind"); - // We have an unterminated sequence. - processedLength -= 3; - for(int k = 0; k < 3; k++) - { - - int candidateByte = pInputBuffer[processedLength + k]; - if ((candidateByte & 0b11000000) == 0b11000000) - { - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 2; - TempScalarCountAdjustment += 1; - } - } - } - } } } @@ -750,8 +663,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe TempUtf16CodeUnitCountAdjustment += 2; TempScalarCountAdjustment += 1; } - - // processedLength += k; break; } } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index e36ab08..517bd30 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -435,9 +435,19 @@ public void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) { byte oldByte = utf8[i]; utf8[i] = 0b11100000; // Forcing a too short error + try + { Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); + ValidateCount(utf8,utf8ValidationDelegate); // Ensure you want to call this here, it seems unrelated to exception handling. + } + catch (Xunit.Sdk.XunitException) + { + Console.WriteLine($"Assertion failed at index: {i}"); + PrintHexAndBinary(utf8, i); + utf8[i] = oldByte; // Restore the original byte + throw; // Rethrow the exception to fail the test. + } utf8[i] = oldByte; // Restore the original byte } } @@ -800,56 +810,6 @@ public void Invalid0xf50xffAvx2() Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - // Prints both hexadecimal and binary representations of a byte array - // static void PrintHexAndBinary(byte[] bytes) - // { - // // Convert to hexadecimal - // string hexRepresentation = BitConverter.ToString(bytes).Replace("-", " "); - // Console.WriteLine($"Hex: {hexRepresentation}"); - - // // Convert to binary - // string binaryRepresentation = string.Join(" ", Array.ConvertAll(bytes, byteValue => Convert.ToString(byteValue, 2).PadLeft(8, '0'))); - // Console.WriteLine($"Binary: {binaryRepresentation}"); - // } - -// static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) -// { -// // Convert to hexadecimal -// Console.Write("Hex: "); -// for (int i = 0; i < bytes.Length; i++) -// { -// if (i == highlightIndex) -// { -// Console.ForegroundColor = ConsoleColor.Red; -// Console.Write($"{bytes[i]:X2} "); -// Console.ResetColor(); -// } -// else -// { -// Console.Write($"{bytes[i]:X2} "); -// } -// } -// Console.WriteLine(); // New line for readability - -// // Convert to binary -// Console.Write("Binary: "); -// for (int i = 0; i < bytes.Length; i++) -// { -// string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0'); -// if (i == highlightIndex) -// { -// Console.ForegroundColor = ConsoleColor.Red; -// Console.Write($"{binaryString} "); -// Console.ResetColor(); -// } -// else -// { -// Console.Write($"{binaryString} "); -// } -// } -// Console.WriteLine(); // New line for readability -// } - static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { int chunkSize = 16; // 128 bits = 16 bytes @@ -905,13 +865,8 @@ public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { - - - Console.WriteLine("Outputlength:" + outputLength); for (int trial = 0; trial < NumTrials; trial++) { - Console.WriteLine("trial:",trial); - byte[] utf8 = generator.Generate(outputLength).ToArray(); for (int i = 0; i < utf8.Length; i++) From 9e31966dbff58fff34f6ff74a8a25f5fb60e2cd5 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 9 Apr 2024 09:39:11 -0400 Subject: [PATCH 43/75] Save game --- src/UTF8.cs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/UTF8.cs b/src/UTF8.cs index 073a189..37cc6d9 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -130,8 +130,25 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe int pos = 0; int nextPos; uint codePoint = 0; + while (pos < inputLength) { + // If the next 16 bytes are ascii, we can skip them. + nextPos = pos + 16; + if (nextPos <= inputLength) + { // if it is safe to read 16 more bytes, check that they are ascii + ulong v1 = *(ulong*)pInputBuffer; + ulong v2 = *(ulong*)(pInputBuffer + 8); + ulong v = v1 | v2; + + if ((v & 0x8080808080808080) == 0) + { + pos = nextPos; + continue; + } + + } + byte firstByte = pInputBuffer[pos]; while (firstByte < 0b10000000) { @@ -631,6 +648,8 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe // We have processed all the blocks using SIMD, we need to process the remaining bytes. // Process the remaining bytes with the scalar function + // worst possible case is 4 bytes, where we need to backtrack 3 bytes + // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte if (processedLength < inputLength) { From ab8b95cfc6f9ea4ddb744fd87314ca0d9eb30d8f Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 9 Apr 2024 10:49:06 -0400 Subject: [PATCH 44/75] put back code deleted by mistake --- src/UTF8.cs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/UTF8.cs b/src/UTF8.cs index 37cc6d9..39ac47a 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -641,6 +641,38 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe TempScalarCountAdjustment -= (int)fourByteCount; prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); + + if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) + { + // We have an unterminated sequence. + processedLength -= 3; + for(int k = 0; k < 3; k++) + { + int candidateByte = pInputBuffer[processedLength + k]; + if ((candidateByte & 0b11000000) == 0b11000000) + { + // if (k != 0) + if (true) + { + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 1; + } + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 2; + } + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + TempUtf16CodeUnitCountAdjustment += 2; + TempScalarCountAdjustment += 1; + } + } + processedLength += k; + break; + } + } + } } } } From 5d47527f50b88e87a3f0130aaae5887f067d7da4 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Wed, 10 Apr 2024 11:36:04 -0400 Subject: [PATCH 45/75] fix error + clearer erre name --- src/UTF8.cs | 18 +--- test/UTF8ValidationTests.cs | 173 +++++++++++------------------------- 2 files changed, 55 insertions(+), 136 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 39ac47a..eee0cbb 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -133,21 +133,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe while (pos < inputLength) { - // If the next 16 bytes are ascii, we can skip them. - nextPos = pos + 16; - if (nextPos <= inputLength) - { // if it is safe to read 16 more bytes, check that they are ascii - ulong v1 = *(ulong*)pInputBuffer; - ulong v2 = *(ulong*)(pInputBuffer + 8); - ulong v = v1 | v2; - - if ((v & 0x8080808080808080) == 0) - { - pos = nextPos; - continue; - } - } byte firstByte = pInputBuffer[pos]; while (firstByte < 0b10000000) @@ -651,8 +637,8 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe int candidateByte = pInputBuffer[processedLength + k]; if ((candidateByte & 0b11000000) == 0b11000000) { - // if (k != 0) - if (true) + if (k != 0) + // if (true) { if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 517bd30..67a0b1f 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -877,7 +877,7 @@ public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100); Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i+1,utf8ValidationDelegate)); ValidateCount(utf8,utf8ValidationDelegate); utf8[i] = old; } @@ -926,8 +926,7 @@ public void TooLargeErrorAvx() } - // TODO: improve this test - public void TooLargeErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) + public void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -941,7 +940,7 @@ public void TooLargeErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) generator.ReplaceEndOfArray(filler,toolong); Assert.False(ValidateUtf8(filler,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(filler, outputLength -1,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(filler, filler.Length - 1,utf8ValidationDelegate)); ValidateCount(filler,utf8ValidationDelegate); } @@ -952,47 +951,47 @@ public void TooLargeErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) [Fact] [Trait("Category", "scalar")] - public void TooLargeErrorAtEndScalar() + public void AsciiPlusContinuationAtEndErrorScalar() { - TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } // TODO:Uncomment when SSE is updated // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] // [Fact] // [Trait("Category", "sse")] - // public void TooLargeErrorAtEndSse() + // public void AsciiPlusContinuationAtEndErrorSse() // { - // TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); // } // TODO:Uncomment when AVX512 is updated // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] // [Trait("Category", "avx512")] - // public void TooLargeErrorAtEndAvx512() + // public void AsciiPlusContinuationAtEndErrorAvx512() // { - // TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); // } // TODO:Uncomment when Arm64 is updated // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] // [Trait("Category", "arm64")] - // public void TooLargeErrorAtEndArm64() + // public void AsciiPlusContinuationAtEndErrorArm64() // { - // TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } [Fact] [Trait("Category", "avx")] - public void TooLargeErrorAtEndAVX() + public void AsciiPlusContinuationAtEndErrorAVX() { - TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } [Fact] - public void TooLargeErrorAtEndAvx2() + public void AsciiPlusContinuationAtEndErrorAvx2() { - TooLargeErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } public void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) @@ -1268,26 +1267,6 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg } } -// void PrintDebugInfo(byte* failedByte, byte* startPtr, byte[] utf8, string source) -// { -// int failedIndex = (int)(failedByte - startPtr); -// byte failedByteValue = *failedByte; -// Console.WriteLine($"Failure in {source}: Index {failedIndex}, Byte {failedByteValue:X2}"); - -// // Print surrounding sequence, assuming 5 bytes context around the failure point -// int contextRadius = 5; -// int startContext = Math.Max(0, failedIndex - contextRadius); -// int endContext = Math.Min(utf8.Length, failedIndex + contextRadius + 1); // Include the failed byte and some after -// Console.Write("Sequence around failure point: "); -// for (int i = startContext; i < endContext; i++) -// { -// Console.Write($"{utf8[i]:X2} "); -// } -// Console.WriteLine(); -// } - - - // Helper method to calculate the actual offset and length from a Range private (int offset, int length) GetOffsetAndLength(int totalLength, Range range) { @@ -1301,47 +1280,6 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg // Define a delegate that matches the signature of the methods you want to test public unsafe delegate byte* Utf8ValidationDelegate(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); - - - - // public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) - // { - // int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; - // int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - - // var isDefaultRange = range.Equals(default(Range)); - // var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); - - // unsafe - // { - // fixed (byte* pInput = utf8) - // { - // byte* startPtr = pInput + offset; - // // Invoke the method under test. - - // DotnetUtf16Adjustment= 0; - // DotnetScalarCountAdjustment= 0; - // DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); - - // SimdUnicodeUtf16Adjustment= 0; - // SimdUnicodeScalarCountAdjustment= 0; - // utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - - // // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); - // // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); - - // // Console.WriteLine("Lenght:" + utf8.Length); - // // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); - // // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); - // // Console.WriteLine("___________________________________________________"); - - // Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); - // Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - // } - // } - // // } - // } - public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) { int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; @@ -1384,60 +1322,55 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele } - [Fact] - [Trait("Category", "Scalar")] - public void DotnetUTF16Count() - { - int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; - int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; - int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - - - foreach (int outputLength in outputLengths) - { - // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. - // byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray(); - byte[] utf8 = generator.Generate(howManyUnits: outputLength).ToArray(); - PrintHexAndBinary(utf8); - var (offset, length) = (0, utf8.Length); - - unsafe - { - fixed (byte* pInput = utf8) - { - byte* startPtr = pInput + offset; - // Invoke the method under test. + // [Fact] + // [Trait("Category", "Scalar")] + // public void DotnetUTF16Count() + // { + // int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; + // int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; + // int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - DotnetUtf16Adjustment= 0; - DotnetScalarCountAdjustment= 0; - DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); - SimdUnicodeUtf16Adjustment= 0; - SimdUnicodeScalarCountAdjustment= 0; - SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + // foreach (int outputLength in outputLengths) + // { + // // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. + // // byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray(); + // byte[] utf8 = generator.Generate(howManyUnits: outputLength).ToArray(); + // PrintHexAndBinary(utf8); + // var (offset, length) = (0, utf8.Length); - Console.WriteLine("Lenght:" + utf8.Length); + // unsafe + // { + // fixed (byte* pInput = utf8) + // { + // byte* startPtr = pInput + offset; + // // Invoke the method under test. - Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); - Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); + // DotnetUtf16Adjustment= 0; + // DotnetScalarCountAdjustment= 0; + // DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); - Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); - Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); - Console.WriteLine("___________________________________________________"); + // SimdUnicodeUtf16Adjustment= 0; + // SimdUnicodeScalarCountAdjustment= 0; + // SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + // Console.WriteLine("Lenght:" + utf8.Length); - Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); - Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); + // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); + // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); + // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); + // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); + // Console.WriteLine("___________________________________________________"); + // Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + // Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - // If your generator creates specific patterns or the utility calculates these adjustments differently, - // you'll need to adjust the expected values accordingly. - } - } - } - } + // } + // } + // } + // } } From 92b7e3b66964fdced35590e4a0611d509f5355d5 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 21 Apr 2024 21:42:50 -0400 Subject: [PATCH 46/75] Addad seems tests --- src/UTF8.cs | 345 +++++++++++++++++++++--------------- test/UTF8ValidationTests.cs | 110 +++++++++++- 2 files changed, 311 insertions(+), 144 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index eee0cbb..d55103a 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -3,50 +3,59 @@ using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.Arm; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace SimdUnicode { public static class UTF8 { + static Func byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0'); + + public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { - + Console.WriteLine("--Rewind Validate with Errors"); int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; int extraLen = 0; bool foundLeadingBytes = false; - for (int i = 0; i <= howFarBack; i++) - { - byte candidateByte = buf[0 - i]; - foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - if (foundLeadingBytes) - { - if (i == 0) {break;} - // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); - // adjustment to avoid double counting - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - // Console.WriteLine("Found 2 byte"); - TempUtf16CodeUnitCountAdjustment += 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - // Console.WriteLine("Found 3 byte"); - TempUtf16CodeUnitCountAdjustment += 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - { - // Console.WriteLine("Found 4 byte"); - TempUtf16CodeUnitCountAdjustment += 2; - TempScalarCountAdjustment += 1; - } - break; - } - } - + // TODO: adjust for double counting iff there is an error eg invalidpointerbyte != length + // Even with no errors, it sometime double counts, why.. ? because it goes back even further + // even though the scalar doesnt thread + // adjust for double counting + // for (int i = 0; i <= howFarBack; i++) + // { + // byte candidateByte = buf[0 - i]; + // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + // if (foundLeadingBytes) + // { + // // if (i == 0) {break;} + // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); + // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // // adjustment to avoid double counting + // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + // { + // // Console.WriteLine("Found 2 byte"); + // TempUtf16CodeUnitCountAdjustment += 1; + // } + // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + // { + // // Console.WriteLine("Found 3 byte"); + // TempUtf16CodeUnitCountAdjustment += 2; + // } + // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + // { + // // Console.WriteLine("Found 4 byte"); + // TempUtf16CodeUnitCountAdjustment += 2; + // TempScalarCountAdjustment += 1; + // } + // break; + // } + // } for (int i = 0; i <= howFarBack; i++) { @@ -56,6 +65,8 @@ public static class UTF8 { buf -= i; extraLen = i; + Console.WriteLine(howFarBack); + Console.WriteLine("Backed up " + i + 1 + " bytes"); break; } } @@ -74,56 +85,25 @@ public static class UTF8 // Now buf points to the start of a UTF-8 sequence or the start of the buffer. // Validate from this new start point with the adjusted length. + + // TODO:figure out why calling SIMD here breaks the tests filter.This just breaks stuff?!?!?! byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCountAdjustment; - // Console.WriteLine("utf16count after rewint(Temp):" + TempUtf16CodeUnitCountAdjustment); - // Console.WriteLine("scalarcount after rewint:" + TempScalarCountAdjustment); - - // Console.WriteLine("utf16count after rewint(Scalar):" + TailUtf16CodeUnitCountAdjustment); - // Console.WriteLine("scalarcount after rewint:" + TailScalarCountAdjustment); + Console.WriteLine("rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment); + Console.WriteLine("scalarcount adjstment after rewind:" + TempScalarCountAdjustment); + Console.WriteLine(" "); + Console.WriteLine("rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment); + Console.WriteLine("scalarcount after rewind(ditto):" + TailScalarCountAdjustment); return invalidBytePointer; } - public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippedBytes, - ref int utf16CodeUnitCountAdjustment, - ref int scalarCountAdjustment, - bool shouldAdd = false) - { - int adjustmentFactor = shouldAdd ? 1 : -1; - - // for (int i = 0; i < skippedBytes; i++) - for (int i = 0; i < 3; i++) - { - byte currentByte = *(pInputBuffer + i); - if (currentByte >= 0xC0 && currentByte < 0xE0) - { - // 2-byte sequence - utf16CodeUnitCountAdjustment += 1 * adjustmentFactor; - } - else if (currentByte >= 0xE0 && currentByte < 0xF0) - { - // 3-byte sequence - utf16CodeUnitCountAdjustment += 2 * adjustmentFactor; - scalarCountAdjustment += 1 * adjustmentFactor; // Assuming each 3-byte sequence translates to one scalar. - } - else if (currentByte >= 0xF0) - { - // 4-byte sequence - utf16CodeUnitCountAdjustment += 2 * adjustmentFactor; // Two UTF-16 code units for each 4-byte sequence. - scalarCountAdjustment += 1 * adjustmentFactor; // One scalar for each 4-byte sequence. - } - // Adjust for other conditions as necessary - } - } - public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; @@ -134,7 +114,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe while (pos < inputLength) { - byte firstByte = pInputBuffer[pos]; while (firstByte < 0b10000000) { @@ -409,7 +388,8 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - // Console.WriteLine("--------------------------Calling function----------------------------------"); + Console.WriteLine("--------------------------Calling function----------------------------------"); + Console.WriteLine("Length: " + inputLength); int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; @@ -438,6 +418,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe } } + Console.WriteLine("asciirun bytes: ", asciirun); processedLength = asciirun; @@ -535,59 +516,68 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe { // TODO/think about : this path iss not explicitly tested - // Console.WriteLine("----Checkpoint 1:All ASCII need rewind"); + Console.WriteLine("----All ASCII need rewind"); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; // int off = processedLength >= 3 ? processedLength - 3 : processedLength; int off = processedLength; - if (processedLength >= 32 + 3){ - off = processedLength -32 - 3; - int overlapCount =3; - for(int k = 0; k < overlapCount; k++) - { - - int candidateByte = pInputBuffer[processedLength + k]; - if ((candidateByte & 0b11000000) == 0b11000000) - { - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 2; - TempScalarCountAdjustment += 1; - } - } - } - } + // No need to count + + // if (processedLength >= 32 + 3){ + // off = processedLength -32 - 3; + // int overlapCount =3; + // for(int k = 0; k < overlapCount; k++) + // { + + // int candidateByte = pInputBuffer[processedLength + k]; + // if ((candidateByte & 0b11000000) == 0b11000000) + // { + // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + // { + // TempUtf16CodeUnitCountAdjustment += 1; + // } + // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + // { + // TempUtf16CodeUnitCountAdjustment += 2; + // } + // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + // { + // TempUtf16CodeUnitCountAdjustment += 2; + // TempScalarCountAdjustment += 1; + // } + // } + // } + // } + + + // TODO this needs S return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = Vector256.Zero; } else // Contains non-ASCII characters, we need to do non-trivial processing { - // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. + Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes"); - Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); - Vector256 isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte); - Vector256 isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte); + // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. + // TODO:integrate this better with the rest of the code + // Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); + // Vector256 isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte); + // Vector256 isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte); - uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence)); - uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence)); - uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); + // uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence)); + // uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence)); + // uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); // Calculate counts by isolating each type. - uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. - uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. + // uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. + // uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. + + Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; @@ -600,25 +590,44 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe Vector256 sc = Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high); Vector256 prev2 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 2)); Vector256 prev3 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 3)); + + Vector256 isSecondByte = Avx2.SubtractSaturate(prev3, secondByte); Vector256 isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte); + + Vector256 isThirdByteAdjustment = Avx2.SubtractSaturate(prev3, thirdByte); + Vector256 isFourthByte = Avx2.SubtractSaturate(prev3, fourthByte); + + uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isSecondByte)); + uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isThirdByteAdjustment)); + uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isFourthByte)); + + uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. + uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. + Vector256 must23 = Avx2.Or(isThirdByte, isFourthByte); Vector256 must23As80 = Avx2.And(must23, v80); Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) //context: we are dealing with a 32 bit { - // Console.WriteLine("-----Error path!!"); + Console.WriteLine("-----Error path!!"); TailScalarCodeUnitCountAdjustment =0; TailUtf16CodeUnitCountAdjustment =0; - - int off = processedLength >= 32 ? processedLength: processedLength; - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + + // TODO :I cant remember why I pu an off that does the same thing here but look intit + // int off = processedLength >= 32 ? processedLength: processedLength; + // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; - return invalidBytePointer; + Console.WriteLine("--------"); + Console.WriteLine("TempUTF16 after error rewind:"+ utf16CodeUnitCountAdjustment); + Console.WriteLine("TempScalar '' '' '':"+ scalarCountAdjustment); + + return invalidBytePointer; } // Adjustments TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; @@ -626,19 +635,56 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; TempScalarCountAdjustment -= (int)fourByteCount; + Console.WriteLine("Doublecount(Temp) after SIMD processing:" + TempUtf16CodeUnitCountAdjustment); + Console.WriteLine("Scalarcount after SIMD processing:" + TempScalarCountAdjustment); + + prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { // We have an unterminated sequence. + Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes"); processedLength -= 3; + + // int incompleteScalarCodeUnitCountAdjustmentPreventDoubleCounting = 0; + // int incompleteUtf16CodeUnitPreventDoubleCounting = 0; + + // SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, 3,out incompleteUtf16CodeUnitPreventDoubleCounting,out incompleteScalarCodeUnitCountAdjustmentPreventDoubleCounting); + + // incompleteScalarCodeUnitCountAdjustmentPreventDoubleCounting *= -1; + // incompleteUtf16CodeUnitPreventDoubleCounting *= -1; + + // TempUtf16CodeUnitCountAdjustment+= incompleteUtf16CodeUnitPreventDoubleCounting; + // TempScalarCountAdjustment+= incompleteScalarCodeUnitCountAdjustmentPreventDoubleCounting; + + + + // Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting); + int backedup= 0; + for(int k = 0; k < 3; k++) { int candidateByte = pInputBuffer[processedLength + k]; if ((candidateByte & 0b11000000) == 0b11000000) { - if (k != 0) - // if (true) + backedup = 3-k; + Console.WriteLine("Backing up " + backedup +" bytes"); + + // Whatever you do, do not delete this + processedLength += k; + break; + } + } + + for(int k = backedup; k < 3; k++) + { + int candidateByte = pInputBuffer[processedLength - k]; + if ((candidateByte & 0b11000000) == 0b11000000) + { + // TODO:The SIMD code backs up by at most 3, but recall that the count is now done on prev3 + // This has the advantage that we KNOW that the bytes here are valid instead of having to take into account the error + // if (k != 0) { if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { @@ -650,20 +696,28 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe } if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence { + Console.WriteLine("Found 4-byte"); TempUtf16CodeUnitCountAdjustment += 2; TempScalarCountAdjustment += 1; } + // break; + } - processedLength += k; - break; } } + + + Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment); + Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment); + Console.WriteLine("-----------------"); } } } } } + Console.WriteLine("-Done with SIMD part!"); + // We have processed all the blocks using SIMD, we need to process the remaining bytes. // Process the remaining bytes with the scalar function // worst possible case is 4 bytes, where we need to backtrack 3 bytes @@ -671,38 +725,47 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe if (processedLength < inputLength) { - // Console.WriteLine("----Process remaining Scalar"); + Console.WriteLine("----Process remaining Scalar"); + // Console.WriteLine("processed length before:" + processedLength); int overlapCount = 0; // // We need to possibly backtrack to the start of the last code point while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) { processedLength -= 1; - overlapCount +=1; - } + // overlapCount +=1; + } + + // Console.WriteLine("processed length after:" + processedLength); + + + // Best use rewind I think + // for(int k = 0; k < overlapCount; k++) + // { + // // There is no error here hence the loop is straigthforward and we avoid double counting every byte + // int candidateByte = pInputBuffer[processedLength + k]; + // if ((candidateByte & 0b11000000) == 0b11000000) + // { + // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + // { + // TempUtf16CodeUnitCountAdjustment += 1; + // } + // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + // { + // TempUtf16CodeUnitCountAdjustment += 2; + // } + // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + // { + // TempUtf16CodeUnitCountAdjustment += 2; + // TempScalarCountAdjustment += 1; + // } + // break; + // } + // } + + // Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); + // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); - for(int k = 0; k < overlapCount; k++) - { - - int candidateByte = pInputBuffer[processedLength + k]; - if ((candidateByte & 0b11000000) == 0b11000000) - { - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 2; - TempScalarCountAdjustment += 1; - } - break; - } - } byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) @@ -713,6 +776,10 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe // An invalid byte was found by the scalar function return invalidBytePointer; } + + // Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); + // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); + } utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 67a0b1f..5aec3f2 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -7,7 +7,9 @@ namespace tests; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.Arm; using BenchmarkDotNet.Disassemblers; +using Iced.Intel; +// TODO: add test for unterminated sequeqce happeqiqg at SIMD transition public unsafe class Utf8SIMDValidationTests { @@ -249,8 +251,18 @@ public void NoError(Utf8ValidationDelegate utf8ValidationDelegate) byte[] utf8 = generator.Generate(outputLength).ToArray(); bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); - Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); - ValidateCount(utf8,utf8ValidationDelegate); + try + { + Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); + Assert.True(InvalidateUtf8(utf8, outputLength,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); + } + catch (Xunit.Sdk.XunitException) + { + // Console.WriteLine($"Assertion failed at index: "); + PrintHexAndBinary(utf8); + throw; // Rethrow the exception to fail the test. + } } } } @@ -356,6 +368,88 @@ public void NoErrorSpecificByteCountAVX() NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDelegate) + { + // foreach (int outputLength in outputLengths) + { + int outputLength = 256; + for (int trial = 0; trial < NumTrials; trial++) + { + + + // var allAscii = generator.Generate(outputLength,1); + var allAscii = new List(Enumerable.Repeat((byte)0, 256)); + int firstcodeLength = rand.Next(2,5); + int secondcodeLength = rand.Next(2,5); + List singlebytes = generator.Generate(1,firstcodeLength);//recall:generate a utf8 code between 2 and 4 bytes + List secondbyte = generator.Generate(1,secondcodeLength); + singlebytes.AddRange(secondbyte); + + int incompleteLocation = 127 - rand.Next(1,firstcodeLength + secondcodeLength); + allAscii.InsertRange(incompleteLocation,singlebytes); + + var utf8 = allAscii.ToArray(); + Console.WriteLine("---------------New trial"); + // PrintHexAndBinary(utf8,incompleteLocation); + + bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); + string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); + try + { + Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); + Assert.True(InvalidateUtf8(utf8, outputLength,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); + } + catch (Xunit.Sdk.XunitException) + { + // Console.WriteLine($"Assertion failed at index: "); + PrintHexAndBinary(utf8,incompleteLocation); + throw; // Rethrow the exception to fail the test. + } + } + } + } + + + [Fact] + [Trait("Category", "scalar")] + public void NoErrorIncompleteAt256VectorScalar() + { + NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void NoErrorIncompleteAt256VectorSse() + // { + // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void NoErrorIncompleteAt256VectorAvx512() + // { + // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void NoErrorIncompleteAt256VectorArm64() + // { + // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void NoErrorIncompleteAt256VectorAVX() + { + NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) @@ -850,6 +944,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) Console.Write($"{binaryString} "); Console.ResetColor(); } + else if (i % (chunkSize * 2) == 0) // print green every 256 bytes + { + Console.ForegroundColor = ConsoleColor.Green; + Console.Write($"{binaryString} "); + Console.ResetColor(); + } else { Console.Write($"{binaryString} "); @@ -1307,14 +1407,14 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele try { - Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); + // Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); } catch (Exception) { // Upon failure, print the utf8 array for inspection - Console.WriteLine("Assertion failed. Inspecting utf8 array:"); - PrintHexAndBinary(utf8,failureIndex); + Console.WriteLine("ValidateCount Assertion failed. Inspecting utf8 array:"); + // PrintHexAndBinary(utf8,failureIndex); throw; // Re-throw the exception to preserve the failure state } } From cbf004dd01cc88aa3dc82fab3f5cd6961240346b Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 23 Apr 2024 09:59:45 -0400 Subject: [PATCH 47/75] Incomplete test progress (only scalarcount working ) --- src/UTF8.cs | 105 +++++++++++++++++++++--------------- test/UTF8ValidationTests.cs | 8 ++- 2 files changed, 69 insertions(+), 44 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index d55103a..09a2671 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -16,6 +16,8 @@ public static class UTF8 public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { Console.WriteLine("--Rewind Validate with Errors"); + Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); + int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; @@ -26,39 +28,42 @@ public static class UTF8 // Even with no errors, it sometime double counts, why.. ? because it goes back even further // even though the scalar doesnt thread // adjust for double counting - // for (int i = 0; i <= howFarBack; i++) - // { - // byte candidateByte = buf[0 - i]; - // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - // if (foundLeadingBytes) - // { - // // if (i == 0) {break;} - // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); - // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // // adjustment to avoid double counting - // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - // { - // // Console.WriteLine("Found 2 byte"); - // TempUtf16CodeUnitCountAdjustment += 1; - // } - // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - // { - // // Console.WriteLine("Found 3 byte"); - // TempUtf16CodeUnitCountAdjustment += 2; - // } - // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - // { - // // Console.WriteLine("Found 4 byte"); - // TempUtf16CodeUnitCountAdjustment += 2; - // TempScalarCountAdjustment += 1; - // } - // break; - // } - // } + // for (int i = 0; i <= howFarBack; i++) + for (int i = 0; i <= howFarBack; i++) + { + if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior + byte candidateByte = buf[0 - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + if (foundLeadingBytes) + { + + // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); + Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // adjustment to avoid double counting + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + // Console.WriteLine("Found 2 byte"); + TempUtf16CodeUnitCountAdjustment += 1; + } + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + // Console.WriteLine("Found 3 byte"); + TempUtf16CodeUnitCountAdjustment += 2; + } + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + // Console.WriteLine("Found 4 byte"); + TempUtf16CodeUnitCountAdjustment += 2; + TempScalarCountAdjustment += 1; + } + break; + } + } for (int i = 0; i <= howFarBack; i++) { + Console.WriteLine("backup stat:" + i); byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) @@ -66,7 +71,10 @@ public static class UTF8 buf -= i; extraLen = i; Console.WriteLine(howFarBack); - Console.WriteLine("Backed up " + i + 1 + " bytes"); + Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + + // Console.WriteLine("Backed up " + extraLen + 1 + " bytes"); break; } } @@ -663,21 +671,31 @@ public static class UTF8 // Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting); int backedup= 0; + int currentByte = pInputBuffer[processedLength]; + Console.WriteLine("CurrentByte:" + Convert.ToString(currentByte, 2).PadLeft(8, '0')); + for(int k = 0; k < 3; k++) { int candidateByte = pInputBuffer[processedLength + k]; + Console.WriteLine("Backing up " + k +" bytes"); + Console.WriteLine("CurrentByte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + backedup = 3-k +1; + // TODO: + // the weird + 1 is so I dont have to put an else to the conditional below + // less readable, there might be a more elegant way to rewrite it but I am taking the path of convenience for now + if ((candidateByte & 0b11000000) == 0b11000000) { - backedup = 3-k; - Console.WriteLine("Backing up " + backedup +" bytes"); - // Whatever you do, do not delete this processedLength += k; break; } } - for(int k = backedup; k < 3; k++) + Console.WriteLine("Backed up " + backedup +" bytes"); + + for(int k = backedup; k < 3 ; k++) { int candidateByte = pInputBuffer[processedLength - k]; if ((candidateByte & 0b11000000) == 0b11000000) @@ -726,20 +744,20 @@ public static class UTF8 { Console.WriteLine("----Process remaining Scalar"); - // Console.WriteLine("processed length before:" + processedLength); + Console.WriteLine("processed length before:" + processedLength); int overlapCount = 0; // // We need to possibly backtrack to the start of the last code point while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) { processedLength -= 1; - // overlapCount +=1; + overlapCount +=1; } - // Console.WriteLine("processed length after:" + processedLength); + Console.WriteLine("processed length after backtrack:" + processedLength); - // Best use rewind I think + // TOCHECK:See if rewind is better here // for(int k = 0; k < overlapCount; k++) // { // // There is no error here hence the loop is straigthforward and we avoid double counting every byte @@ -763,11 +781,12 @@ public static class UTF8 // } // } - // Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); - // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); + Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); + Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; @@ -777,8 +796,8 @@ public static class UTF8 return invalidBytePointer; } - // Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); - // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); + Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); + Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 5aec3f2..38c8e99 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -921,6 +921,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) Console.Write($"{bytes[i]:X2} "); Console.ResetColor(); } + else if (i % (chunkSize * 2) == 0) // print green every 256 bytes + { + Console.ForegroundColor = ConsoleColor.Green; + Console.Write($"{bytes[i]:X2} "); + Console.ResetColor(); + } else { Console.Write($"{bytes[i]:X2} "); @@ -1408,7 +1414,7 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele try { Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - // Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); } catch (Exception) { From f3f2f9d6f4145af890a9549993d587778542b119 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Thu, 25 Apr 2024 12:48:53 -0400 Subject: [PATCH 48/75] incomplete test working --- src/UTF8.cs | 189 +++++++++++++++++++++++++++++++----- test/UTF8ValidationTests.cs | 4 +- 2 files changed, 167 insertions(+), 26 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 09a2671..99c2ef9 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -24,38 +24,133 @@ public static class UTF8 int extraLen = 0; bool foundLeadingBytes = false; + // this is the generic function called when there is an error: // TODO: adjust for double counting iff there is an error eg invalidpointerbyte != length // Even with no errors, it sometime double counts, why.. ? because it goes back even further // even though the scalar doesnt thread // adjust for double counting // for (int i = 0; i <= howFarBack; i++) - for (int i = 0; i <= howFarBack; i++) + // { + // if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior + // // TODO: written like this for readability, I know its ugly so this needs to be rewritten + // byte candidateByte = buf[0 - i]; + // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + // if (foundLeadingBytes) + // { + + // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); + // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // // adjustment to avoid double counting + // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + // { + // // Console.WriteLine("Found 2 byte"); + // TempUtf16CodeUnitCountAdjustment += 1; + // } + // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + // { + // // Console.WriteLine("Found 3 byte"); + // TempUtf16CodeUnitCountAdjustment += 2; + // } + // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + // { + // // Console.WriteLine("Found 4 byte"); + // TempUtf16CodeUnitCountAdjustment += 2; + // TempScalarCountAdjustment += 1; + // } + // break; + // } + // } + + for (int i = 0; i <= howFarBack; i++) + { + Console.WriteLine("backup stat:" + i); + byte candidateByte = buf[0 - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + if (foundLeadingBytes) + { + buf -= i; + extraLen = i; + Console.WriteLine(howFarBack); + Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // Console.WriteLine("Backed up " + extraLen + 1 + " bytes"); + break; + } + } + + + if (!foundLeadingBytes) + { + return buf - howFarBack; + } + + utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TempScalarCountAdjustment; + + int TailUtf16CodeUnitCountAdjustment = 0; + int TailScalarCountAdjustment = 0; + + // Now buf points to the start of a UTF-8 sequence or the start of the buffer. + // Validate from this new start point with the adjusted length. + + // TODO:figure out why calling SIMD here breaks the tests filter.This just breaks stuff?!?!?! + byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); + + utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TailScalarCountAdjustment; + + Console.WriteLine("rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment); + Console.WriteLine("scalarcount adjstment after rewind:" + TempScalarCountAdjustment); + Console.WriteLine(" "); + Console.WriteLine("rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment); + Console.WriteLine("scalarcount after rewind(ditto):" + TailScalarCountAdjustment); + + return invalidBytePointer; + } + + // I seperate this function as for the tail, we know that there has been no error thus far: but remember the SIMD + // function calculates + public unsafe static byte* RewindAndValidateWithErrorsRemaining(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + { + Console.WriteLine("--Rewind Validate with Errors Remaining"); + Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); + + int TempUtf16CodeUnitCountAdjustment = 0; + int TempScalarCountAdjustment = 0; + + int extraLen = 0; + bool foundLeadingBytes = false; + + for (int i = 0; i <= 3; i++) { if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior + // TODO: written like this for readability, I know its ugly so this needs to be rewritten byte candidateByte = buf[0 - i]; + Console.WriteLine("Checking Byte:" + candidateByte.ToString("X2")); + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) { - - // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); - Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); + // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); // adjustment to avoid double counting if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { - // Console.WriteLine("Found 2 byte"); - TempUtf16CodeUnitCountAdjustment += 1; + Console.WriteLine("Found 2 byte"); + TempUtf16CodeUnitCountAdjustment -= 1; } if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence { - // Console.WriteLine("Found 3 byte"); - TempUtf16CodeUnitCountAdjustment += 2; + Console.WriteLine("Found 3 byte"); + TempUtf16CodeUnitCountAdjustment -= 2; } if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence { - // Console.WriteLine("Found 4 byte"); - TempUtf16CodeUnitCountAdjustment += 2; - TempScalarCountAdjustment += 1; + Console.WriteLine("Found 4 byte"); + TempUtf16CodeUnitCountAdjustment -= 2; + TempScalarCountAdjustment -= 1; } break; } @@ -73,7 +168,6 @@ public static class UTF8 Console.WriteLine(howFarBack); Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - // Console.WriteLine("Backed up " + extraLen + 1 + " bytes"); break; } @@ -109,6 +203,7 @@ public static class UTF8 return invalidBytePointer; } + public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { @@ -511,8 +606,18 @@ public static class UTF8 Vector256 v0f = Vector256.Create((byte)0x0F); Vector256 v80 = Vector256.Create((byte)0x80); + bool prevWasSimd = false; + for (; processedLength + 32 <= inputLength; processedLength += 32) { + + + + // TODO: there is a problem with the fastpath : namely that if it is followed by a vector with all ascii, + // there is a gap where + // this is because + // Now we have 2 choices : either still use prev3 to count dutf and check if there is a gap here OR + // Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength); int mask = Avx2.MoveMask(currentBlock); @@ -520,6 +625,38 @@ public static class UTF8 { // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. + + if (prevWasSimd){ // recall that the non ascii simd checks counts the adjustment on prev3, hence we need to backtrack in case the + // it was called + Console.WriteLine("--prev was simd!"); + for(int k = 1; k <= 3 ; k++) // we dont want to double count the current byte + { + int candidateByte = pInputBuffer[processedLength - k]; + if ((candidateByte & 0b11000000) == 0b11000000) + { + { + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + TempUtf16CodeUnitCountAdjustment -= 1; + } + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + TempUtf16CodeUnitCountAdjustment -= 2; + } + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + Console.WriteLine("Found 4-byte"); + TempUtf16CodeUnitCountAdjustment -= 2; + TempScalarCountAdjustment -= 1; + } + // break; + + } + } + } + + } + if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { @@ -562,14 +699,15 @@ public static class UTF8 // } - // TODO this needs S return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = Vector256.Zero; + prevWasSimd = false; } else // Contains non-ASCII characters, we need to do non-trivial processing { Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes"); + prevWasSimd = true; // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. // TODO:integrate this better with the rest of the code @@ -616,6 +754,7 @@ public static class UTF8 Vector256 must23 = Avx2.Or(isThirdByte, isFourthByte); Vector256 must23As80 = Avx2.And(must23, v80); Vector256 error = Avx2.Xor(must23As80, sc); + if (!Avx2.TestZ(error, error)) //context: we are dealing with a 32 bit { Console.WriteLine("-----Error path!!"); @@ -676,9 +815,9 @@ public static class UTF8 for(int k = 0; k < 3; k++) { - int candidateByte = pInputBuffer[processedLength + k]; + int candidateByte = pInputBuffer[processedLength + 32 + k]; Console.WriteLine("Backing up " + k +" bytes"); - Console.WriteLine("CurrentByte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + Console.WriteLine("Byte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); backedup = 3-k +1; // TODO: @@ -743,21 +882,21 @@ public static class UTF8 if (processedLength < inputLength) { - Console.WriteLine("----Process remaining Scalar"); - Console.WriteLine("processed length before:" + processedLength); + Console.WriteLine("----Process remaining Scalar @ " + processedLength + "bytes"); + // Console.WriteLine("processed length before:" + processedLength); int overlapCount = 0; // // We need to possibly backtrack to the start of the last code point - while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) - { - processedLength -= 1; - overlapCount +=1; - } + // while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) + // { + // processedLength -= 1; + // overlapCount +=1; + // } Console.WriteLine("processed length after backtrack:" + processedLength); - // TOCHECK:See if rewind is better here + // PERFORMANCE TOCHECK:See if rewind is better here // for(int k = 0; k < overlapCount; k++) // { // // There is no error here hence the loop is straigthforward and we avoid double counting every byte @@ -785,8 +924,8 @@ public static class UTF8 Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); - byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrorsRemaining(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 38c8e99..a199369 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -9,7 +9,9 @@ namespace tests; using BenchmarkDotNet.Disassemblers; using Iced.Intel; -// TODO: add test for unterminated sequeqce happeqiqg at SIMD transition +// TODO: refine test for unterminated sequeqce happening at SIMD transition +// TODO: The various tests do not formally take into account the scenario where vector is all ASCII + public unsafe class Utf8SIMDValidationTests { From c5e400437d38af5175bd514741406390e7982779 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Fri, 26 Apr 2024 06:31:44 -0400 Subject: [PATCH 49/75] NoerrortestAVX working --- src/UTF8.cs | 27 ++++++++++++++++++++++++++- test/UTF8ValidationTests.cs | 13 +++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 99c2ef9..bc728fa 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -500,6 +500,9 @@ public static class UTF8 int TailScalarCodeUnitCountAdjustment = 0; int TailUtf16CodeUnitCountAdjustment = 0; + bool prevWasSimd = false; + + if (pInputBuffer == null || inputLength <= 0) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; @@ -606,7 +609,6 @@ public static class UTF8 Vector256 v0f = Vector256.Create((byte)0x0F); Vector256 v80 = Vector256.Create((byte)0x80); - bool prevWasSimd = false; for (; processedLength + 32 <= inputLength; processedLength += 32) { @@ -938,6 +940,29 @@ public static class UTF8 Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); + } else if (processedLength == inputLength && prevWasSimd){ + for(int k = 0; k < 3; k++) + { + // There is no error here hence the loop is straigthforward and we avoid double counting every byte + int candidateByte = pInputBuffer[processedLength - k]; + if ((candidateByte & 0b11000000) == 0b11000000) + { + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + TempUtf16CodeUnitCountAdjustment -= 1; + } + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + TempUtf16CodeUnitCountAdjustment -= 2; + } + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + TempUtf16CodeUnitCountAdjustment -= 2; + TempScalarCountAdjustment -= 1; + } + break; + } + } } utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index a199369..87d5193 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -325,8 +325,17 @@ private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8Vali { byte[] utf8 = generator.Generate(outputLength, byteLength).ToArray(); bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); - Assert.True(isValidUtf8, $"Failure for {byteLength}-byte UTF8 of length {outputLength} in trial {trial}"); - ValidateCount(utf8,utf8ValidationDelegate); + try + { + Assert.True(isValidUtf8, $"Failure NoErrorTest. "); + ValidateCount(utf8,utf8ValidationDelegate); + } + catch (Xunit.Sdk.XunitException) + { + Console.WriteLine($"Test failed for {byteLength}-byte unit "); + PrintHexAndBinary(utf8); + throw; // Rethrow the exception to fail the test. + } } } } From 4802a10070589ad25aaf09fe6fe58f37f3373bc4 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 28 Apr 2024 15:03:09 -0400 Subject: [PATCH 50/75] Noerreavx test really working this time --- src/UTF8.cs | 139 +++++++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 61 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index bc728fa..fdc9c90 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -111,7 +111,7 @@ public static class UTF8 // I seperate this function as for the tail, we know that there has been no error thus far: but remember the SIMD // function calculates - public unsafe static byte* RewindAndValidateWithErrorsRemaining(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + public unsafe static byte* RewindAndValidateWithErrorsRemaining(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment,bool prevWasUnterminated = false) { Console.WriteLine("--Rewind Validate with Errors Remaining"); Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); @@ -122,40 +122,43 @@ public static class UTF8 int extraLen = 0; bool foundLeadingBytes = false; - for (int i = 0; i <= 3; i++) - { - if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior - // TODO: written like this for readability, I know its ugly so this needs to be rewritten - byte candidateByte = buf[0 - i]; - Console.WriteLine("Checking Byte:" + candidateByte.ToString("X2")); - - foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - if (foundLeadingBytes) + // This was created in the context of incomplete tests: namely a gap is created when the SIMD vector is followed by a processremainingscalar + if (!prevWasUnterminated) // + { + for (int i = 0; i <= 3; i++) { - Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); - // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior + // TODO: written like this for readability, I know its ugly so this needs to be rewritten + byte candidateByte = buf[0 - i]; + Console.WriteLine("Checking Byte:" + candidateByte.ToString("X2")); - // adjustment to avoid double counting - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - Console.WriteLine("Found 2 byte"); - TempUtf16CodeUnitCountAdjustment -= 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - Console.WriteLine("Found 3 byte"); - TempUtf16CodeUnitCountAdjustment -= 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + if (foundLeadingBytes) { - Console.WriteLine("Found 4 byte"); - TempUtf16CodeUnitCountAdjustment -= 2; - TempScalarCountAdjustment -= 1; + Console.WriteLine("Double counting.Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); + // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // adjustment to avoid double counting + if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + { + Console.WriteLine("Found 2 byte"); + TempUtf16CodeUnitCountAdjustment -= 1; + } + if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + { + Console.WriteLine("Found 3 byte"); + TempUtf16CodeUnitCountAdjustment -= 2; + } + if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + { + Console.WriteLine("Found 4 byte"); + TempUtf16CodeUnitCountAdjustment -= 2; + TempScalarCountAdjustment -= 1; + } + break; } - break; } } - for (int i = 0; i <= howFarBack; i++) { Console.WriteLine("backup stat:" + i); @@ -501,6 +504,7 @@ public static class UTF8 int TailUtf16CodeUnitCountAdjustment = 0; bool prevWasSimd = false; + bool prevWasUnterminated = false; if (pInputBuffer == null || inputLength <= 0) @@ -836,39 +840,42 @@ public static class UTF8 Console.WriteLine("Backed up " + backedup +" bytes"); - for(int k = backedup; k < 3 ; k++) - { - int candidateByte = pInputBuffer[processedLength - k]; - if ((candidateByte & 0b11000000) == 0b11000000) - { - // TODO:The SIMD code backs up by at most 3, but recall that the count is now done on prev3 - // This has the advantage that we KNOW that the bytes here are valid instead of having to take into account the error - // if (k != 0) - { - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - TempUtf16CodeUnitCountAdjustment += 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - { - Console.WriteLine("Found 4-byte"); - TempUtf16CodeUnitCountAdjustment += 2; - TempScalarCountAdjustment += 1; - } - // break; + // for(int k = backedup; k < 3 ; k++) + // { + // int candidateByte = pInputBuffer[processedLength - k]; + // if ((candidateByte & 0b11000000) == 0b11000000) + // { + // // TODO:The SIMD code backs up by at most 3, but recall that the count is now done on prev3 + // // This has the advantage that we KNOW that the bytes here are valid instead of having to take into account the error + // // if (k != 0) + // { + // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + // { + // TempUtf16CodeUnitCountAdjustment += 1; + // } + // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + // { + // TempUtf16CodeUnitCountAdjustment += 2; + // } + // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + // { + // Console.WriteLine("Found 4-byte"); + // TempUtf16CodeUnitCountAdjustment += 2; + // TempScalarCountAdjustment += 1; + // } + // // break; - } - } - } + // } + // } + // } Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment); Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment); Console.WriteLine("-----------------"); + + prevWasUnterminated = true; + prevWasSimd = true; } } } @@ -927,7 +934,7 @@ public static class UTF8 // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrorsRemaining(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrorsRemaining(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment,prevWasUnterminated); if (invalidBytePointer != pInputBuffer + inputLength) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; @@ -939,28 +946,38 @@ public static class UTF8 Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); + // prevWasSimd = false; + + } + else + if (processedLength == inputLength && prevWasSimd){ // without this there is a 3 byte gap at the end + Console.Write("Closing in the gap\n"); - } else if (processedLength == inputLength && prevWasSimd){ - for(int k = 0; k < 3; k++) + for(int k = 0; k <= 3; k++) { - // There is no error here hence the loop is straigthforward and we avoid double counting every byte + + // There is no error here hence the loop is straigthforward and we avoid double counting every byte int candidateByte = pInputBuffer[processedLength - k]; if ((candidateByte & 0b11000000) == 0b11000000) { if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence { + Console.Write("Found 2 byte \n"); + TempUtf16CodeUnitCountAdjustment -= 1; } if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence { + Console.Write("Found 3 byte \n"); TempUtf16CodeUnitCountAdjustment -= 2; } if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence { + Console.Write("Found 4 byte \n"); TempUtf16CodeUnitCountAdjustment -= 2; TempScalarCountAdjustment -= 1; } - break; + // break; } } } From 73ecbf05620a0357f99fb191258e72e991155bea Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Wed, 1 May 2024 16:33:57 -0400 Subject: [PATCH 51/75] Some buggy attempts in comments --- src/UTF8.cs | 66 +++++++++++++++++++++++++++++++------ test/UTF8ValidationTests.cs | 21 ++++++++++-- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index fdc9c90..ef3c770 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -13,7 +13,7 @@ public static class UTF8 static Func byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0'); - public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment,bool prevWasSimd=false) { Console.WriteLine("--Rewind Validate with Errors"); Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); @@ -23,12 +23,50 @@ public static class UTF8 int extraLen = 0; bool foundLeadingBytes = false; + // Console.WriteLine(prevWasSimd); + + // adjust for filling in gap + // If an error is found, since we start counting tho adjustments on prev3, a gap is left that needs to be counted in case the previous operation was using SIMD + if (prevWasSimd) + { + // Console.WriteLine("Triggering Negative adjustment!"); + // for (int i = 0; i <= 3; i++) + // { + // if (i == 0){continue;}; // we dont want to dbouble count current byte + // byte candidateByte = buf[0 - i]; + // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + // // if (i==0 & foundLeadingBytes){break;};// We dont want to + // // TODO: written like this for readability, I know its ugly so this needs to be rewritten + + // if (foundLeadingBytes) + // { + + // Console.WriteLine("Negative adjstment:Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); + // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // // adjustment to avoid double counting + // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + // { + // // Console.WriteLine("Found 2 byte"); + // TempUtf16CodeUnitCountAdjustment -= 1; + // } + // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + // { + // // Console.WriteLine("Found 3 byte"); + // TempUtf16CodeUnitCountAdjustment -= 2; + // } + // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + // { + // // Console.WriteLine("Found 4 byte"); + // TempUtf16CodeUnitCountAdjustment -= 2; + // TempScalarCountAdjustment -= 1; + // } + // // break; + // } + // } + } + - // this is the generic function called when there is an error: - // TODO: adjust for double counting iff there is an error eg invalidpointerbyte != length - // Even with no errors, it sometime double counts, why.. ? because it goes back even further - // even though the scalar doesnt thread - // adjust for double counting // for (int i = 0; i <= howFarBack; i++) // { // if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior @@ -713,7 +751,7 @@ public static class UTF8 else // Contains non-ASCII characters, we need to do non-trivial processing { Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes"); - prevWasSimd = true; + prevWasSimd = true; // consider moving this somewhere else // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. // TODO:integrate this better with the rest of the code @@ -766,11 +804,19 @@ public static class UTF8 Console.WriteLine("-----Error path!!"); TailScalarCodeUnitCountAdjustment =0; TailUtf16CodeUnitCountAdjustment =0; + int off= 32; + + // if (processedLength <32) // not enough bytes to load into SIMD! + // { + // // off = 0; + // prevWasSimd = false; // there was no previous op at all, let alone SIMD one + // } - // TODO :I cant remember why I pu an off that does the same thing here but look intit - // int off = processedLength >= 32 ? processedLength: processedLength; + + // int off = processedLength >= 32 ? processedLength: 0; // we check if there + // without this there is an overflow if // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment,prevWasSimd); utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 87d5193..ff66ced 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -11,6 +11,7 @@ namespace tests; // TODO: refine test for unterminated sequeqce happening at SIMD transition // TODO: The various tests do not formally take into account the scenario where vector is all ASCII +// TODO?: Test if the error is in the first vector? public unsafe class Utf8SIMDValidationTests { @@ -475,9 +476,23 @@ public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) { byte oldByte = utf8[i]; utf8[i] = 0b11111000; // Forcing a header bits error - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); + // Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + // Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + // ValidateCount(utf8,utf8ValidationDelegate); + try + { + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); // Ensure you want to call this here, it seems unrelated to exception handling. + } + catch (Xunit.Sdk.XunitException) + { + Console.WriteLine($"Assertion failed at index: {i}"); + PrintHexAndBinary(utf8, i); + utf8[i] = oldByte; // Restore the original byte + throw; // Rethrow the exception to fail the test. + } + utf8[i] = oldByte; // Restore the original byte } } From 520951fc778256ddc08def6162d311360769e49e Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 4 May 2024 09:54:13 -0400 Subject: [PATCH 52/75] save game --- src/UTF8.cs | 96 ++++++++++++++++++------------------- test/UTF8ValidationTests.cs | 1 + 2 files changed, 49 insertions(+), 48 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index ef3c770..783a7c5 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -25,48 +25,6 @@ public static class UTF8 bool foundLeadingBytes = false; // Console.WriteLine(prevWasSimd); - // adjust for filling in gap - // If an error is found, since we start counting tho adjustments on prev3, a gap is left that needs to be counted in case the previous operation was using SIMD - if (prevWasSimd) - { - // Console.WriteLine("Triggering Negative adjustment!"); - // for (int i = 0; i <= 3; i++) - // { - // if (i == 0){continue;}; // we dont want to dbouble count current byte - // byte candidateByte = buf[0 - i]; - // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - // // if (i==0 & foundLeadingBytes){break;};// We dont want to - // // TODO: written like this for readability, I know its ugly so this needs to be rewritten - - // if (foundLeadingBytes) - // { - - // Console.WriteLine("Negative adjstment:Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); - // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // // adjustment to avoid double counting - // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - // { - // // Console.WriteLine("Found 2 byte"); - // TempUtf16CodeUnitCountAdjustment -= 1; - // } - // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - // { - // // Console.WriteLine("Found 3 byte"); - // TempUtf16CodeUnitCountAdjustment -= 2; - // } - // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - // { - // // Console.WriteLine("Found 4 byte"); - // TempUtf16CodeUnitCountAdjustment -= 2; - // TempScalarCountAdjustment -= 1; - // } - // // break; - // } - // } - } - - // for (int i = 0; i <= howFarBack; i++) // { // if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior @@ -102,13 +60,13 @@ public static class UTF8 for (int i = 0; i <= howFarBack; i++) { - Console.WriteLine("backup stat:" + i); + Console.WriteLine("Activiting main backup:" + i); byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) { buf -= i; - extraLen = i; + extraLen = i; // a measure of how far we've backed up Console.WriteLine(howFarBack); Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); @@ -117,6 +75,49 @@ public static class UTF8 } } + // adjust for filling in gap + // If an error is found, since we start counting tho adjustments on prev3, a gap is left that needs to be counted in case the previous operation was using SIMD + // if (prevWasSimd) + // { + // Console.WriteLine("Triggering Negative adjustment!"); + // for (int i = extraLen + 1; i <= extraLen + 3; i++) + // { + // // if (i == 0){continue;}; // we dont want to double count current byte + // byte candidateByte = buf[0 - i]; + // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + // // Console.WriteLine("Exmining byte...:" + candidateByte.ToString("X2")); + + // // if (i==0 & foundLeadingBytes){break;};// We dont want to + // // TODO: written like this for readability, I know its ugly so this needs to be rewritten + + // if (foundLeadingBytes) + // { + + // Console.WriteLine("Negative adjstment:Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); + // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // // adjustment to avoid double counting + // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence + // { + // // Console.WriteLine("Found 2 byte"); + // TempUtf16CodeUnitCountAdjustment -= 1; + // } + // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence + // { + // // Console.WriteLine("Found 3 byte"); + // TempUtf16CodeUnitCountAdjustment -= 2; + // } + // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence + // { + // // Console.WriteLine("Found 4 byte"); + // TempUtf16CodeUnitCountAdjustment -= 2; + // TempScalarCountAdjustment -= 1; + // } + // // break; + // } + // } + // } + if (!foundLeadingBytes) { @@ -804,14 +805,13 @@ public static class UTF8 Console.WriteLine("-----Error path!!"); TailScalarCodeUnitCountAdjustment =0; TailUtf16CodeUnitCountAdjustment =0; - int off= 32; + // int off= 32; - // if (processedLength <32) // not enough bytes to load into SIMD! + // if (processedLength <32) // // { // // off = 0; - // prevWasSimd = false; // there was no previous op at all, let alone SIMD one + // prevWasSimd = false; // not enough bytes to load into SIMD! there was no previous op at all, let alone SIMD one // } - // int off = processedLength >= 32 ? processedLength: 0; // we check if there // without this there is an overflow if diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index ff66ced..a17bedc 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -12,6 +12,7 @@ namespace tests; // TODO: refine test for unterminated sequeqce happening at SIMD transition // TODO: The various tests do not formally take into account the scenario where vector is all ASCII // TODO?: Test if the error is in the first vector? +// TODO:fix NoError,Ingomplete (some of the tests are wrong) public unsafe class Utf8SIMDValidationTests { From 3b0cb73da6027057b4283d8f65c2ebdba44352ee Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 7 May 2024 09:09:01 -0400 Subject: [PATCH 53/75] Main algo is working .this is the baseline --- src/UTF8.cs | 648 +++++----------------------------------------------- 1 file changed, 63 insertions(+), 585 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 783a7c5..5e3059f 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -3,262 +3,53 @@ using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.Arm; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; namespace SimdUnicode { public static class UTF8 { - static Func byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0'); - - - public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment,bool prevWasSimd=false) + public unsafe static byte* RewindAndValidateWithErrors(int priorBytes, byte* buf, int len) { - Console.WriteLine("--Rewind Validate with Errors"); - Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); - - int TempUtf16CodeUnitCountAdjustment = 0; - int TempScalarCountAdjustment = 0; - + int howFarBack = priorBytes; int extraLen = 0; bool foundLeadingBytes = false; - // Console.WriteLine(prevWasSimd); - - // for (int i = 0; i <= howFarBack; i++) - // { - // if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior - // // TODO: written like this for readability, I know its ugly so this needs to be rewritten - // byte candidateByte = buf[0 - i]; - // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - // if (foundLeadingBytes) - // { - - // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); - // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // // adjustment to avoid double counting - // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - // { - // // Console.WriteLine("Found 2 byte"); - // TempUtf16CodeUnitCountAdjustment += 1; - // } - // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - // { - // // Console.WriteLine("Found 3 byte"); - // TempUtf16CodeUnitCountAdjustment += 2; - // } - // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - // { - // // Console.WriteLine("Found 4 byte"); - // TempUtf16CodeUnitCountAdjustment += 2; - // TempScalarCountAdjustment += 1; - // } - // break; - // } - // } - for (int i = 0; i <= howFarBack; i++) { - Console.WriteLine("Activiting main backup:" + i); - byte candidateByte = buf[0 - i]; - foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + byte b = buf[0 - i]; + foundLeadingBytes = ((b & 0b11000000) != 0b10000000); if (foundLeadingBytes) - { - buf -= i; - extraLen = i; // a measure of how far we've backed up - Console.WriteLine(howFarBack); - Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // Console.WriteLine("Backed up " + extraLen + 1 + " bytes"); - break; - } - } - - // adjust for filling in gap - // If an error is found, since we start counting tho adjustments on prev3, a gap is left that needs to be counted in case the previous operation was using SIMD - // if (prevWasSimd) - // { - // Console.WriteLine("Triggering Negative adjustment!"); - // for (int i = extraLen + 1; i <= extraLen + 3; i++) - // { - // // if (i == 0){continue;}; // we dont want to double count current byte - // byte candidateByte = buf[0 - i]; - // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - // // Console.WriteLine("Exmining byte...:" + candidateByte.ToString("X2")); - - // // if (i==0 & foundLeadingBytes){break;};// We dont want to - // // TODO: written like this for readability, I know its ugly so this needs to be rewritten - - // if (foundLeadingBytes) - // { - - // Console.WriteLine("Negative adjstment:Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); - // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // // adjustment to avoid double counting - // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - // { - // // Console.WriteLine("Found 2 byte"); - // TempUtf16CodeUnitCountAdjustment -= 1; - // } - // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - // { - // // Console.WriteLine("Found 3 byte"); - // TempUtf16CodeUnitCountAdjustment -= 2; - // } - // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - // { - // // Console.WriteLine("Found 4 byte"); - // TempUtf16CodeUnitCountAdjustment -= 2; - // TempScalarCountAdjustment -= 1; - // } - // // break; - // } - // } - // } - - - if (!foundLeadingBytes) - { - return buf - howFarBack; - } - - utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TempScalarCountAdjustment; - - int TailUtf16CodeUnitCountAdjustment = 0; - int TailScalarCountAdjustment = 0; - - // Now buf points to the start of a UTF-8 sequence or the start of the buffer. - // Validate from this new start point with the adjusted length. - - // TODO:figure out why calling SIMD here breaks the tests filter.This just breaks stuff?!?!?! - byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); - - utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TailScalarCountAdjustment; - - Console.WriteLine("rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment); - Console.WriteLine("scalarcount adjstment after rewind:" + TempScalarCountAdjustment); - Console.WriteLine(" "); - Console.WriteLine("rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment); - Console.WriteLine("scalarcount after rewind(ditto):" + TailScalarCountAdjustment); - - return invalidBytePointer; - } - - // I seperate this function as for the tail, we know that there has been no error thus far: but remember the SIMD - // function calculates - public unsafe static byte* RewindAndValidateWithErrorsRemaining(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment,bool prevWasUnterminated = false) - { - Console.WriteLine("--Rewind Validate with Errors Remaining"); - Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); - - int TempUtf16CodeUnitCountAdjustment = 0; - int TempScalarCountAdjustment = 0; - - int extraLen = 0; - bool foundLeadingBytes = false; - - // This was created in the context of incomplete tests: namely a gap is created when the SIMD vector is followed by a processremainingscalar - if (!prevWasUnterminated) // - { - for (int i = 0; i <= 3; i++) { - if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior - // TODO: written like this for readability, I know its ugly so this needs to be rewritten - byte candidateByte = buf[0 - i]; - Console.WriteLine("Checking Byte:" + candidateByte.ToString("X2")); - - foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - if (foundLeadingBytes) - { - Console.WriteLine("Double counting.Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2")); - // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // adjustment to avoid double counting - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - Console.WriteLine("Found 2 byte"); - TempUtf16CodeUnitCountAdjustment -= 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - Console.WriteLine("Found 3 byte"); - TempUtf16CodeUnitCountAdjustment -= 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - { - Console.WriteLine("Found 4 byte"); - TempUtf16CodeUnitCountAdjustment -= 2; - TempScalarCountAdjustment -= 1; - } - break; - } - } - } - for (int i = 0; i <= howFarBack; i++) - { - Console.WriteLine("backup stat:" + i); - byte candidateByte = buf[0 - i]; - foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - if (foundLeadingBytes) - { buf -= i; extraLen = i; - Console.WriteLine(howFarBack); - Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // Console.WriteLine("Backed up " + extraLen + 1 + " bytes"); break; } } - - if (!foundLeadingBytes) { return buf - howFarBack; } - utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TempScalarCountAdjustment; - - int TailUtf16CodeUnitCountAdjustment = 0; - int TailScalarCountAdjustment = 0; // Now buf points to the start of a UTF-8 sequence or the start of the buffer. // Validate from this new start point with the adjusted length. + byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); - // TODO:figure out why calling SIMD here breaks the tests filter.This just breaks stuff?!?!?! - byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); - - utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TailScalarCountAdjustment; - - Console.WriteLine("rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment); - Console.WriteLine("scalarcount adjstment after rewind:" + TempScalarCountAdjustment); - Console.WriteLine(" "); - Console.WriteLine("rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment); - Console.WriteLine("scalarcount after rewind(ditto):" + TailScalarCountAdjustment); - - return invalidBytePointer; + return invalidByte; } - public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { + int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; int pos = 0; int nextPos; uint codePoint = 0; - while (pos < inputLength) { - byte firstByte = pInputBuffer[pos]; while (firstByte < 0b10000000) { @@ -268,6 +59,7 @@ public static class UTF8 scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + inputLength; } firstByte = pInputBuffer[pos]; + // TempUtf16CodeUnitCountAdjustment -= 1; } if ((firstByte & 0b11100000) == 0b11000000) @@ -317,10 +109,16 @@ public static class UTF8 utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } // Too short + // if (pInputBuffer[pos + 3] < 0b10000000) { + // TempUtf16CodeUnitCountAdjustment -= 1; + // } else { + // TempUtf16CodeUnitCountAdjustment -= 2; + // } TempUtf16CodeUnitCountAdjustment -= 2; } else if ((firstByte & 0b11111000) == 0b11110000) - { + { // 0b11110000 + nextPos = pos + 4; if (nextPos > inputLength) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; @@ -347,6 +145,9 @@ public static class UTF8 return pInputBuffer + pos; } TempUtf16CodeUnitCountAdjustment -= 2; TempScalarCountAdjustment -= 1; + + + } else { @@ -377,8 +178,6 @@ public static class UTF8 { int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment= 0 ; - int TempScalarCountAdjustment = 0; if (pInputBuffer == null || inputLength <= 0) { @@ -468,7 +267,7 @@ public static class UTF8 // return pInputBuffer + processedLength; // Console.WriteLine("not ascii"); - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); } prevIncomplete = Vector128.Zero; } @@ -490,7 +289,7 @@ public static class UTF8 Vector128 error = Sse2.Xor(must23As80, sc); if (Sse2.MoveMask(error) != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); } prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue); } @@ -531,25 +330,12 @@ public static class UTF8 } - public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) + public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength) { - Console.WriteLine("--------------------------Calling function----------------------------------"); - Console.WriteLine("Length: " + inputLength); int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment= 0 ; - int TempScalarCountAdjustment = 0; - - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - - bool prevWasSimd = false; - bool prevWasUnterminated = false; - if (pInputBuffer == null || inputLength <= 0) { - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer; } if (inputLength > 128) @@ -565,13 +351,9 @@ public static class UTF8 { break; } - } - Console.WriteLine("asciirun bytes: ", asciirun); processedLength = asciirun; - - if (processedLength + 32 < inputLength) { // We still have work to do! @@ -645,24 +427,15 @@ public static class UTF8 TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); - Vector256 secondByte = Vector256.Create((byte)(0b11000000u - 0x80)); Vector256 thirdByte = Vector256.Create((byte)(0b11100000u - 0x80)); Vector256 fourthByte = Vector256.Create((byte)(0b11110000u - 0x80)); - Vector256 v0f = Vector256.Create((byte)0x0F); Vector256 v80 = Vector256.Create((byte)0x80); + for (; processedLength + 32 <= inputLength; processedLength += 32) { - - - - // TODO: there is a problem with the fastpath : namely that if it is followed by a vector with all ascii, - // there is a gap where - // this is because - // Now we have 2 choices : either still use prev3 to count dutf and check if there is a gap here OR - // Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength); int mask = Avx2.MoveMask(currentBlock); @@ -670,106 +443,16 @@ public static class UTF8 { // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. - - if (prevWasSimd){ // recall that the non ascii simd checks counts the adjustment on prev3, hence we need to backtrack in case the - // it was called - Console.WriteLine("--prev was simd!"); - for(int k = 1; k <= 3 ; k++) // we dont want to double count the current byte - { - int candidateByte = pInputBuffer[processedLength - k]; - if ((candidateByte & 0b11000000) == 0b11000000) - { - { - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - TempUtf16CodeUnitCountAdjustment -= 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - TempUtf16CodeUnitCountAdjustment -= 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - { - Console.WriteLine("Found 4-byte"); - TempUtf16CodeUnitCountAdjustment -= 2; - TempScalarCountAdjustment -= 1; - } - // break; - - } - } - } - - } - if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - - // TODO/think about : this path iss not explicitly tested - Console.WriteLine("----All ASCII need rewind"); - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment; - - // int off = processedLength >= 3 ? processedLength - 3 : processedLength; - int off = processedLength; - - - // No need to count - - // if (processedLength >= 32 + 3){ - // off = processedLength -32 - 3; - // int overlapCount =3; - - // for(int k = 0; k < overlapCount; k++) - // { - - // int candidateByte = pInputBuffer[processedLength + k]; - // if ((candidateByte & 0b11000000) == 0b11000000) - // { - // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - // { - // TempUtf16CodeUnitCountAdjustment += 1; - // } - // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - // { - // TempUtf16CodeUnitCountAdjustment += 2; - // } - // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - // { - // TempUtf16CodeUnitCountAdjustment += 2; - // TempScalarCountAdjustment += 1; - // } - // } - // } - // } - - - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + int off = processedLength >= 3 ? processedLength - 3 : processedLength; + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); } prevIncomplete = Vector256.Zero; - prevWasSimd = false; } - else // Contains non-ASCII characters, we need to do non-trivial processing + else { - Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes"); - prevWasSimd = true; // consider moving this somewhere else - - // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. - // TODO:integrate this better with the rest of the code - // Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); - // Vector256 isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte); - // Vector256 isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte); - - // uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence)); - // uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence)); - // uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); - - // Calculate counts by isolating each type. - // uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. - // uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. - - - + // Contains non-ASCII characters, we need to do non-trivial processing Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; Vector256 prev1 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 1)); @@ -781,256 +464,55 @@ public static class UTF8 Vector256 sc = Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high); Vector256 prev2 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 2)); Vector256 prev3 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 3)); - - Vector256 isSecondByte = Avx2.SubtractSaturate(prev3, secondByte); Vector256 isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte); - - Vector256 isThirdByteAdjustment = Avx2.SubtractSaturate(prev3, thirdByte); - Vector256 isFourthByte = Avx2.SubtractSaturate(prev3, fourthByte); - - uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isSecondByte)); - uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isThirdByteAdjustment)); - uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isFourthByte)); - - uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts. - uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts. - Vector256 must23 = Avx2.Or(isThirdByte, isFourthByte); Vector256 must23As80 = Avx2.And(must23, v80); Vector256 error = Avx2.Xor(must23As80, sc); - - if (!Avx2.TestZ(error, error)) //context: we are dealing with a 32 bit + if (!Avx2.TestZ(error, error)) { - Console.WriteLine("-----Error path!!"); - TailScalarCodeUnitCountAdjustment =0; - TailUtf16CodeUnitCountAdjustment =0; - // int off= 32; - - // if (processedLength <32) // - // { - // // off = 0; - // prevWasSimd = false; // not enough bytes to load into SIMD! there was no previous op at all, let alone SIMD one - // } - - // int off = processedLength >= 32 ? processedLength: 0; // we check if there - // without this there is an overflow if - // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment,prevWasSimd); - - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; - - - Console.WriteLine("--------"); - Console.WriteLine("TempUTF16 after error rewind:"+ utf16CodeUnitCountAdjustment); - Console.WriteLine("TempScalar '' '' '':"+ scalarCountAdjustment); - - return invalidBytePointer; + int off = processedLength >= 32 ? processedLength - 32 : processedLength; + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); } - // Adjustments - TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; - TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; - TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; - TempScalarCountAdjustment -= (int)fourByteCount; - - Console.WriteLine("Doublecount(Temp) after SIMD processing:" + TempUtf16CodeUnitCountAdjustment); - Console.WriteLine("Scalarcount after SIMD processing:" + TempScalarCountAdjustment); - - prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); + } + } - if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) + if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) + { + // We have an unterminated sequence. + processedLength -= 3; + for(int k = 0; k < 3; k++) + { + if ((pInputBuffer[processedLength + k] & 0b11000000) == 0b11000000) { - // We have an unterminated sequence. - Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes"); - processedLength -= 3; - - // int incompleteScalarCodeUnitCountAdjustmentPreventDoubleCounting = 0; - // int incompleteUtf16CodeUnitPreventDoubleCounting = 0; - - // SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, 3,out incompleteUtf16CodeUnitPreventDoubleCounting,out incompleteScalarCodeUnitCountAdjustmentPreventDoubleCounting); - - // incompleteScalarCodeUnitCountAdjustmentPreventDoubleCounting *= -1; - // incompleteUtf16CodeUnitPreventDoubleCounting *= -1; - - // TempUtf16CodeUnitCountAdjustment+= incompleteUtf16CodeUnitPreventDoubleCounting; - // TempScalarCountAdjustment+= incompleteScalarCodeUnitCountAdjustmentPreventDoubleCounting; - - - - // Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting); - int backedup= 0; - - int currentByte = pInputBuffer[processedLength]; - Console.WriteLine("CurrentByte:" + Convert.ToString(currentByte, 2).PadLeft(8, '0')); - - for(int k = 0; k < 3; k++) - { - int candidateByte = pInputBuffer[processedLength + 32 + k]; - Console.WriteLine("Backing up " + k +" bytes"); - Console.WriteLine("Byte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - backedup = 3-k +1; - // TODO: - // the weird + 1 is so I dont have to put an else to the conditional below - // less readable, there might be a more elegant way to rewrite it but I am taking the path of convenience for now - - if ((candidateByte & 0b11000000) == 0b11000000) - { - // Whatever you do, do not delete this - processedLength += k; - break; - } - } - - Console.WriteLine("Backed up " + backedup +" bytes"); - - // for(int k = backedup; k < 3 ; k++) - // { - // int candidateByte = pInputBuffer[processedLength - k]; - // if ((candidateByte & 0b11000000) == 0b11000000) - // { - // // TODO:The SIMD code backs up by at most 3, but recall that the count is now done on prev3 - // // This has the advantage that we KNOW that the bytes here are valid instead of having to take into account the error - // // if (k != 0) - // { - // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - // { - // TempUtf16CodeUnitCountAdjustment += 1; - // } - // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - // { - // TempUtf16CodeUnitCountAdjustment += 2; - // } - // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - // { - // Console.WriteLine("Found 4-byte"); - // TempUtf16CodeUnitCountAdjustment += 2; - // TempScalarCountAdjustment += 1; - // } - // // break; - - // } - // } - // } - - - Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment); - Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment); - Console.WriteLine("-----------------"); - - prevWasUnterminated = true; - prevWasSimd = true; + processedLength += k; + break; } } } } } - Console.WriteLine("-Done with SIMD part!"); - // We have processed all the blocks using SIMD, we need to process the remaining bytes. // Process the remaining bytes with the scalar function - // worst possible case is 4 bytes, where we need to backtrack 3 bytes - // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte if (processedLength < inputLength) { - - Console.WriteLine("----Process remaining Scalar @ " + processedLength + "bytes"); - // Console.WriteLine("processed length before:" + processedLength); - int overlapCount = 0; - - // // We need to possibly backtrack to the start of the last code point - // while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) - // { - // processedLength -= 1; - // overlapCount +=1; - // } - - Console.WriteLine("processed length after backtrack:" + processedLength); - - - // PERFORMANCE TOCHECK:See if rewind is better here - // for(int k = 0; k < overlapCount; k++) - // { - // // There is no error here hence the loop is straigthforward and we avoid double counting every byte - // int candidateByte = pInputBuffer[processedLength + k]; - // if ((candidateByte & 0b11000000) == 0b11000000) - // { - // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - // { - // TempUtf16CodeUnitCountAdjustment += 1; - // } - // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - // { - // TempUtf16CodeUnitCountAdjustment += 2; - // } - // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - // { - // TempUtf16CodeUnitCountAdjustment += 2; - // TempScalarCountAdjustment += 1; - // } - // break; - // } - // } - - Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); - Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); - - - // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrorsRemaining(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment,prevWasUnterminated); + // We need to possibly backtrack to the start of the last code point + while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) + { + processedLength -= 1; + } + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; - // An invalid byte was found by the scalar function return invalidBytePointer; } - - Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); - Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); - // prevWasSimd = false; - - } - else - if (processedLength == inputLength && prevWasSimd){ // without this there is a 3 byte gap at the end - Console.Write("Closing in the gap\n"); - - for(int k = 0; k <= 3; k++) - { - - // There is no error here hence the loop is straigthforward and we avoid double counting every byte - int candidateByte = pInputBuffer[processedLength - k]; - if ((candidateByte & 0b11000000) == 0b11000000) - { - if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence - { - Console.Write("Found 2 byte \n"); - - TempUtf16CodeUnitCountAdjustment -= 1; - } - if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence - { - Console.Write("Found 3 byte \n"); - TempUtf16CodeUnitCountAdjustment -= 2; - } - if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence - { - Console.Write("Found 4 byte \n"); - TempUtf16CodeUnitCountAdjustment -= 2; - TempScalarCountAdjustment -= 1; - } - // break; - } - } } - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; - return pInputBuffer + inputLength; } @@ -1038,11 +520,6 @@ public static class UTF8 { int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment= 0 ; - int TempScalarCountAdjustment = 0; - - int utf16CodeUnitCountAdjustment=0, scalarCountAdjustment=0; - if (pInputBuffer == null || inputLength <= 0) { return pInputBuffer; @@ -1124,7 +601,7 @@ public static class UTF8 // we need to check if the previous block was incomplete. if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); } prevIncomplete = Vector128.Zero; } @@ -1146,7 +623,7 @@ public static class UTF8 Vector128 error = AdvSimd.Xor(must23As80, sc); if (AdvSimd.Arm64.MaxAcross(error).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); } prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue); } @@ -1185,28 +662,29 @@ public static class UTF8 return pInputBuffer + inputLength; } - public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength,out int Utf16CodeUnitCountAdjustment,out int ScalarCodeUnitCountAdjustment) + public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength) { - - // if (AdvSimd.Arm64.IsSupported) - // { - // return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); - // } + if (AdvSimd.Arm64.IsSupported) + { + return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); + } if (Avx2.IsSupported) { - return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment,out ScalarCodeUnitCountAdjustment); + return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength); } /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) { return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength); }*/ - // if (Ssse3.IsSupported) - // { - // return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); - // } + if (Ssse3.IsSupported) + { + return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); + } // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); - return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment,out ScalarCodeUnitCountAdjustment); + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; + return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); } From 631195e7fd478435cf664c52fdfe866cd688ee2d Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 7 May 2024 19:17:06 -0400 Subject: [PATCH 54/75] Adjustmentfactor fx integration @ unterminated path (part 1) + minor tests fixes --- src/UTF8.cs | 324 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 252 insertions(+), 72 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 5e3059f..df5cf5a 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -3,53 +3,81 @@ using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics.Arm; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace SimdUnicode { public static class UTF8 { - public unsafe static byte* RewindAndValidateWithErrors(int priorBytes, byte* buf, int len) + static Func byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0');//for debugging + + public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { - int howFarBack = priorBytes; + // Console.WriteLine("--Rewind Validate with Errors"); + // Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); + + int TempUtf16CodeUnitCountAdjustment = 0; + int TempScalarCountAdjustment = 0; + int extraLen = 0; bool foundLeadingBytes = false; + for (int i = 0; i <= howFarBack; i++) { - byte b = buf[0 - i]; - foundLeadingBytes = ((b & 0b11000000) != 0b10000000); + // Console.WriteLine("Activiting main backup:" + i); + byte candidateByte = buf[0 - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) - { + { buf -= i; - extraLen = i; + // extraLen = i; // a measure of how far we've backed up, only useful for debugging + // Console.WriteLine(howFarBack); + // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // Console.WriteLine("Backed up " + extraLen + 1 + " bytes"); break; } } + if (!foundLeadingBytes) { return buf - howFarBack; } + utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TempScalarCountAdjustment; + + int TailUtf16CodeUnitCountAdjustment = 0; + int TailScalarCountAdjustment = 0; + + byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); - // Now buf points to the start of a UTF-8 sequence or the start of the buffer. - // Validate from this new start point with the adjusted length. - byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); + utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TailScalarCountAdjustment; - return invalidByte; + // Console.WriteLine("rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment); + // Console.WriteLine("scalarcount adjstment after rewind:" + TempScalarCountAdjustment); + // Console.WriteLine(" "); + // Console.WriteLine("rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment); + // Console.WriteLine("scalarcount after rewind(ditto):" + TailScalarCountAdjustment); + + return invalidBytePointer; } public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; int pos = 0; int nextPos; uint codePoint = 0; + while (pos < inputLength) { + byte firstByte = pInputBuffer[pos]; while (firstByte < 0b10000000) { @@ -59,7 +87,6 @@ public static class UTF8 scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + inputLength; } firstByte = pInputBuffer[pos]; - // TempUtf16CodeUnitCountAdjustment -= 1; } if ((firstByte & 0b11100000) == 0b11000000) @@ -109,16 +136,10 @@ public static class UTF8 utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } // Too short - // if (pInputBuffer[pos + 3] < 0b10000000) { - // TempUtf16CodeUnitCountAdjustment -= 1; - // } else { - // TempUtf16CodeUnitCountAdjustment -= 2; - // } TempUtf16CodeUnitCountAdjustment -= 2; } else if ((firstByte & 0b11111000) == 0b11110000) - { // 0b11110000 - + { nextPos = pos + 4; if (nextPos > inputLength) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; @@ -145,9 +166,6 @@ public static class UTF8 return pInputBuffer + pos; } TempUtf16CodeUnitCountAdjustment -= 2; TempScalarCountAdjustment -= 1; - - - } else { @@ -174,10 +192,70 @@ public static class UTF8 const byte OVERLONG_4 = 1 << 6; const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; + // Assuming that a valid UTF-8 sequence ends at pInputBuffer, + // computes how many bytes are needed (eg what type of byte) to complete the last character. also counts the number of n4, n2 and ascii affected + // This will return 1, 2, 3. If the whole byte sequence is valid UTF-8, + // and this function returns returnedvalue>0, then the bytes at pInputBuffer[0], + // ... pInputBuffer[returnedvalue - 1] should be continuation bytes. + // Note that this function is unsafe, and it is the caller's responsibility + // to ensure that we can read at least 4 bytes before pInputBuffer. + // (Nick Nuon added 7th may) there is an addenum labeled important in the mock PR however I think we can treat unterminated as + public unsafe static (int totalbyteadjustment,int i,int ascii,int n2,int n4) adjustmentFactor(byte* pInputBuffer) { + // Find the first non-continuation byte, working backward. + int i = 1; + for (; i <= 4; i++) + { + if ((pInputBuffer[-i] & 0b11000000) != 0b10000000) + { + break; + } + } + if ((pInputBuffer[-i] & 0b10000000) == 0) { + return (0,i,-1,0,0); // We must have that i == 1 + } + if ((pInputBuffer[-i] & 0b11100000) == 0b11000000) { + return (2 - i,i,0,-1,0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte. + } + if ((pInputBuffer[-i] & 0b11110000) == 0b11100000) { + return (3 - i,i,0,0,0); // We have that i == 1 or i == 2 or i == 3, if i == 1, we are missing two bytes, if i == 2, we are missing one byte. + } + // We must have that (pInputBuffer[-i] & 0b11111000) == 0b11110000 + return (4 - i,i,0,0,-1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte. + } + + public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) + { + // Calculate the total bytes from start_point to processedLength + int totalbyte = processedLength - start_point; + + // Adjust the length to include a complete character, if necessary + if (totalbyte > 0) + { + var (temptotalbyte,i ,tempascii, tempn2, tempn4) = adjustmentFactor(pInputBuffer + processedLength); + } + + // Calculate n3 based on provided formula + int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; + + // Calculate n2 based on provided formula + int n2 = -2 * asciibytes + n4 - 4 * contbytes + 2 * totalbyte; + + // TODO add them all up + + int utfadjust = -2 * n4 - 2* n3 - n2; + int scalaradjust = n4; + + // Return the calculated n2 and n3 + return (utfadjust, scalaradjust); + } + + public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength) { int processedLength = 0; + int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempScalarCountAdjustment = 0; if (pInputBuffer == null || inputLength <= 0) { @@ -258,16 +336,11 @@ public static class UTF8 int mask = Sse2.MoveMask(currentBlock); if (mask == 0) { - // Console.WriteLine("ascii"); - // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. if (Sse2.MoveMask(prevIncomplete) != 0) { - // return pInputBuffer + processedLength; - - // Console.WriteLine("not ascii"); - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment); } prevIncomplete = Vector128.Zero; } @@ -289,7 +362,7 @@ public static class UTF8 Vector128 error = Sse2.Xor(must23As80, sc); if (Sse2.MoveMask(error) != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment); } prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue); } @@ -330,12 +403,21 @@ public static class UTF8 } - public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength) + public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { + // Console.WriteLine("--------------------------Calling function----------------------------------"); + // Console.WriteLine("Length: " + inputLength); int processedLength = 0; + int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempScalarCountAdjustment = 0; + + int TailScalarCodeUnitCountAdjustment = 0; + int TailUtf16CodeUnitCountAdjustment = 0; if (pInputBuffer == null || inputLength <= 0) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer; } if (inputLength > 128) @@ -352,6 +434,7 @@ public static class UTF8 break; } } + // Console.WriteLine("asciirun bytes: ", asciirun); // debugging processedLength = asciirun; if (processedLength + 32 < inputLength) @@ -431,11 +514,43 @@ public static class UTF8 Vector256 fourthByte = Vector256.Create((byte)(0b11110000u - 0x80)); Vector256 v0f = Vector256.Create((byte)0x0F); Vector256 v80 = Vector256.Create((byte)0x80); - - + /**** + * So we want to count the number of 4-byte sequences, + * the number of 4-byte sequences, 3-byte sequences, and + * the number of 2-byte sequences. + * We can do it indirectly. We know how many bytes in total + * we have (length). Let us assume that the length covers + * only complete sequences (we need to adjust otherwise). + * We have that + * length = 4 * n4 + 3 * n3 + 2 * n2 + n1 + * where n1 is the number of 1-byte sequences (ASCII), + * n2 is the number of 2-byte sequences, n3 is the number + * of 3-byte sequences, and n4 is the number of 4-byte sequences. + * + * Let ncon be the number of continuation bytes, then we have + * length = n4 + n3 + n2 + ncon + n1 + * + * We can solve for n2 and n3 in terms of the other variables: + * n3 = n1 - 2 * n4 + 2 * ncon - length + * n2 = -2 * n1 + n4 - 4 * ncon + 2 * length + * Thus we only need to count the number of continuation bytes, + * the number of ASCII bytes and the number of 4-byte sequences. + */ + //////////// + // The *block* here is what begins at processedLength and ends + // at processedLength/16*16 or when an error occurs. + /////////// + int start_point = processedLength; + + // The block goes from processedLength to processedLength/16*16. + int asciibytes = 0; // number of ascii bytes in the block (could also be called n1) + int contbytes = 0; // number of continuation bytes in the block + int n4 = 0; // number of 4-byte sequences that start in this block + int totalbyte, n3, n2; for (; processedLength + 32 <= inputLength; processedLength += 32) { + Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength); int mask = Avx2.MoveMask(currentBlock); @@ -445,14 +560,22 @@ public static class UTF8 // we need to check if the previous block was incomplete. if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { + + // TODO/think about : this path iss not explicitly tested + // Console.WriteLine("----All ASCII need rewind"); + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment; + int off = processedLength >= 3 ? processedLength - 3 : processedLength; - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); + // int off = processedLength; + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = Vector256.Zero; } - else + else // Contains non-ASCII characters, we need to do non-trivial processing { - // Contains non-ASCII characters, we need to do non-trivial processing + // Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes"); //debug + // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; Vector256 prev1 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 1)); @@ -471,47 +594,100 @@ public static class UTF8 Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { + // Console.WriteLine("-----Error path!!"); + TailScalarCodeUnitCountAdjustment =0; + TailUtf16CodeUnitCountAdjustment =0; + int off = processedLength >= 32 ? processedLength - 32 : processedLength; - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off); + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; + + // Console.WriteLine("--------"); //debug + // Console.WriteLine("TempUTF16 after error rewind:"+ utf16CodeUnitCountAdjustment); + // Console.WriteLine("TempScalar '' '' '':"+ scalarCountAdjustment); + + return invalidBytePointer; } + // Adjustments :TODO: + // TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; + // TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; + // TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; + // TempScalarCountAdjustment -= (int)fourByteCount; + + // Console.WriteLine("Doublecount(Temp) after SIMD processing:" + TempUtf16CodeUnitCountAdjustment); debug + // Console.WriteLine("Scalarcount after SIMD processing:" + TempScalarCountAdjustment); prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); - } - } - if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) - { - // We have an unterminated sequence. - processedLength -= 3; - for(int k = 0; k < 3; k++) - { - if ((pInputBuffer[processedLength + k] & 0b11000000) == 0b11000000) + if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - processedLength += k; - break; + // We have an unterminated sequence. + // Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes"); + // processedLength -= 3; + + // Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting); + // int backedup= 0; + + // int currentByte = pInputBuffer[processedLength]; + // Console.WriteLine("CurrentByte:" + Convert.ToString(currentByte, 2).PadLeft(8, '0')); + + var (totalbyteadjustment, i,tempascii, tempn2, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); + processedLength -= i; + + + // for(int k = 0; k < 3; k++) + // { + // // TODO:I do not remember why I put +32 here but the compiler complains if I remeve it + // int candidateByte = pInputBuffer[processedLength + 32 + k]; + // // Console.WriteLine("Backing up " + k +" bytes"); + // // Console.WriteLine("Byte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + + // // backedup = 3-k +1; + + // if ((candidateByte & 0b11000000) == 0b11000000) + // { + // // Whatever you do, do not delete this + // processedLength += k; + // break; + // } + // } + + // // Console.WriteLine("Backed up " + backedup +" bytes"); + // // Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment); + // // Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment); + // // Console.WriteLine("-----------------"); + } } } } } - + // Console.WriteLine("-Done with SIMD part!"); //debug // We have processed all the blocks using SIMD, we need to process the remaining bytes. // Process the remaining bytes with the scalar function + // worst possible case is 4 bytes, where we need to backtrack 3 bytes + // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte if (processedLength < inputLength) { - // We need to possibly backtrack to the start of the last code point - while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) - { - processedLength -= 1; - } - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + // Console.WriteLine("----Process remaining Scalar @ " + processedLength + "bytes"); + // int overlapCount = 0; + // Console.WriteLine("processed length after backtrack:" + processedLength); + // Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); + // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; // An invalid byte was found by the scalar function return invalidBytePointer; } - } + // Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); + // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); + } + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; return pInputBuffer + inputLength; } @@ -520,6 +696,11 @@ public static class UTF8 { int processedLength = 0; + int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempScalarCountAdjustment = 0; + + int utf16CodeUnitCountAdjustment=0, scalarCountAdjustment=0; + if (pInputBuffer == null || inputLength <= 0) { return pInputBuffer; @@ -601,7 +782,7 @@ public static class UTF8 // we need to check if the previous block was incomplete. if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = Vector128.Zero; } @@ -623,7 +804,7 @@ public static class UTF8 Vector128 error = AdvSimd.Xor(must23As80, sc); if (AdvSimd.Arm64.MaxAcross(error).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue); } @@ -662,29 +843,28 @@ public static class UTF8 return pInputBuffer + inputLength; } - public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength) + public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength,out int Utf16CodeUnitCountAdjustment,out int ScalarCodeUnitCountAdjustment) { - if (AdvSimd.Arm64.IsSupported) - { - return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); - } + + // if (AdvSimd.Arm64.IsSupported) + // { + // return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); + // } if (Avx2.IsSupported) { - return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength); + return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment,out ScalarCodeUnitCountAdjustment); } /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) { return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength); }*/ - if (Ssse3.IsSupported) - { - return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); - } + // if (Ssse3.IsSupported) + // { + // return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); + // } // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment,out ScalarCodeUnitCountAdjustment); } From 0949b938bcbc0b355695e5f1210b81b9d476106c Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Wed, 8 May 2024 20:32:41 -0400 Subject: [PATCH 55/75] Added updating of counts if no error --- src/UTF8.cs | 105 +++++++++++++++++++++++++++++----------------------- 1 file changed, 59 insertions(+), 46 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index df5cf5a..d22d0cc 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -200,7 +200,7 @@ public static class UTF8 // Note that this function is unsafe, and it is the caller's responsibility // to ensure that we can read at least 4 bytes before pInputBuffer. // (Nick Nuon added 7th may) there is an addenum labeled important in the mock PR however I think we can treat unterminated as - public unsafe static (int totalbyteadjustment,int i,int ascii,int n2,int n4) adjustmentFactor(byte* pInputBuffer) { + public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,int n2,int n4) adjustmentFactor(byte* pInputBuffer) { // Find the first non-continuation byte, working backward. int i = 1; for (; i <= 4; i++) @@ -223,27 +223,41 @@ public unsafe static (int totalbyteadjustment,int i,int ascii,int n2,int n4) adj return (4 - i,i,0,0,-1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte. } - public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) + public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) + { + // Calculate n3 based on the provided formula + int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; + + // Calculate n2 based on the provided formula + int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; + + // Calculate utfadjust by adding them all up + int utfadjust = -2 * n4 - 2 * n3 - n2; + + // Calculate scalaradjust based on n4 + int scalaradjust = -n4; + + // Return the calculated utfadjust and scalaradjust + return (utfadjust, scalaradjust); + } + + + + + + public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int n2, int contbytes) { // Calculate the total bytes from start_point to processedLength int totalbyte = processedLength - start_point; + int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustn2 = 0, adjustn4 = 0; // Adjust the length to include a complete character, if necessary if (totalbyte > 0) { - var (temptotalbyte,i ,tempascii, tempn2, tempn4) = adjustmentFactor(pInputBuffer + processedLength); + (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustn2, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); } - // Calculate n3 based on provided formula - int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; - - // Calculate n2 based on provided formula - int n2 = -2 * asciibytes + n4 - 4 * contbytes + 2 * totalbyte; - - // TODO add them all up - - int utfadjust = -2 * n4 - 2* n3 - n2; - int scalaradjust = n4; + var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustn2, totalbyte + adjusttotalbyte); // Return the calculated n2 and n3 return (utfadjust, scalaradjust); @@ -405,7 +419,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - // Console.WriteLine("--------------------------Calling function----------------------------------"); + Console.WriteLine("--------------------------Calling function----------------------------------"); // Console.WriteLine("Length: " + inputLength); int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; @@ -546,7 +560,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int int asciibytes = 0; // number of ascii bytes in the block (could also be called n1) int contbytes = 0; // number of continuation bytes in the block int n4 = 0; // number of 4-byte sequences that start in this block - int totalbyte, n3, n2; + // int totalbyte = 0, n3 = 0, n2 = 0; + + for (; processedLength + 32 <= inputLength; processedLength += 32) { @@ -560,21 +576,18 @@ public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int // we need to check if the previous block was incomplete. if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - - // TODO/think about : this path iss not explicitly tested - // Console.WriteLine("----All ASCII need rewind"); + // TODO? : this path iss not explicitly tested utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; int off = processedLength >= 3 ? processedLength - 3 : processedLength; - // int off = processedLength; return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); } prevIncomplete = Vector256.Zero; } else // Contains non-ASCII characters, we need to do non-trivial processing { - // Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes"); //debug + Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes"); //debug // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; @@ -594,7 +607,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { - // Console.WriteLine("-----Error path!!"); + Console.WriteLine("-----Error path!!"); TailScalarCodeUnitCountAdjustment =0; TailUtf16CodeUnitCountAdjustment =0; @@ -610,11 +623,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int return invalidBytePointer; } - // Adjustments :TODO: - // TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2; - // TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount; - // TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2; - // TempScalarCountAdjustment -= (int)fourByteCount; // Console.WriteLine("Doublecount(Temp) after SIMD processing:" + TempUtf16CodeUnitCountAdjustment); debug // Console.WriteLine("Scalarcount after SIMD processing:" + TempScalarCountAdjustment); @@ -623,7 +631,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { // We have an unterminated sequence. - // Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes"); + Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes"); // processedLength -= 3; // Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting); @@ -634,24 +642,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int var (totalbyteadjustment, i,tempascii, tempn2, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); processedLength -= i; - - - // for(int k = 0; k < 3; k++) - // { - // // TODO:I do not remember why I put +32 here but the compiler complains if I remeve it - // int candidateByte = pInputBuffer[processedLength + 32 + k]; - // // Console.WriteLine("Backing up " + k +" bytes"); - // // Console.WriteLine("Byte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // // backedup = 3-k +1; - - // if ((candidateByte & 0b11000000) == 0b11000000) - // { - // // Whatever you do, do not delete this - // processedLength += k; - // break; - // } - // } + // totalbyte -= totalbyteadjustment; + asciibytes +=tempascii; + n4 += tempn4; + contbytes +=tempn2; // // Console.WriteLine("Backed up " + backedup +" bytes"); // // Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment); @@ -659,18 +653,37 @@ public unsafe static (int utfadjust, int scalaradjust) calculatefinaladjust(int // // Console.WriteLine("-----------------"); } + + // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation. + contbytes += Avx2.MoveMask(sc); + + // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation. + n4 += Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)); } } + // There are 2 possible scenarios here : either + // A) it arrives flush en the border. eg it doesnt need to be processed further + // B) There is some bytes remaining in which case we need to call the scalar functien + // Either way we need to calculate n2,n3 and update the utf16adjust and scalar adjust + int totalbyte = processedLength - start_point; + var (utf16adjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4, contbytes, totalbyte); + + utf16CodeUnitCountAdjustment = utf16adjust; + scalarCountAdjustment = scalaradjust; } + + } // Console.WriteLine("-Done with SIMD part!"); //debug // We have processed all the blocks using SIMD, we need to process the remaining bytes. // Process the remaining bytes with the scalar function + + // worst possible case is 4 bytes, where we need to backtrack 3 bytes // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte if (processedLength < inputLength) { - // Console.WriteLine("----Process remaining Scalar @ " + processedLength + "bytes"); + Console.WriteLine("----Process remaining Scalar @ " + processedLength + "bytes"); // int overlapCount = 0; // Console.WriteLine("processed length after backtrack:" + processedLength); // Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); From 7b77dc1efb01ad690ac8b4965f159ce39872a19d Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Fri, 10 May 2024 12:42:29 -0400 Subject: [PATCH 56/75] correct types --- src/UTF8.cs | 62 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index d22d0cc..043204d 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -225,6 +225,8 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { + + Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte); // Calculate n3 based on the provided formula int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; @@ -237,6 +239,9 @@ public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustment // Calculate scalaradjust based on n4 int scalaradjust = -n4; + + + // Return the calculated utfadjust and scalaradjust return (utfadjust, scalaradjust); } @@ -576,9 +581,15 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // we need to check if the previous block was incomplete. if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - // TODO? : this path iss not explicitly tested - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment; + // TODO? : this path is not explicitly tested + Console.WriteLine("---------All ascii need rewind"); + + + int totalbyteasciierror = processedLength - start_point; + var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); + + utf16CodeUnitCountAdjustment = utfadjustasciierror; + scalarCountAdjustment = scalaradjustasciierror; int off = processedLength >= 3 ? processedLength - 3 : processedLength; return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); @@ -608,18 +619,28 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( if (!Avx2.TestZ(error, error)) { Console.WriteLine("-----Error path!!"); + + int totalbyteasciierror = processedLength - start_point; + var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes, contbytes); + + Console.WriteLine("calculateErrorPathadjust utf16 adjustment:"+ utfadjustasciierror); + Console.WriteLine("calculateErrorPathadjust scalar adjustment:"+ scalaradjustasciierror); + + utf16CodeUnitCountAdjustment = utfadjustasciierror; + scalarCountAdjustment = scalaradjustasciierror; + TailScalarCodeUnitCountAdjustment =0; TailUtf16CodeUnitCountAdjustment =0; int off = processedLength >= 32 ? processedLength - 32 : processedLength; byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; + utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment += TailScalarCodeUnitCountAdjustment; // Console.WriteLine("--------"); //debug - // Console.WriteLine("TempUTF16 after error rewind:"+ utf16CodeUnitCountAdjustment); - // Console.WriteLine("TempScalar '' '' '':"+ scalarCountAdjustment); + Console.WriteLine("TempUTF16 after error rewind:"+ utf16CodeUnitCountAdjustment); + Console.WriteLine("TempScalar '' '' '':"+ scalarCountAdjustment); return invalidBytePointer; } @@ -632,35 +653,34 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { // We have an unterminated sequence. Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes"); - // processedLength -= 3; - - // Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting); - // int backedup= 0; - - // int currentByte = pInputBuffer[processedLength]; - // Console.WriteLine("CurrentByte:" + Convert.ToString(currentByte, 2).PadLeft(8, '0')); var (totalbyteadjustment, i,tempascii, tempn2, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); processedLength -= i; - // totalbyte -= totalbyteadjustment; - asciibytes +=tempascii; n4 += tempn4; contbytes +=tempn2; - // // Console.WriteLine("Backed up " + backedup +" bytes"); // // Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment); // // Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment); - // // Console.WriteLine("-----------------"); } - // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation. - contbytes += Avx2.MoveMask(sc); + // No errors! Updating the variables we keep track of + // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation. + contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); + Console.WriteLine("this is contbytes" + contbytes) ; // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation. - n4 += Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)); + n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte))); } + asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));// TODO(Nick Nuon): simplify this expression + } + + // important: we just update asciibytes if there was no error. + // We count the number of ascii bytes in the block using just some simple arithmetic + // and no expensive operation: + + // There are 2 possible scenarios here : either // A) it arrives flush en the border. eg it doesnt need to be processed further // B) There is some bytes remaining in which case we need to call the scalar functien From b9ff7c369372f9acf1a82df7094383cbbd9cccb3 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 12 May 2024 18:04:07 -0400 Subject: [PATCH 57/75] save game --- src/UTF8.cs | 52 +++++++++++++++++++++++++------------ test/UTF8ValidationTests.cs | 10 +++---- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 043204d..0ea8f1c 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -14,8 +14,8 @@ public static class UTF8 public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { - // Console.WriteLine("--Rewind Validate with Errors"); - // Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); + Console.WriteLine("-Rewind Validate with Errors"); + Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; @@ -33,7 +33,7 @@ public static class UTF8 buf -= i; // extraLen = i; // a measure of how far we've backed up, only useful for debugging // Console.WriteLine(howFarBack); - // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); + Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); // Console.WriteLine("Backed up " + extraLen + 1 + " bytes"); break; @@ -56,12 +56,6 @@ public static class UTF8 utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCountAdjustment; - // Console.WriteLine("rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment); - // Console.WriteLine("scalarcount adjstment after rewind:" + TempScalarCountAdjustment); - // Console.WriteLine(" "); - // Console.WriteLine("rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment); - // Console.WriteLine("scalarcount after rewind(ditto):" + TailScalarCountAdjustment); - return invalidBytePointer; } @@ -199,8 +193,7 @@ public static class UTF8 // ... pInputBuffer[returnedvalue - 1] should be continuation bytes. // Note that this function is unsafe, and it is the caller's responsibility // to ensure that we can read at least 4 bytes before pInputBuffer. - // (Nick Nuon added 7th may) there is an addenum labeled important in the mock PR however I think we can treat unterminated as - public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,int n2,int n4) adjustmentFactor(byte* pInputBuffer) { + public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,int contbyte,int n4) adjustmentFactor(byte* pInputBuffer) { // Find the first non-continuation byte, working backward. int i = 1; for (; i <= 4; i++) @@ -254,15 +247,15 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { // Calculate the total bytes from start_point to processedLength int totalbyte = processedLength - start_point; - int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustn2 = 0, adjustn4 = 0; + int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0; // Adjust the length to include a complete character, if necessary if (totalbyte > 0) { - (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustn2, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); + (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); } - var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustn2, totalbyte + adjusttotalbyte); + var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte); // Return the calculated n2 and n3 return (utfadjust, scalaradjust); @@ -432,6 +425,19 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( int TailScalarCodeUnitCountAdjustment = 0; int TailUtf16CodeUnitCountAdjustment = 0; + bool lastSIMDisIncomplete = false; + // This is to solve a specific problem, where we have an unterminated SIMD vector followed by a call to the scaral rewind function: + // as an example say I have this sequence of byte where every line represents 16 bytes: + // 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 + // 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 11101100 10001001 10011000 11001011 <=== This SIMD vector is unterminated,thus it has to backup + // 10100100 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 + // 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 + // By default , if there is an unterminated SIMD vector, it assumes that the next vector is SIMD, + // dont count the backed up bytes(in this case the "11101100 10001001 10011000") + // however in case there isnt enough bytes to fill in, a gap is created as (??????) + // A call to the adjustment vector has to be made and this is the value that holds whether this call is made or not. + // It is somewhat questionable to create one extra variable just for that but I felt that I needed to separate what worked and what was tacked on later as clearly as possible + if (pInputBuffer == null || inputLength <= 0) { @@ -566,6 +572,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( int contbytes = 0; // number of continuation bytes in the block int n4 = 0; // number of 4-byte sequences that start in this block // int totalbyte = 0, n3 = 0, n2 = 0; + @@ -654,10 +661,16 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // We have an unterminated sequence. Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes"); - var (totalbyteadjustment, i,tempascii, tempn2, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); + + var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); + + Console.WriteLine("this is n4 adjusted by the adjustmentfactor function :" + tempn4 + " contbyte: " + contbytes); +6 processedLength -= i; n4 += tempn4; - contbytes +=tempn2; + contbytes +=tempcont; + + lastSIMDisIncomplete = true; // // Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment); // // Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment); @@ -667,13 +680,17 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // No errors! Updating the variables we keep track of // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation. contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); - Console.WriteLine("this is contbytes" + contbytes) ; + + // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation. n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte))); + Console.WriteLine("No error has been detected! Adding contbytes: " + (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)) + "Adding n4: " + (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)))); + Console.WriteLine(" this is the accumulated contbytes" + contbytes + " and n4:" + n4) ; // debug } asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));// TODO(Nick Nuon): simplify this expression + } // important: we just update asciibytes if there was no error. @@ -690,6 +707,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( utf16CodeUnitCountAdjustment = utf16adjust; scalarCountAdjustment = scalaradjust; + } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index a17bedc..a469d23 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -9,10 +9,7 @@ namespace tests; using BenchmarkDotNet.Disassemblers; using Iced.Intel; -// TODO: refine test for unterminated sequeqce happening at SIMD transition // TODO: The various tests do not formally take into account the scenario where vector is all ASCII -// TODO?: Test if the error is in the first vector? -// TODO:fix NoError,Ingomplete (some of the tests are wrong) public unsafe class Utf8SIMDValidationTests { @@ -258,7 +255,7 @@ public void NoError(Utf8ValidationDelegate utf8ValidationDelegate) try { Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); - Assert.True(InvalidateUtf8(utf8, outputLength,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate)); ValidateCount(utf8,utf8ValidationDelegate); } catch (Xunit.Sdk.XunitException) @@ -398,11 +395,10 @@ public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDe List secondbyte = generator.Generate(1,secondcodeLength); singlebytes.AddRange(secondbyte); - int incompleteLocation = 127 - rand.Next(1,firstcodeLength + secondcodeLength); + int incompleteLocation = 127 - rand.Next(1,firstcodeLength + secondcodeLength - 1); allAscii.InsertRange(incompleteLocation,singlebytes); var utf8 = allAscii.ToArray(); - Console.WriteLine("---------------New trial"); // PrintHexAndBinary(utf8,incompleteLocation); bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); @@ -410,7 +406,7 @@ public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDe try { Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); - Assert.True(InvalidateUtf8(utf8, outputLength,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate)); ValidateCount(utf8,utf8ValidationDelegate); } catch (Xunit.Sdk.XunitException) From 8e84627abc836ef5ce9e73595b1f1a48c780880a Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 12 May 2024 20:27:19 -0400 Subject: [PATCH 58/75] cleanup --- src/UTF8.cs | 84 +++++------------------------------------------------ 1 file changed, 7 insertions(+), 77 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 0ea8f1c..721e6b2 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -14,8 +14,6 @@ public static class UTF8 public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { - Console.WriteLine("-Rewind Validate with Errors"); - Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0')); int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; @@ -31,11 +29,6 @@ public static class UTF8 if (foundLeadingBytes) { buf -= i; - // extraLen = i; // a measure of how far we've backed up, only useful for debugging - // Console.WriteLine(howFarBack); - Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0')); - - // Console.WriteLine("Backed up " + extraLen + 1 + " bytes"); break; } } @@ -218,32 +211,16 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { - - Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte); - // Calculate n3 based on the provided formula + // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte); int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; - - // Calculate n2 based on the provided formula int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; - - // Calculate utfadjust by adding them all up int utfadjust = -2 * n4 - 2 * n3 - n2; - - // Calculate scalaradjust based on n4 int scalaradjust = -n4; - - - - // Return the calculated utfadjust and scalaradjust return (utfadjust, scalaradjust); } - - - - - public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int n2, int contbytes) + public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) { // Calculate the total bytes from start_point to processedLength int totalbyte = processedLength - start_point; @@ -257,7 +234,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte); - // Return the calculated n2 and n3 return (utfadjust, scalaradjust); } @@ -339,7 +315,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector128 fourthByte = Vector128.Create((byte)(0b11110000u - 0x80)); Vector128 v0f = Vector128.Create((byte)0x0F); Vector128 v80 = Vector128.Create((byte)0x80); - for (; processedLength + 16 <= inputLength; processedLength += 16) { @@ -417,8 +392,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - Console.WriteLine("--------------------------Calling function----------------------------------"); - // Console.WriteLine("Length: " + inputLength); int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; @@ -570,11 +543,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // The block goes from processedLength to processedLength/16*16. int asciibytes = 0; // number of ascii bytes in the block (could also be called n1) int contbytes = 0; // number of continuation bytes in the block - int n4 = 0; // number of 4-byte sequences that start in this block - // int totalbyte = 0, n3 = 0, n2 = 0; - - - + int n4 = 0; // number of 4-byte sequences that start in this block for (; processedLength + 32 <= inputLength; processedLength += 32) { @@ -586,12 +555,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. + // if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - // TODO? : this path is not explicitly tested - Console.WriteLine("---------All ascii need rewind"); - - + // TODO? : this path is not explicitly tested, write tests int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); @@ -605,7 +572,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( } else // Contains non-ASCII characters, we need to do non-trivial processing { - Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes"); //debug // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; @@ -625,13 +591,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { - Console.WriteLine("-----Error path!!"); - int totalbyteasciierror = processedLength - start_point; - var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes, contbytes); - - Console.WriteLine("calculateErrorPathadjust utf16 adjustment:"+ utfadjustasciierror); - Console.WriteLine("calculateErrorPathadjust scalar adjustment:"+ scalaradjustasciierror); + var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); utf16CodeUnitCountAdjustment = utfadjustasciierror; scalarCountAdjustment = scalaradjustasciierror; @@ -645,52 +606,30 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCodeUnitCountAdjustment; - // Console.WriteLine("--------"); //debug - Console.WriteLine("TempUTF16 after error rewind:"+ utf16CodeUnitCountAdjustment); - Console.WriteLine("TempScalar '' '' '':"+ scalarCountAdjustment); - return invalidBytePointer; } - // Console.WriteLine("Doublecount(Temp) after SIMD processing:" + TempUtf16CodeUnitCountAdjustment); debug - // Console.WriteLine("Scalarcount after SIMD processing:" + TempScalarCountAdjustment); prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { // We have an unterminated sequence. - Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes"); - - var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); - Console.WriteLine("this is n4 adjusted by the adjustmentfactor function :" + tempn4 + " contbyte: " + contbytes); -6 processedLength -= i; n4 += tempn4; contbytes +=tempcont; - lastSIMDisIncomplete = true; - - // // Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment); - // // Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment); - } // No errors! Updating the variables we keep track of // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation. contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); - - // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation. n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte))); - Console.WriteLine("No error has been detected! Adding contbytes: " + (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)) + "Adding n4: " + (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)))); - Console.WriteLine(" this is the accumulated contbytes" + contbytes + " and n4:" + n4) ; // debug } - asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));// TODO(Nick Nuon): simplify this expression - - + asciibytes += (int)(32 - Popcnt.PopCount((uint)mask)); } // important: we just update asciibytes if there was no error. @@ -712,7 +651,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( } - // Console.WriteLine("-Done with SIMD part!"); //debug // We have processed all the blocks using SIMD, we need to process the remaining bytes. // Process the remaining bytes with the scalar function @@ -721,11 +659,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte if (processedLength < inputLength) { - Console.WriteLine("----Process remaining Scalar @ " + processedLength + "bytes"); - // int overlapCount = 0; - // Console.WriteLine("processed length after backtrack:" + processedLength); - // Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); - // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { @@ -734,8 +667,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // An invalid byte was found by the scalar function return invalidBytePointer; } - // Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment); - // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment); } utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; @@ -746,7 +677,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength) { int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; From 202a996c0c85376f7f22d72872b56a708ab3df45 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 13 May 2024 09:52:37 -0400 Subject: [PATCH 59/75] further clean up --- test/UTF8ValidationTests.cs | 75 +------------------------------------ 1 file changed, 2 insertions(+), 73 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index a469d23..7d6371a 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -260,7 +260,6 @@ public void NoError(Utf8ValidationDelegate utf8ValidationDelegate) } catch (Xunit.Sdk.XunitException) { - // Console.WriteLine($"Assertion failed at index: "); PrintHexAndBinary(utf8); throw; // Rethrow the exception to fail the test. } @@ -390,16 +389,11 @@ public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDe // var allAscii = generator.Generate(outputLength,1); var allAscii = new List(Enumerable.Repeat((byte)0, 256)); int firstcodeLength = rand.Next(2,5); - int secondcodeLength = rand.Next(2,5); List singlebytes = generator.Generate(1,firstcodeLength);//recall:generate a utf8 code between 2 and 4 bytes - List secondbyte = generator.Generate(1,secondcodeLength); - singlebytes.AddRange(secondbyte); - - int incompleteLocation = 127 - rand.Next(1,firstcodeLength + secondcodeLength - 1); + int incompleteLocation = 128 - rand.Next(1,firstcodeLength - 1); allAscii.InsertRange(incompleteLocation,singlebytes); var utf8 = allAscii.ToArray(); - // PrintHexAndBinary(utf8,incompleteLocation); bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); @@ -411,7 +405,6 @@ public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDe } catch (Xunit.Sdk.XunitException) { - // Console.WriteLine($"Assertion failed at index: "); PrintHexAndBinary(utf8,incompleteLocation); throw; // Rethrow the exception to fail the test. } @@ -473,9 +466,6 @@ public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) { byte oldByte = utf8[i]; utf8[i] = 0b11111000; // Forcing a header bits error - // Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - // Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - // ValidateCount(utf8,utf8ValidationDelegate); try { Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); @@ -615,10 +605,6 @@ public void TooShortErrorAVX() public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) { - // int[] outputLengths = { 128, 256, 512, 1024 }; // Example lengths -// int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths - - foreach (int outputLength in outputLengths) { for (int trial = 0; trial < NumTrials; trial++) @@ -769,7 +755,6 @@ public void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { - // List oneUTFunit = generator.Generate( howManyUnits:1 ,byteCountInUnit: 2); byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1).ToArray(); unsafe @@ -779,7 +764,6 @@ public void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) for (int i = 0; i < utf8.Length; i++) { - // int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; byte currentByte = utf8[i]; int offset = 0; @@ -789,7 +773,6 @@ public void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) } if ((currentByte & 0b11110000) == 0b11100000) { // This is a header byte of a 3-byte sequence - offset = rand.Next(0, 3); } if ((currentByte & 0b11111000) == 0b11110000) { @@ -872,8 +855,6 @@ public void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) foreach (var invalidByte in invalidBytes) { utf8[position] = invalidByte; - // PrintHexAndBinary(utf8); - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); // Expect the validation to fail due to the invalid byte Assert.True(InvalidateUtf8(utf8,position,utf8ValidationDelegate)); ValidateCount(utf8,utf8ValidationDelegate); @@ -927,6 +908,7 @@ public void Invalid0xf50xffAvx2() Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } +// helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { int chunkSize = 16; // 128 bits = 16 bytes @@ -1441,7 +1423,6 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele } catch (Exception) { - // Upon failure, print the utf8 array for inspection Console.WriteLine("ValidateCount Assertion failed. Inspecting utf8 array:"); // PrintHexAndBinary(utf8,failureIndex); throw; // Re-throw the exception to preserve the failure state @@ -1450,58 +1431,6 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele } } - - // [Fact] - // [Trait("Category", "Scalar")] - // public void DotnetUTF16Count() - // { - // int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 }; - // int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; - // int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - - - // foreach (int outputLength in outputLengths) - // { - // // Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid. - // // byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray(); - // byte[] utf8 = generator.Generate(howManyUnits: outputLength).ToArray(); - // PrintHexAndBinary(utf8); - // var (offset, length) = (0, utf8.Length); - - // unsafe - // { - // fixed (byte* pInput = utf8) - // { - // byte* startPtr = pInput + offset; - // // Invoke the method under test. - - // DotnetUtf16Adjustment= 0; - // DotnetScalarCountAdjustment= 0; - // DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); - - // SimdUnicodeUtf16Adjustment= 0; - // SimdUnicodeScalarCountAdjustment= 0; - // SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - - // Console.WriteLine("Lenght:" + utf8.Length); - - // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment); - // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment); - - // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment); - // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment); - // Console.WriteLine("___________________________________________________"); - - - // Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); - // Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - - // } - // } - // } - // } - - } From f5ed30a8823d4a0b932bb3be41f363369938df53 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 13 May 2024 10:54:16 -0400 Subject: [PATCH 60/75] clean up blank lines --- src/UTF8.cs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 721e6b2..16e68de 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -398,19 +398,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( int TailScalarCodeUnitCountAdjustment = 0; int TailUtf16CodeUnitCountAdjustment = 0; - bool lastSIMDisIncomplete = false; - // This is to solve a specific problem, where we have an unterminated SIMD vector followed by a call to the scaral rewind function: - // as an example say I have this sequence of byte where every line represents 16 bytes: - // 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 - // 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 11101100 10001001 10011000 11001011 <=== This SIMD vector is unterminated,thus it has to backup - // 10100100 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 - // 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 - // By default , if there is an unterminated SIMD vector, it assumes that the next vector is SIMD, - // dont count the backed up bytes(in this case the "11101100 10001001 10011000") - // however in case there isnt enough bytes to fill in, a gap is created as (??????) - // A call to the adjustment vector has to be made and this is the value that holds whether this call is made or not. - // It is somewhat questionable to create one extra variable just for that but I felt that I needed to separate what worked and what was tacked on later as clearly as possible - if (pInputBuffer == null || inputLength <= 0) { @@ -432,7 +419,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( break; } } - // Console.WriteLine("asciirun bytes: ", asciirun); // debugging processedLength = asciirun; if (processedLength + 32 < inputLength) From 124c6fd127faf7d28e3b191c4bf2496c034ba471 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 13 May 2024 11:05:06 -0400 Subject: [PATCH 61/75] cleanup --- src/UTF8.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 16e68de..0445d69 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -544,7 +544,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - // TODO? : this path is not explicitly tested, write tests + // TODO : this path is not explicitly tested, write tests int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); @@ -605,7 +605,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( processedLength -= i; n4 += tempn4; contbytes +=tempcont; - } // No errors! Updating the variables we keep track of From d73ffc359103f85ba1d370101cbaad2aff5408f6 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Tue, 14 May 2024 14:01:18 -0400 Subject: [PATCH 62/75] No error incomplete working --- src/UTF8.cs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 0445d69..82de9ed 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -23,7 +23,6 @@ public static class UTF8 for (int i = 0; i <= howFarBack; i++) { - // Console.WriteLine("Activiting main backup:" + i); byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) @@ -200,7 +199,7 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in return (0,i,-1,0,0); // We must have that i == 1 } if ((pInputBuffer[-i] & 0b11100000) == 0b11000000) { - return (2 - i,i,0,-1,0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte. + return (2 - i,i,0,0,0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte. } if ((pInputBuffer[-i] & 0b11110000) == 0b11100000) { return (3 - i,i,0,0,0); // We have that i == 1 or i == 2 or i == 3, if i == 1, we are missing two bytes, if i == 2, we are missing one byte. @@ -211,12 +210,16 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { - // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte); + Console.WriteLine("---------"); + Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte); int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; int utfadjust = -2 * n4 - 2 * n3 - n2; int scalaradjust = -n4; + Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust); + + return (utfadjust, scalaradjust); } @@ -392,6 +395,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { + Console.WriteLine("-------------------------------------"); int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; @@ -544,7 +548,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - // TODO : this path is not explicitly tested, write tests + // TODO : this path is not explicitly tested, write tests int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); @@ -577,6 +581,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { + Console.WriteLine("--Error!"); int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); @@ -605,6 +610,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( processedLength -= i; n4 += tempn4; contbytes +=tempcont; + Console.WriteLine($"Unterminated! Backing up by {i}"); + } // No errors! Updating the variables we keep track of @@ -631,7 +638,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( utf16CodeUnitCountAdjustment = utf16adjust; scalarCountAdjustment = scalaradjust; - } From 0c758c91fe18f969fead47225fd628dae8342199 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Fri, 17 May 2024 22:48:09 -0400 Subject: [PATCH 63/75] temporary cont byte fix --- src/UTF8.cs | 144 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 124 insertions(+), 20 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 82de9ed..4d91eae 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -10,6 +10,70 @@ namespace SimdUnicode public static class UTF8 { + // helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index +static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) +{ + int chunkSize = 16; // 128 bits = 16 bytes + + // Process each chunk for hexadecimal + Console.Write("Hex: "); + for (int i = 0; i < bytes.Length; i++) + { + if (i > 0 && i % chunkSize == 0) + Console.WriteLine(); // New line after every 16 bytes + + if (i == highlightIndex) + { + Console.ForegroundColor = ConsoleColor.Red; + Console.Write($"{bytes[i]:X2} "); + Console.ResetColor(); + } + else if (i % (chunkSize * 2) == 0) // print green every 256 bytes + { + Console.ForegroundColor = ConsoleColor.Green; + Console.Write($"{bytes[i]:X2} "); + Console.ResetColor(); + } + else + { + Console.Write($"{bytes[i]:X2} "); + } + + if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line + } + Console.WriteLine("\n"); // New line for readability and to separate hex from binary + + // Process each chunk for binary + Console.Write("Binary: "); + for (int i = 0; i < bytes.Length; i++) + { + if (i > 0 && i % chunkSize == 0) + Console.WriteLine(); // New line after every 16 bytes + + string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0'); + if (i == highlightIndex) + { + Console.ForegroundColor = ConsoleColor.Red; + Console.Write($"{binaryString} "); + Console.ResetColor(); + } + else if (i % (chunkSize * 2) == 0) // print green every 256 bytes + { + Console.ForegroundColor = ConsoleColor.Green; + Console.Write($"{binaryString} "); + Console.ResetColor(); + } + else + { + Console.Write($"{binaryString} "); + } + + if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line + } + Console.WriteLine(); // New line for readability +} + + static Func byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0');//for debugging public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) @@ -188,38 +252,40 @@ public static class UTF8 public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,int contbyte,int n4) adjustmentFactor(byte* pInputBuffer) { // Find the first non-continuation byte, working backward. int i = 1; + int contbyteadjust = 0; for (; i <= 4; i++) { if ((pInputBuffer[-i] & 0b11000000) != 0b10000000) { break; } + contbyteadjust -= 1; + } if ((pInputBuffer[-i] & 0b10000000) == 0) { - return (0,i,-1,0,0); // We must have that i == 1 + return (0,i,-1,contbyteadjust,0); // We must have that i == 1 } if ((pInputBuffer[-i] & 0b11100000) == 0b11000000) { - return (2 - i,i,0,0,0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte. + return (2 - i,i,0,contbyteadjust,0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte. } if ((pInputBuffer[-i] & 0b11110000) == 0b11100000) { - return (3 - i,i,0,0,0); // We have that i == 1 or i == 2 or i == 3, if i == 1, we are missing two bytes, if i == 2, we are missing one byte. + return (3 - i,i,0,contbyteadjust,0); // We have that i == 1 or i == 2 or i == 3, if i == 1, we are missing two bytes, if i == 2, we are missing one byte. } // We must have that (pInputBuffer[-i] & 0b11111000) == 0b11110000 - return (4 - i,i,0,0,-1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte. + return (4 - i,i,0,contbyteadjust,-1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte. } public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { - Console.WriteLine("---------"); - Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte); + Console.WriteLine("---------"); //debug + Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; int utfadjust = -2 * n4 - 2 * n3 - n2; int scalaradjust = -n4; - Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust); + Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug - return (utfadjust, scalaradjust); } @@ -395,7 +461,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - Console.WriteLine("-------------------------------------"); + Console.ForegroundColor = ConsoleColor.Blue; //debug + Console.WriteLine("-------------------------------------");//debug + Console.ResetColor();//debug + int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; @@ -568,10 +637,17 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 prev1 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 1)); // Vector256.Shuffle vs Avx2.Shuffle // https://github.com/dotnet/runtime/blob/1400c1e7a888ea1e710e5c08d55c800e0b04bf8a/docs/coding-guidelines/vectorization-guidelines.md#vector256shuffle-vs-avx2shuffle - Vector256 byte_1_high = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f); - Vector256 byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f)); - Vector256 byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); + Vector256 byte_1_high = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);// takes the XXXX 0000 part of the previous byte + Vector256 byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f)); // takes the 0000 XXXX part of the previous part + Vector256 byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); // takes the XXXX 0000 part of the current byte Vector256 sc = Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high); + + // Create a span from the Vector256 + // Console.WriteLine(""); + // Span byteSpan = MemoryMarshal.Cast, byte>(MemoryMarshal.CreateSpan(ref sc, 1)); + // byte[] scbytes = byteSpan.ToArray(); + // PrintHexAndBinary(scbytes);55555555555555555 + Vector256 prev2 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 2)); Vector256 prev3 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 3)); Vector256 isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte); @@ -581,7 +657,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { - Console.WriteLine("--Error!"); + Console.WriteLine($"--Error! @ {processedLength} bytes");//debug int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); @@ -608,25 +684,53 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); processedLength -= i; - n4 += tempn4; + n4 += tempn4;// this is + because the adjustment function returns something negative already contbytes +=tempcont; - Console.WriteLine($"Unterminated! Backing up by {i}"); - + Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug } + + + + + // Vector256 contbyto = Vector256.Create((byte)(0b11000000u - 0x80)); + // Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); + // Vector256 isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte); + // Vector256 isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte); + + // uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence)); + // uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence)); + // uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); + + // No errors! Updating the variables we keep track of // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation. - contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); + + // contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); // this actually counts the number of 2 consecutive continuous bytes + // Placeholder until andether way to do with contbyte is found + + Vector256 top2bits = Vector256.Create((byte)0b11000000); // Mask to isolate the two most significant bits + Vector256 contbytemask = Vector256.Create((byte)0b10000000); // The expected pattern for continuation bytes: 10xxxxxx + // Apply the mask and compare + Vector256 maskedData = Avx2.And(currentBlock, top2bits); + Vector256 compareResult = Avx2.CompareEqual(maskedData, contbytemask); + // Move mask to get integer representation + contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(compareResult)); + + + // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation. n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte))); } + + // important: we just update asciibytes if there was no error. + // We count the number of ascii bytes in the block using just some simple arithmetic + // and no expensive operation: asciibytes += (int)(32 - Popcnt.PopCount((uint)mask)); } - // important: we just update asciibytes if there was no error. - // We count the number of ascii bytes in the block using just some simple arithmetic - // and no expensive operation: + // There are 2 possible scenarios here : either From f6e40c8b359d63727f28a5a335af646d02e1e250 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sat, 18 May 2024 13:59:29 -0400 Subject: [PATCH 64/75] All tests working save Bruteforce/toolong error --- src/UTF8.cs | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 4d91eae..db7ef8d 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -257,10 +257,12 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in { if ((pInputBuffer[-i] & 0b11000000) != 0b10000000) { + string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0'); + // Console.WriteLine($"Stopping at byte {binaryString}"); //debug break; } contbyteadjust -= 1; - + } if ((pInputBuffer[-i] & 0b10000000) == 0) { return (0,i,-1,contbyteadjust,0); // We must have that i == 1 @@ -277,14 +279,14 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { - Console.WriteLine("---------"); //debug - Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug + // Console.WriteLine("---------"); //debug + // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; int utfadjust = -2 * n4 - 2 * n3 - n2; int scalaradjust = -n4; - Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug + // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug return (utfadjust, scalaradjust); } @@ -301,7 +303,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); } - var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte); + // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte); + var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte); + return (utfadjust, scalaradjust); } @@ -461,9 +465,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - Console.ForegroundColor = ConsoleColor.Blue; //debug - Console.WriteLine("-------------------------------------");//debug - Console.ResetColor();//debug + // Console.ForegroundColor = ConsoleColor.Blue; //debug + // Console.WriteLine("-------------------------------------");//debug + // Console.ResetColor();//debug int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; @@ -657,7 +661,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 error = Avx2.Xor(must23As80, sc); if (!Avx2.TestZ(error, error)) { - Console.WriteLine($"--Error! @ {processedLength} bytes");//debug + // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); @@ -686,7 +690,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( processedLength -= i; n4 += tempn4;// this is + because the adjustment function returns something negative already contbytes +=tempcont; - Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug + // Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug } @@ -740,8 +744,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( int totalbyte = processedLength - start_point; var (utf16adjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4, contbytes, totalbyte); - utf16CodeUnitCountAdjustment = utf16adjust; - scalarCountAdjustment = scalaradjust; + TempUtf16CodeUnitCountAdjustment = utf16adjust; + TempScalarCountAdjustment = scalaradjust; } From 1cee9d621ea2220e829d6e61dc7e9c6ad5dd5750 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 19 May 2024 23:04:18 -0400 Subject: [PATCH 65/75] save game + longerror fix attempt + error around rewind length(I thing) --- src/UTF8.cs | 177 +++++++++++++++++++++++++++++------- test/UTF8ValidationTests.cs | 34 +++++-- 2 files changed, 168 insertions(+), 43 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index db7ef8d..bee9eef 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -10,7 +10,7 @@ namespace SimdUnicode public static class UTF8 { - // helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index + //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { int chunkSize = 16; // 128 bits = 16 bytes @@ -78,10 +78,7 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { - - int TempUtf16CodeUnitCountAdjustment = 0; - int TempScalarCountAdjustment = 0; - + // Console.WriteLine("CALLING REWIND"); int extraLen = 0; bool foundLeadingBytes = false; @@ -89,8 +86,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + Console.WriteLine($"Rewinding byte to offset {-i}: {candidateByte:X2}"); + Console.WriteLine(foundLeadingBytes); + if (foundLeadingBytes) - { + { + Console.WriteLine("Found leading byte"); buf -= i; break; } @@ -101,13 +102,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) return buf - howFarBack; } - utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TempScalarCountAdjustment; - int TailUtf16CodeUnitCountAdjustment = 0; int TailScalarCountAdjustment = 0; byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); + // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}"); + utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCountAdjustment; @@ -219,7 +219,7 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) } else { - // we may have a continuation + // we may have a continuation/too long error utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; @@ -257,12 +257,11 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in { if ((pInputBuffer[-i] & 0b11000000) != 0b10000000) { - string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0'); - // Console.WriteLine($"Stopping at byte {binaryString}"); //debug + string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');//debug + Console.WriteLine($"Stopping at byte {binaryString}"); //debug break; } contbyteadjust -= 1; - } if ((pInputBuffer[-i] & 0b10000000) == 0) { return (0,i,-1,contbyteadjust,0); // We must have that i == 1 @@ -279,19 +278,41 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { - // Console.WriteLine("---------"); //debug - // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug + Console.WriteLine("---------"); //debug + Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; int utfadjust = -2 * n4 - 2 * n3 - n2; int scalaradjust = -n4; - // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug + Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug return (utfadjust, scalaradjust); } - public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) + // public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) //todo: add an extra bool parameter 'TooLongErroronEdge' which defaults to false + // { + // // Calculate the total bytes from start_point to processedLength + // int totalbyte = processedLength - start_point; + // int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0; + + // // Adjust the length to include a complete character, if necessary + // if (totalbyte > 0) + // { + // (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); + // } + + // // Pseudocode: + // // if 'TooLongErroronEdge' bool is true then + // // then substract (remove) adjustascii, adjustcont, adjustn4 from their respective counterpart in the following function: + + // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte); + + + // return (utfadjust, scalaradjust); + // } + + public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes, bool TooLongErroronEdge = false) { // Calculate the total bytes from start_point to processedLength int totalbyte = processedLength - start_point; @@ -300,17 +321,25 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // Adjust the length to include a complete character, if necessary if (totalbyte > 0) { - (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); + (adjusttotalbyte, backedupByHowMuch, adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); } - // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte); - var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte); + // Adjust the counters if 'TooLongErroronEdge' is true + if (TooLongErroronEdge) + { + // If you can figure out why this makes a difference,youre golden + asciibytes += adjustascii; + contbytes += adjustcont; + n4 += adjustn4; + } + var (utfadjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyte + adjusttotalbyte); return (utfadjust, scalaradjust); } + public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength) { @@ -465,9 +494,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - // Console.ForegroundColor = ConsoleColor.Blue; //debug - // Console.WriteLine("-------------------------------------");//debug - // Console.ResetColor();//debug + Console.ForegroundColor = ConsoleColor.Blue; //debug + Console.WriteLine("-------------------------------------");//debug + Console.ResetColor();//debug int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; @@ -659,23 +688,100 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 must23 = Avx2.Or(isThirdByte, isFourthByte); Vector256 must23As80 = Avx2.And(must23, v80); Vector256 error = Avx2.Xor(must23As80, sc); - if (!Avx2.TestZ(error, error)) - { - // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug - int totalbyteasciierror = processedLength - start_point; - var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); + // if (!Avx2.TestZ(error, error)) + // { + // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug - utf16CodeUnitCountAdjustment = utfadjustasciierror; - scalarCountAdjustment = scalaradjustasciierror; + // int off = processedLength >= 32 ? processedLength - 32 : processedLength; + // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + + // utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; + // scalarCountAdjustment = TailScalarCodeUnitCountAdjustment; + + // // We need to take care of eg + // // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011 + // // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011 + // // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge + // // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100 + // // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function + // // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted + // // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10----- + // // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much + // // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup + + // // so in short , we want to solve this error while at the same time not disturbing anything else + // // we know that there is a continuation on the edge eg at the 64 byte, we need te check that + // // *TODO:Fill code here * + // // Peudocode for now + // // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then + // // pass on true to the + + + // int totalbyteasciierror = processedLength - start_point; + // var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); + + // utf16CodeUnitCountAdjustment += utfadjustasciierror; + // scalarCountAdjustment += scalaradjustasciierror; + + // TailScalarCodeUnitCountAdjustment =0; + // TailUtf16CodeUnitCountAdjustment =0; - TailScalarCodeUnitCountAdjustment =0; - TailUtf16CodeUnitCountAdjustment =0; + + + // return invalidBytePointer; + // } + + if (!Avx2.TestZ(error, error)) + { + Console.WriteLine($"--Error! @ {processedLength} bytes");//debug int off = processedLength >= 32 ? processedLength - 32 : processedLength; byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + bool TooLongErroronEdge = false; + + utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; + scalarCountAdjustment = TailScalarCodeUnitCountAdjustment; + + Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCodeUnitCountAdjustment}"); + + // We need to take care of eg + // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011 + // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011 + // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge + // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100 + // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function + // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted + // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10----- + // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much + // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup + + // so in short , we want to solve this error while at the same time not disturbing anything else + // we know that there is a continuation on the edge eg at the 64 byte, we need te check that + // *TODO:Fill code here * + // Peudocode for now + // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then + // pass on true to the + + // Calculate the offset of the invalid byte pointer from the start of the input buffer + ulong offsetFromStart = (ulong)(invalidBytePointer - pInputBuffer); + + // Debugging output + bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80; + bool isOneByteAfterProcessedLength = (invalidBytePointer == pInputBuffer + processedLength); + + // if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives + if (isContinuationByte && isOneByteAfterProcessedLength) + { + Console.WriteLine("Triggering TooLongErrorOnEdge adjustment"); + TooLongErroronEdge = true; + } - utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment += TailScalarCodeUnitCountAdjustment; + + int totalbyteasciierror = processedLength - start_point; + var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes,TooLongErroronEdge); + + utf16CodeUnitCountAdjustment += utfadjustasciierror; + scalarCountAdjustment += scalaradjustasciierror; return invalidBytePointer; } @@ -690,7 +796,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( processedLength -= i; n4 += tempn4;// this is + because the adjustment function returns something negative already contbytes +=tempcont; - // Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug + Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug } @@ -763,6 +869,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; + // An invalid byte was found by the scalar function return invalidBytePointer; } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 7d6371a..3e3ebfa 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -476,7 +476,6 @@ public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) { Console.WriteLine($"Assertion failed at index: {i}"); PrintHexAndBinary(utf8, i); - utf8[i] = oldByte; // Restore the original byte throw; // Rethrow the exception to fail the test. } @@ -552,7 +551,6 @@ public void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) { Console.WriteLine($"Assertion failed at index: {i}"); PrintHexAndBinary(utf8, i); - utf8[i] = oldByte; // Restore the original byte throw; // Rethrow the exception to fail the test. } utf8[i] = oldByte; // Restore the original byte @@ -617,10 +615,23 @@ public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) { byte oldByte = utf8[i]; utf8[i] = 0b10000000; // Forcing a too long error - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); - utf8[i] = oldByte; // Restore the original byte + // Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + // Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + // ValidateCount(utf8,utf8ValidationDelegate); + // utf8[i] = oldByte; // Restore the original byte + try + { + Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); + ValidateCount(utf8,utf8ValidationDelegate); // Ensure you want to call this here, it seems unrelated to exception handling. + utf8[i] = oldByte; // Restore the original byte + } + catch (Xunit.Sdk.XunitException) + { + Console.WriteLine($"Assertion failed at index: {i}"); + PrintHexAndBinary(utf8, i); + throw; // Rethrow the exception to fail the test. + } } } } @@ -1213,7 +1224,14 @@ public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) ValidateCount(modifiedUtf8,utf8ValidationDelegate); // Ensure both methods agree on the validation result - Assert.Equal(isValidPrimary, isValidFuschia); + try{ Assert.Equal(isValidPrimary, isValidFuschia);} + catch (Xunit.Sdk.XunitException) + { + Console.WriteLine($"Assertion failed at index: {byteIndex}"); + PrintHexAndBinary(utf8, byteIndex); + throw; // Rethrow the exception to fail the test. + } + } } } @@ -1424,7 +1442,7 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele catch (Exception) { Console.WriteLine("ValidateCount Assertion failed. Inspecting utf8 array:"); - // PrintHexAndBinary(utf8,failureIndex); + // PrintHexAndBinary(utf8); throw; // Re-throw the exception to preserve the failure state } } From d784815c98419a73be217af433689ecc65984728 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Sun, 19 May 2024 23:51:39 -0400 Subject: [PATCH 66/75] save game --- src/UTF8.cs | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index bee9eef..71d1bfb 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -10,7 +10,7 @@ namespace SimdUnicode public static class UTF8 { - //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index +// //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { int chunkSize = 16; // 128 bits = 16 bytes @@ -78,7 +78,7 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { - // Console.WriteLine("CALLING REWIND"); +// // Console.WriteLine("CALLING REWIND");//debug int extraLen = 0; bool foundLeadingBytes = false; @@ -86,12 +86,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - Console.WriteLine($"Rewinding byte to offset {-i}: {candidateByte:X2}"); - Console.WriteLine(foundLeadingBytes); +// Console.WriteLine($"Rewinding byte to offset {-i}: {candidateByte:X2}");//debug +// Console.WriteLine(foundLeadingBytes);//debug if (foundLeadingBytes) { - Console.WriteLine("Found leading byte"); +// Console.WriteLine("Found leading byte");//debug buf -= i; break; } @@ -257,8 +257,8 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in { if ((pInputBuffer[-i] & 0b11000000) != 0b10000000) { - string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');//debug - Console.WriteLine($"Stopping at byte {binaryString}"); //debug +// string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');//debug +// Console.WriteLine($"Stopping at byte {binaryString}"); //debug break; } contbyteadjust -= 1; @@ -278,14 +278,14 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { - Console.WriteLine("---------"); //debug - Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug +// Console.WriteLine("---------"); //debug +// Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; int utfadjust = -2 * n4 - 2 * n3 - n2; int scalaradjust = -n4; - Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug +// Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug return (utfadjust, scalaradjust); } @@ -494,9 +494,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - Console.ForegroundColor = ConsoleColor.Blue; //debug - Console.WriteLine("-------------------------------------");//debug - Console.ResetColor();//debug +// Console.ForegroundColor = ConsoleColor.Blue; //debug +// Console.WriteLine("-------------------------------------");//debug +// Console.ResetColor();//debug int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; @@ -690,7 +690,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 error = Avx2.Xor(must23As80, sc); // if (!Avx2.TestZ(error, error)) // { - // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug +// Console.WriteLine($"--Error! @ {processedLength} bytes");//debug // int off = processedLength >= 32 ? processedLength - 32 : processedLength; // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); @@ -733,16 +733,19 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( if (!Avx2.TestZ(error, error)) { - Console.WriteLine($"--Error! @ {processedLength} bytes");//debug +// Console.WriteLine($"--Error! @ {processedLength} bytes");//debug - int off = processedLength >= 32 ? processedLength - 32 : processedLength; + int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32 + // int off = processedLength >= 32 ? processedLength - 32 : processedLength; original/main algorithm working + +// Console.WriteLine($"Offset backup by: {off}");//debug byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); bool TooLongErroronEdge = false; utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TailScalarCodeUnitCountAdjustment; - Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCodeUnitCountAdjustment}"); +// Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCodeUnitCountAdjustment}");//debug // We need to take care of eg // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011 @@ -772,7 +775,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives if (isContinuationByte && isOneByteAfterProcessedLength) { - Console.WriteLine("Triggering TooLongErrorOnEdge adjustment"); +// Console.WriteLine("Triggering TooLongErrorOnEdge adjustment");//debug TooLongErroronEdge = true; } @@ -796,7 +799,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( processedLength -= i; n4 += tempn4;// this is + because the adjustment function returns something negative already contbytes +=tempcont; - Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug +// Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug } From e27c85faf06022c597223b6255dadd3873b4b1e3 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 20 May 2024 09:40:47 -0400 Subject: [PATCH 67/75] cleanup + more expressive tests --- src/UTF8.cs | 149 ++++++------------------------------ test/UTF8ValidationTests.cs | 20 +++-- 2 files changed, 37 insertions(+), 132 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 71d1bfb..e4506f4 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -10,7 +10,7 @@ namespace SimdUnicode public static class UTF8 { -// //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index + static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { int chunkSize = 16; // 128 bits = 16 bytes @@ -78,7 +78,7 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { -// // Console.WriteLine("CALLING REWIND");//debug + int extraLen = 0; bool foundLeadingBytes = false; @@ -86,12 +86,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; -// Console.WriteLine($"Rewinding byte to offset {-i}: {candidateByte:X2}");//debug -// Console.WriteLine(foundLeadingBytes);//debug + + if (foundLeadingBytes) { -// Console.WriteLine("Found leading byte");//debug + buf -= i; break; } @@ -257,8 +257,8 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in { if ((pInputBuffer[-i] & 0b11000000) != 0b10000000) { -// string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');//debug -// Console.WriteLine($"Stopping at byte {binaryString}"); //debug + + break; } contbyteadjust -= 1; @@ -278,40 +278,18 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { -// Console.WriteLine("---------"); //debug -// Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug + + int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; int utfadjust = -2 * n4 - 2 * n3 - n2; int scalaradjust = -n4; -// Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug + return (utfadjust, scalaradjust); } - // public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) //todo: add an extra bool parameter 'TooLongErroronEdge' which defaults to false - // { - // // Calculate the total bytes from start_point to processedLength - // int totalbyte = processedLength - start_point; - // int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0; - - // // Adjust the length to include a complete character, if necessary - // if (totalbyte > 0) - // { - // (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); - // } - - // // Pseudocode: - // // if 'TooLongErroronEdge' bool is true then - // // then substract (remove) adjustascii, adjustcont, adjustn4 from their respective counterpart in the following function: - - // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte); - - - // return (utfadjust, scalaradjust); - // } - public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes, bool TooLongErroronEdge = false) { // Calculate the total bytes from start_point to processedLength @@ -324,10 +302,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( (adjusttotalbyte, backedupByHowMuch, adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); } - // Adjust the counters if 'TooLongErroronEdge' is true if (TooLongErroronEdge) { - // If you can figure out why this makes a difference,youre golden asciibytes += adjustascii; contbytes += adjustcont; n4 += adjustn4; @@ -494,9 +470,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { -// Console.ForegroundColor = ConsoleColor.Blue; //debug -// Console.WriteLine("-------------------------------------");//debug -// Console.ResetColor();//debug + + + int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; @@ -674,13 +650,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f)); // takes the 0000 XXXX part of the previous part Vector256 byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); // takes the XXXX 0000 part of the current byte Vector256 sc = Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high); - - // Create a span from the Vector256 - // Console.WriteLine(""); - // Span byteSpan = MemoryMarshal.Cast, byte>(MemoryMarshal.CreateSpan(ref sc, 1)); - // byte[] scbytes = byteSpan.ToArray(); - // PrintHexAndBinary(scbytes);55555555555555555 - Vector256 prev2 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 2)); Vector256 prev3 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 3)); Vector256 isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte); @@ -688,64 +657,22 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector256 must23 = Avx2.Or(isThirdByte, isFourthByte); Vector256 must23As80 = Avx2.And(must23, v80); Vector256 error = Avx2.Xor(must23As80, sc); - // if (!Avx2.TestZ(error, error)) - // { -// Console.WriteLine($"--Error! @ {processedLength} bytes");//debug - - // int off = processedLength >= 32 ? processedLength - 32 : processedLength; - // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); - - // utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; - // scalarCountAdjustment = TailScalarCodeUnitCountAdjustment; - - // // We need to take care of eg - // // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011 - // // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011 - // // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge - // // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100 - // // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function - // // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted - // // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10----- - // // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much - // // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup - - // // so in short , we want to solve this error while at the same time not disturbing anything else - // // we know that there is a continuation on the edge eg at the 64 byte, we need te check that - // // *TODO:Fill code here * - // // Peudocode for now - // // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then - // // pass on true to the - // int totalbyteasciierror = processedLength - start_point; - // var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); - - // utf16CodeUnitCountAdjustment += utfadjustasciierror; - // scalarCountAdjustment += scalaradjustasciierror; - - // TailScalarCodeUnitCountAdjustment =0; - // TailUtf16CodeUnitCountAdjustment =0; - - - - // return invalidBytePointer; - // } - if (!Avx2.TestZ(error, error)) { -// Console.WriteLine($"--Error! @ {processedLength} bytes");//debug + int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32 - // int off = processedLength >= 32 ? processedLength - 32 : processedLength; original/main algorithm working -// Console.WriteLine($"Offset backup by: {off}");//debug + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); bool TooLongErroronEdge = false; utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TailScalarCodeUnitCountAdjustment; -// Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCodeUnitCountAdjustment}");//debug + // We need to take care of eg // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011 @@ -756,26 +683,21 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10----- // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much - // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup - - // so in short , we want to solve this error while at the same time not disturbing anything else - // we know that there is a continuation on the edge eg at the 64 byte, we need te check that - // *TODO:Fill code here * - // Peudocode for now - // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then - // pass on true to the + // If this error arrive at the edge of 2 simd vector, that is where problem abound // Calculate the offset of the invalid byte pointer from the start of the input buffer ulong offsetFromStart = (ulong)(invalidBytePointer - pInputBuffer); // Debugging output + bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80; + bool isOneByteAfterProcessedLength = (invalidBytePointer == pInputBuffer + processedLength); - // if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives + if (isContinuationByte && isOneByteAfterProcessedLength) { -// Console.WriteLine("Triggering TooLongErrorOnEdge adjustment");//debug + TooLongErroronEdge = true; } @@ -797,31 +719,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); processedLength -= i; - n4 += tempn4;// this is + because the adjustment function returns something negative already + n4 += tempn4; contbytes +=tempcont; -// Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug - } - - - - - - // Vector256 contbyto = Vector256.Create((byte)(0b11000000u - 0x80)); - // Vector256 isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte); - // Vector256 isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte); - // Vector256 isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte); - - // uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence)); - // uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence)); - // uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence)); + } - // No errors! Updating the variables we keep track of - // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation. - - // contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); // this actually counts the number of 2 consecutive continuous bytes - // Placeholder until andether way to do with contbyte is found - Vector256 top2bits = Vector256.Create((byte)0b11000000); // Mask to isolate the two most significant bits Vector256 contbytemask = Vector256.Create((byte)0b10000000); // The expected pattern for continuation bytes: 10xxxxxx @@ -843,9 +745,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( asciibytes += (int)(32 - Popcnt.PopCount((uint)mask)); } - - - // There are 2 possible scenarios here : either // A) it arrives flush en the border. eg it doesnt need to be processed further // B) There is some bytes remaining in which case we need to call the scalar functien @@ -862,11 +761,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // We have processed all the blocks using SIMD, we need to process the remaining bytes. // Process the remaining bytes with the scalar function - // worst possible case is 4 bytes, where we need to backtrack 3 bytes // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte if (processedLength < inputLength) { + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 3e3ebfa..94d05c2 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -615,10 +615,6 @@ public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) { byte oldByte = utf8[i]; utf8[i] = 0b10000000; // Forcing a too long error - // Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - // Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - // ValidateCount(utf8,utf8ValidationDelegate); - // utf8[i] = oldByte; // Restore the original byte try { Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); @@ -1221,7 +1217,7 @@ public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) // Validate the modified sequence with both methods bool isValidPrimary = ValidateUtf8(modifiedUtf8,utf8ValidationDelegate); bool isValidFuschia = ValidateUtf8Fuschia(modifiedUtf8); - ValidateCount(modifiedUtf8,utf8ValidationDelegate); + ValidateCount(modifiedUtf8,utf8ValidationDelegate,default,byteIndex); // Ensure both methods agree on the validation result try{ Assert.Equal(isValidPrimary, isValidFuschia);} @@ -1409,7 +1405,8 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg // Define a delegate that matches the signature of the methods you want to test public unsafe delegate byte* Utf8ValidationDelegate(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); - public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) + // public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) + public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default, int? index = null) { int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; @@ -1432,7 +1429,8 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele byte* simdResult = utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); // Determine the index of the invalid byte if simdResult doesn't point to the end. - int failureIndex = simdResult != pInput + length ? (int)(simdResult - pInput) : -1; + // int failureIndex = simdResult != pInput + length ? (int)(simdResult - pInput) : -1; + try { @@ -1443,6 +1441,14 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele { Console.WriteLine("ValidateCount Assertion failed. Inspecting utf8 array:"); // PrintHexAndBinary(utf8); + if (index.HasValue) + { + PrintHexAndBinary(utf8, index.Value); + } + else + { + PrintHexAndBinary(utf8); + } throw; // Re-throw the exception to preserve the failure state } } From 1d27d6fa8462eb5223b1aadb94756fa5fcb8b5ea Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Mon, 20 May 2024 21:12:47 -0400 Subject: [PATCH 68/75] all tests working need cleanup --- src/UTF8.cs | 66 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index e4506f4..858c8a8 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -76,12 +76,44 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) static Func byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0');//for debugging +// prevents double counting in case there is a toolong error on the edge + public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte) + { + // Check if the header byte belongs to a 2-byte UTF-8 character + if ((headerByte & 0b11100000) == 0b11000000) + { + return (1, 0); + } + // Check if the header byte belongs to a 3-byte UTF-8 character + else if ((headerByte & 0b11110000) == 0b11100000) + { + return (2, 0); + } + // Check if the header byte belongs to a 4-byte UTF-8 character + else if ((headerByte & 0b11111000) == 0b11110000) + { + + return (2, 1); + } + // Otherwise, it's a 1-byte character or continuation byte + return (0, 0); + } + + public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { int extraLen = 0; bool foundLeadingBytes = false; + // Print the byte value at the buf pointer + byte* PinputPlusProcessedlength = buf; + + + + int TooLongErroronEdgeUtfadjust = 0; + int TooLongErroronEdgeScalaradjust = 0; + for (int i = 0; i <= howFarBack; i++) { byte candidateByte = buf[0 - i]; @@ -92,6 +124,8 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) if (foundLeadingBytes) { + (TooLongErroronEdgeUtfadjust,TooLongErroronEdgeScalaradjust) = GetFinalScalarUtfAdjustments(candidateByte); + buf -= i; break; } @@ -108,6 +142,24 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}"); + bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80; + bool isOneByteAfterProcessedLength = (invalidBytePointer == PinputPlusProcessedlength); + + + + // // Print the byte value at the invalidBytePointer + + + + + if (isContinuationByte && isOneByteAfterProcessedLength) + { + + utf16CodeUnitCountAdjustment += TooLongErroronEdgeUtfadjust; + scalarCountAdjustment += TooLongErroronEdgeScalaradjust; + + } + utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCountAdjustment; @@ -302,12 +354,12 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( (adjusttotalbyte, backedupByHowMuch, adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); } - if (TooLongErroronEdge) - { - asciibytes += adjustascii; - contbytes += adjustcont; - n4 += adjustn4; - } + // if (TooLongErroronEdge) + // { + // asciibytes += adjustascii; + // contbytes += adjustcont; + // n4 += adjustn4; + // } var (utfadjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyte + adjusttotalbyte); @@ -698,7 +750,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( if (isContinuationByte && isOneByteAfterProcessedLength) { - TooLongErroronEdge = true; + // TooLongErroronEdge = true; } From c22c649441d75568976e1a82d844319380eb3f4c Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Wed, 22 May 2024 12:17:11 -0400 Subject: [PATCH 69/75] cleanup --- README.md | 6 -- src/UTF8.cs | 176 +++++------------------------------- test/UTF8ValidationTests.cs | 142 ++++++++++++++++++++--------- 3 files changed, 120 insertions(+), 204 deletions(-) diff --git a/README.md b/README.md index 6b682b1..1138cee 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,6 @@ dotnet test To get a list of available tests, enter the command: -``` -dotnet test --list-tests | cut -d '(' -f 1 | uniq -``` - -For a far more verbose output: - ``` dotnet test --list-tests ``` diff --git a/src/UTF8.cs b/src/UTF8.cs index 858c8a8..502b7c3 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -10,72 +10,6 @@ namespace SimdUnicode public static class UTF8 { - -static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) -{ - int chunkSize = 16; // 128 bits = 16 bytes - - // Process each chunk for hexadecimal - Console.Write("Hex: "); - for (int i = 0; i < bytes.Length; i++) - { - if (i > 0 && i % chunkSize == 0) - Console.WriteLine(); // New line after every 16 bytes - - if (i == highlightIndex) - { - Console.ForegroundColor = ConsoleColor.Red; - Console.Write($"{bytes[i]:X2} "); - Console.ResetColor(); - } - else if (i % (chunkSize * 2) == 0) // print green every 256 bytes - { - Console.ForegroundColor = ConsoleColor.Green; - Console.Write($"{bytes[i]:X2} "); - Console.ResetColor(); - } - else - { - Console.Write($"{bytes[i]:X2} "); - } - - if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line - } - Console.WriteLine("\n"); // New line for readability and to separate hex from binary - - // Process each chunk for binary - Console.Write("Binary: "); - for (int i = 0; i < bytes.Length; i++) - { - if (i > 0 && i % chunkSize == 0) - Console.WriteLine(); // New line after every 16 bytes - - string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0'); - if (i == highlightIndex) - { - Console.ForegroundColor = ConsoleColor.Red; - Console.Write($"{binaryString} "); - Console.ResetColor(); - } - else if (i % (chunkSize * 2) == 0) // print green every 256 bytes - { - Console.ForegroundColor = ConsoleColor.Green; - Console.Write($"{binaryString} "); - Console.ResetColor(); - } - else - { - Console.Write($"{binaryString} "); - } - - if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line - } - Console.WriteLine(); // New line for readability -} - - - static Func byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0');//for debugging - // prevents double counting in case there is a toolong error on the edge public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte) { @@ -92,7 +26,6 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt // Check if the header byte belongs to a 4-byte UTF-8 character else if ((headerByte & 0b11111000) == 0b11110000) { - return (2, 1); } // Otherwise, it's a 1-byte character or continuation byte @@ -107,10 +40,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt bool foundLeadingBytes = false; // Print the byte value at the buf pointer - byte* PinputPlusProcessedlength = buf; - - - + byte* PinputPlusProcessedlength = buf; int TooLongErroronEdgeUtfadjust = 0; int TooLongErroronEdgeScalaradjust = 0; @@ -119,8 +49,6 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt byte candidateByte = buf[0 - i]; foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; - - if (foundLeadingBytes) { @@ -140,27 +68,26 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt int TailScalarCountAdjustment = 0; byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); - // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}"); - - bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80; - bool isOneByteAfterProcessedLength = (invalidBytePointer == PinputPlusProcessedlength); - - - - // // Print the byte value at the invalidBytePointer - + // We need to take care of eg + // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011 + // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 *11110000* 10011001 10101011 10000011 + // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge + // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100 + // Without the following check, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function + // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted + // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10----- + // the part between parentheses will be counted as valid and thus scalaradjust/utfadjust will be incremented once too much + bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80; + bool isOnEdge = (invalidBytePointer == PinputPlusProcessedlength); - if (isContinuationByte && isOneByteAfterProcessedLength) + if (isContinuationByte && isOnEdge) { - utf16CodeUnitCountAdjustment += TooLongErroronEdgeUtfadjust; scalarCountAdjustment += TooLongErroronEdgeScalaradjust; - } - utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment += TailScalarCountAdjustment; @@ -295,7 +222,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS; // Assuming that a valid UTF-8 sequence ends at pInputBuffer, - // computes how many bytes are needed (eg what type of byte) to complete the last character. also counts the number of n4, n2 and ascii affected + // computes how many bytes are needed to complete the last character. also counts the number of n4, n2 and ascii affected // This will return 1, 2, 3. If the whole byte sequence is valid UTF-8, // and this function returns returnedvalue>0, then the bytes at pInputBuffer[0], // ... pInputBuffer[returnedvalue - 1] should be continuation bytes. @@ -309,8 +236,6 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in { if ((pInputBuffer[-i] & 0b11000000) != 0b10000000) { - - break; } contbyteadjust -= 1; @@ -330,19 +255,15 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { - - int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; int utfadjust = -2 * n4 - 2 * n3 - n2; int scalaradjust = -n4; - - return (utfadjust, scalaradjust); } - public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes, bool TooLongErroronEdge = false) + public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) { // Calculate the total bytes from start_point to processedLength int totalbyte = processedLength - start_point; @@ -353,21 +274,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { (adjusttotalbyte, backedupByHowMuch, adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength); } - - // if (TooLongErroronEdge) - // { - // asciibytes += adjustascii; - // contbytes += adjustcont; - // n4 += adjustn4; - // } - var (utfadjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyte + adjusttotalbyte); - return (utfadjust, scalaradjust); } - - public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength) { @@ -522,10 +432,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - - - - int processedLength = 0; int TempUtf16CodeUnitCountAdjustment= 0 ; int TempScalarCountAdjustment = 0; @@ -678,7 +584,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - // TODO : this path is not explicitly tested, write tests + // Note/todo : this path is not yet explicitly tested int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); @@ -713,49 +619,13 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( if (!Avx2.TestZ(error, error)) { - - int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32 - - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); - bool TooLongErroronEdge = false; - utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TailScalarCodeUnitCountAdjustment; - - - // We need to take care of eg - // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011 - // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011 - // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge - // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100 - // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function - // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted - // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10----- - // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much - // If this error arrive at the edge of 2 simd vector, that is where problem abound - - // Calculate the offset of the invalid byte pointer from the start of the input buffer - ulong offsetFromStart = (ulong)(invalidBytePointer - pInputBuffer); - - // Debugging output - - bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80; - - bool isOneByteAfterProcessedLength = (invalidBytePointer == pInputBuffer + processedLength); - - - if (isContinuationByte && isOneByteAfterProcessedLength) - { - - // TooLongErroronEdge = true; - } - - int totalbyteasciierror = processedLength - start_point; - var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes,TooLongErroronEdge); + var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); utf16CodeUnitCountAdjustment += utfadjustasciierror; scalarCountAdjustment += scalaradjustasciierror; @@ -769,13 +639,17 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { // We have an unterminated sequence. var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); - processedLength -= i; n4 += tempn4; contbytes +=tempcont; - } + // (Nick Nuon)The counts for continuous bytes can probably be optimized: + // The draft had something like this line: + // contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); + // this actually counts the number of 2 consecutive continuous bytes + // I put something that was bound to be working regardless as a slow but temporary fix: + Vector256 top2bits = Vector256.Create((byte)0b11000000); // Mask to isolate the two most significant bits Vector256 contbytemask = Vector256.Create((byte)0b10000000); // The expected pattern for continuation bytes: 10xxxxxx @@ -797,10 +671,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( asciibytes += (int)(32 - Popcnt.PopCount((uint)mask)); } - // There are 2 possible scenarios here : either - // A) it arrives flush en the border. eg it doesnt need to be processed further - // B) There is some bytes remaining in which case we need to call the scalar functien - // Either way we need to calculate n2,n3 and update the utf16adjust and scalar adjust int totalbyte = processedLength - start_point; var (utf16adjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4, contbytes, totalbyte); diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 94d05c2..5389bf7 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -103,7 +103,7 @@ public void simpleGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) Assert.True(ValidateUtf8(input,utf8ValidationDelegate), $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); - ValidateCount(input,utf8ValidationDelegate); + Assert.True(ValidateCount(input,utf8ValidationDelegate)); } } } @@ -191,7 +191,7 @@ public void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate) fixed (byte* pInput = input) { ValidateUtf8(input,utf8ValidationDelegate); - ValidateCount(input,utf8ValidationDelegate); + Assert.True(ValidateCount(input,utf8ValidationDelegate)); } } } @@ -236,7 +236,7 @@ public void BadSequencesAVX() BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - + // this was in the C++ code public void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate) { byte[] bad = new byte[] { 0x80 }; @@ -256,7 +256,7 @@ public void NoError(Utf8ValidationDelegate utf8ValidationDelegate) { Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -326,7 +326,7 @@ private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8Vali try { Assert.True(isValidUtf8, $"Failure NoErrorTest. "); - ValidateCount(utf8,utf8ValidationDelegate); + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -401,7 +401,7 @@ public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDe { Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -470,7 +470,7 @@ public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) { Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); // Ensure you want to call this here, it seems unrelated to exception handling. + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -545,7 +545,7 @@ public void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) { Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); // Ensure you want to call this here, it seems unrelated to exception handling. + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -619,8 +619,7 @@ public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) { Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); // Ensure you want to call this here, it seems unrelated to exception handling. - utf8[i] = oldByte; // Restore the original byte + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -628,6 +627,7 @@ public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) PrintHexAndBinary(utf8, i); throw; // Rethrow the exception to fail the test. } + utf8[i] = oldByte; // Restore the original byte } } } @@ -706,7 +706,7 @@ public void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); utf8[i] = old; utf8[i + 1] = secondOld; @@ -793,7 +793,7 @@ public void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, i + offset, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); Assert.True(dotnetResult == pInput + i + offset); - ValidateCount(utf8,utf8ValidationDelegate); + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } } @@ -864,7 +864,7 @@ public void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) utf8[position] = invalidByte; Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); // Expect the validation to fail due to the invalid byte Assert.True(InvalidateUtf8(utf8,position,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } } } @@ -996,7 +996,7 @@ public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i+1,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); utf8[i] = old; } } @@ -1059,7 +1059,7 @@ public void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8Validatio Assert.False(ValidateUtf8(filler,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(filler, filler.Length - 1,utf8ValidationDelegate)); - ValidateCount(filler,utf8ValidationDelegate); + Assert.True(ValidateCount(filler,utf8ValidationDelegate)); } @@ -1135,7 +1135,7 @@ public void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - ValidateCount(utf8,utf8ValidationDelegate); + Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); } utf8[i] = old; @@ -1217,10 +1217,11 @@ public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) // Validate the modified sequence with both methods bool isValidPrimary = ValidateUtf8(modifiedUtf8,utf8ValidationDelegate); bool isValidFuschia = ValidateUtf8Fuschia(modifiedUtf8); - ValidateCount(modifiedUtf8,utf8ValidationDelegate,default,byteIndex); // Ensure both methods agree on the validation result - try{ Assert.Equal(isValidPrimary, isValidFuschia);} + try{ Assert.Equal(isValidPrimary, isValidFuschia); + Assert.True(ValidateCount(modifiedUtf8,utf8ValidationDelegate)); + } catch (Xunit.Sdk.XunitException) { Console.WriteLine($"Assertion failed at index: {byteIndex}"); @@ -1406,10 +1407,59 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg public unsafe delegate byte* Utf8ValidationDelegate(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); // public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) - public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default, int? index = null) +// public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default, int? index = null) +// { +// int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; +// int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + +// var isDefaultRange = range.Equals(default(Range)); +// var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); + +// unsafe +// { +// fixed (byte* pInput = utf8) +// { +// byte* startPtr = pInput + offset; + +// DotnetUtf16Adjustment = 0; +// DotnetScalarCountAdjustment = 0; +// DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); + +// SimdUnicodeUtf16Adjustment = 0; +// SimdUnicodeScalarCountAdjustment = 0; +// byte* simdResult = utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + +// // Determine the index of the invalid byte if simdResult doesn't point to the end. +// // int failureIndex = simdResult != pInput + length ? (int)(simdResult - pInput) : -1; + + +// try +// { +// Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); +// Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); +// } +// catch (Exception) +// { +// Console.WriteLine("ValidateCount Assertion failed. Inspecting utf8 array:"); +// // PrintHexAndBinary(utf8); +// if (index.HasValue) +// { +// PrintHexAndBinary(utf8, index.Value); +// } +// else +// { +// PrintHexAndBinary(utf8); +// } +// throw; // Re-throw the exception to preserve the failure state +// } +// } +// } +// } + +public bool ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) { - int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; - int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + int dotnetUtf16Adjustment, dotnetScalarCountAdjustment; + int simdUnicodeUtf16Adjustment, simdUnicodeScalarCountAdjustment; var isDefaultRange = range.Equals(default(Range)); var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); @@ -1420,41 +1470,43 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele { byte* startPtr = pInput + offset; - DotnetUtf16Adjustment = 0; - DotnetScalarCountAdjustment = 0; - DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); + // Initialize adjustments + dotnetUtf16Adjustment = 0; + dotnetScalarCountAdjustment = 0; + DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out dotnetUtf16Adjustment, out dotnetScalarCountAdjustment); - SimdUnicodeUtf16Adjustment = 0; - SimdUnicodeScalarCountAdjustment = 0; - byte* simdResult = utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); + simdUnicodeUtf16Adjustment = 0; + simdUnicodeScalarCountAdjustment = 0; + byte* simdResult = utf8ValidationDelegate(pInput, length, out simdUnicodeUtf16Adjustment, out simdUnicodeScalarCountAdjustment); - // Determine the index of the invalid byte if simdResult doesn't point to the end. - // int failureIndex = simdResult != pInput + length ? (int)(simdResult - pInput) : -1; - + // Check for discrepancies and report them in one combined message + bool adjustmentsMatch = true; + string errorMessage = "Error: Adjustments mismatch - "; - try + if (dotnetScalarCountAdjustment != simdUnicodeScalarCountAdjustment) { - Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); - Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); + errorMessage += $"Expected Scalar Count Adjustment: {dotnetScalarCountAdjustment}, but got: {simdUnicodeScalarCountAdjustment}. "; + adjustmentsMatch = false; } - catch (Exception) + + if (dotnetUtf16Adjustment != simdUnicodeUtf16Adjustment) { - Console.WriteLine("ValidateCount Assertion failed. Inspecting utf8 array:"); - // PrintHexAndBinary(utf8); - if (index.HasValue) - { - PrintHexAndBinary(utf8, index.Value); - } - else - { - PrintHexAndBinary(utf8); - } - throw; // Re-throw the exception to preserve the failure state + errorMessage += $"Expected UTF16 Adjustment: {dotnetUtf16Adjustment}, but got: {simdUnicodeUtf16Adjustment}."; + adjustmentsMatch = false; } + + if (!adjustmentsMatch) + { + Console.WriteLine(errorMessage); + return false; + } + + return true; } } } + } From ad7dad3e750719df4a924bc68d04561be3995af2 Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Thu, 23 May 2024 00:24:37 -0400 Subject: [PATCH 70/75] dotnet format + ingempletethenascii test --- src/Ascii.cs | 2 +- src/UTF8.cs | 218 ++++++++++++++++++++---------------- test/UTF8ValidationTests.cs | 136 +++++++++++++--------- test/helpers/randomutf8.cs | 10 +- 4 files changed, 209 insertions(+), 157 deletions(-) diff --git a/src/Ascii.cs b/src/Ascii.cs index c256c20..e92845b 100644 --- a/src/Ascii.cs +++ b/src/Ascii.cs @@ -157,7 +157,7 @@ public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint buff } - return GetIndexOfFirstNonAsciiByteScalar(pBuffer, bufferLength); + return GetIndexOfFirstNonAsciiByteScalar(pBuffer, bufferLength); } diff --git a/src/UTF8.cs b/src/UTF8.cs index 502b7c3..fdbe1f2 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -10,36 +10,36 @@ namespace SimdUnicode public static class UTF8 { -// prevents double counting in case there is a toolong error on the edge - public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte) - { - // Check if the header byte belongs to a 2-byte UTF-8 character - if ((headerByte & 0b11100000) == 0b11000000) - { - return (1, 0); - } - // Check if the header byte belongs to a 3-byte UTF-8 character - else if ((headerByte & 0b11110000) == 0b11100000) + // prevents double counting in case there is a toolong error on the edge + public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte) { - return (2, 0); - } - // Check if the header byte belongs to a 4-byte UTF-8 character - else if ((headerByte & 0b11111000) == 0b11110000) - { - return (2, 1); + // Check if the header byte belongs to a 2-byte UTF-8 character + if ((headerByte & 0b11100000) == 0b11000000) + { + return (1, 0); + } + // Check if the header byte belongs to a 3-byte UTF-8 character + else if ((headerByte & 0b11110000) == 0b11100000) + { + return (2, 0); + } + // Check if the header byte belongs to a 4-byte UTF-8 character + else if ((headerByte & 0b11111000) == 0b11110000) + { + return (2, 1); + } + // Otherwise, it's a 1-byte character or continuation byte + return (0, 0); } - // Otherwise, it's a 1-byte character or continuation byte - return (0, 0); - } - public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len, ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) { int extraLen = 0; bool foundLeadingBytes = false; - // Print the byte value at the buf pointer + // Print the byte value at the buf pointer byte* PinputPlusProcessedlength = buf; int TooLongErroronEdgeUtfadjust = 0; int TooLongErroronEdgeScalaradjust = 0; @@ -50,9 +50,9 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; if (foundLeadingBytes) - { + { - (TooLongErroronEdgeUtfadjust,TooLongErroronEdgeScalaradjust) = GetFinalScalarUtfAdjustments(candidateByte); + (TooLongErroronEdgeUtfadjust, TooLongErroronEdgeScalaradjust) = GetFinalScalarUtfAdjustments(candidateByte); buf -= i; break; @@ -67,7 +67,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt int TailUtf16CodeUnitCountAdjustment = 0; int TailScalarCountAdjustment = 0; - byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); + byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen, out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment); // We need to take care of eg // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011 @@ -94,10 +94,10 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt return invalidBytePointer; } - public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) + public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; int pos = 0; @@ -110,41 +110,51 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt byte firstByte = pInputBuffer[pos]; while (firstByte < 0b10000000) { - if (++pos == inputLength) { + if (++pos == inputLength) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + inputLength; } + return pInputBuffer + inputLength; + } firstByte = pInputBuffer[pos]; } if ((firstByte & 0b11100000) == 0b11000000) { nextPos = pos + 2; - if (nextPos > inputLength) { + if (nextPos > inputLength) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } // Too short - if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { + return pInputBuffer + pos; + } // Too short + if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } // Too short + return pInputBuffer + pos; + } // Too short // range check codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(pInputBuffer[pos + 1] & 0b00111111); - if ((codePoint < 0x80) || (0x7ff < codePoint)) { + if ((codePoint < 0x80) || (0x7ff < codePoint)) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } // Overlong + return pInputBuffer + pos; + } // Overlong TempUtf16CodeUnitCountAdjustment -= 1; } else if ((firstByte & 0b11110000) == 0b11100000) { nextPos = pos + 3; - if (nextPos > inputLength) { - + if (nextPos > inputLength) + { + utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } // Too short + return pInputBuffer + pos; + } // Too short // range check codePoint = (uint)(firstByte & 0b00001111) << 12 | (uint)(pInputBuffer[pos + 1] & 0b00111111) << 6 | @@ -157,42 +167,56 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; } - if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { + if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } // Too short - if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { + return pInputBuffer + pos; + } // Too short + if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } // Too short + return pInputBuffer + pos; + } // Too short TempUtf16CodeUnitCountAdjustment -= 2; } else if ((firstByte & 0b11111000) == 0b11110000) - { + { nextPos = pos + 4; - if (nextPos > inputLength) { + if (nextPos > inputLength) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment;return pInputBuffer + pos; } - if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) { + scalarCountAdjustment = TempScalarCountAdjustment; return pInputBuffer + pos; + } + if ((pInputBuffer[pos + 1] & 0b11000000) != 0b10000000) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } - if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) { + return pInputBuffer + pos; + } + if ((pInputBuffer[pos + 2] & 0b11000000) != 0b10000000) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } - if ((pInputBuffer[pos + 3] & 0b11000000) != 0b10000000) { + return pInputBuffer + pos; + } + if ((pInputBuffer[pos + 3] & 0b11000000) != 0b10000000) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } + return pInputBuffer + pos; + } // range check codePoint = (uint)(firstByte & 0b00000111) << 18 | (uint)(pInputBuffer[pos + 1] & 0b00111111) << 12 | (uint)(pInputBuffer[pos + 2] & 0b00111111) << 6 | (uint)(pInputBuffer[pos + 3] & 0b00111111); - if (codePoint <= 0xffff || 0x10ffff < codePoint) { + if (codePoint <= 0xffff || 0x10ffff < codePoint) + { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment; - return pInputBuffer + pos; } + return pInputBuffer + pos; + } TempUtf16CodeUnitCountAdjustment -= 2; TempScalarCountAdjustment -= 1; } @@ -228,7 +252,8 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt // ... pInputBuffer[returnedvalue - 1] should be continuation bytes. // Note that this function is unsafe, and it is the caller's responsibility // to ensure that we can read at least 4 bytes before pInputBuffer. - public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,int contbyte,int n4) adjustmentFactor(byte* pInputBuffer) { + public unsafe static (int totalbyteadjustment, int backedupByHowMuch, int ascii, int contbyte, int n4) adjustmentFactor(byte* pInputBuffer) + { // Find the first non-continuation byte, working backward. int i = 1; int contbyteadjust = 0; @@ -240,17 +265,20 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in } contbyteadjust -= 1; } - if ((pInputBuffer[-i] & 0b10000000) == 0) { - return (0,i,-1,contbyteadjust,0); // We must have that i == 1 + if ((pInputBuffer[-i] & 0b10000000) == 0) + { + return (0, i, -1, contbyteadjust, 0); // We must have that i == 1 } - if ((pInputBuffer[-i] & 0b11100000) == 0b11000000) { - return (2 - i,i,0,contbyteadjust,0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte. + if ((pInputBuffer[-i] & 0b11100000) == 0b11000000) + { + return (2 - i, i, 0, contbyteadjust, 0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte. } - if ((pInputBuffer[-i] & 0b11110000) == 0b11100000) { - return (3 - i,i,0,contbyteadjust,0); // We have that i == 1 or i == 2 or i == 3, if i == 1, we are missing two bytes, if i == 2, we are missing one byte. + if ((pInputBuffer[-i] & 0b11110000) == 0b11100000) + { + return (3 - i, i, 0, contbyteadjust, 0); // We have that i == 1 or i == 2 or i == 3, if i == 1, we are missing two bytes, if i == 2, we are missing one byte. } // We must have that (pInputBuffer[-i] & 0b11111000) == 0b11110000 - return (4 - i,i,0,contbyteadjust,-1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte. + return (4 - i, i, 0, contbyteadjust, -1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte. } public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) @@ -282,7 +310,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; if (pInputBuffer == null || inputLength <= 0) @@ -293,7 +321,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { // We skip any ASCII characters at the start of the buffer int asciirun = 0; - for(; asciirun + 64 <= inputLength; asciirun += 64) + for (; asciirun + 64 <= inputLength; asciirun += 64) { Vector128 block1 = Avx.LoadVector128(pInputBuffer + asciirun); Vector128 block2 = Avx.LoadVector128(pInputBuffer + asciirun + 16); @@ -367,7 +395,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // we need to check if the previous block was incomplete. if (Sse2.MoveMask(prevIncomplete) != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref TempUtf16CodeUnitCountAdjustment, ref TempScalarCountAdjustment); } prevIncomplete = Vector128.Zero; } @@ -379,8 +407,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector128 byte_1_low = Ssse3.Shuffle(shuf2, (prev1 & v0f)); Vector128 byte_2_high = Ssse3.Shuffle(shuf3, Sse2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); Vector128 sc = Sse2.And(Sse2.And(byte_1_high, byte_1_low), byte_2_high); - Vector128 prev2 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 2)); - Vector128 prev3 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 3)); + Vector128 prev2 = Ssse3.AlignRight(currentBlock, prevInputBlock, (byte)(16 - 2)); + Vector128 prev3 = Ssse3.AlignRight(currentBlock, prevInputBlock, (byte)(16 - 3)); prevInputBlock = currentBlock; Vector128 isThirdByte = Sse2.SubtractSaturate(prev2, thirdByte); Vector128 isFourthByte = Sse2.SubtractSaturate(prev3, fourthByte); @@ -389,7 +417,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector128 error = Sse2.Xor(must23As80, sc); if (Sse2.MoveMask(error) != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref TempUtf16CodeUnitCountAdjustment, ref TempScalarCountAdjustment); } prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue); } @@ -418,7 +446,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( } int TailScalarCodeUnitCountAdjustment = 0; int TailUtf16CodeUnitCountAdjustment = 0; - byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out TailUtf16CodeUnitCountAdjustment, out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { // An invalid byte was found by the scalar function @@ -430,10 +458,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( } - public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) + public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; int TailScalarCodeUnitCountAdjustment = 0; @@ -449,7 +477,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { // We skip any ASCII characters at the start of the buffer int asciirun = 0; - for(; asciirun + 64 <= inputLength; asciirun += 64) + for (; asciirun + 64 <= inputLength; asciirun += 64) { Vector256 block1 = Avx.LoadVector256(pInputBuffer + asciirun); Vector256 block2 = Avx.LoadVector256(pInputBuffer + asciirun + 32); @@ -565,7 +593,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // at processedLength/16*16 or when an error occurs. /////////// int start_point = processedLength; - + // The block goes from processedLength to processedLength/16*16. int asciibytes = 0; // number of ascii bytes in the block (could also be called n1) int contbytes = 0; // number of continuation bytes in the block @@ -585,14 +613,14 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { // Note/todo : this path is not yet explicitly tested - int totalbyteasciierror = processedLength - start_point; - var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); + int totalbyteasciierror = processedLength - start_point; + var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); utf16CodeUnitCountAdjustment = utfadjustasciierror; scalarCountAdjustment = scalaradjustasciierror; int off = processedLength >= 3 ? processedLength - 3 : processedLength; - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment); } prevIncomplete = Vector256.Zero; } @@ -620,11 +648,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( if (!Avx2.TestZ(error, error)) { int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32 - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); - utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment); + utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TailScalarCodeUnitCountAdjustment; - int totalbyteasciierror = processedLength - start_point; + int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); utf16CodeUnitCountAdjustment += utfadjustasciierror; @@ -638,10 +666,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { // We have an unterminated sequence. - var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); + var (totalbyteadjustment, i, tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); processedLength -= i; n4 += tempn4; - contbytes +=tempcont; + contbytes += tempcont; } // (Nick Nuon)The counts for continuous bytes can probably be optimized: @@ -660,7 +688,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(compareResult)); - + // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation. n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte))); } @@ -672,7 +700,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( } int totalbyte = processedLength - start_point; - var (utf16adjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4, contbytes, totalbyte); + var (utf16adjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyte); TempUtf16CodeUnitCountAdjustment = utf16adjust; TempScalarCountAdjustment = scalaradjust; @@ -687,8 +715,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte if (processedLength < inputLength) { - - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment); + + byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; @@ -697,7 +725,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // An invalid byte was found by the scalar function return invalidBytePointer; } - } + } utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; @@ -707,10 +735,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength) { int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment= 0 ; + int TempUtf16CodeUnitCountAdjustment = 0; int TempScalarCountAdjustment = 0; - int utf16CodeUnitCountAdjustment=0, scalarCountAdjustment=0; + int utf16CodeUnitCountAdjustment = 0, scalarCountAdjustment = 0; if (pInputBuffer == null || inputLength <= 0) { @@ -720,7 +748,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { // We skip any ASCII characters at the start of the buffer int asciirun = 0; - for(; asciirun + 64 <= inputLength; asciirun += 64) + for (; asciirun + 64 <= inputLength; asciirun += 64) { Vector128 block1 = AdvSimd.LoadVector128(pInputBuffer + asciirun); Vector128 block2 = AdvSimd.LoadVector128(pInputBuffer + asciirun + 16); @@ -793,7 +821,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // we need to check if the previous block was incomplete. if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment); } prevIncomplete = Vector128.Zero; } @@ -805,8 +833,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector128 byte_1_low = Vector128.Shuffle(shuf2, (prev1 & v0f)); Vector128 byte_2_high = Vector128.Shuffle(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); Vector128 sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high); - Vector128 prev2 = AdvSimd.ExtractVector128 (prevInputBlock, currentBlock, (byte)(16 - 2)); - Vector128 prev3 = AdvSimd.ExtractVector128 (prevInputBlock, currentBlock, (byte)(16 - 3)); + Vector128 prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2)); + Vector128 prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3)); prevInputBlock = currentBlock; Vector128 isThirdByte = AdvSimd.SubtractSaturate(prev2, thirdByte); Vector128 isFourthByte = AdvSimd.SubtractSaturate(prev3, fourthByte); @@ -815,7 +843,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector128 error = AdvSimd.Xor(must23As80, sc); if (AdvSimd.Arm64.MaxAcross(error).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment); + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment); } prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue); } @@ -844,7 +872,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( } int TailScalarCodeUnitCountAdjustment = 0; int TailUtf16CodeUnitCountAdjustment = 0; - byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out TailUtf16CodeUnitCountAdjustment, out TailScalarCodeUnitCountAdjustment); if (invalidBytePointer != pInputBuffer + inputLength) { // An invalid byte was found by the scalar function @@ -854,7 +882,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( return pInputBuffer + inputLength; } - public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength,out int Utf16CodeUnitCountAdjustment,out int ScalarCodeUnitCountAdjustment) + public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment) { // if (AdvSimd.Arm64.IsSupported) @@ -863,7 +891,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // } if (Avx2.IsSupported) { - return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment,out ScalarCodeUnitCountAdjustment); + return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); } /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) { @@ -875,7 +903,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // } // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); - return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment,out ScalarCodeUnitCountAdjustment); + return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 5389bf7..1b77ca0 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -9,8 +9,6 @@ namespace tests; using BenchmarkDotNet.Disassemblers; using Iced.Intel; -// TODO: The various tests do not formally take into account the scenario where vector is all ASCII - public unsafe class Utf8SIMDValidationTests { @@ -20,7 +18,7 @@ public unsafe class Utf8SIMDValidationTests private static readonly Random rand = new Random(); // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; - static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths + static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; [Flags] public enum TestSystemRequirements @@ -68,8 +66,6 @@ public class TestIfCondition : FactAttribute { public TestIfCondition(Func condition, string skipReason) { - // if (condition == null) throw new ArgumentNullException(nameof(condition)); - // Only set the Skip property if the condition evaluates to false if (!condition.Invoke()) { @@ -377,6 +373,84 @@ public void NoErrorSpecificByteCountAVX() NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } +public void NoErrorIncompleteThenASCII(Utf8ValidationDelegate utf8ValidationDelegate) +{ + foreach (int outputLength in outputLengths){ + for (int trial = 0; trial < NumTrials; trial++) + { + var allAscii = new List(Enumerable.Repeat((byte)0, outputLength)); + int firstCodeLength = rand.Next(2, 5); + List singleBytes = generator.Generate(1, firstCodeLength); + + int incompleteLocation = 128 - rand.Next(1, firstCodeLength - 1); + allAscii.InsertRange(incompleteLocation, singleBytes); + + var utf8 = allAscii.ToArray(); + int cutOffLength = 128;//utf8.Length - rand.Next(1, firstCodeLength); + cutOffLength = Math.Min(cutOffLength, outputLength); // Ensure it doesn't exceed the length of truncatedUtf8 + byte[] truncatedUtf8 = new byte[outputLength]; // Initialized to zero + + Array.Copy(utf8, 0, truncatedUtf8, 0, cutOffLength); + + bool isValidUtf8 = ValidateUtf8(truncatedUtf8, utf8ValidationDelegate); + // string utf8HexString = BitConverter.ToString(truncatedUtf8).Replace("-", " "); + try + { + Assert.False(isValidUtf8); + Assert.True(InvalidateUtf8(truncatedUtf8, truncatedUtf8.Length, utf8ValidationDelegate)); + Assert.True(ValidateCount(truncatedUtf8, utf8ValidationDelegate)); + } + catch (Xunit.Sdk.XunitException) + { + PrintHexAndBinary(truncatedUtf8, incompleteLocation); + throw; + } + } + } +} + + + [Fact] + [Trait("Category", "scalar")] + public void NoErrorIncompleteThenASCIIScalar() + { + NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); + } + + // TODO:Uncomment when SSE is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] + // [Fact] + // [Trait("Category", "sse")] + // public void NoErrorIncompleteThenASCIISse() + // { + // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + // } + + // TODO:Uncomment when AVX512 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] + // [Trait("Category", "avx512")] + // public void NoErrorIncompleteThenASCIIAvx512() + // { + // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + // } + + // TODO:Uncomment when Arm64 is updated + // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + // [Trait("Category", "arm64")] + // public void NoErrorIncompleteThenASCIIArm64() + // { + // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + // } + + [Fact] + [Trait("Category", "avx")] + public void NoErrorIncompleteThenASCIIAVX() + { + NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + } + + + public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDelegate) { // foreach (int outputLength in outputLengths) @@ -1224,7 +1298,7 @@ public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) } catch (Xunit.Sdk.XunitException) { - Console.WriteLine($"Assertion failed at index: {byteIndex}"); + Console.WriteLine($"Assertion failed. Byte randomly changed at index: {byteIndex}"); PrintHexAndBinary(utf8, byteIndex); throw; // Rethrow the exception to fail the test. } @@ -1406,56 +1480,6 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg // Define a delegate that matches the signature of the methods you want to test public unsafe delegate byte* Utf8ValidationDelegate(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); - // public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) -// public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default, int? index = null) -// { -// int DotnetUtf16Adjustment, DotnetScalarCountAdjustment; -// int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - -// var isDefaultRange = range.Equals(default(Range)); -// var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); - -// unsafe -// { -// fixed (byte* pInput = utf8) -// { -// byte* startPtr = pInput + offset; - -// DotnetUtf16Adjustment = 0; -// DotnetScalarCountAdjustment = 0; -// DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment); - -// SimdUnicodeUtf16Adjustment = 0; -// SimdUnicodeScalarCountAdjustment = 0; -// byte* simdResult = utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); - -// // Determine the index of the invalid byte if simdResult doesn't point to the end. -// // int failureIndex = simdResult != pInput + length ? (int)(simdResult - pInput) : -1; - - -// try -// { -// Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}."); -// Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}."); -// } -// catch (Exception) -// { -// Console.WriteLine("ValidateCount Assertion failed. Inspecting utf8 array:"); -// // PrintHexAndBinary(utf8); -// if (index.HasValue) -// { -// PrintHexAndBinary(utf8, index.Value); -// } -// else -// { -// PrintHexAndBinary(utf8); -// } -// throw; // Re-throw the exception to preserve the failure state -// } -// } -// } -// } - public bool ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) { int dotnetUtf16Adjustment, dotnetScalarCountAdjustment; diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index 89c39ee..5b30cd0 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -31,7 +31,7 @@ public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, i // return result.ToArray(); // } - public List Generate(int howManyUnits, int? byteCountInUnit = null) + public List Generate(int howManyUnits, int? byteCountInUnit = null) { var result = new List(); while (result.Count < howManyUnits) @@ -127,22 +127,22 @@ private int GenerateCodePoint(int byteCount) } } - public List AppendContinuationByte(List utf8Bytes) => - utf8Bytes.Concat(new byte[] {(byte)gen.Next(0x80, 0xBF + 1)}).ToList(); + public List AppendContinuationByte(List utf8Bytes) => + utf8Bytes.Concat(new byte[] { (byte)gen.Next(0x80, 0xBF + 1) }).ToList(); public void ReplaceEndOfArray(byte[] original, byte[] replacement)//, int startIndex) { - // Calculate the start index for replacement + // Calculate the start index for replacement int startIndex = original.Length - replacement.Length; // Copy the replacement array into the original starting at startIndex Array.Copy(replacement, 0, original, startIndex, Math.Min(replacement.Length, original.Length - startIndex)); } - + private int PickRandomByteCount() From 2e2c0d4a848a539704b9897617a5c14d9a7f2f6c Mon Sep 17 00:00:00 2001 From: Nick Nuon Date: Thu, 23 May 2024 01:14:49 -0400 Subject: [PATCH 71/75] remove superfuos comment --- src/UTF8.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index fdbe1f2..2b9668d 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -612,7 +612,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - // Note/todo : this path is not yet explicitly tested int totalbyteasciierror = processedLength - start_point; var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); From 900472e029a548275cb85604f6be83d4b036c5c8 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 23 May 2024 17:45:37 -0400 Subject: [PATCH 72/75] fix: enable avx2 only when avx2 is available --- test/UTF8ValidationTests.cs | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 1b77ca0..9e2011d 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -137,8 +137,7 @@ public void simpleGoodSequencesScalar() // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void simpleGoodSequencesAVX() { simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); @@ -225,8 +224,7 @@ public void BadSequencesScalar() // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void BadSequencesAVX() { BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); @@ -295,8 +293,7 @@ public void NoErrorScalar() // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void NoErrorAVX() { NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); @@ -366,8 +363,7 @@ public void NoErrorSpecificByteCountScalar() // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void NoErrorSpecificByteCountAVX() { NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); @@ -909,8 +905,8 @@ public void TooShortErrorAtEndScalar() // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooShortErrorAtEndAVX() { TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); From 58d67cc929b6bb2864768eb33f617a751471dc2a Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 23 May 2024 19:55:05 -0400 Subject: [PATCH 73/75] fix: complete correction --- test/UTF8ValidationTests.cs | 80 ++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 9e2011d..8500233 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -13,12 +13,12 @@ public unsafe class Utf8SIMDValidationTests { - private const int NumTrials = 1000; + private const int NumTrials = 100; private static readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1); private static readonly Random rand = new Random(); // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; - static int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; + static int[] outputLengths = { 128, 345, 1000 }; [Flags] public enum TestSystemRequirements @@ -76,7 +76,7 @@ public TestIfCondition(Func condition, string skipReason) - public void simpleGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) + private void simpleGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) { string[] goodSequences = { "a", @@ -144,7 +144,7 @@ public void simpleGoodSequencesAVX() } - public void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate) + private void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate) { string[] badSequences = { "\xC3\x28", @@ -231,13 +231,13 @@ public void BadSequencesAVX() } // this was in the C++ code - public void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate) + private void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate) { byte[] bad = new byte[] { 0x80 }; Assert.False(ValidateUtf8(bad,utf8ValidationDelegate)); } - public void NoError(Utf8ValidationDelegate utf8ValidationDelegate) + private void NoError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -299,7 +299,7 @@ public void NoErrorAVX() NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void NoErrorSpecificByteCount(Utf8ValidationDelegate utf8ValidationDelegate) + private void NoErrorSpecificByteCount(Utf8ValidationDelegate utf8ValidationDelegate) { RunTestForByteLength(1,utf8ValidationDelegate); RunTestForByteLength(2,utf8ValidationDelegate); @@ -369,7 +369,7 @@ public void NoErrorSpecificByteCountAVX() NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } -public void NoErrorIncompleteThenASCII(Utf8ValidationDelegate utf8ValidationDelegate) +private void NoErrorIncompleteThenASCII(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths){ for (int trial = 0; trial < NumTrials; trial++) @@ -438,8 +438,7 @@ public void NoErrorIncompleteThenASCIIScalar() // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void NoErrorIncompleteThenASCIIAVX() { NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); @@ -447,11 +446,10 @@ public void NoErrorIncompleteThenASCIIAVX() - public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDelegate) + private void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDelegate) { - // foreach (int outputLength in outputLengths) + foreach (int outputLength in outputLengths) { - int outputLength = 256; for (int trial = 0; trial < NumTrials; trial++) { @@ -515,14 +513,13 @@ public void NoErrorIncompleteAt256VectorScalar() // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void NoErrorIncompleteAt256VectorAVX() { NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) + private void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -590,14 +587,13 @@ public void BadHeaderBitsScalar() // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void BadHeaderBitsAVX() { BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) + private void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -663,14 +659,13 @@ public void TooShortErrorScalar() // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooShortErrorAVX() { TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) + private void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) @@ -736,14 +731,13 @@ public void TooLongErrorScalar() // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooLongErrorAVX() { TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate) + private void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate) { for (int trial = 0; trial < NumTrials; trial++) { @@ -818,15 +812,14 @@ public void OverlongErrorScalar() // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void OverlongErrorAVX() { OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) + private void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) { for (int trial = 0; trial < NumTrials; trial++) { @@ -912,7 +905,7 @@ public void TooShortErrorAtEndAVX() TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - [Fact] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooShortErrorAtEndAvx2() { TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); @@ -920,7 +913,7 @@ public void TooShortErrorAtEndAvx2() //corresponds to condition 5.4.1 in the paper - public void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) + private void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) { var invalidBytes = Enumerable.Range(0xF5, 0x100 - 0xF5).Select(i => (byte)i).ToArray(); // 0xF5 to 0xFF @@ -972,14 +965,13 @@ public void Invalid0xf50xffScalar() // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void Invalid0xf50xffAVX() { Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - [Fact] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void Invalid0xf50xffAvx2() { Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); @@ -1049,7 +1041,7 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) } - public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) + private void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -1106,15 +1098,14 @@ public void TooLargeErrorScalar() // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooLargeErrorAvx() { TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8ValidationDelegate) + private void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -1169,20 +1160,19 @@ public void AsciiPlusContinuationAtEndErrorScalar() // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void AsciiPlusContinuationAtEndErrorAVX() { AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - [Fact] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void AsciiPlusContinuationAtEndErrorAvx2() { AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) + private void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -1249,15 +1239,14 @@ public void SurrogateErrorTestScalar() // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void SurrogateErrorTestAVX() { SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - public void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) + private void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -1336,8 +1325,7 @@ public void BruteForceTestScalar() // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - [Fact] - [Trait("Category", "avx")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void BruteForceTestAVX() { BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); From 5a6899acc96fd07b40de4162912cdd3d56f6873a Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Thu, 23 May 2024 19:56:54 -0400 Subject: [PATCH 74/75] comment --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 1138cee..2d91f64 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,12 @@ cd test dotnet test ``` +To see which tests are running, we recommend setting the verbosity level: + +``` +dotnet test -v d +``` + To get a list of available tests, enter the command: ``` From 0f4d294479cd6ea75f99856a5bc90140a96b48fc Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Fri, 24 May 2024 13:37:17 -0400 Subject: [PATCH 75/75] fix: add [Trait("Category", "avx")] --- test/UTF8ValidationTests.cs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index 8500233..d30e6e1 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -137,6 +137,7 @@ public void simpleGoodSequencesScalar() // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void simpleGoodSequencesAVX() { @@ -224,6 +225,7 @@ public void BadSequencesScalar() // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void BadSequencesAVX() { @@ -293,6 +295,7 @@ public void NoErrorScalar() // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void NoErrorAVX() { @@ -363,6 +366,7 @@ public void NoErrorSpecificByteCountScalar() // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void NoErrorSpecificByteCountAVX() { @@ -438,6 +442,7 @@ public void NoErrorIncompleteThenASCIIScalar() // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void NoErrorIncompleteThenASCIIAVX() { @@ -513,6 +518,7 @@ public void NoErrorIncompleteAt256VectorScalar() // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void NoErrorIncompleteAt256VectorAVX() { @@ -587,6 +593,7 @@ public void BadHeaderBitsScalar() // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void BadHeaderBitsAVX() { @@ -659,6 +666,7 @@ public void TooShortErrorScalar() // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooShortErrorAVX() { @@ -731,6 +739,7 @@ public void TooLongErrorScalar() // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooLongErrorAVX() { @@ -812,6 +821,7 @@ public void OverlongErrorScalar() // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void OverlongErrorAVX() { @@ -898,13 +908,14 @@ public void TooShortErrorAtEndScalar() // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } - + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooShortErrorAtEndAVX() { TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooShortErrorAtEndAvx2() { @@ -965,12 +976,14 @@ public void Invalid0xf50xffScalar() // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void Invalid0xf50xffAVX() { Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void Invalid0xf50xffAvx2() { @@ -1098,6 +1111,7 @@ public void TooLargeErrorScalar() // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooLargeErrorAvx() { @@ -1160,12 +1174,14 @@ public void AsciiPlusContinuationAtEndErrorScalar() // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void AsciiPlusContinuationAtEndErrorAVX() { AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void AsciiPlusContinuationAtEndErrorAvx2() { @@ -1239,6 +1255,7 @@ public void SurrogateErrorTestScalar() // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void SurrogateErrorTestAVX() { @@ -1325,6 +1342,7 @@ public void BruteForceTestScalar() // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); // } + [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void BruteForceTestAVX() {