Skip to content

Commit

Permalink
added sse and avx results
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Lemire committed Jun 19, 2024
1 parent 9f2f80f commit ce39243
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 29 deletions.
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,28 @@ sudo dotnet run -c Release

## Results (x64)

To be completed.
On x64 system, we offer several functions: a fallback function for legacy systems,
a SSE42 function for older CPUs, an AVX2 function for current x64 systems, and an
AVX-512 function for the most recent systems (AMD Zen 4, Intel Ice lake, etc.).

On an Intel Ice Lake system, our validation function is up to several times
faster than the standard library when using at least the AVX2 routines. Only on pure
ASCII inputs (Latin-Lipsum) is the standard library seemingly faster, but all functions
are effectively at "memory speed" so the difference is likely practically not significant.
A more realistic input is Twitter.json which is mostly ASCII with some Unicode content.

| data set | SimdUnicode SSE42 (GB/s) | SimdUnicode AVX2 (GB/s) | .NET speed (GB/s) |
|:----------------|:-------------------------|:------------------------|-------------------|
| Twitter.json | 15 | 24 | 12 |
| Arabic-Lipsum | 4.5 | 6.2 | 2.3 |
| Chinese-Lipsum | 4.5 | 8.1 | 3.9 |
| Emoji-Lipsum | 4.3 | 7.1 | 0.9 |
| Hebrew-Lipsum | 4.5 | 8.0 | 2.3 |
| Hindi-Lipsum | 4.3 | 8.0 | 2.1 |
| Japanese-Lipsum | 4.5 | 8.0  | 3.5 |
| Korean-Lipsum | 4.5 | 8.0 | 1.3 |
| Latin-Lipsum | 50 | 76 | 96 |
| Russian-Lipsum | 4.3 | 8.0 | 1.2 |

## Results (ARM)

Expand Down
32 changes: 27 additions & 5 deletions benchmark/Benchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase)
[Config(typeof(Config))]
public class RealDataBenchmark
{
// We only informs the user once about the SIMD support of the system.
private static bool printed = false;

Check warning on line 66 in benchmark/Benchmark.cs

View workflow job for this annotation

GitHub Actions / Build and test on ubuntu-latest

Member 'printed' is explicitly initialized to its default value (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1805)

Check warning on line 66 in benchmark/Benchmark.cs

View workflow job for this annotation

GitHub Actions / Build and test on ubuntu-latest

Member 'printed' is explicitly initialized to its default value (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1805)

Check warning on line 66 in benchmark/Benchmark.cs

View workflow job for this annotation

GitHub Actions / Build and test on windows-latest

Member 'printed' is explicitly initialized to its default value (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1805)

Check warning on line 66 in benchmark/Benchmark.cs

View workflow job for this annotation

GitHub Actions / Build and test on windows-latest

Member 'printed' is explicitly initialized to its default value (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1805)

Check warning on line 66 in benchmark/Benchmark.cs

View workflow job for this annotation

GitHub Actions / Build and test on macos-latest

Member 'printed' is explicitly initialized to its default value (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1805)

Check warning on line 66 in benchmark/Benchmark.cs

View workflow job for this annotation

GitHub Actions / Build and test on macos-latest

Member 'printed' is explicitly initialized to its default value (https://learn.microsoft.com/dotnet/fundamentals/code-analysis/quality-rules/ca1805)
#pragma warning disable CA1812
private sealed class Config : ManualConfig
{
Expand All @@ -72,35 +74,55 @@ public Config()

if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
{
if (!printed)
{
#pragma warning disable CA1303
Console.WriteLine("ARM64 system detected.");
Console.WriteLine("ARM64 system detected.");
printed = true;
}
AddFilter(new AnyCategoriesFilter(["arm64", "scalar", "runtime"]));

}
else if (RuntimeInformation.ProcessArchitecture == Architecture.X64)
{
if (Vector512.IsHardwareAccelerated && System.Runtime.Intrinsics.X86.Avx512Vbmi.IsSupported)
{
if (!printed)
{
#pragma warning disable CA1303
Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX-512 support.");
Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX-512 support.");
printed = true;
}
AddFilter(new AnyCategoriesFilter(["avx512", "avx", "sse", "scalar", "runtime"]));
}
else if (Avx2.IsSupported)
{
if (!printed)
{
#pragma warning disable CA1303
Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX2 support.");
Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX2 support.");
printed = true;
}
AddFilter(new AnyCategoriesFilter(["avx", "sse", "scalar", "runtime"]));
}
else if (Ssse3.IsSupported)
{
if (!printed)
{
#pragma warning disable CA1303
Console.WriteLine("X64 system detected (Intel, AMD,...) with Sse4.2 support.");
Console.WriteLine("X64 system detected (Intel, AMD,...) with Sse4.2 support.");
printed = true;
}
AddFilter(new AnyCategoriesFilter(["sse", "scalar", "runtime"]));
}
else
{
if (!printed)
{
#pragma warning disable CA1303
Console.WriteLine("X64 system detected (Intel, AMD,...) without relevant SIMD support.");
Console.WriteLine("X64 system detected (Intel, AMD,...) without relevant SIMD support.");
printed = true;
}
AddFilter(new AnyCategoriesFilter(["scalar", "runtime"]));
}
}
Expand Down
76 changes: 59 additions & 17 deletions src/UTF8.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public static class UTF8
}*/
if (Ssse3.IsSupported)
{
return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength,out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
}

return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
Expand Down Expand Up @@ -486,10 +486,10 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
int asciirun = 0;
for (; asciirun + 64 <= inputLength; asciirun += 64)
{
Vector128<byte> block1 = Avx.LoadVector128(pInputBuffer + asciirun);
Vector128<byte> block2 = Avx.LoadVector128(pInputBuffer + asciirun + 16);
Vector128<byte> block3 = Avx.LoadVector128(pInputBuffer + asciirun + 32);
Vector128<byte> block4 = Avx.LoadVector128(pInputBuffer + asciirun + 48);
Vector128<byte> block1 = Sse2.LoadVector128(pInputBuffer + asciirun);
Vector128<byte> block2 = Sse2.LoadVector128(pInputBuffer + asciirun + 16);
Vector128<byte> block3 = Sse2.LoadVector128(pInputBuffer + asciirun + 32);
Vector128<byte> block4 = Sse2.LoadVector128(pInputBuffer + asciirun + 48);

Vector128<byte> or = Sse2.Or(Sse2.Or(block1, block2), Sse2.Or(block3, block4));
if (Sse2.MoveMask(or) != 0)
Expand Down Expand Up @@ -582,7 +582,7 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
for (; processedLength + 16 <= inputLength; processedLength += 16)
{

Vector128<byte> currentBlock = Avx.LoadVector128(pInputBuffer + processedLength);
Vector128<byte> currentBlock = Sse2.LoadVector128(pInputBuffer + processedLength);
int mask = Sse42.MoveMask(currentBlock);
if (mask == 0)
{
Expand All @@ -608,6 +608,27 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
return invalidBytePointer;
}
prevIncomplete = Vector128<byte>.Zero;

// Often, we have a lot of ASCII characters in a row.
int localasciirun = 16;
if (processedLength + localasciirun + 64 <= inputLength)
{
for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
{
Vector128<byte> block1 = Sse2.LoadVector128(pInputBuffer + processedLength + localasciirun);
Vector128<byte> block2 = Sse2.LoadVector128(pInputBuffer + processedLength + localasciirun + 16);
Vector128<byte> block3 = Sse2.LoadVector128(pInputBuffer + processedLength + localasciirun + 32);
Vector128<byte> block4 = Sse2.LoadVector128(pInputBuffer + processedLength + localasciirun + 48);

Vector128<byte> or = Sse2.Or(Sse2.Or(block1, block2), Sse2.Or(block3, block4));
if (Sse2.MoveMask(or) != 0)
{
break;
}
}
processedLength += localasciirun - 16;
}
asciibytes += localasciirun;
}
else // Contains non-ASCII characters, we need to do non-trivial processing
{
Expand Down Expand Up @@ -654,16 +675,16 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
}

prevIncomplete = Sse3.SubtractSaturate(currentBlock, maxValue);

contbytes += (int)Popcnt.PopCount((uint)Sse42.MoveMask(byte_2_high));
// We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
n4 += (int)Popcnt.PopCount((uint)Sse42.MoveMask(Sse42.SubtractSaturate(currentBlock, fourthByte)));
// important: we just update asciibytes if there was no error.
// We count the number of ascii bytes in the block using just some simple arithmetic
// and no expensive operation:
asciibytes += (int)(16 - Popcnt.PopCount((uint)mask));
}

// important: we just update asciibytes if there was no error.
// We count the number of ascii bytes in the block using just some simple arithmetic
// and no expensive operation:
asciibytes += (int)(16 - Popcnt.PopCount((uint)mask));
}


Expand Down Expand Up @@ -868,6 +889,27 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
return invalidBytePointer;
}
prevIncomplete = Vector256<byte>.Zero;

// Often, we have a lot of ASCII characters in a row.
int localasciirun = 32;
if (processedLength + localasciirun + 64 <= inputLength)
{
for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
{
Vector256<byte> block1 = Avx.LoadVector256(pInputBuffer + processedLength + localasciirun);
Vector256<byte> block2 = Avx.LoadVector256(pInputBuffer + processedLength + localasciirun + 32);
Vector256<byte> or = Avx2.Or(block1, block2);
if (Avx2.MoveMask(or) != 0)
{
break;
}
}
processedLength += localasciirun - 32;
}
asciibytes += localasciirun;

asciibytes += (int)32;

}
else // Contains non-ASCII characters, we need to do non-trivial processing
{
Expand Down Expand Up @@ -918,12 +960,11 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(byte_2_high));
// We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)));
// important: we just update asciibytes if there was no error.
// We count the number of ascii bytes in the block using just some simple arithmetic
// and no expensive operation:
asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));
}

// important: we just update asciibytes if there was no error.
// We count the number of ascii bytes in the block using just some simple arithmetic
// and no expensive operation:
asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));
}
// We may still have an error.
if (processedLength < inputLength || !Avx2.TestZ(prevIncomplete, prevIncomplete))
Expand Down Expand Up @@ -1075,7 +1116,8 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
prevIncomplete = Vector128<byte>.Zero;
// Often, we have a lot of ASCII characters in a row.
int localasciirun = 16;
if(processedLength + localasciirun + 64 <= inputLength) {
if (processedLength + localasciirun + 64 <= inputLength)
{
for (; processedLength + localasciirun + 64 <= inputLength; localasciirun += 64)
{
Vector128<byte> block1 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun);
Expand Down
13 changes: 7 additions & 6 deletions test/UTF8ValidationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ public TestIfCondition(Func<bool> condition, string skipReason)
// Only set the Skip property if the condition evaluates to false
if (!condition.Invoke())
{
if(skipReason == null) {
if (skipReason == null)
{
throw new ArgumentNullException(nameof(skipReason), "skipReason cannot be null when condition is false.");
}
Skip = skipReason;
Expand Down Expand Up @@ -821,7 +822,7 @@ public void Invalid0xf50xffScalar()
public void Invalid0xf50xffSse()
{
Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
}
}


[Trait("Category", "avx")]
Expand Down Expand Up @@ -943,7 +944,7 @@ public void TooLargeErrorScalar()
public void TooLargeErrorSse()
{
TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
}
}

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
Expand Down Expand Up @@ -995,7 +996,7 @@ public void AsciiPlusContinuationAtEndErrorScalar()
public void AsciiPlusContinuationAtEndErrorSse()
{
AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
}
}


[Trait("Category", "arm64")]
Expand Down Expand Up @@ -1059,7 +1060,7 @@ public void SurrogateErrorTestScalar()
public void SurrogateErrorTestSse()
{
SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
}
}


[Trait("Category", "avx")]
Expand Down Expand Up @@ -1141,7 +1142,7 @@ public void BruteForceTestScalar()
public void BruteForceTestSse()
{
BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
}
}

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
Expand Down

0 comments on commit ce39243

Please sign in to comment.