Skip to content

Commit

Permalink
Merge pull request #36 from simdutf/optimize_avx2
Browse files Browse the repository at this point in the history
fix: optimize the avx2 validator (shaving one SIMD instruction).
  • Loading branch information
lemire authored May 28, 2024
2 parents e5f5b39 + 71b6fe2 commit 5a99bb2
Showing 1 changed file with 1 addition and 17 deletions.
18 changes: 1 addition & 17 deletions src/UTF8.cs
Original file line number Diff line number Diff line change
Expand Up @@ -671,23 +671,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
contbytes += tempcont;
}

// (Nick Nuon)The counts for continuous bytes can probably be optimized:
// The draft had something like this line:
// contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc));
// this actually counts the number of 2 consecutive continuous bytes
// I put something that was bound to be working regardless as a slow but temporary fix:

Vector256<byte> top2bits = Vector256.Create((byte)0b11000000); // Mask to isolate the two most significant bits
Vector256<byte> contbytemask = Vector256.Create((byte)0b10000000); // The expected pattern for continuation bytes: 10xxxxxx

// Apply the mask and compare
Vector256<byte> maskedData = Avx2.And(currentBlock, top2bits);
Vector256<byte> compareResult = Avx2.CompareEqual(maskedData, contbytemask);
// Move mask to get integer representation
contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(compareResult));



contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(byte_2_high));
// We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)));
}
Expand Down

0 comments on commit 5a99bb2

Please sign in to comment.