From e5fb0e4341e0631a5416c896780c97ca5f3dcac2 Mon Sep 17 00:00:00 2001 From: Ryan Lamansky <13633345+RyanLamansky@users.noreply.github.com> Date: Sat, 27 Jul 2024 08:12:46 -0500 Subject: [PATCH] Added pure-stack UTF-8 decoder via enumerable syntax, moved UTF-16 decoder to a separate file. --- HtmlUtilities/CodePoint.cs | 65 +----------------- HtmlUtilities/Utf16DecoderEnumerable.cs | 20 ++++++ HtmlUtilities/Utf16DecoderEnumerator.cs | 50 ++++++++++++++ HtmlUtilities/Utf8DecoderEnumerable.cs | 20 ++++++ HtmlUtilities/Utf8DecoderEnumerator.cs | 90 +++++++++++++++++++++++++ 5 files changed, 182 insertions(+), 63 deletions(-) create mode 100644 HtmlUtilities/Utf16DecoderEnumerable.cs create mode 100644 HtmlUtilities/Utf16DecoderEnumerator.cs create mode 100644 HtmlUtilities/Utf8DecoderEnumerable.cs create mode 100644 HtmlUtilities/Utf8DecoderEnumerator.cs diff --git a/HtmlUtilities/CodePoint.cs b/HtmlUtilities/CodePoint.cs index 2b7a0cb..2191f01 100644 --- a/HtmlUtilities/CodePoint.cs +++ b/HtmlUtilities/CodePoint.cs @@ -703,70 +703,9 @@ public static IEnumerable EncodeUtf8(IEnumerable? source) } /// - /// Enumerates s from a of type without allocating heap memory. + /// Gets an enumerable for s from a of type without allocating heap memory. /// - public ref struct Utf16DecoderEnumerator - { - private ReadOnlySpan.Enumerator enumerator; - - /// - /// The current value. Not valid until has been called at least once. - /// - public CodePoint Current { readonly get; private set; } - - internal Utf16DecoderEnumerator(ReadOnlySpan source) - { - this.enumerator = source.GetEnumerator(); - this.Current = default; - } - - /// - /// Reads the next from the source. - /// - /// True if a value was found, false if the end of the source has been reached. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public bool MoveNext() - { - if (!enumerator.MoveNext()) - return false; - - var high = (int)enumerator.Current; - - if (high <= 0xD7FF || (high >= 0xE000 && high <= 0xFFFF)) - { - Current = high; - return true; - } - else if (!enumerator.MoveNext()) - { - return false; - } - - var low = (int)enumerator.Current; - Current = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000; - - return true; - } - } - - /// - /// Wraps a of type for on-demand enumeration into s. - /// - public readonly ref struct Utf16DecoderEnumerable - { - private readonly ReadOnlySpan source; - - internal Utf16DecoderEnumerable(ReadOnlySpan source) - { - this.source = source; - } - - /// - /// Gets an enumerator to produce from the source. - /// - /// The enumerator. - public Utf16DecoderEnumerator GetEnumerator() => new(source); - } + public static Utf8DecoderEnumerable GetEnumerable(ReadOnlySpan source) => new(source); /// /// Gets an enumerable for s from a of type without allocating heap memory. diff --git a/HtmlUtilities/Utf16DecoderEnumerable.cs b/HtmlUtilities/Utf16DecoderEnumerable.cs new file mode 100644 index 0000000..ecdc38a --- /dev/null +++ b/HtmlUtilities/Utf16DecoderEnumerable.cs @@ -0,0 +1,20 @@ +namespace HtmlUtilities; + +/// +/// Wraps a of type for on-demand enumeration into s. +/// +public readonly ref struct Utf16DecoderEnumerable +{ + private readonly ReadOnlySpan source; + + internal Utf16DecoderEnumerable(ReadOnlySpan source) + { + this.source = source; + } + + /// + /// Gets an enumerator to produce from the source. + /// + /// The enumerator. + public Utf16DecoderEnumerator GetEnumerator() => new(source); +} diff --git a/HtmlUtilities/Utf16DecoderEnumerator.cs b/HtmlUtilities/Utf16DecoderEnumerator.cs new file mode 100644 index 0000000..8a0e725 --- /dev/null +++ b/HtmlUtilities/Utf16DecoderEnumerator.cs @@ -0,0 +1,50 @@ +using System.Runtime.CompilerServices; + +namespace HtmlUtilities; + +/// +/// Enumerates s from a of type without allocating heap memory. +/// +public ref struct Utf16DecoderEnumerator +{ + private ReadOnlySpan.Enumerator enumerator; + + /// + /// The current value. Not valid until has been called at least once. + /// + public CodePoint Current { readonly get; private set; } + + internal Utf16DecoderEnumerator(ReadOnlySpan source) + { + this.enumerator = source.GetEnumerator(); + this.Current = default; + } + + /// + /// Reads the next from the source. + /// + /// True if a value was found, false if the end of the source has been reached. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool MoveNext() + { + if (!enumerator.MoveNext()) + return false; + + var high = (int)enumerator.Current; + + if (high <= 0xD7FF || (high >= 0xE000 && high <= 0xFFFF)) + { + Current = high; + return true; + } + else if (!enumerator.MoveNext()) + { + return false; + } + + var low = (int)enumerator.Current; + Current = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000; + + return true; + } +} diff --git a/HtmlUtilities/Utf8DecoderEnumerable.cs b/HtmlUtilities/Utf8DecoderEnumerable.cs new file mode 100644 index 0000000..a685d89 --- /dev/null +++ b/HtmlUtilities/Utf8DecoderEnumerable.cs @@ -0,0 +1,20 @@ +namespace HtmlUtilities; + +/// +/// Wraps a of type for on-demand enumeration into s. +/// +public readonly ref struct Utf8DecoderEnumerable +{ + private readonly ReadOnlySpan source; + + internal Utf8DecoderEnumerable(ReadOnlySpan source) + { + this.source = source; + } + + /// + /// Gets an enumerator to produce from the source. + /// + /// The enumerator. + public Utf8DecoderEnumerator GetEnumerator() => new(source); +} diff --git a/HtmlUtilities/Utf8DecoderEnumerator.cs b/HtmlUtilities/Utf8DecoderEnumerator.cs new file mode 100644 index 0000000..341d193 --- /dev/null +++ b/HtmlUtilities/Utf8DecoderEnumerator.cs @@ -0,0 +1,90 @@ +namespace HtmlUtilities; + +/// +/// Enumerates s from a of type without allocating heap memory. +/// +public ref struct Utf8DecoderEnumerator +{ + private ReadOnlySpan.Enumerator enumerator; + + /// + /// The current value. Not valid until has been called at least once. + /// + public CodePoint Current { readonly get; private set; } + + internal Utf8DecoderEnumerator(ReadOnlySpan source) + { + this.enumerator = source.GetEnumerator(); + this.Current = default; + } + + /// + /// Reads the next from the source. + /// + /// True if a value was found, false if the end of the source has been reached. + public bool MoveNext() + { + while (enumerator.MoveNext()) + { + var current = enumerator.Current; + + if (current <= 0x7f) + { + Current = current; + return true; + } + + static bool Next(ReadOnlySpan.Enumerator enumerator, ref byte current) + { + if (!enumerator.MoveNext()) + return false; // Invalid sequence. + if (((current = enumerator.Current) >> 6) != 0b10) + return false; + + return true; + } + + int b1, b2, b3; + if ((current >> 5) == 0b110) + { + b1 = current & 0b00011111; + if (!Next(enumerator, ref current)) + continue; // Invalid sequence. + + Current = (b1 << 6) | current & 0b00111111; + } + else if ((current >> 4) == 0b1110) + { + b1 = current & 0b00001111; + if (!Next(enumerator, ref current)) + continue; // Invalid sequence. + + b2 = current & 0b00111111; + if (!Next(enumerator, ref current)) + continue; // Invalid sequence. + + Current = (b1 << 12) | (b2 << 6) | current & 0b00111111; + } + else if ((current >> 3) == 0b11110) + { + b1 = current & 0b00001111; + if (!Next(enumerator, ref current)) + continue; // Invalid sequence. + + b2 = current & 0b00111111; + if (!Next(enumerator, ref current)) + continue; // Invalid sequence. + + b3 = current & 0b00111111; + if (!Next(enumerator, ref current)) + continue; // Invalid sequence. + + Current = (b1 << 18) | (b2 << 12) | (b3 << 6) | current & 0b00111111; + } + + return true; + } + + return false; + } +}