Skip to content

Commit

Permalink
Added pure-stack UTF-8 decoder via enumerable syntax, moved UTF-16 de…
Browse files Browse the repository at this point in the history
…coder to a separate file.
  • Loading branch information
RyanLamansky committed Jul 27, 2024
1 parent 21a524b commit e5fb0e4
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 63 deletions.
65 changes: 2 additions & 63 deletions HtmlUtilities/CodePoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -703,70 +703,9 @@ public static IEnumerable<byte> EncodeUtf8(IEnumerable<CodePoint>? source)
}

/// <summary>
/// Enumerates <see cref="CodePoint"/>s from a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> without allocating heap memory.
/// Gets an enumerable for <see cref="CodePoint"/>s from a <see cref="ReadOnlySpan{T}"/> of type <see cref="byte"/> without allocating heap memory.
/// </summary>
public ref struct Utf16DecoderEnumerator
{
private ReadOnlySpan<char>.Enumerator enumerator;

/// <summary>
/// The current <see cref="CodePoint"/> value. Not valid until <see cref="MoveNext"/> has been called at least once.
/// </summary>
public CodePoint Current { readonly get; private set; }

internal Utf16DecoderEnumerator(ReadOnlySpan<char> source)
{
this.enumerator = source.GetEnumerator();
this.Current = default;
}

/// <summary>
/// Reads the next <see cref="CodePoint"/> from the source.
/// </summary>
/// <returns>True if a value was found, false if the end of the source has been reached.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool MoveNext()
{
if (!enumerator.MoveNext())
return false;

var high = (int)enumerator.Current;

if (high <= 0xD7FF || (high >= 0xE000 && high <= 0xFFFF))
{
Current = high;
return true;
}
else if (!enumerator.MoveNext())
{
return false;
}

var low = (int)enumerator.Current;
Current = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;

return true;
}
}

/// <summary>
/// Wraps a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> for on-demand enumeration into <see cref="CodePoint"/>s.
/// </summary>
public readonly ref struct Utf16DecoderEnumerable
{
private readonly ReadOnlySpan<char> source;

internal Utf16DecoderEnumerable(ReadOnlySpan<char> source)
{
this.source = source;
}

/// <summary>
/// Gets an enumerator to produce <see cref="CodePoint"/> from the source.
/// </summary>
/// <returns>The enumerator.</returns>
public Utf16DecoderEnumerator GetEnumerator() => new(source);
}
public static Utf8DecoderEnumerable GetEnumerable(ReadOnlySpan<byte> source) => new(source);

/// <summary>
/// Gets an enumerable for <see cref="CodePoint"/>s from a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> without allocating heap memory.
Expand Down
20 changes: 20 additions & 0 deletions HtmlUtilities/Utf16DecoderEnumerable.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
namespace HtmlUtilities;

/// <summary>
/// Wraps a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> for on-demand enumeration into <see cref="CodePoint"/>s.
/// </summary>
public readonly ref struct Utf16DecoderEnumerable
{
private readonly ReadOnlySpan<char> source;

internal Utf16DecoderEnumerable(ReadOnlySpan<char> source)
{
this.source = source;
}

/// <summary>
/// Gets an enumerator to produce <see cref="CodePoint"/> from the source.
/// </summary>
/// <returns>The enumerator.</returns>
public Utf16DecoderEnumerator GetEnumerator() => new(source);
}
50 changes: 50 additions & 0 deletions HtmlUtilities/Utf16DecoderEnumerator.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
using System.Runtime.CompilerServices;

namespace HtmlUtilities;

/// <summary>
/// Enumerates <see cref="CodePoint"/>s from a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> without allocating heap memory.
/// </summary>
public ref struct Utf16DecoderEnumerator
{
private ReadOnlySpan<char>.Enumerator enumerator;

/// <summary>
/// The current <see cref="CodePoint"/> value. Not valid until <see cref="MoveNext"/> has been called at least once.
/// </summary>
public CodePoint Current { readonly get; private set; }

internal Utf16DecoderEnumerator(ReadOnlySpan<char> source)
{
this.enumerator = source.GetEnumerator();
this.Current = default;
}

/// <summary>
/// Reads the next <see cref="CodePoint"/> from the source.
/// </summary>
/// <returns>True if a value was found, false if the end of the source has been reached.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool MoveNext()
{
if (!enumerator.MoveNext())
return false;

var high = (int)enumerator.Current;

if (high <= 0xD7FF || (high >= 0xE000 && high <= 0xFFFF))
{
Current = high;
return true;
}
else if (!enumerator.MoveNext())
{
return false;
}

var low = (int)enumerator.Current;
Current = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;

return true;
}
}
20 changes: 20 additions & 0 deletions HtmlUtilities/Utf8DecoderEnumerable.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
namespace HtmlUtilities;

/// <summary>
/// Wraps a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> for on-demand enumeration into <see cref="CodePoint"/>s.
/// </summary>
public readonly ref struct Utf8DecoderEnumerable
{
private readonly ReadOnlySpan<byte> source;

internal Utf8DecoderEnumerable(ReadOnlySpan<byte> source)
{
this.source = source;
}

/// <summary>
/// Gets an enumerator to produce <see cref="CodePoint"/> from the source.
/// </summary>
/// <returns>The enumerator.</returns>
public Utf8DecoderEnumerator GetEnumerator() => new(source);
}
90 changes: 90 additions & 0 deletions HtmlUtilities/Utf8DecoderEnumerator.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
namespace HtmlUtilities;

/// <summary>
/// Enumerates <see cref="CodePoint"/>s from a <see cref="ReadOnlySpan{T}"/> of type <see cref="byte"/> without allocating heap memory.
/// </summary>
public ref struct Utf8DecoderEnumerator
{
private ReadOnlySpan<byte>.Enumerator enumerator;

/// <summary>
/// The current <see cref="CodePoint"/> value. Not valid until <see cref="MoveNext"/> has been called at least once.
/// </summary>
public CodePoint Current { readonly get; private set; }

internal Utf8DecoderEnumerator(ReadOnlySpan<byte> source)
{
this.enumerator = source.GetEnumerator();
this.Current = default;
}

/// <summary>
/// Reads the next <see cref="CodePoint"/> from the source.
/// </summary>
/// <returns>True if a value was found, false if the end of the source has been reached.</returns>
public bool MoveNext()
{
while (enumerator.MoveNext())
{
var current = enumerator.Current;

if (current <= 0x7f)
{
Current = current;
return true;
}

static bool Next(ReadOnlySpan<byte>.Enumerator enumerator, ref byte current)
{
if (!enumerator.MoveNext())
return false; // Invalid sequence.
if (((current = enumerator.Current) >> 6) != 0b10)
return false;

return true;
}

int b1, b2, b3;
if ((current >> 5) == 0b110)
{
b1 = current & 0b00011111;
if (!Next(enumerator, ref current))
continue; // Invalid sequence.

Current = (b1 << 6) | current & 0b00111111;
}
else if ((current >> 4) == 0b1110)
{
b1 = current & 0b00001111;
if (!Next(enumerator, ref current))
continue; // Invalid sequence.

b2 = current & 0b00111111;
if (!Next(enumerator, ref current))
continue; // Invalid sequence.

Current = (b1 << 12) | (b2 << 6) | current & 0b00111111;
}
else if ((current >> 3) == 0b11110)
{
b1 = current & 0b00001111;
if (!Next(enumerator, ref current))
continue; // Invalid sequence.

b2 = current & 0b00111111;
if (!Next(enumerator, ref current))
continue; // Invalid sequence.

b3 = current & 0b00111111;
if (!Next(enumerator, ref current))
continue; // Invalid sequence.

Current = (b1 << 18) | (b2 << 12) | (b3 << 6) | current & 0b00111111;
}

return true;
}

return false;
}
}

0 comments on commit e5fb0e4

Please sign in to comment.