-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added pure-stack UTF-8 decoder via enumerable syntax, moved UTF-16 de…
…coder to a separate file.
- Loading branch information
1 parent
21a524b
commit e5fb0e4
Showing
5 changed files
with
182 additions
and
63 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
namespace HtmlUtilities; | ||
|
||
/// <summary> | ||
/// Wraps a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> for on-demand enumeration into <see cref="CodePoint"/>s. | ||
/// </summary> | ||
public readonly ref struct Utf16DecoderEnumerable | ||
{ | ||
private readonly ReadOnlySpan<char> source; | ||
|
||
internal Utf16DecoderEnumerable(ReadOnlySpan<char> source) | ||
{ | ||
this.source = source; | ||
} | ||
|
||
/// <summary> | ||
/// Gets an enumerator to produce <see cref="CodePoint"/> from the source. | ||
/// </summary> | ||
/// <returns>The enumerator.</returns> | ||
public Utf16DecoderEnumerator GetEnumerator() => new(source); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
using System.Runtime.CompilerServices; | ||
|
||
namespace HtmlUtilities; | ||
|
||
/// <summary> | ||
/// Enumerates <see cref="CodePoint"/>s from a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> without allocating heap memory. | ||
/// </summary> | ||
public ref struct Utf16DecoderEnumerator | ||
{ | ||
private ReadOnlySpan<char>.Enumerator enumerator; | ||
|
||
/// <summary> | ||
/// The current <see cref="CodePoint"/> value. Not valid until <see cref="MoveNext"/> has been called at least once. | ||
/// </summary> | ||
public CodePoint Current { readonly get; private set; } | ||
|
||
internal Utf16DecoderEnumerator(ReadOnlySpan<char> source) | ||
{ | ||
this.enumerator = source.GetEnumerator(); | ||
this.Current = default; | ||
} | ||
|
||
/// <summary> | ||
/// Reads the next <see cref="CodePoint"/> from the source. | ||
/// </summary> | ||
/// <returns>True if a value was found, false if the end of the source has been reached.</returns> | ||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public bool MoveNext() | ||
{ | ||
if (!enumerator.MoveNext()) | ||
return false; | ||
|
||
var high = (int)enumerator.Current; | ||
|
||
if (high <= 0xD7FF || (high >= 0xE000 && high <= 0xFFFF)) | ||
{ | ||
Current = high; | ||
return true; | ||
} | ||
else if (!enumerator.MoveNext()) | ||
{ | ||
return false; | ||
} | ||
|
||
var low = (int)enumerator.Current; | ||
Current = (high - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000; | ||
|
||
return true; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
namespace HtmlUtilities; | ||
|
||
/// <summary> | ||
/// Wraps a <see cref="ReadOnlySpan{T}"/> of type <see cref="char"/> for on-demand enumeration into <see cref="CodePoint"/>s. | ||
/// </summary> | ||
public readonly ref struct Utf8DecoderEnumerable | ||
{ | ||
private readonly ReadOnlySpan<byte> source; | ||
|
||
internal Utf8DecoderEnumerable(ReadOnlySpan<byte> source) | ||
{ | ||
this.source = source; | ||
} | ||
|
||
/// <summary> | ||
/// Gets an enumerator to produce <see cref="CodePoint"/> from the source. | ||
/// </summary> | ||
/// <returns>The enumerator.</returns> | ||
public Utf8DecoderEnumerator GetEnumerator() => new(source); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
namespace HtmlUtilities; | ||
|
||
/// <summary> | ||
/// Enumerates <see cref="CodePoint"/>s from a <see cref="ReadOnlySpan{T}"/> of type <see cref="byte"/> without allocating heap memory. | ||
/// </summary> | ||
public ref struct Utf8DecoderEnumerator | ||
{ | ||
private ReadOnlySpan<byte>.Enumerator enumerator; | ||
|
||
/// <summary> | ||
/// The current <see cref="CodePoint"/> value. Not valid until <see cref="MoveNext"/> has been called at least once. | ||
/// </summary> | ||
public CodePoint Current { readonly get; private set; } | ||
|
||
internal Utf8DecoderEnumerator(ReadOnlySpan<byte> source) | ||
{ | ||
this.enumerator = source.GetEnumerator(); | ||
this.Current = default; | ||
} | ||
|
||
/// <summary> | ||
/// Reads the next <see cref="CodePoint"/> from the source. | ||
/// </summary> | ||
/// <returns>True if a value was found, false if the end of the source has been reached.</returns> | ||
public bool MoveNext() | ||
{ | ||
while (enumerator.MoveNext()) | ||
{ | ||
var current = enumerator.Current; | ||
|
||
if (current <= 0x7f) | ||
{ | ||
Current = current; | ||
return true; | ||
} | ||
|
||
static bool Next(ReadOnlySpan<byte>.Enumerator enumerator, ref byte current) | ||
{ | ||
if (!enumerator.MoveNext()) | ||
return false; // Invalid sequence. | ||
if (((current = enumerator.Current) >> 6) != 0b10) | ||
return false; | ||
|
||
return true; | ||
} | ||
|
||
int b1, b2, b3; | ||
if ((current >> 5) == 0b110) | ||
{ | ||
b1 = current & 0b00011111; | ||
if (!Next(enumerator, ref current)) | ||
continue; // Invalid sequence. | ||
|
||
Current = (b1 << 6) | current & 0b00111111; | ||
} | ||
else if ((current >> 4) == 0b1110) | ||
{ | ||
b1 = current & 0b00001111; | ||
if (!Next(enumerator, ref current)) | ||
continue; // Invalid sequence. | ||
|
||
b2 = current & 0b00111111; | ||
if (!Next(enumerator, ref current)) | ||
continue; // Invalid sequence. | ||
|
||
Current = (b1 << 12) | (b2 << 6) | current & 0b00111111; | ||
} | ||
else if ((current >> 3) == 0b11110) | ||
{ | ||
b1 = current & 0b00001111; | ||
if (!Next(enumerator, ref current)) | ||
continue; // Invalid sequence. | ||
|
||
b2 = current & 0b00111111; | ||
if (!Next(enumerator, ref current)) | ||
continue; // Invalid sequence. | ||
|
||
b3 = current & 0b00111111; | ||
if (!Next(enumerator, ref current)) | ||
continue; // Invalid sequence. | ||
|
||
Current = (b1 << 18) | (b2 << 12) | (b3 << 6) | current & 0b00111111; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
return false; | ||
} | ||
} |