Skip to content

Commit

Permalink
Add bytes decoding from UTF-16.
Browse files Browse the repository at this point in the history
Closes #1788.
  • Loading branch information
bbannier committed Dec 10, 2024
1 parent d28c275 commit 7cd01bb
Show file tree
Hide file tree
Showing 16 changed files with 236 additions and 109 deletions.
2 changes: 1 addition & 1 deletion hilti/lib/hilti.hlt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public type Side = enum { Left, Right, Both } &cxxname="hilti::rt::bytes::Side";
public type AddressFamily = enum { IPv4, IPv6 } &cxxname="hilti::rt::AddressFamily";
public type RealType = enum { IEEE754_Single, IEEE754_Double } &cxxname="hilti::rt::real::Type";
public type Protocol = enum { TCP, UDP, ICMP } &cxxname="hilti::rt::Protocol";
public type Charset = enum { ASCII, UTF8 } &cxxname="hilti::rt::unicode::Charset";
public type Charset = enum { ASCII, UTF8, UTF16 } &cxxname="hilti::rt::unicode::Charset";
public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::unicode::DecodeErrorStrategy";
public type Captures = vector<bytes>;
public type Profiler = __library_type("hilti::rt::Profiler");
Expand Down
2 changes: 1 addition & 1 deletion hilti/runtime/include/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
);

/** For bytes decoding, which character set to use. */
HILTI_RT_ENUM(Charset, Undef, UTF8, ASCII);
HILTI_RT_ENUM(Charset, Undef, UTF8, UTF16, ASCII);

constexpr uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;

Expand Down
9 changes: 9 additions & 0 deletions hilti/runtime/src/tests/bytes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ TEST_CASE("construct") {
CHECK_EQ(Bytes("123", Enum(unicode::Charset::ASCII)).str(), "123");
CHECK_EQ(Bytes("abc", Enum(unicode::Charset::ASCII)).str(), "abc");
CHECK_EQ(Bytes("abc", Enum(unicode::Charset::UTF8)).str(), "abc");
CHECK_EQ(Bytes("abc", Enum(unicode::Charset::UTF16)).str(), "abc");

CHECK_EQ(Bytes("\xF0\x9F\x98\x85", Enum(unicode::Charset::UTF8)).str(), "\xF0\x9F\x98\x85");
CHECK_EQ(Bytes("\xc3\x28", Enum(unicode::Charset::UTF8), unicode::DecodeErrorStrategy::REPLACE).str(), "\ufffd(");
Expand All @@ -60,6 +61,8 @@ TEST_CASE("construct") {
// NOLINTNEXTLINE(bugprone-throw-keyword-missing)
CHECK_THROWS_WITH_AS(Bytes("123", Enum(unicode::Charset::Undef)), "unknown character set for encoding",
const RuntimeError&);

// FIXME(bbannier): add test case for invalid UTF-16.
}

TEST_CASE("decode") {
Expand All @@ -79,6 +82,8 @@ TEST_CASE("decode") {
CHECK_THROWS_WITH_AS("\xc3\x28"_b.decode(unicode::Charset::UTF8, unicode::DecodeErrorStrategy::STRICT),
"illegal UTF8 sequence in string", const RuntimeError&);

// FIXME(bbannier): add test cases decoding from valid and invalid UTF-16.

CHECK_THROWS_WITH_AS("123"_b.decode(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);
}
Expand Down Expand Up @@ -215,6 +220,8 @@ TEST_CASE("lower") {
// NOLINTNEXTLINE(bugprone-throw-keyword-missing)
CHECK_THROWS_WITH_AS("123"_b.lower(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);

// FIXME(bbannier): check that lowering an UTF-16 bytes produces a lowercase UTF-16 bytes.
}

TEST_CASE("match") {
Expand Down Expand Up @@ -514,6 +521,8 @@ TEST_CASE("upper") {
// NOLINTNEXTLINE(bugprone-throw-keyword-missing)
CHECK_THROWS_WITH_AS("123"_b.upper(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);

// FIXME(bbannier): check that upping an UTF-16 bytes produces an uppercase UTF-16 bytes.
}

TEST_CASE("append") {
Expand Down
1 change: 1 addition & 0 deletions hilti/runtime/src/tests/to_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ TEST_CASE("integer::BitOrder") {
TEST_CASE("bytes::Charset") {
CHECK_EQ(to_string(Enum(unicode::Charset::ASCII)), "Charset::ASCII");
CHECK_EQ(to_string(Enum(unicode::Charset::UTF8)), "Charset::UTF8");
CHECK_EQ(to_string(Enum(unicode::Charset::UTF16)), "Charset::UTF16");
CHECK_EQ(to_string(Enum(unicode::Charset::Undef)), "Charset::Undef");
}

Expand Down
54 changes: 54 additions & 0 deletions hilti/runtime/src/types/bytes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <string>
#include <string_view>

#include <hilti/rt/types/bytes.h>
#include <hilti/rt/types/integer.h>
Expand Down Expand Up @@ -79,6 +82,34 @@ Bytes::Bytes(std::string s, unicode::Charset cs, unicode::DecodeErrorStrategy er
return;
}

case unicode::Charset::UTF16: {
// FIXME(bbannier): this has a lot of copy/paste from above case, clean up.
std::u16string t;

auto p = s.begin();
auto e = s.end();

while ( p < e ) {
try {
auto cp = utf8::next16(p, e);
utf8::append16(cp, t);
} catch ( const utf8::exception& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: {
utf8::append(0xFFFD, s);
break;
}
case unicode::DecodeErrorStrategy::STRICT:
throw RuntimeError("illegal UTF8 sequence in string");
}
}
}

*this = utf8::utf16to8(t);
return;
}

case unicode::Charset::ASCII: {
std::string t;
for ( const auto& c : s ) {
Expand Down Expand Up @@ -109,6 +140,29 @@ std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy erro
case unicode::Charset::UTF8:
// Data is already in UTF-8, but let's validate it.
return Bytes(str(), cs, errors).str();
case unicode::Charset::UTF16: {
std::u16string t;

auto p = this->str().begin();
auto e = this->str().end();
while ( p < e ) {
try {
auto cp = utf8::next16(p, e);
utf8::append16(cp, t);
} catch ( const utf8::exception& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: utf8::append16(unicode::REPLACEMENT_CHARACTER, t);
case unicode::DecodeErrorStrategy::STRICT:
throw RuntimeError("illegal UTF-16 character in string");
}

p = std::next(p);
}
}

return utf8::utf16to8(t);
}

case unicode::Charset::ASCII: {
std::string s;
Expand Down
1 change: 1 addition & 0 deletions hilti/runtime/src/unicode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ std::string to_string(const unicode::Charset& x, tag /*unused*/) {
switch ( x.value() ) {
case unicode::Charset::ASCII: return "Charset::ASCII";
case unicode::Charset::UTF8: return "Charset::UTF8";
case unicode::Charset::UTF16: return "Charset::UTF16";
case unicode::Charset::Undef: return "Charset::Undef";
}

Expand Down
3 changes: 2 additions & 1 deletion spicy/lib/spicy.spicy
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ public type ByteOrder = enum {
## Specifies the character set for bytes encoding/decoding.
public type Charset = enum {
ASCII,
UTF8
UTF8,
UTF16,
} &cxxname="hilti::rt::unicode::Charset";

## Specifies how data is handled that's not representable in a specified character set.
Expand Down
33 changes: 21 additions & 12 deletions tests/Baseline/hilti.ast.basic-module/debug.log
Original file line number Diff line number Diff line change
Expand Up @@ -323,38 +323,47 @@
[debug/ast-final] - ctor::String <is_literal="true" value="::hilti::rt::Protocol"> [parent @e:XXX] [@c:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @c:XXX] [@q:XXX]
[debug/ast-final] - type::String <declaration="-" type="-" unified="string" wildcard="false"> [parent @q:XXX] (resolved) [@t:XXX]
[debug/ast-final] - declaration::Type <canonical-id="hilti::Charset" declaration="D9" fqid="hilti::Charset" id="Charset" linkage="public"> [parent @d:XXX] (hilti.hlt:14:1-14:82) [@d:XXX]
[debug/ast-final] - declaration::Type <canonical-id="hilti::Charset" declaration="D9" fqid="hilti::Charset" id="Charset" linkage="public"> [parent @d:XXX] (hilti.hlt:14:1-14:89) [@d:XXX]
[debug/ast-final] | ASCII -> declaration::Constant <canonical-id="hilti::ASCII" declaration="-" fqid="hilti::Charset::ASCII" id="ASCII" linkage="private"> [parent @t:XXX] [@d:XXX] ([@d:XXX])
[debug/ast-final] | UTF16 -> declaration::Constant <canonical-id="hilti::UTF16" declaration="-" fqid="hilti::Charset::UTF16" id="UTF16" linkage="private"> [parent @t:XXX] [@d:XXX] ([@d:XXX])
[debug/ast-final] | UTF8 -> declaration::Constant <canonical-id="hilti::UTF8" declaration="D21" fqid="hilti::Charset::UTF8" id="UTF8" linkage="private"> [parent @t:XXX] [@d:XXX] ([@d:XXX])
[debug/ast-final] | Undef -> declaration::Constant <canonical-id="hilti::Undef_7" declaration="-" fqid="hilti::Charset::Undef" id="Undef" linkage="private"> [parent @t:XXX] [@d:XXX] ([@d:XXX])
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @d:XXX] (hilti.hlt:14:23-14:42) [@q:XXX]
[debug/ast-final] - type::Enum <declaration="D9" type="T7" unified="name(hilti::Charset)" wildcard="false"> [parent @q:XXX] (hilti.hlt:14:23-14:42) (resolved) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @d:XXX] (hilti.hlt:14:23-14:49) [@q:XXX]
[debug/ast-final] - type::Enum <declaration="D9" type="T7" unified="name(hilti::Charset)" wildcard="false"> [parent @q:XXX] (hilti.hlt:14:23-14:49) (resolved) [@t:XXX]
[debug/ast-final] | ASCII -> declaration::Constant <canonical-id="hilti::ASCII" declaration="-" fqid="hilti::Charset::ASCII" id="ASCII" linkage="private"> [parent @t:XXX] [@d:XXX] ([@d:XXX])
[debug/ast-final] | UTF16 -> declaration::Constant <canonical-id="hilti::UTF16" declaration="-" fqid="hilti::Charset::UTF16" id="UTF16" linkage="private"> [parent @t:XXX] [@d:XXX] ([@d:XXX])
[debug/ast-final] | UTF8 -> declaration::Constant <canonical-id="hilti::UTF8" declaration="D21" fqid="hilti::Charset::UTF8" id="UTF8" linkage="private"> [parent @t:XXX] [@d:XXX] ([@d:XXX])
[debug/ast-final] | Undef -> declaration::Constant <canonical-id="hilti::Undef_7" declaration="-" fqid="hilti::Charset::Undef" id="Undef" linkage="private"> [parent @t:XXX] [@d:XXX] ([@d:XXX])
[debug/ast-final] - declaration::Constant <canonical-id="hilti::ASCII" declaration="-" fqid="hilti::Charset::ASCII" id="ASCII" linkage="private"> [parent @t:XXX] (hilti.hlt:14:23-14:42) [@d:XXX]
[debug/ast-final] - declaration::Constant <canonical-id="hilti::ASCII" declaration="-" fqid="hilti::Charset::ASCII" id="ASCII" linkage="private"> [parent @t:XXX] (hilti.hlt:14:23-14:49) [@d:XXX]
[debug/ast-final] - <empty>
[debug/ast-final] - expression::Ctor [parent @d:XXX] (const) (resolved) [@e:XXX]
[debug/ast-final] - ctor::Enum [parent @e:XXX] [@c:XXX]
[debug/ast-final] - type::enum_::Label <id="ASCII" value="0"> [parent @c:XXX] (hilti.hlt:14:30-14:34) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:14:30-14:34) [@q:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @c:XXX] [@q:XXX]
[debug/ast-final] - declaration::Constant <canonical-id="hilti::UTF8" declaration="D21" fqid="hilti::Charset::UTF8" id="UTF8" linkage="private"> [parent @t:XXX] (hilti.hlt:14:23-14:42) [@d:XXX]
[debug/ast-final] - declaration::Constant <canonical-id="hilti::UTF8" declaration="D21" fqid="hilti::Charset::UTF8" id="UTF8" linkage="private"> [parent @t:XXX] (hilti.hlt:14:23-14:49) [@d:XXX]
[debug/ast-final] - <empty>
[debug/ast-final] - expression::Ctor [parent @d:XXX] (const) (resolved) [@e:XXX]
[debug/ast-final] - ctor::Enum [parent @e:XXX] [@c:XXX]
[debug/ast-final] - type::enum_::Label <id="UTF8" value="1"> [parent @c:XXX] (hilti.hlt:14:37-14:40) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:14:30-14:34) [@q:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @c:XXX] [@q:XXX]
[debug/ast-final] - declaration::Constant <canonical-id="hilti::Undef_7" declaration="-" fqid="hilti::Charset::Undef" id="Undef" linkage="private"> [parent @t:XXX] (hilti.hlt:14:23-14:42) [@d:XXX]
[debug/ast-final] - declaration::Constant <canonical-id="hilti::UTF16" declaration="-" fqid="hilti::Charset::UTF16" id="UTF16" linkage="private"> [parent @t:XXX] (hilti.hlt:14:23-14:49) [@d:XXX]
[debug/ast-final] - <empty>
[debug/ast-final] - expression::Ctor [parent @d:XXX] (const) (resolved) [@e:XXX]
[debug/ast-final] - ctor::Enum [parent @e:XXX] [@c:XXX]
[debug/ast-final] - type::enum_::Label <id="Undef" value="-1"> [parent @c:XXX] (hilti.hlt:14:23-14:42) [@t:XXX]
[debug/ast-final] - type::enum_::Label <id="UTF16" value="2"> [parent @c:XXX] (hilti.hlt:14:43-14:47) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:14:30-14:34) [@q:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @c:XXX] [@q:XXX]
[debug/ast-final] - AttributeSet [parent @d:XXX] (hilti.hlt:14:43-14:81) [@a:XXX]
[debug/ast-final] - Attribute <tag="&cxxname"> [parent @a:XXX] (hilti.hlt:14:43-14:81) [@a:XXX]
[debug/ast-final] - declaration::Constant <canonical-id="hilti::Undef_7" declaration="-" fqid="hilti::Charset::Undef" id="Undef" linkage="private"> [parent @t:XXX] (hilti.hlt:14:23-14:49) [@d:XXX]
[debug/ast-final] - <empty>
[debug/ast-final] - expression::Ctor [parent @d:XXX] (const) (resolved) [@e:XXX]
[debug/ast-final] - ctor::Enum [parent @e:XXX] [@c:XXX]
[debug/ast-final] - type::enum_::Label <id="Undef" value="-1"> [parent @c:XXX] (hilti.hlt:14:23-14:49) [@t:XXX]
[debug/ast-final] - QualifiedType <const="false" side="rhs"> [parent @t:XXX] (hilti.hlt:14:30-14:34) [@q:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @c:XXX] [@q:XXX]
[debug/ast-final] - AttributeSet [parent @d:XXX] (hilti.hlt:14:50-14:88) [@a:XXX]
[debug/ast-final] - Attribute <tag="&cxxname"> [parent @a:XXX] (hilti.hlt:14:50-14:88) [@a:XXX]
[debug/ast-final] - expression::Ctor [parent @a:XXX] (const) (resolved) [@e:XXX]
[debug/ast-final] - ctor::String <is_literal="true" value="::hilti::rt::unicode::Charset"> [parent @e:XXX] [@c:XXX]
[debug/ast-final] - QualifiedType <const="true" side="rhs"> [parent @c:XXX] [@q:XXX]
Expand Down Expand Up @@ -928,7 +937,7 @@
[debug/ast-final] [D6] hilti::AddressFamily [declaration::Type] (hilti.hlt:11:1-11:84)
[debug/ast-final] [D7] hilti::RealType [declaration::Type] (hilti.hlt:12:1-12:96)
[debug/ast-final] [D8] hilti::Protocol [declaration::Type] (hilti.hlt:13:1-13:78)
[debug/ast-final] [D9] hilti::Charset [declaration::Type] (hilti.hlt:14:1-14:82)
[debug/ast-final] [D9] hilti::Charset [declaration::Type] (hilti.hlt:14:1-14:89)
[debug/ast-final] [D10] hilti::DecodeErrorStrategy [declaration::Type] (hilti.hlt:15:1-15:118)
[debug/ast-final] [D11] hilti::Captures [declaration::Type] (hilti.hlt:16:1-16:37)
[debug/ast-final] [D12] hilti::Profiler [declaration::Type] (hilti.hlt:17:1-17:61)
Expand All @@ -940,15 +949,15 @@
[debug/ast-final] [D18] hilti::RecoverableFailure [declaration::Type] (hilti.hlt:56:1-56:98)
[debug/ast-final] [D19] hilti::MissingData [declaration::Type] (hilti.hlt:59:1-59:84)
[debug/ast-final] [D20] hilti [declaration::Module] (hilti.hlt:3:1-68:1)
[debug/ast-final] [D21] hilti::UTF8 [declaration::Constant] (hilti.hlt:14:23-14:42)
[debug/ast-final] [D21] hilti::UTF8 [declaration::Constant] (hilti.hlt:14:23-14:49)
[debug/ast-final] [D22] hilti::REPLACE [declaration::Constant] (hilti.hlt:15:35-15:66)
[debug/ast-final] [T1] hilti::BitOrder [type::Enum] (hilti.hlt:8:24-8:42)
[debug/ast-final] [T2] hilti::ByteOrder [type::Enum] (hilti.hlt:9:25-9:59)
[debug/ast-final] [T3] hilti::Side [type::Enum] (hilti.hlt:10:20-10:45)
[debug/ast-final] [T4] hilti::AddressFamily [type::Enum] (hilti.hlt:11:29-11:47)
[debug/ast-final] [T5] hilti::RealType [type::Enum] (hilti.hlt:12:24-12:62)
[debug/ast-final] [T6] hilti::Protocol [type::Enum] (hilti.hlt:13:24-13:46)
[debug/ast-final] [T7] hilti::Charset [type::Enum] (hilti.hlt:14:23-14:42)
[debug/ast-final] [T7] hilti::Charset [type::Enum] (hilti.hlt:14:23-14:49)
[debug/ast-final] [T8] hilti::DecodeErrorStrategy [type::Enum] (hilti.hlt:15:35-15:66)
[debug/ast-final] [T9] hilti::MatchState [type::Struct] (hilti.hlt:19:26-21:1)
[debug/ast-final] [T10] hilti::StreamStatistics [type::Struct] (hilti.hlt:23:32-28:1)
Expand Down
Loading

0 comments on commit 7cd01bb

Please sign in to comment.