Skip to content

Commit

Permalink
Implement decoding and encoding UTF16 Bytes.
Browse files Browse the repository at this point in the history
This adds two new charsets `UTF16LE` and `UTF16BE` for little and big
endian UTF16 respectively.

We also clean up use of the Unicode replacement character to make it
work consistently between UTF16 and UTF8.

Closes #1788.
  • Loading branch information
bbannier committed Dec 20, 2024
1 parent c15cad5 commit 4d4e0cf
Show file tree
Hide file tree
Showing 21 changed files with 515 additions and 163 deletions.
12 changes: 7 additions & 5 deletions doc/autogen/spicy-types.spicy
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ Specifies the character set for bytes encoding/decoding.
.. spicy-code::

type Charset = enum {
ASCII,
UTF8
ASCII, # ASCII encoding
UTF8, # UTF8 encoding
UTF16LE, # UTF16 little endian encoding
UTF16BE, # UTF16 big endian encoding
};

.. _spicy_decodeerrorstrategy:
Expand All @@ -67,9 +69,9 @@ Specifies how data is handled that's not representable in a specified character
.. spicy-code::

type DecodeErrorStrategy = enum {
IGNORE, # data is skipped but processing continues
REPLACE, # data is replaced with a valid place-holder and processing continues
STRICT # runtime error is triggered
IGNORE, # data is skipped but processing continues
REPLACE, # data is replaced with a valid place-holder and processing continues
STRICT # runtime error is triggered
};

.. _spicy_matchstate:
Expand Down
2 changes: 1 addition & 1 deletion hilti/lib/hilti.hlt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public type Side = enum { Left, Right, Both } &cxxname="hilti::rt::bytes::Side";
public type AddressFamily = enum { IPv4, IPv6 } &cxxname="hilti::rt::AddressFamily";
public type RealType = enum { IEEE754_Single, IEEE754_Double } &cxxname="hilti::rt::real::Type";
public type Protocol = enum { TCP, UDP, ICMP } &cxxname="hilti::rt::Protocol";
public type Charset = enum { ASCII, UTF8 } &cxxname="hilti::rt::unicode::Charset";
public type Charset = enum { ASCII, UTF8, UTF16LE, UTF16BE } &cxxname="hilti::rt::unicode::Charset";
public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::unicode::DecodeErrorStrategy";
public type Captures = vector<bytes>;
public type Profiler = __library_type("hilti::rt::Profiler");
Expand Down
4 changes: 3 additions & 1 deletion hilti/runtime/include/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
);

/** For bytes decoding, which character set to use. */
HILTI_RT_ENUM(Charset, Undef, UTF8, ASCII);
HILTI_RT_ENUM(Charset, Undef, UTF8, UTF16LE, UTF16BE, ASCII);

constexpr uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;

} // namespace unicode

Expand Down
45 changes: 45 additions & 0 deletions hilti/runtime/src/tests/bytes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <hilti/rt/types/integer.h>
#include <hilti/rt/types/regexp.h>

using namespace std::string_literals;
using namespace hilti::rt;
using namespace hilti::rt::bytes;

Expand Down Expand Up @@ -56,6 +57,33 @@ TEST_CASE("decode") {
CHECK_THROWS_WITH_AS("\xc3\x28"_b.decode(unicode::Charset::UTF8, unicode::DecodeErrorStrategy::STRICT),
"illegal UTF8 sequence in string", const RuntimeError&);

CHECK_EQ(Bytes("\0a\0b\0c"s).decode(unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::STRICT), "abc");
CHECK_EQ(Bytes("a\0b\0c\0"s).decode(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "abc");

// Our `decode` of UTF-16 bytes returns UTF8 string with BOM if they do not fit into ASCII, see e.g.,
// https://stackoverflow.com/questions/2223882/whats-the-difference-between-utf-8-and-utf-8-with-bom.
// To compute the expected results in Python encode with `utf_8_sig` encoding.
//
// LHS is an UTF16 encoding of '東京', RHS UTF8 with BOM.
CHECK_EQ("\xff\xfeqg\xacN"_b.decode(unicode::Charset ::UTF16LE, unicode::DecodeErrorStrategy::STRICT),
"\ufeff東京");

// Decoding of UTF16 with BOM. The byte order in the charset is just a hint, but we still decode as UTF16.
CHECK_EQ("\xff\xfeqg\xacN"_b.decode(unicode::Charset ::UTF16BE, unicode::DecodeErrorStrategy::STRICT),
"\ufeff東京");

// Decoding of too few bytes for UTF16 (expected even number, provided uneven).
CHECK_THROWS_WITH_AS(Bytes("\0a\0b\0"s).decode(unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::STRICT),
"illegal UTF16 character in string", const RuntimeError&);
CHECK_EQ(Bytes("\0a\0b\0"s).decode(unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::IGNORE), "ab");
CHECK_EQ(Bytes("\0a\0b\0"s).decode(unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::REPLACE), "ab\ufffd");

// Our UTF16 implementation seems to differ in what it considers invalid encodings, e.g., `\x00\xd8` is rejected by
// python-3.1[1-3], but accepted by us.
//
// TODO(bbannier): Test rejection of invalid UTF16 (but with even length).
CHECK_EQ(Bytes("\x00\xd8").decode(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "");

CHECK_THROWS_WITH_AS("123"_b.decode(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);
}
Expand Down Expand Up @@ -192,6 +220,13 @@ TEST_CASE("lower") {
// NOLINTNEXTLINE(bugprone-throw-keyword-missing)
CHECK_THROWS_WITH_AS("123"_b.lower(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);

// No case change expected for these Japanese codepoints.
const auto tokio8 = "東京"_b;
CHECK_EQ(tokio8.lower(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio8);

const auto tokio16 = "\xff\xfeqg\xacN"_b; // 東京 in UTF16LE.
CHECK_EQ(tokio16.lower(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio16);
}

TEST_CASE("match") {
Expand Down Expand Up @@ -488,9 +523,19 @@ TEST_CASE("upper") {
CHECK_EQ("Gänsefüßchen"_b.upper(unicode::Charset::UTF8).str(), "GÄNSEFÜẞCHEN");
CHECK_EQ("Gänsefüßchen"_b.upper(unicode::Charset::ASCII).str(), "G??NSEF????CHEN");

CHECK_EQ(Bytes("a\0b\0c\0"s).upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT),
Bytes("A\0B\0C\0"s).upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT));

// NOLINTNEXTLINE(bugprone-throw-keyword-missing)
CHECK_THROWS_WITH_AS("123"_b.upper(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);

// No case change expected for these Japanese codepoints.
const auto tokio8 = "東京"_b;
CHECK_EQ(tokio8.upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio8);

const auto tokio16 = "\xff\xfeqg\xacN"_b; // 東京 in UTF16LE.
CHECK_EQ(tokio16.upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio16);
}

TEST_CASE("append") {
Expand Down
6 changes: 6 additions & 0 deletions hilti/runtime/src/tests/string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ using namespace hilti::rt::bytes::literals;
TEST_SUITE_BEGIN("string");

TEST_CASE("encode") {
CHECK_EQ(string::encode("", unicode::Charset::ASCII), ""_b);
CHECK_EQ(string::encode("123", unicode::Charset::ASCII), "123"_b);
CHECK_EQ(string::encode("abc", unicode::Charset::ASCII), "abc"_b);
CHECK_EQ(string::encode("abc", unicode::Charset::UTF8), "abc"_b);
Expand All @@ -30,6 +31,11 @@ TEST_CASE("encode") {
unicode::DecodeErrorStrategy::STRICT),
"illegal ASCII character in string", const RuntimeError&);

CHECK_EQ(string::encode("abc", unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "a\0b\0c\0"_b);
CHECK_EQ(string::encode("abc", unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::STRICT), "\0a\0b\0c"_b);
CHECK_EQ(string::encode("東京", unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "qg\xacN"_b);
CHECK_EQ(string::encode("東京", unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::STRICT), "gqN\xac"_b);

// NOLINTNEXTLINE(bugprone-throw-keyword-missing)
CHECK_THROWS_WITH_AS(string::encode("123", unicode::Charset::Undef), "unknown character set for encoding",
const RuntimeError&);
Expand Down
2 changes: 2 additions & 0 deletions hilti/runtime/src/tests/to_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ TEST_CASE("integer::BitOrder") {
TEST_CASE("bytes::Charset") {
CHECK_EQ(to_string(Enum(unicode::Charset::ASCII)), "Charset::ASCII");
CHECK_EQ(to_string(Enum(unicode::Charset::UTF8)), "Charset::UTF8");
CHECK_EQ(to_string(Enum(unicode::Charset::UTF16BE)), "Charset::UTF16BE");
CHECK_EQ(to_string(Enum(unicode::Charset::UTF16LE)), "Charset::UTF16LE");
CHECK_EQ(to_string(Enum(unicode::Charset::Undef)), "Charset::Undef");
}

Expand Down
149 changes: 135 additions & 14 deletions hilti/runtime/src/types/bytes.cc
Original file line number Diff line number Diff line change
@@ -1,19 +1,78 @@
// Copyright (c) 2020-2023 by the Zeek Project. See LICENSE for details.

#include <utf8proc/utf8proc.h>
#include <utf8.h>

#include <cstddef>
#include <cstdint>
#include <cstdlib>
#include <string>
#include <string_view>
#include <utility>

#include <hilti/rt/types/bytes.h>
#include <hilti/rt/types/integer.h>
#include <hilti/rt/types/regexp.h>
#include <hilti/rt/types/stream.h>
#include <hilti/rt/unicode.h>
#include <hilti/rt/util.h>

using namespace hilti::rt;
using namespace hilti::rt::bytes;

namespace {

// An iterator over `char16_t` which can adjust the byte order.
struct U16Iterator {
// Most of this is boilerplate.
using iterator_category = std::forward_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = const char16_t;
using pointer = value_type*;
using reference = value_type&;

pointer cur = nullptr;

U16Iterator& operator++() {
++cur;
return *this;
}

U16Iterator operator++(int) {
auto tmp = *this;
++(*this);
return tmp;
}

friend bool operator==(const U16Iterator& a, const U16Iterator& b) { return a.cur == b.cur; };
friend bool operator!=(const U16Iterator& a, const U16Iterator& b) { return ! (a == b); };

// Implementation of custom behavior below.
enum Order { LE, BE, Detected };

U16Iterator(pointer ptr, Order order) : cur(ptr), order(order) {}

Order order;

auto operator*() const {
switch ( order ) {
case Detected: [[fallthrough]];
case LE: return *cur;
case BE: {
auto r = *cur;

char* xs = reinterpret_cast<char*>(&r);
std::swap(xs[0], xs[1]);

return r;
}
}

cannot_be_reached();
}
};

} // namespace

std::tuple<bool, Bytes::const_iterator> Bytes::find(const Bytes& needle, const const_iterator& start) const {
auto b = begin();

Expand Down Expand Up @@ -46,35 +105,91 @@ std::tuple<bool, Bytes::const_iterator> Bytes::find(const Bytes& needle, const c
}
}

std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy errors) const {
std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy errors) const try {
if ( Base::empty() )
return "";

switch ( cs.value() ) {
case unicode::Charset::UTF8: {
std::string t;

auto p = reinterpret_cast<const unsigned char*>(Base::data());
auto e = p + Base::size();
auto p = Base::begin();
auto e = Base::end();

while ( p < e ) {
utf8proc_int32_t cp;
auto n = utf8proc_iterate(p, e - p, &cp);

if ( n < 0 ) {
try {
auto cp = utf8::next(p, e);
utf8::append(cp, t);
} catch ( const utf8::invalid_utf8& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: t += "\ufffd"; break;
case unicode::DecodeErrorStrategy::REPLACE: {
utf8::append(unicode::REPLACEMENT_CHARACTER, t);
break;
}
case unicode::DecodeErrorStrategy::STRICT:
throw RuntimeError("illegal UTF8 sequence in string");
}

p += 1;
continue;
++p;
}
}

return t;
}

case unicode::Charset::UTF16BE: [[fallthrough]];
case unicode::Charset::UTF16LE: {
if ( Base::size() % 2 != 0 ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF16 character in string");
case unicode::DecodeErrorStrategy::IGNORE: {
// Ignore the last byte.
return Bytes(str().substr(0, Base::size() / 2 * 2)).decode(cs, errors);
}
case unicode::DecodeErrorStrategy::REPLACE: {
// Convert everything but the last byte, and append replacement.
auto dec = Bytes(str().substr(0, Base::size() / 2 * 2)).decode(cs, errors);
utf8::append(unicode::REPLACEMENT_CHARACTER, dec);
return dec;
}
}
}

// We can assume an even number of bytes.

t += std::string(reinterpret_cast<const char*>(p), n);
p += n;
std::u16string t;

// utfcpp expects to iterate a `u16string` or `u16string_view`.
auto v16 = std::u16string_view{reinterpret_cast<const char16_t*>(Base::data()), Base::size() / 2};

// We prefer to use the byte order from a BOM if present. If none is found use the passed byte order.
U16Iterator::Order order = U16Iterator::Detected;
if ( ! startsWith("\xFF\xFE") && ! startsWith("\xFE\xFF") )
order = (cs.value() == unicode::Charset::UTF16LE ? U16Iterator::LE : U16Iterator::BE);

auto p = U16Iterator(v16.begin(), order);
auto e = U16Iterator(v16.end(), order);

while ( p != e ) {
try {
auto cp = utf8::next16(p, e);
utf8::append16(cp, t);
} catch ( const utf8::invalid_utf16& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE:
utf8::append16(unicode::REPLACEMENT_CHARACTER, t);
break;
case unicode::DecodeErrorStrategy::STRICT:
throw RuntimeError("illegal UTF16 character in string");
}

++p;
}
}

return {t};
return {utf8::utf16to8(t)};
}

case unicode::Charset::ASCII: {
Expand All @@ -99,6 +214,12 @@ std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy erro
}

cannot_be_reached();
} catch ( const RuntimeError& ) {
// Directly propagate already correctly wrapped exceptions.
throw;
} catch ( ... ) {
// Throw a new `RuntimeError` for any other exception which has made it out of the function.
throw RuntimeError("could not decode bytes");
}

Bytes Bytes::strip(const Bytes& set, bytes::Side side) const {
Expand Down
Loading

0 comments on commit 4d4e0cf

Please sign in to comment.