Skip to content

Commit

Permalink
Implement decoding of UTF16 Bytes.
Browse files Browse the repository at this point in the history
We also clean up use of the Unicode replacement character to make it
work consistently between UTF16 and UTF8.

Closes #1788.
  • Loading branch information
bbannier committed Dec 16, 2024
1 parent 802a0a4 commit 6daf2f6
Show file tree
Hide file tree
Showing 18 changed files with 335 additions and 154 deletions.
3 changes: 2 additions & 1 deletion doc/autogen/spicy-types.spicy
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ Specifies the character set for bytes encoding/decoding.

type Charset = enum {
ASCII,
UTF8
UTF8,
UTF16LE,
};

.. _spicy_decodeerrorstrategy:
Expand Down
2 changes: 1 addition & 1 deletion hilti/lib/hilti.hlt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ public type Side = enum { Left, Right, Both } &cxxname="hilti::rt::bytes::Side";
public type AddressFamily = enum { IPv4, IPv6 } &cxxname="hilti::rt::AddressFamily";
public type RealType = enum { IEEE754_Single, IEEE754_Double } &cxxname="hilti::rt::real::Type";
public type Protocol = enum { TCP, UDP, ICMP } &cxxname="hilti::rt::Protocol";
public type Charset = enum { ASCII, UTF8 } &cxxname="hilti::rt::unicode::Charset";
public type Charset = enum { ASCII, UTF8, UTF16LE } &cxxname="hilti::rt::unicode::Charset";
public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::unicode::DecodeErrorStrategy";
public type Captures = vector<bytes>;
public type Profiler = __library_type("hilti::rt::Profiler");
Expand Down
4 changes: 3 additions & 1 deletion hilti/runtime/include/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
);

/** For bytes decoding, which character set to use. */
HILTI_RT_ENUM(Charset, Undef, UTF8, ASCII);
HILTI_RT_ENUM(Charset, Undef, UTF8, UTF16LE, ASCII);

constexpr uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;

} // namespace unicode

Expand Down
28 changes: 28 additions & 0 deletions hilti/runtime/src/tests/bytes.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <hilti/rt/types/integer.h>
#include <hilti/rt/types/regexp.h>

using namespace std::string_literals;
using namespace hilti::rt;
using namespace hilti::rt::bytes;

Expand Down Expand Up @@ -56,6 +57,16 @@ TEST_CASE("decode") {
CHECK_THROWS_WITH_AS("\xc3\x28"_b.decode(unicode::Charset::UTF8, unicode::DecodeErrorStrategy::STRICT),
"illegal UTF8 sequence in string", const RuntimeError&);

CHECK_EQ(Bytes("a\0b\0c\0"s).decode(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "abc");

// Our `decode` of UTF-16 bytes returns UTF8 string with BOM if they do not fit into ASCII, see e.g.,
// https://stackoverflow.com/questions/2223882/whats-the-difference-between-utf-8-and-utf-8-with-bom.
// To compute the expected results in Python encode with `utf_8_sig` encoding.
//
// LHS is an UTF16 encoding of '東京', RHS UTF8 with BOM.
CHECK_EQ("\xff\xfeqg\xacN"_b.decode(unicode::Charset ::UTF16LE, unicode::DecodeErrorStrategy::STRICT),
"\ufeff東京");

CHECK_THROWS_WITH_AS("123"_b.decode(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);
}
Expand Down Expand Up @@ -192,6 +203,13 @@ TEST_CASE("lower") {
// NOLINTNEXTLINE(bugprone-throw-keyword-missing)
CHECK_THROWS_WITH_AS("123"_b.lower(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);

// No case change expected for these Japanese codepoints.
const auto tokio8 = "東京"_b;
CHECK_EQ(tokio8.lower(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio8);

const auto tokio16 = "\xff\xfeqg\xacN"_b; // 東京 in UTF16LE.
CHECK_EQ(tokio16.lower(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio16);
}

TEST_CASE("match") {
Expand Down Expand Up @@ -488,9 +506,19 @@ TEST_CASE("upper") {
CHECK_EQ("Gänsefüßchen"_b.upper(unicode::Charset::UTF8).str(), "GÄNSEFÜẞCHEN");
CHECK_EQ("Gänsefüßchen"_b.upper(unicode::Charset::ASCII).str(), "G??NSEF????CHEN");

CHECK_EQ(Bytes("a\0b\0c\0"s).upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT),
Bytes("A\0B\0C\0"s).upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT));

// NOLINTNEXTLINE(bugprone-throw-keyword-missing)
CHECK_THROWS_WITH_AS("123"_b.upper(unicode::Charset::Undef), "unknown character set for decoding",
const RuntimeError&);

// No case change expected for these Japanese codepoints.
const auto tokio8 = "東京"_b;
CHECK_EQ(tokio8.upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio8);

const auto tokio16 = "\xff\xfeqg\xacN"_b; // 東京 in UTF16LE.
CHECK_EQ(tokio16.upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio16);
}

TEST_CASE("append") {
Expand Down
1 change: 1 addition & 0 deletions hilti/runtime/src/tests/to_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ TEST_CASE("integer::BitOrder") {
TEST_CASE("bytes::Charset") {
CHECK_EQ(to_string(Enum(unicode::Charset::ASCII)), "Charset::ASCII");
CHECK_EQ(to_string(Enum(unicode::Charset::UTF8)), "Charset::UTF8");
CHECK_EQ(to_string(Enum(unicode::Charset::UTF16LE)), "Charset::UTF16LE");
CHECK_EQ(to_string(Enum(unicode::Charset::Undef)), "Charset::Undef");
}

Expand Down
77 changes: 64 additions & 13 deletions hilti/runtime/src/types/bytes.cc
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
// Copyright (c) 2020-2023 by the Zeek Project. See LICENSE for details.

#include <utf8proc/utf8proc.h>
#include <utf8.h>

#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <string>
#include <string_view>

#include <hilti/rt/types/bytes.h>
#include <hilti/rt/types/integer.h>
#include <hilti/rt/types/regexp.h>
#include <hilti/rt/types/stream.h>
#include <hilti/rt/unicode.h>
#include <hilti/rt/util.h>

using namespace hilti::rt;
Expand Down Expand Up @@ -47,34 +51,81 @@ std::tuple<bool, Bytes::const_iterator> Bytes::find(const Bytes& needle, const c
}

std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy errors) const {
if ( Base::empty() )
return "";

switch ( cs.value() ) {
case unicode::Charset::UTF8: {
std::string t;

auto p = reinterpret_cast<const unsigned char*>(Base::data());
auto e = p + Base::size();
auto p = Base::begin();
auto e = Base::end();

while ( p < e ) {
utf8proc_int32_t cp;
auto n = utf8proc_iterate(p, e - p, &cp);

if ( n < 0 ) {
try {
auto cp = utf8::next(p, e);
utf8::append(cp, t);
} catch ( const utf8::exception& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: t += "\ufffd"; break;
case unicode::DecodeErrorStrategy::REPLACE: {
utf8::append(unicode::REPLACEMENT_CHARACTER, t);
break;
}
case unicode::DecodeErrorStrategy::STRICT:
throw RuntimeError("illegal UTF8 sequence in string");
}

p += 1;
continue;
p = std::next(p);
}
}

return t;
}

case unicode::Charset::UTF16LE: {
if ( Base::size() % 2 != 0 ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF16 character in string");
case unicode::DecodeErrorStrategy::IGNORE: {
// Ignore the last byte.
return Bytes(str().substr(0, Base::size() / 2 * 2)).decode(cs, errors);
}
case unicode::DecodeErrorStrategy::REPLACE: {
// Convert everything but the last byte, and append replacement.
auto dec = Bytes(str().substr(0, Base::size() / 2 * 2)).decode(cs, errors);
utf8::append(unicode::REPLACEMENT_CHARACTER, dec);
return dec;
}
}
}

// We can assume an even number of bytes.
std::u16string t;

t += std::string(reinterpret_cast<const char*>(p), n);
p += n;
// utfcpp expects to iterate a `u16string` or `u16string_view`.
auto v16 = std::u16string_view{reinterpret_cast<const char16_t*>(Base::data()), Base::size() / 2};
auto p = v16.begin();
auto e = v16.end();
while ( p < e ) {
try {
auto cp = utf8::next16(p, e);
utf8::append16(cp, t);
} catch ( const utf8::exception& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE:
utf8::append16(unicode::REPLACEMENT_CHARACTER, t);
break;
case unicode::DecodeErrorStrategy::STRICT:
throw RuntimeError("illegal UTF16 character in string");
}

p = std::next(p);
}
}

return {t};
return {utf8::utf16to8(t)};
}

case unicode::Charset::ASCII: {
Expand Down
97 changes: 66 additions & 31 deletions hilti/runtime/src/types/string.cc
Original file line number Diff line number Diff line change
@@ -1,37 +1,40 @@
// Copyright (c) 2020-2023 by the Zeek Project. See LICENSE for details.

#include <utf8.h>
#include <utf8proc/utf8proc.h>

#include <iterator>

#include <hilti/rt/exception.h>
#include <hilti/rt/types/bytes.h>
#include <hilti/rt/types/string.h>
#include <hilti/rt/util.h>

using namespace hilti::rt;

integer::safe<uint64_t> string::size(const std::string& s, unicode::DecodeErrorStrategy errors) {
auto p = reinterpret_cast<const unsigned char*>(s.data());
auto e = p + s.size();
auto p = s.begin();
auto e = s.end();

uint64_t len = 0;

while ( p < e ) {
utf8proc_int32_t cp;
auto n = utf8proc_iterate(p, e - p, &cp);

if ( n < 0 ) {
try {
// `utf8::next` is for iterating UTF-8 strings.
utf8::next(p, s.end());
++len;
} catch ( const utf8::exception& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: ++len; break;
case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF8 sequence in string");
case unicode::DecodeErrorStrategy::REPLACE: {
++len;
}
[[fallthrough]];
case unicode::DecodeErrorStrategy::IGNORE: {
p = std::next(p);
break;
}
}

p += 1;
continue;
}

++len;
p += n;
}

return len;
Expand All @@ -51,7 +54,7 @@ std::string string::upper(std::string_view s, unicode::DecodeErrorStrategy error
if ( n < 0 ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: rval += "\ufffd"; break;
case unicode::DecodeErrorStrategy::REPLACE: utf8::append(unicode::REPLACEMENT_CHARACTER, rval); break;
case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF8 sequence in string");
}

Expand Down Expand Up @@ -81,7 +84,7 @@ std::string string::lower(std::string_view s, unicode::DecodeErrorStrategy error
if ( n < 0 ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: rval += "\ufffd"; break;
case unicode::DecodeErrorStrategy::REPLACE: utf8::append(unicode::REPLACEMENT_CHARACTER, rval); break;
case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF8 sequence in string");
}

Expand Down Expand Up @@ -132,35 +135,67 @@ std::tuple<std::string, std::string> string::split1(const std::string& s, const
}

Bytes string::encode(std::string s, unicode::Charset cs, unicode::DecodeErrorStrategy errors) {
if ( s.empty() )
return {std::move(s)};

switch ( cs.value() ) {
case unicode::Charset::UTF8: {
// Data supposedly is already in UTF-8, but let's validate it.
// HILTI `string` is always UTF-8, but we could be invoked with raw bags of bytes here as well, so validate.
std::string t;

auto p = reinterpret_cast<const unsigned char*>(s.data());
auto e = p + s.size();
auto p = s.begin();
auto e = s.end();

while ( p < e ) {
utf8proc_int32_t cp;
auto n = utf8proc_iterate(p, e - p, &cp);

if ( n < 0 ) {
try {
auto cp = utf8::next(p, e);
utf8::append(cp, t);
} catch ( const utf8::exception& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: t += "\ufffd"; break;
case unicode::DecodeErrorStrategy::REPLACE: {
utf8::append(unicode::REPLACEMENT_CHARACTER, t);
break;
}
case unicode::DecodeErrorStrategy::STRICT:
throw RuntimeError("illegal UTF8 sequence in string");
}

p += 1;
continue;
p = std::next(p);
}
}

return Bytes(std::move(t));
}

case unicode::Charset::UTF16LE: {
std::string t8;

t += std::string(reinterpret_cast<const char*>(p), n);
p += n;
auto p = s.begin();
auto e = s.end();

while ( p < e ) {
try {
auto cp = utf8::next(p, e);
utf8::append(cp, t8);
} catch ( const utf8::exception& ) {
switch ( errors.value() ) {
case unicode::DecodeErrorStrategy::IGNORE: break;
case unicode::DecodeErrorStrategy::REPLACE: {
utf8::append(unicode::REPLACEMENT_CHARACTER, t8);
break;
}
case unicode::DecodeErrorStrategy::STRICT:
throw RuntimeError("illegal UTF8 sequence in string");
}

p = std::next(p);
}
}

return {std::move(t)};
auto t = utf8::utf8to16(t8);
auto data = reinterpret_cast<char*>(t.data());
return {std::string{data, data + (t.size() * 2)}};
}

case unicode::Charset::ASCII: {
Expand All @@ -178,7 +213,7 @@ Bytes string::encode(std::string s, unicode::Charset cs, unicode::DecodeErrorStr
}
}

return {std::move(t)};
return Bytes(std::move(t));
}

case unicode::Charset::Undef: throw RuntimeError("unknown character set for encoding");
Expand Down
1 change: 1 addition & 0 deletions hilti/runtime/src/unicode.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ std::string to_string(const unicode::Charset& x, tag /*unused*/) {
switch ( x.value() ) {
case unicode::Charset::ASCII: return "Charset::ASCII";
case unicode::Charset::UTF8: return "Charset::UTF8";
case unicode::Charset::UTF16LE: return "Charset::UTF16LE";
case unicode::Charset::Undef: return "Charset::Undef";
}

Expand Down
3 changes: 2 additions & 1 deletion spicy/lib/spicy.spicy
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ public type ByteOrder = enum {
## Specifies the character set for bytes encoding/decoding.
public type Charset = enum {
ASCII,
UTF8
UTF8,
UTF16LE,
} &cxxname="hilti::rt::unicode::Charset";

## Specifies how data is handled that's not representable in a specified character set.
Expand Down
Loading

0 comments on commit 6daf2f6

Please sign in to comment.