Implement decoding and encoding UTF16 Bytes.

This adds two new charsets `UTF16LE` and `UTF16BE` for little and big endian UTF16 respectively. We also clean up use of the Unicode replacement character to make it work consistently between UTF16 and UTF8. Closes #1788.
zeek · Dec 20, 2024 · 4d4e0cf · 4d4e0cf
1 parent c15cad5
commit 4d4e0cf
Show file tree

Hide file tree

Showing 21 changed files with 515 additions and 163 deletions.
diff --git a/doc/autogen/spicy-types.spicy b/doc/autogen/spicy-types.spicy
@@ -54,8 +54,10 @@ Specifies the character set for bytes encoding/decoding.
 .. spicy-code::
 
     type Charset = enum {
-        ASCII,
-        UTF8
+        ASCII,    # ASCII encoding
+        UTF8,     # UTF8 encoding
+        UTF16LE,  # UTF16 little endian encoding
+        UTF16BE,  # UTF16 big endian encoding
     };
 
 .. _spicy_decodeerrorstrategy:
@@ -67,9 +69,9 @@ Specifies how data is handled that's not representable in a specified character
 .. spicy-code::
 
     type DecodeErrorStrategy = enum {
-        IGNORE,  # data is skipped but processing continues
-        REPLACE, # data is replaced with a valid place-holder and processing continues
-        STRICT   # runtime error is triggered
+        IGNORE,   # data is skipped but processing continues
+        REPLACE,  # data is replaced with a valid place-holder and processing continues
+        STRICT    # runtime error is triggered
     };
 
 .. _spicy_matchstate:

diff --git a/hilti/lib/hilti.hlt b/hilti/lib/hilti.hlt
@@ -11,7 +11,7 @@ public type Side = enum { Left, Right, Both } &cxxname="hilti::rt::bytes::Side";
 public type AddressFamily = enum { IPv4, IPv6 } &cxxname="hilti::rt::AddressFamily";
 public type RealType = enum { IEEE754_Single, IEEE754_Double } &cxxname="hilti::rt::real::Type";
 public type Protocol = enum { TCP, UDP, ICMP } &cxxname="hilti::rt::Protocol";
-public type Charset = enum { ASCII, UTF8 } &cxxname="hilti::rt::unicode::Charset";
+public type Charset = enum { ASCII, UTF8, UTF16LE, UTF16BE } &cxxname="hilti::rt::unicode::Charset";
 public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::unicode::DecodeErrorStrategy";
 public type Captures = vector<bytes>;
 public type Profiler = __library_type("hilti::rt::Profiler");

diff --git a/hilti/runtime/include/unicode.h b/hilti/runtime/include/unicode.h
@@ -19,7 +19,9 @@ HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
 );
 
 /** For bytes decoding, which character set to use. */
-HILTI_RT_ENUM(Charset, Undef, UTF8, ASCII);
+HILTI_RT_ENUM(Charset, Undef, UTF8, UTF16LE, UTF16BE, ASCII);
+
+constexpr uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;
 
 } // namespace unicode
 

diff --git a/hilti/runtime/src/tests/bytes.cc b/hilti/runtime/src/tests/bytes.cc
@@ -10,6 +10,7 @@
 #include <hilti/rt/types/integer.h>
 #include <hilti/rt/types/regexp.h>
 
+using namespace std::string_literals;
 using namespace hilti::rt;
 using namespace hilti::rt::bytes;
 
@@ -56,6 +57,33 @@ TEST_CASE("decode") {
     CHECK_THROWS_WITH_AS("\xc3\x28"_b.decode(unicode::Charset::UTF8, unicode::DecodeErrorStrategy::STRICT),
                          "illegal UTF8 sequence in string", const RuntimeError&);
 
+    CHECK_EQ(Bytes("\0a\0b\0c"s).decode(unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::STRICT), "abc");
+    CHECK_EQ(Bytes("a\0b\0c\0"s).decode(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "abc");
+
+    // Our `decode` of UTF-16 bytes returns UTF8 string with BOM if they do not fit into ASCII, see e.g.,
+    // https://stackoverflow.com/questions/2223882/whats-the-difference-between-utf-8-and-utf-8-with-bom.
+    // To compute the expected results in Python encode with `utf_8_sig` encoding.
+    //
+    // LHS is an UTF16 encoding of '東京', RHS UTF8 with BOM.
+    CHECK_EQ("\xff\xfeqg\xacN"_b.decode(unicode::Charset ::UTF16LE, unicode::DecodeErrorStrategy::STRICT),
+             "\ufeff東京");
+
+    // Decoding of UTF16 with BOM. The byte order in the charset is just a hint, but we still decode as UTF16.
+    CHECK_EQ("\xff\xfeqg\xacN"_b.decode(unicode::Charset ::UTF16BE, unicode::DecodeErrorStrategy::STRICT),
+             "\ufeff東京");
+
+    // Decoding of too few bytes for UTF16 (expected even number, provided uneven).
+    CHECK_THROWS_WITH_AS(Bytes("\0a\0b\0"s).decode(unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::STRICT),
+                         "illegal UTF16 character in string", const RuntimeError&);
+    CHECK_EQ(Bytes("\0a\0b\0"s).decode(unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::IGNORE), "ab");
+    CHECK_EQ(Bytes("\0a\0b\0"s).decode(unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::REPLACE), "ab\ufffd");
+
+    // Our UTF16 implementation seems to differ in what it considers invalid encodings, e.g., `\x00\xd8` is rejected by
+    // python-3.1[1-3], but accepted by us.
+    //
+    // TODO(bbannier): Test rejection of invalid UTF16 (but with even length).
+    CHECK_EQ(Bytes("\x00\xd8").decode(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "");
+
     CHECK_THROWS_WITH_AS("123"_b.decode(unicode::Charset::Undef), "unknown character set for decoding",
                          const RuntimeError&);
 }
@@ -192,6 +220,13 @@ TEST_CASE("lower") {
     // NOLINTNEXTLINE(bugprone-throw-keyword-missing)
     CHECK_THROWS_WITH_AS("123"_b.lower(unicode::Charset::Undef), "unknown character set for decoding",
                          const RuntimeError&);
+
+    // No case change expected for these Japanese codepoints.
+    const auto tokio8 = "東京"_b;
+    CHECK_EQ(tokio8.lower(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio8);
+
+    const auto tokio16 = "\xff\xfeqg\xacN"_b; // 東京 in UTF16LE.
+    CHECK_EQ(tokio16.lower(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio16);
 }
 
 TEST_CASE("match") {
@@ -488,9 +523,19 @@ TEST_CASE("upper") {
     CHECK_EQ("Gänsefüßchen"_b.upper(unicode::Charset::UTF8).str(), "GÄNSEFÜẞCHEN");
     CHECK_EQ("Gänsefüßchen"_b.upper(unicode::Charset::ASCII).str(), "G??NSEF????CHEN");
 
+    CHECK_EQ(Bytes("a\0b\0c\0"s).upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT),
+             Bytes("A\0B\0C\0"s).upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT));
+
     // NOLINTNEXTLINE(bugprone-throw-keyword-missing)
     CHECK_THROWS_WITH_AS("123"_b.upper(unicode::Charset::Undef), "unknown character set for decoding",
                          const RuntimeError&);
+
+    // No case change expected for these Japanese codepoints.
+    const auto tokio8 = "東京"_b;
+    CHECK_EQ(tokio8.upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio8);
+
+    const auto tokio16 = "\xff\xfeqg\xacN"_b; // 東京 in UTF16LE.
+    CHECK_EQ(tokio16.upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio16);
 }
 
 TEST_CASE("append") {

diff --git a/hilti/runtime/src/tests/string.cc b/hilti/runtime/src/tests/string.cc
@@ -12,6 +12,7 @@ using namespace hilti::rt::bytes::literals;
 TEST_SUITE_BEGIN("string");
 
 TEST_CASE("encode") {
+    CHECK_EQ(string::encode("", unicode::Charset::ASCII), ""_b);
     CHECK_EQ(string::encode("123", unicode::Charset::ASCII), "123"_b);
     CHECK_EQ(string::encode("abc", unicode::Charset::ASCII), "abc"_b);
     CHECK_EQ(string::encode("abc", unicode::Charset::UTF8), "abc"_b);
@@ -30,6 +31,11 @@ TEST_CASE("encode") {
                                         unicode::DecodeErrorStrategy::STRICT),
                          "illegal ASCII character in string", const RuntimeError&);
 
+    CHECK_EQ(string::encode("abc", unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "a\0b\0c\0"_b);
+    CHECK_EQ(string::encode("abc", unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::STRICT), "\0a\0b\0c"_b);
+    CHECK_EQ(string::encode("東京", unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "qg\xacN"_b);
+    CHECK_EQ(string::encode("東京", unicode::Charset::UTF16BE, unicode::DecodeErrorStrategy::STRICT), "gqN\xac"_b);
+
     // NOLINTNEXTLINE(bugprone-throw-keyword-missing)
     CHECK_THROWS_WITH_AS(string::encode("123", unicode::Charset::Undef), "unknown character set for encoding",
                          const RuntimeError&);

diff --git a/hilti/runtime/src/tests/to_string.cc b/hilti/runtime/src/tests/to_string.cc
@@ -110,6 +110,8 @@ TEST_CASE("integer::BitOrder") {
 TEST_CASE("bytes::Charset") {
     CHECK_EQ(to_string(Enum(unicode::Charset::ASCII)), "Charset::ASCII");
     CHECK_EQ(to_string(Enum(unicode::Charset::UTF8)), "Charset::UTF8");
+    CHECK_EQ(to_string(Enum(unicode::Charset::UTF16BE)), "Charset::UTF16BE");
+    CHECK_EQ(to_string(Enum(unicode::Charset::UTF16LE)), "Charset::UTF16LE");
     CHECK_EQ(to_string(Enum(unicode::Charset::Undef)), "Charset::Undef");
 }
 

diff --git a/hilti/runtime/src/types/bytes.cc b/hilti/runtime/src/types/bytes.cc
@@ -1,19 +1,78 @@
 // Copyright (c) 2020-2023 by the Zeek Project. See LICENSE for details.
 
-#include <utf8proc/utf8proc.h>
+#include <utf8.h>
 
+#include <cstddef>
 #include <cstdint>
 #include <cstdlib>
+#include <string>
+#include <string_view>
+#include <utility>
 
 #include <hilti/rt/types/bytes.h>
 #include <hilti/rt/types/integer.h>
 #include <hilti/rt/types/regexp.h>
 #include <hilti/rt/types/stream.h>
+#include <hilti/rt/unicode.h>
 #include <hilti/rt/util.h>
 
 using namespace hilti::rt;
 using namespace hilti::rt::bytes;
 
+namespace {
+
+// An iterator over `char16_t` which can adjust the byte order.
+struct U16Iterator {
+    // Most of this is boilerplate.
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = const char16_t;
+    using pointer = value_type*;
+    using reference = value_type&;
+
+    pointer cur = nullptr;
+
+    U16Iterator& operator++() {
+        ++cur;
+        return *this;
+    }
+
+    U16Iterator operator++(int) {
+        auto tmp = *this;
+        ++(*this);
+        return tmp;
+    }
+
+    friend bool operator==(const U16Iterator& a, const U16Iterator& b) { return a.cur == b.cur; };
+    friend bool operator!=(const U16Iterator& a, const U16Iterator& b) { return ! (a == b); };
+
+    // Implementation of custom behavior below.
+    enum Order { LE, BE, Detected };
+
+    U16Iterator(pointer ptr, Order order) : cur(ptr), order(order) {}
+
+    Order order;
+
+    auto operator*() const {
+        switch ( order ) {
+            case Detected: [[fallthrough]];
+            case LE: return *cur;
+            case BE: {
+                auto r = *cur;
+
+                char* xs = reinterpret_cast<char*>(&r);
+                std::swap(xs[0], xs[1]);
+
+                return r;
+            }
+        }
+
+        cannot_be_reached();
+    }
+};
+
+} // namespace
+
 std::tuple<bool, Bytes::const_iterator> Bytes::find(const Bytes& needle, const const_iterator& start) const {
     auto b = begin();
 
@@ -46,35 +105,91 @@ std::tuple<bool, Bytes::const_iterator> Bytes::find(const Bytes& needle, const c
     }
 }
 
-std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy errors) const {
+std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy errors) const try {
+    if ( Base::empty() )
+        return "";
+
     switch ( cs.value() ) {
         case unicode::Charset::UTF8: {
             std::string t;
 
-            auto p = reinterpret_cast<const unsigned char*>(Base::data());
-            auto e = p + Base::size();
+            auto p = Base::begin();
+            auto e = Base::end();
 
             while ( p < e ) {
-                utf8proc_int32_t cp;
-                auto n = utf8proc_iterate(p, e - p, &cp);
-
-                if ( n < 0 ) {
+                try {
+                    auto cp = utf8::next(p, e);
+                    utf8::append(cp, t);
+                } catch ( const utf8::invalid_utf8& ) {
                     switch ( errors.value() ) {
                         case unicode::DecodeErrorStrategy::IGNORE: break;
-                        case unicode::DecodeErrorStrategy::REPLACE: t += "\ufffd"; break;
+                        case unicode::DecodeErrorStrategy::REPLACE: {
+                            utf8::append(unicode::REPLACEMENT_CHARACTER, t);
+                            break;
+                        }
                         case unicode::DecodeErrorStrategy::STRICT:
                             throw RuntimeError("illegal UTF8 sequence in string");
                     }
 
-                    p += 1;
-                    continue;
+                    ++p;
+                }
+            }
+
+            return t;
+        }
+
+        case unicode::Charset::UTF16BE: [[fallthrough]];
+        case unicode::Charset::UTF16LE: {
+            if ( Base::size() % 2 != 0 ) {
+                switch ( errors.value() ) {
+                    case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF16 character in string");
+                    case unicode::DecodeErrorStrategy::IGNORE: {
+                        // Ignore the last byte.
+                        return Bytes(str().substr(0, Base::size() / 2 * 2)).decode(cs, errors);
+                    }
+                    case unicode::DecodeErrorStrategy::REPLACE: {
+                        // Convert everything but the last byte, and append replacement.
+                        auto dec = Bytes(str().substr(0, Base::size() / 2 * 2)).decode(cs, errors);
+                        utf8::append(unicode::REPLACEMENT_CHARACTER, dec);
+                        return dec;
+                    }
                 }
+            }
+
+            // We can assume an even number of bytes.
 
-                t += std::string(reinterpret_cast<const char*>(p), n);
-                p += n;
+            std::u16string t;
+
+            // utfcpp expects to iterate a `u16string` or `u16string_view`.
+            auto v16 = std::u16string_view{reinterpret_cast<const char16_t*>(Base::data()), Base::size() / 2};
+
+            // We prefer to use the byte order from a BOM if present. If none is found use the passed byte order.
+            U16Iterator::Order order = U16Iterator::Detected;
+            if ( ! startsWith("\xFF\xFE") && ! startsWith("\xFE\xFF") )
+                order = (cs.value() == unicode::Charset::UTF16LE ? U16Iterator::LE : U16Iterator::BE);
+
+            auto p = U16Iterator(v16.begin(), order);
+            auto e = U16Iterator(v16.end(), order);
+
+            while ( p != e ) {
+                try {
+                    auto cp = utf8::next16(p, e);
+                    utf8::append16(cp, t);
+                } catch ( const utf8::invalid_utf16& ) {
+                    switch ( errors.value() ) {
+                        case unicode::DecodeErrorStrategy::IGNORE: break;
+                        case unicode::DecodeErrorStrategy::REPLACE:
+                            utf8::append16(unicode::REPLACEMENT_CHARACTER, t);
+                            break;
+                        case unicode::DecodeErrorStrategy::STRICT:
+                            throw RuntimeError("illegal UTF16 character in string");
+                    }
+
+                    ++p;
+                }
             }
 
-            return {t};
+            return {utf8::utf16to8(t)};
         }
 
         case unicode::Charset::ASCII: {
@@ -99,6 +214,12 @@ std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy erro
     }
 
     cannot_be_reached();
+} catch ( const RuntimeError& ) {
+    // Directly propagate already correctly wrapped exceptions.
+    throw;
+} catch ( ... ) {
+    // Throw a new `RuntimeError` for any other exception which has made it out of the function.
+    throw RuntimeError("could not decode bytes");
 }
 
 Bytes Bytes::strip(const Bytes& set, bytes::Side side) const {