Implement decoding of UTF16 Bytes.

We also clean up use of the Unicode replacement character to make it work consistently between UTF16 and UTF8. Closes #1788.
zeek · Dec 16, 2024 · 6daf2f6 · 6daf2f6
1 parent 802a0a4
commit 6daf2f6
Show file tree

Hide file tree

Showing 18 changed files with 335 additions and 154 deletions.
diff --git a/doc/autogen/spicy-types.spicy b/doc/autogen/spicy-types.spicy
@@ -55,7 +55,8 @@ Specifies the character set for bytes encoding/decoding.
 
     type Charset = enum {
         ASCII,
-        UTF8
+        UTF8,
+        UTF16LE,
     };
 
 .. _spicy_decodeerrorstrategy:

diff --git a/hilti/lib/hilti.hlt b/hilti/lib/hilti.hlt
@@ -11,7 +11,7 @@ public type Side = enum { Left, Right, Both } &cxxname="hilti::rt::bytes::Side";
 public type AddressFamily = enum { IPv4, IPv6 } &cxxname="hilti::rt::AddressFamily";
 public type RealType = enum { IEEE754_Single, IEEE754_Double } &cxxname="hilti::rt::real::Type";
 public type Protocol = enum { TCP, UDP, ICMP } &cxxname="hilti::rt::Protocol";
-public type Charset = enum { ASCII, UTF8 } &cxxname="hilti::rt::unicode::Charset";
+public type Charset = enum { ASCII, UTF8, UTF16LE } &cxxname="hilti::rt::unicode::Charset";
 public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::unicode::DecodeErrorStrategy";
 public type Captures = vector<bytes>;
 public type Profiler = __library_type("hilti::rt::Profiler");

diff --git a/hilti/runtime/include/unicode.h b/hilti/runtime/include/unicode.h
@@ -19,7 +19,9 @@ HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
 );
 
 /** For bytes decoding, which character set to use. */
-HILTI_RT_ENUM(Charset, Undef, UTF8, ASCII);
+HILTI_RT_ENUM(Charset, Undef, UTF8, UTF16LE, ASCII);
+
+constexpr uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;
 
 } // namespace unicode
 

diff --git a/hilti/runtime/src/tests/bytes.cc b/hilti/runtime/src/tests/bytes.cc
@@ -10,6 +10,7 @@
 #include <hilti/rt/types/integer.h>
 #include <hilti/rt/types/regexp.h>
 
+using namespace std::string_literals;
 using namespace hilti::rt;
 using namespace hilti::rt::bytes;
 
@@ -56,6 +57,16 @@ TEST_CASE("decode") {
     CHECK_THROWS_WITH_AS("\xc3\x28"_b.decode(unicode::Charset::UTF8, unicode::DecodeErrorStrategy::STRICT),
                          "illegal UTF8 sequence in string", const RuntimeError&);
 
+    CHECK_EQ(Bytes("a\0b\0c\0"s).decode(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), "abc");
+
+    // Our `decode` of UTF-16 bytes returns UTF8 string with BOM if they do not fit into ASCII, see e.g.,
+    // https://stackoverflow.com/questions/2223882/whats-the-difference-between-utf-8-and-utf-8-with-bom.
+    // To compute the expected results in Python encode with `utf_8_sig` encoding.
+    //
+    // LHS is an UTF16 encoding of '東京', RHS UTF8 with BOM.
+    CHECK_EQ("\xff\xfeqg\xacN"_b.decode(unicode::Charset ::UTF16LE, unicode::DecodeErrorStrategy::STRICT),
+             "\ufeff東京");
+
     CHECK_THROWS_WITH_AS("123"_b.decode(unicode::Charset::Undef), "unknown character set for decoding",
                          const RuntimeError&);
 }
@@ -192,6 +203,13 @@ TEST_CASE("lower") {
     // NOLINTNEXTLINE(bugprone-throw-keyword-missing)
     CHECK_THROWS_WITH_AS("123"_b.lower(unicode::Charset::Undef), "unknown character set for decoding",
                          const RuntimeError&);
+
+    // No case change expected for these Japanese codepoints.
+    const auto tokio8 = "東京"_b;
+    CHECK_EQ(tokio8.lower(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio8);
+
+    const auto tokio16 = "\xff\xfeqg\xacN"_b; // 東京 in UTF16LE.
+    CHECK_EQ(tokio16.lower(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio16);
 }
 
 TEST_CASE("match") {
@@ -488,9 +506,19 @@ TEST_CASE("upper") {
     CHECK_EQ("Gänsefüßchen"_b.upper(unicode::Charset::UTF8).str(), "GÄNSEFÜẞCHEN");
     CHECK_EQ("Gänsefüßchen"_b.upper(unicode::Charset::ASCII).str(), "G??NSEF????CHEN");
 
+    CHECK_EQ(Bytes("a\0b\0c\0"s).upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT),
+             Bytes("A\0B\0C\0"s).upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT));
+
     // NOLINTNEXTLINE(bugprone-throw-keyword-missing)
     CHECK_THROWS_WITH_AS("123"_b.upper(unicode::Charset::Undef), "unknown character set for decoding",
                          const RuntimeError&);
+
+    // No case change expected for these Japanese codepoints.
+    const auto tokio8 = "東京"_b;
+    CHECK_EQ(tokio8.upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio8);
+
+    const auto tokio16 = "\xff\xfeqg\xacN"_b; // 東京 in UTF16LE.
+    CHECK_EQ(tokio16.upper(unicode::Charset::UTF16LE, unicode::DecodeErrorStrategy::STRICT), tokio16);
 }
 
 TEST_CASE("append") {

diff --git a/hilti/runtime/src/tests/to_string.cc b/hilti/runtime/src/tests/to_string.cc
@@ -110,6 +110,7 @@ TEST_CASE("integer::BitOrder") {
 TEST_CASE("bytes::Charset") {
     CHECK_EQ(to_string(Enum(unicode::Charset::ASCII)), "Charset::ASCII");
     CHECK_EQ(to_string(Enum(unicode::Charset::UTF8)), "Charset::UTF8");
+    CHECK_EQ(to_string(Enum(unicode::Charset::UTF16LE)), "Charset::UTF16LE");
     CHECK_EQ(to_string(Enum(unicode::Charset::Undef)), "Charset::Undef");
 }
 

diff --git a/hilti/runtime/src/types/bytes.cc b/hilti/runtime/src/types/bytes.cc
@@ -1,14 +1,18 @@
 // Copyright (c) 2020-2023 by the Zeek Project. See LICENSE for details.
 
-#include <utf8proc/utf8proc.h>
+#include <utf8.h>
 
 #include <cstdint>
 #include <cstdlib>
+#include <iterator>
+#include <string>
+#include <string_view>
 
 #include <hilti/rt/types/bytes.h>
 #include <hilti/rt/types/integer.h>
 #include <hilti/rt/types/regexp.h>
 #include <hilti/rt/types/stream.h>
+#include <hilti/rt/unicode.h>
 #include <hilti/rt/util.h>
 
 using namespace hilti::rt;
@@ -47,34 +51,81 @@ std::tuple<bool, Bytes::const_iterator> Bytes::find(const Bytes& needle, const c
 }
 
 std::string Bytes::decode(unicode::Charset cs, unicode::DecodeErrorStrategy errors) const {
+    if ( Base::empty() )
+        return "";
+
     switch ( cs.value() ) {
         case unicode::Charset::UTF8: {
             std::string t;
 
-            auto p = reinterpret_cast<const unsigned char*>(Base::data());
-            auto e = p + Base::size();
+            auto p = Base::begin();
+            auto e = Base::end();
 
             while ( p < e ) {
-                utf8proc_int32_t cp;
-                auto n = utf8proc_iterate(p, e - p, &cp);
-
-                if ( n < 0 ) {
+                try {
+                    auto cp = utf8::next(p, e);
+                    utf8::append(cp, t);
+                } catch ( const utf8::exception& ) {
                     switch ( errors.value() ) {
                         case unicode::DecodeErrorStrategy::IGNORE: break;
-                        case unicode::DecodeErrorStrategy::REPLACE: t += "\ufffd"; break;
+                        case unicode::DecodeErrorStrategy::REPLACE: {
+                            utf8::append(unicode::REPLACEMENT_CHARACTER, t);
+                            break;
+                        }
                         case unicode::DecodeErrorStrategy::STRICT:
                             throw RuntimeError("illegal UTF8 sequence in string");
                     }
 
-                    p += 1;
-                    continue;
+                    p = std::next(p);
+                }
+            }
+
+            return t;
+        }
+
+        case unicode::Charset::UTF16LE: {
+            if ( Base::size() % 2 != 0 ) {
+                switch ( errors.value() ) {
+                    case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF16 character in string");
+                    case unicode::DecodeErrorStrategy::IGNORE: {
+                        // Ignore the last byte.
+                        return Bytes(str().substr(0, Base::size() / 2 * 2)).decode(cs, errors);
+                    }
+                    case unicode::DecodeErrorStrategy::REPLACE: {
+                        // Convert everything but the last byte, and append replacement.
+                        auto dec = Bytes(str().substr(0, Base::size() / 2 * 2)).decode(cs, errors);
+                        utf8::append(unicode::REPLACEMENT_CHARACTER, dec);
+                        return dec;
+                    }
                 }
+            }
+
+            // We can assume an even number of bytes.
+            std::u16string t;
 
-                t += std::string(reinterpret_cast<const char*>(p), n);
-                p += n;
+            // utfcpp expects to iterate a `u16string` or `u16string_view`.
+            auto v16 = std::u16string_view{reinterpret_cast<const char16_t*>(Base::data()), Base::size() / 2};
+            auto p = v16.begin();
+            auto e = v16.end();
+            while ( p < e ) {
+                try {
+                    auto cp = utf8::next16(p, e);
+                    utf8::append16(cp, t);
+                } catch ( const utf8::exception& ) {
+                    switch ( errors.value() ) {
+                        case unicode::DecodeErrorStrategy::IGNORE: break;
+                        case unicode::DecodeErrorStrategy::REPLACE:
+                            utf8::append16(unicode::REPLACEMENT_CHARACTER, t);
+                            break;
+                        case unicode::DecodeErrorStrategy::STRICT:
+                            throw RuntimeError("illegal UTF16 character in string");
+                    }
+
+                    p = std::next(p);
+                }
             }
 
-            return {t};
+            return {utf8::utf16to8(t)};
         }
 
         case unicode::Charset::ASCII: {

diff --git a/hilti/runtime/src/types/string.cc b/hilti/runtime/src/types/string.cc
@@ -1,37 +1,40 @@
 // Copyright (c) 2020-2023 by the Zeek Project. See LICENSE for details.
 
+#include <utf8.h>
 #include <utf8proc/utf8proc.h>
 
+#include <iterator>
+
 #include <hilti/rt/exception.h>
 #include <hilti/rt/types/bytes.h>
 #include <hilti/rt/types/string.h>
-#include <hilti/rt/util.h>
 
 using namespace hilti::rt;
 
 integer::safe<uint64_t> string::size(const std::string& s, unicode::DecodeErrorStrategy errors) {
-    auto p = reinterpret_cast<const unsigned char*>(s.data());
-    auto e = p + s.size();
+    auto p = s.begin();
+    auto e = s.end();
 
     uint64_t len = 0;
 
     while ( p < e ) {
-        utf8proc_int32_t cp;
-        auto n = utf8proc_iterate(p, e - p, &cp);
-
-        if ( n < 0 ) {
+        try {
+            // `utf8::next` is for iterating UTF-8 strings.
+            utf8::next(p, s.end());
+            ++len;
+        } catch ( const utf8::exception& ) {
             switch ( errors.value() ) {
-                case unicode::DecodeErrorStrategy::IGNORE: break;
-                case unicode::DecodeErrorStrategy::REPLACE: ++len; break;
                 case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF8 sequence in string");
+                case unicode::DecodeErrorStrategy::REPLACE: {
+                    ++len;
+                }
+                    [[fallthrough]];
+                case unicode::DecodeErrorStrategy::IGNORE: {
+                    p = std::next(p);
+                    break;
+                }
             }
-
-            p += 1;
-            continue;
         }
-
-        ++len;
-        p += n;
     }
 
     return len;
@@ -51,7 +54,7 @@ std::string string::upper(std::string_view s, unicode::DecodeErrorStrategy error
         if ( n < 0 ) {
             switch ( errors.value() ) {
                 case unicode::DecodeErrorStrategy::IGNORE: break;
-                case unicode::DecodeErrorStrategy::REPLACE: rval += "\ufffd"; break;
+                case unicode::DecodeErrorStrategy::REPLACE: utf8::append(unicode::REPLACEMENT_CHARACTER, rval); break;
                 case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF8 sequence in string");
             }
 
@@ -81,7 +84,7 @@ std::string string::lower(std::string_view s, unicode::DecodeErrorStrategy error
         if ( n < 0 ) {
             switch ( errors.value() ) {
                 case unicode::DecodeErrorStrategy::IGNORE: break;
-                case unicode::DecodeErrorStrategy::REPLACE: rval += "\ufffd"; break;
+                case unicode::DecodeErrorStrategy::REPLACE: utf8::append(unicode::REPLACEMENT_CHARACTER, rval); break;
                 case unicode::DecodeErrorStrategy::STRICT: throw RuntimeError("illegal UTF8 sequence in string");
             }
 
@@ -132,35 +135,67 @@ std::tuple<std::string, std::string> string::split1(const std::string& s, const
 }
 
 Bytes string::encode(std::string s, unicode::Charset cs, unicode::DecodeErrorStrategy errors) {
+    if ( s.empty() )
+        return {std::move(s)};
+
     switch ( cs.value() ) {
         case unicode::Charset::UTF8: {
-            // Data supposedly is already in UTF-8, but let's validate it.
+            // HILTI `string` is always UTF-8, but we could be invoked with raw bags of bytes here as well, so validate.
             std::string t;
 
-            auto p = reinterpret_cast<const unsigned char*>(s.data());
-            auto e = p + s.size();
+            auto p = s.begin();
+            auto e = s.end();
 
             while ( p < e ) {
-                utf8proc_int32_t cp;
-                auto n = utf8proc_iterate(p, e - p, &cp);
-
-                if ( n < 0 ) {
+                try {
+                    auto cp = utf8::next(p, e);
+                    utf8::append(cp, t);
+                } catch ( const utf8::exception& ) {
                     switch ( errors.value() ) {
                         case unicode::DecodeErrorStrategy::IGNORE: break;
-                        case unicode::DecodeErrorStrategy::REPLACE: t += "\ufffd"; break;
+                        case unicode::DecodeErrorStrategy::REPLACE: {
+                            utf8::append(unicode::REPLACEMENT_CHARACTER, t);
+                            break;
+                        }
                         case unicode::DecodeErrorStrategy::STRICT:
                             throw RuntimeError("illegal UTF8 sequence in string");
                     }
 
-                    p += 1;
-                    continue;
+                    p = std::next(p);
                 }
+            }
+
+            return Bytes(std::move(t));
+        }
+
+        case unicode::Charset::UTF16LE: {
+            std::string t8;
 
-                t += std::string(reinterpret_cast<const char*>(p), n);
-                p += n;
+            auto p = s.begin();
+            auto e = s.end();
+
+            while ( p < e ) {
+                try {
+                    auto cp = utf8::next(p, e);
+                    utf8::append(cp, t8);
+                } catch ( const utf8::exception& ) {
+                    switch ( errors.value() ) {
+                        case unicode::DecodeErrorStrategy::IGNORE: break;
+                        case unicode::DecodeErrorStrategy::REPLACE: {
+                            utf8::append(unicode::REPLACEMENT_CHARACTER, t8);
+                            break;
+                        }
+                        case unicode::DecodeErrorStrategy::STRICT:
+                            throw RuntimeError("illegal UTF8 sequence in string");
+                    }
+
+                    p = std::next(p);
+                }
             }
 
-            return {std::move(t)};
+            auto t = utf8::utf8to16(t8);
+            auto data = reinterpret_cast<char*>(t.data());
+            return {std::string{data, data + (t.size() * 2)}};
         }
 
         case unicode::Charset::ASCII: {
@@ -178,7 +213,7 @@ Bytes string::encode(std::string s, unicode::Charset cs, unicode::DecodeErrorStr
                 }
             }
 
-            return {std::move(t)};
+            return Bytes(std::move(t));
         }
 
         case unicode::Charset::Undef: throw RuntimeError("unknown character set for encoding");

diff --git a/hilti/runtime/src/unicode.cc b/hilti/runtime/src/unicode.cc
@@ -18,6 +18,7 @@ std::string to_string(const unicode::Charset& x, tag /*unused*/) {
     switch ( x.value() ) {
         case unicode::Charset::ASCII: return "Charset::ASCII";
         case unicode::Charset::UTF8: return "Charset::UTF8";
+        case unicode::Charset::UTF16LE: return "Charset::UTF16LE";
         case unicode::Charset::Undef: return "Charset::Undef";
     }
 

diff --git a/spicy/lib/spicy.spicy b/spicy/lib/spicy.spicy
@@ -33,7 +33,8 @@ public type ByteOrder = enum {
 ## Specifies the character set for bytes encoding/decoding.
 public type Charset = enum {
     ASCII,
-    UTF8
+    UTF8,
+    UTF16LE,
 } &cxxname="hilti::rt::unicode::Charset";
 
 ## Specifies how data is handled that's not representable in a specified character set.
-Original file line number
+Diff line change
@@ Expand Up / @@ -55,7 +55,8 @@ Specifies the character set for bytes encoding/decoding. @@
         type Charset = enum {
             ASCII,
-            UTF8
+            UTF8,
+            UTF16LE,
         };
     .. _spicy_decodeerrorstrategy:
@@ Expand Down @@