zeek · bbannier · Dec 10, 2024 · Dec 16, 2024 · Dec 11, 2024 · Dec 16, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -37,3 +37,6 @@
 [submodule "3rdparty/any"]
 	path = 3rdparty/any
 	url = https://github.com/thelink2012/any.git
+[submodule "3rdparty/utfcpp"]
+	path = 3rdparty/utfcpp
+	url = https://github.com/nemtrif/utfcpp.git
diff --git a/3rdparty/LICENSE.3rdparty b/3rdparty/LICENSE.3rdparty
@@ -352,6 +352,34 @@ permanent authorization for you to choose that version for the
 Library.
 
 
+================================================================================
+utfcpp
+================================================================================
+
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+
 ================================================================================
 utf8proc
 ================================================================================

diff --git a/3rdparty/utfcpp b/3rdparty/utfcpp
diff --git a/doc/autogen/spicy-types.spicy b/doc/autogen/spicy-types.spicy
@@ -54,8 +54,10 @@ Specifies the character set for bytes encoding/decoding.
 .. spicy-code::
 
     type Charset = enum {
-        ASCII,
-        UTF8
+        ASCII,    # ASCII encoding
+        UTF8,     # UTF8 encoding
+        UTF16LE,  # UTF16 little endian encoding
+        UTF16BE,  # UTF16 big endian encoding
     };
 
 .. _spicy_decodeerrorstrategy:
@@ -67,9 +69,9 @@ Specifies how data is handled that's not representable in a specified character
 .. spicy-code::
 
     type DecodeErrorStrategy = enum {
-        IGNORE,  # data is skipped but processing continues
-        REPLACE, # data is replaced with a valid place-holder and processing continues
-        STRICT   # runtime error is triggered
+        IGNORE,   # data is skipped but processing continues
+        REPLACE,  # data is replaced with a valid place-holder and processing continues
+        STRICT    # runtime error is triggered
     };
 
 .. _spicy_matchstate:

diff --git a/doc/autogen/types/string.rst b/doc/autogen/types/string.rst
@@ -1,6 +1,6 @@
 .. rubric:: Methods
 
-.. spicy:method:: string::encode string encode False bytes ([ charset: spicy::Charset = spicy::Charset::UTF8 ])
+.. spicy:method:: string::encode string encode False bytes ([ charset: spicy::Charset = spicy::Charset::UTF8 ], [ errors: spicy::DecodeErrorStrategy = spicy::DecodeErrorStrategy::REPLACE ])
 
     Converts the string into a binary representation encoded with the
     given character set.

diff --git a/hilti/lib/hilti.hlt b/hilti/lib/hilti.hlt
@@ -11,8 +11,8 @@ public type Side = enum { Left, Right, Both } &cxxname="hilti::rt::bytes::Side";
 public type AddressFamily = enum { IPv4, IPv6 } &cxxname="hilti::rt::AddressFamily";
 public type RealType = enum { IEEE754_Single, IEEE754_Double } &cxxname="hilti::rt::real::Type";
 public type Protocol = enum { TCP, UDP, ICMP } &cxxname="hilti::rt::Protocol";
-public type Charset = enum { ASCII, UTF8 } &cxxname="hilti::rt::bytes::Charset";
-public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::bytes::DecodeErrorStrategy";
+public type Charset = enum { ASCII, UTF8, UTF16LE, UTF16BE } &cxxname="hilti::rt::unicode::Charset";
+public type DecodeErrorStrategy = enum { IGNORE, REPLACE, STRICT } &cxxname="hilti::rt::unicode::DecodeErrorStrategy";
 public type Captures = vector<bytes>;
 public type Profiler = __library_type("hilti::rt::Profiler");
 

diff --git a/hilti/runtime/CMakeLists.txt b/hilti/runtime/CMakeLists.txt
@@ -46,6 +46,7 @@ set(SOURCES
     src/types/stream.cc
     src/types/string.cc
     src/types/time.cc
+    src/unicode.cc
     src/util.cc
     src/version.cc
     ${PROJECT_SOURCE_DIR}/3rdparty/utf8proc/utf8proc.c)
@@ -67,8 +68,10 @@ foreach (lib hilti-rt hilti-rt-debug)
 
     add_dependencies(${lib}-objects fiber)
     target_include_directories(
-        ${lib}-objects PRIVATE ${PROJECT_SOURCE_DIR}/3rdparty/fiber/include
-                               ${PROJECT_SOURCE_DIR}/3rdparty/fiber/deps/cxx-header-utils/include)
+        ${lib}-objects
+        PRIVATE ${PROJECT_SOURCE_DIR}/3rdparty/fiber/include
+                ${PROJECT_SOURCE_DIR}/3rdparty/fiber/deps/cxx-header-utils/include
+                ${PROJECT_SOURCE_DIR}/3rdparty/utfcpp/source)
 
     add_library(${lib} STATIC)
     target_link_libraries(${lib} ${lib}-objects)

diff --git a/hilti/runtime/include/types/bytes.h b/hilti/runtime/include/types/bytes.h
@@ -15,9 +15,9 @@
 #include <hilti/rt/json-fwd.h>
 #include <hilti/rt/result.h>
 #include <hilti/rt/safe-int.h>
-#include <hilti/rt/types/string.h>
 #include <hilti/rt/types/time.h>
 #include <hilti/rt/types/vector.h>
+#include <hilti/rt/unicode.h>
 #include <hilti/rt/util.h>
 
 namespace hilti::rt {
@@ -38,12 +38,6 @@ HILTI_RT_ENUM_WITH_DEFAULT(Side, Left,
                            Both   // left and right side
 );
 
-/** For bytes decoding, which character set to use. */
-HILTI_RT_ENUM(Charset, Undef, UTF8, ASCII);
-
-/** For bytes decoding, how to handle decoding errors. */
-using DecodeErrorStrategy = string::DecodeErrorStrategy;
-
 /**
  * Safe bytes iterator traversing the content of an instance.
  *
@@ -257,17 +251,10 @@ class Bytes : protected std::string {
     using C = std::shared_ptr<const Base*>;
 
     /**
-     * Creates a bytes instance from a raw string representation
-     * encoded in a specified character set.
-     *
-     * @param s raw data
-     * @param cs character set the raw data is assumed to be encoded in
-     * @param errors how to handle errors when decoding the data
-     * @return bytes instances encoding *s* in character set *cs*
+     * Creates a bytes instance from a raw string representation.
      */
-    Bytes(std::string s, bytes::Charset cs, bytes::DecodeErrorStrategy errors = bytes::DecodeErrorStrategy::REPLACE);
+    Bytes(Base s) : Base(std::move(s)) {}
 
-    Bytes(Base&& str) : Base(std::move(str)) {}
     Bytes(const Bytes& xs) : Base(xs) {}
     Bytes(Bytes&& xs) noexcept : Base(std::move(xs)) {}
 
@@ -442,8 +429,8 @@ class Bytes : protected std::string {
      * @param errors how to handle errors when decoding the data
      * @return UTF8 string
      */
-    std::string decode(bytes::Charset cs,
-                       bytes::DecodeErrorStrategy errors = bytes::DecodeErrorStrategy::REPLACE) const;
+    std::string decode(unicode::Charset cs,
+                       unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE) const;
 
     /** Returns true if the data begins with a given, other bytes instance. */
     bool startsWith(const Bytes& b) const { return hilti::rt::startsWith(*this, b); }
@@ -457,9 +444,7 @@ class Bytes : protected std::string {
      * @param errors how to handle errors when decoding/encoding the data
      * @return an upper case version of the instance
      */
-    Bytes upper(bytes::Charset cs, bytes::DecodeErrorStrategy errors = bytes::DecodeErrorStrategy::REPLACE) const {
-        return Bytes(hilti::rt::string::upper(decode(cs, errors), errors), cs, errors);
-    }
+    Bytes upper(unicode::Charset cs, unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE) const;
 
     /**
      * Returns an upper-case version of the instance.
@@ -468,9 +453,7 @@ class Bytes : protected std::string {
      * @param errors how to handle errors when decoding/encoding the data
      * @return a lower case version of the instance
      */
-    Bytes lower(bytes::Charset cs, bytes::DecodeErrorStrategy errors = bytes::DecodeErrorStrategy::REPLACE) const {
-        return Bytes(hilti::rt::string::lower(decode(cs, errors), errors), cs, errors);
-    }
+    Bytes lower(unicode::Charset cs, unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE) const;
 
     /**
      * Removes leading and/or trailing sequences of all characters of a set
@@ -685,8 +668,6 @@ inline std::string detail::to_string_for_print<Bytes>(const Bytes& x) {
 namespace detail::adl {
 std::string to_string(const Bytes& x, adl::tag /*unused*/);
 std::string to_string(const bytes::Side& x, adl::tag /*unused*/);
-std::string to_string(const bytes::Charset& x, adl::tag /*unused*/);
-std::string to_string(const bytes::DecodeErrorStrategy& x, adl::tag /*unused*/);
 } // namespace detail::adl
 
 } // namespace hilti::rt

diff --git a/hilti/runtime/include/types/string.h b/hilti/runtime/include/types/string.h
@@ -9,18 +9,14 @@
 #include <hilti/rt/extension-points.h>
 #include <hilti/rt/safe-int.h>
 #include <hilti/rt/types/vector.h>
+#include <hilti/rt/unicode.h>
 #include <hilti/rt/util.h>
 
 namespace hilti::rt {
 
-namespace string {
+class Bytes;
 
-/* When processing UTF8, how to handle invalid data not representing UTF8 codepoints. */
-HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
-                           IGNORE,  // skip data
-                           REPLACE, // replace with a place-holder
-                           STRICT   // throw a runtime error
-);
+namespace string {
 
 /**
  * Computes the length of a UTF8 string in number of codepoints.
@@ -30,7 +26,8 @@ HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
  * @return the length of the input string
  * @throws RuntimeError if the input is not a valid UTF8 string
  */
-integer::safe<uint64_t> size(const std::string& s, DecodeErrorStrategy errors = DecodeErrorStrategy::REPLACE);
+integer::safe<uint64_t> size(const std::string& s,
+                             unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE);
 
 /**
  * Computes a lower-case version of an UTF8 string.
@@ -40,7 +37,7 @@ integer::safe<uint64_t> size(const std::string& s, DecodeErrorStrategy errors =
  * @return a lower-case version of the input string
  * @throws RuntimeError if the input is not a valid UTF8 string
  */
-std::string lower(std::string_view s, DecodeErrorStrategy errors = DecodeErrorStrategy::REPLACE);
+std::string lower(std::string_view s, unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE);
 
 /**
  * Computes a upper-case version of an UTF8 string.
@@ -50,7 +47,7 @@ std::string lower(std::string_view s, DecodeErrorStrategy errors = DecodeErrorSt
  * @return a upper-case version of the input string
  * @throws RuntimeError if the input is not a valid UTF8 string
  */
-std::string upper(std::string_view s, DecodeErrorStrategy errors = DecodeErrorStrategy::REPLACE);
+std::string upper(std::string_view s, unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE);
 
 /**
  * Splits the string at sequences of whitespace.
@@ -88,6 +85,18 @@ std::tuple<std::string, std::string> split1(const std::string& s);
  */
 std::tuple<std::string, std::string> split1(const std::string& s, const std::string& sep);
 
+/**
+ * Creates a bytes instance from a raw string representation
+ * encoded in a specified character set.
+ *
+ * @param s raw data
+ * @param cs character set the raw data is assumed to be encoded in
+ * @param errors how to handle errors when decoding the data
+ * @return bytes instances encoding *s* in character set *cs*
+ */
+rt::Bytes encode(std::string s, unicode::Charset cs,
+                 unicode::DecodeErrorStrategy errors = unicode::DecodeErrorStrategy::REPLACE);
+
 } // namespace string
 
 namespace detail::adl {

diff --git a/hilti/runtime/include/unicode.h b/hilti/runtime/include/unicode.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2020-2023 by the Zeek Project. See LICENSE for details.
+
+#pragma once
+
+#include <cstdint>
+
+#include <hilti/rt/extension-points.h>
+#include <hilti/rt/util.h>
+
+namespace hilti::rt {
+
+namespace unicode {
+
+/* When processing unicode, how to handle invalid data not representing unicode codepoints. */
+HILTI_RT_ENUM_WITH_DEFAULT(DecodeErrorStrategy, IGNORE,
+                           IGNORE,  // skip data
+                           REPLACE, // replace with a place-holder
+                           STRICT   // throw a runtime error
+);
+
+/** For bytes decoding, which character set to use. */
+HILTI_RT_ENUM(Charset, Undef, UTF8, UTF16LE, UTF16BE, ASCII);
+
+constexpr uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;
+
+} // namespace unicode
+
+namespace detail::adl {
+std::string to_string(const unicode::DecodeErrorStrategy& x, adl::tag /*unused*/);
+std::string to_string(const unicode::Charset& x, adl::tag /*unused*/);
+} // namespace detail::adl
+
+} // namespace hilti::rt
diff --git a/hilti/runtime/include/util.h b/hilti/runtime/include/util.h
@@ -240,7 +240,7 @@ std::string replace(std::string s, std::string_view o, std::string_view n);
  *
  * \note This function is not UTF8-aware.
  */
-bool startsWith(const std::string& s, const std::string& prefix);
+bool startsWith(std::string_view s, std::string_view prefix);
 
 /**
  * Python-style enumerate() that returns an iterable yielding pairs `(index,