From fa30d2d87dd425473361e9811b180df8b03e9c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Kurucz?= Date: Thu, 31 Oct 2024 14:34:29 +0100 Subject: [PATCH] Update nixpkgs This requires two main changes: - An update to the nixpkgs patch. It is now based on this draft PR: https://github.com/NixOS/nixpkgs/pull/352629 - An update to the Lean runtime, as Lean is updated to v4.10.0. The changes come from Lean commit 0a1a855ba80e51515570439f3d73d3d9414ac053. --- flake.lock | 6 +- flake.nix | 2 +- patches/nixpkgs.patch | 288 +++++++++++++++++++++++++----------- platform/lean/lean.h | 5 + platform/runtime/object.cpp | 96 ++++++++---- platform/runtime/object.h | 1 + platform/runtime/utf8.cpp | 106 ++++++------- platform/runtime/utf8.h | 7 +- 8 files changed, 343 insertions(+), 168 deletions(-) diff --git a/flake.lock b/flake.lock index 3cf12c3..e95e006 100644 --- a/flake.lock +++ b/flake.lock @@ -2,11 +2,11 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1719361614, - "narHash": "sha256-D6Up3ZCRso9U2Byj1PIJdrD3gziB4wbvu9hbhga57FQ=", + "lastModified": 1730400794, + "narHash": "sha256-+JePAVBbwtdrC5mIB+BNW90ZK8E2A5wwMZQY8wpu+yE=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "e55c4339a418a17723420447ec5bda0f7e2f76c6", + "rev": "365b440a1fb432ca58518dbc7125a1ab46cec9ed", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index c83f691..baec7d7 100644 --- a/flake.nix +++ b/flake.nix @@ -185,7 +185,7 @@ ''; hash = "sha256-yJ54S+HB3CH5/wb+GFwNkz3/990J8Z6GbuttDhWqN5Q="; }; - patches = finalAttrs.patches ++ [ ./patches/qemu.patch ]; + patches = [ ./patches/qemu.patch ]; buildInputs = finalAttrs.buildInputs ++ [ pkgsNative.libgcrypt ]; configureFlags = finalAttrs.configureFlags ++ [ "--enable-gcrypt" ]; }); diff --git a/patches/nixpkgs.patch b/patches/nixpkgs.patch index d910188..b3a883d 100644 --- a/patches/nixpkgs.patch +++ b/patches/nixpkgs.patch @@ -1,113 +1,235 @@ -diff --git a/pkgs/development/compilers/llvm/18/default.nix b/pkgs/development/compilers/llvm/18/default.nix -index 878f1e3a8e7e..fde993d67b70 100644 ---- a/pkgs/development/compilers/llvm/18/default.nix -+++ b/pkgs/development/compilers/llvm/18/default.nix -@@ -263,7 +263,7 @@ in let - bintools = bintools'; - extraPackages = [ - targetLlvmLibraries.compiler-rt -- ] ++ lib.optionals (!stdenv.targetPlatform.isWasm) [ -+ ] ++ lib.optionals (!stdenv.targetPlatform.isNone) [ - targetLlvmLibraries.libunwind - ]; - extraBuildCommands = mkExtraBuildCommands cc; -@@ -272,12 +272,12 @@ in let - "-Wno-unused-command-line-argument" - "-B${targetLlvmLibraries.compiler-rt}/lib" - ] -- ++ lib.optional (!stdenv.targetPlatform.isWasm) "--unwindlib=libunwind" -+ ++ lib.optional (!stdenv.targetPlatform.isNone) "--unwindlib=libunwind" - ++ lib.optional -- (!stdenv.targetPlatform.isWasm && stdenv.targetPlatform.useLLVM or false) -+ (!stdenv.targetPlatform.isNone && stdenv.targetPlatform.useLLVM or false) - "-lunwind" -- ++ lib.optional stdenv.targetPlatform.isWasm "-fno-exceptions"; -- nixSupport.cc-ldflags = lib.optionals (!stdenv.targetPlatform.isWasm) [ "-L${targetLlvmLibraries.libunwind}/lib" ]; -+ ++ lib.optional stdenv.targetPlatform.isNone "-fno-exceptions"; -+ nixSupport.cc-ldflags = lib.optionals (!stdenv.targetPlatform.isNone) [ "-L${targetLlvmLibraries.libunwind}/lib" ]; - }; +diff --git a/lib/systems/default.nix b/lib/systems/default.nix +index a6ceef2cc3a1..d9718c9f3dc9 100644 +--- a/lib/systems/default.nix ++++ b/lib/systems/default.nix +@@ -183,6 +183,26 @@ let + || isWasm # WASM + ) && !isStatic; - clangNoLibcxx = wrapCCWith rec { -@@ -294,7 +294,7 @@ in let - "-B${targetLlvmLibraries.compiler-rt}/lib" - "-nostdlib++" - ] -- ++ lib.optional stdenv.targetPlatform.isWasm "-fno-exceptions"; -+ ++ lib.optional stdenv.targetPlatform.isNone "-fno-exceptions"; - }; - - clangNoLibc = wrapCCWith rec { -@@ -310,7 +310,7 @@ in let - "-rtlib=compiler-rt" - "-B${targetLlvmLibraries.compiler-rt}/lib" - ] -- ++ lib.optional stdenv.targetPlatform.isWasm "-fno-exceptions"; -+ ++ lib.optional stdenv.targetPlatform.isNone "-fno-exceptions"; - }; ++ # Controls whether libunwind is enabled. ++ hasStackUnwinding = !(final.isWasm || final.isFreeBSD || final.isDarwin || final.isNone); ++ ++ # You often don't want excepetions in an embedded setting, disable them by default. ++ hasExceptions = !(final.isNone || final.isWasm); ++ ++ # If there is no kernel (`isNone`) we assume no threads. WASM can ++ # supposedly support threads in certain cases, so this might change in ++ # the future. ++ hasThreads = !(final.isNone || final.isWasm); ++ ++ hasFilesystem = !(final.isNone || final.isWasm); ++ ++ # Wanting localization in embedded settings is probably a rare edge-case, ++ # so disable it by default. ++ hasLocalization = !final.isNone; ++ ++ # Without a kernel there is no cross-platform way for libc to obtain a clock. ++ hasMonotonicClock = !final.isNone; ++ + # The difference between `isStatic` and `hasSharedLibraries` is mainly the + # addition of the `staticMarker` (see make-derivation.nix). Some + # platforms, like embedded machines without a libc (e.g. arm-none-eabi) +diff --git a/pkgs/development/compilers/llvm/common/default.nix b/pkgs/development/compilers/llvm/common/default.nix +index eaba6f45edfe..1028ee197c6a 100644 +--- a/pkgs/development/compilers/llvm/common/default.nix ++++ b/pkgs/development/compilers/llvm/common/default.nix +@@ -736,7 +736,7 @@ let + bintools = bintools'; + extraPackages = + [ targetLlvmLibraries.compiler-rt ] +- ++ lib.optionals (!stdenv.targetPlatform.isWasm && !stdenv.targetPlatform.isFreeBSD) [ ++ ++ lib.optionals stdenv.targetPlatform.hasStackUnwinding [ + targetLlvmLibraries.libunwind + ]; + extraBuildCommands = +@@ -745,14 +745,14 @@ let + echo "-rtlib=compiler-rt -Wno-unused-command-line-argument" >> $out/nix-support/cc-cflags + echo "-B${targetLlvmLibraries.compiler-rt}/lib" >> $out/nix-support/cc-cflags + '' +- + lib.optionalString (!stdenv.targetPlatform.isWasm) '' ++ + lib.optionalString stdenv.targetPlatform.hasStackUnwinding '' + echo "--unwindlib=libunwind" >> $out/nix-support/cc-cflags + echo "-L${targetLlvmLibraries.libunwind}/lib" >> $out/nix-support/cc-ldflags + '' +- + lib.optionalString (!stdenv.targetPlatform.isWasm && stdenv.targetPlatform.useLLVM or false) '' ++ + lib.optionalString (stdenv.targetPlatform.hasStackUnwinding && stdenv.targetPlatform.useLLVM or false) '' + echo "-lunwind" >> $out/nix-support/cc-ldflags + '' +- + lib.optionalString stdenv.targetPlatform.isWasm '' ++ + lib.optionalString (!stdenv.targetPlatform.hasExceptions) '' + echo "-fno-exceptions" >> $out/nix-support/cc-cflags + '' + ) +@@ -765,18 +765,13 @@ let + "-Wno-unused-command-line-argument" + "-B${targetLlvmLibraries.compiler-rt}/lib" + ] ++ ++ lib.optional stdenv.targetPlatform.hasStackUnwinding "--unwindlib=libunwind" + ++ lib.optional ( +- !stdenv.targetPlatform.isWasm && !stdenv.targetPlatform.isFreeBSD +- ) "--unwindlib=libunwind" +- ++ lib.optional ( +- !stdenv.targetPlatform.isWasm +- && !stdenv.targetPlatform.isFreeBSD ++ stdenv.targetPlatform.hasStackUnwinding + && stdenv.targetPlatform.useLLVM or false + ) "-lunwind" +- ++ lib.optional stdenv.targetPlatform.isWasm "-fno-exceptions"; +- nixSupport.cc-ldflags = lib.optionals ( +- !stdenv.targetPlatform.isWasm && !stdenv.targetPlatform.isFreeBSD +- ) [ "-L${targetLlvmLibraries.libunwind}/lib" ]; ++ ++ lib.optional (!stdenv.targetPlatform.hasExceptions) "-fno-exceptions"; ++ nixSupport.cc-ldflags = lib.optionals stdenv.targetPlatform.hasStackUnwinding [ "-L${targetLlvmLibraries.libunwind}/lib" ]; + } + ); - clangNoCompilerRt = wrapCCWith rec { -@@ -323,7 +323,7 @@ in let - [ - "-nostartfiles" - ] -- ++ lib.optional stdenv.targetPlatform.isWasm "-fno-exceptions"; -+ ++ lib.optional stdenv.targetPlatform.isNone "-fno-exceptions"; - }; +@@ -787,27 +782,23 @@ let + bintools = bintools'; + extraPackages = + [ targetLlvmLibraries.compiler-rt-no-libc ] +- ++ lib.optionals +- ( +- !stdenv.targetPlatform.isWasm && !stdenv.targetPlatform.isFreeBSD && !stdenv.targetPlatform.isDarwin +- ) +- [ +- targetLlvmLibraries.libunwind +- ]; ++ ++ lib.optionals stdenv.targetPlatform.hasStackUnwinding [ ++ targetLlvmLibraries.libunwind ++ ]; + extraBuildCommands = + lib.optionalString (lib.versions.major metadata.release_version == "13") ( + '' + echo "-rtlib=compiler-rt -Wno-unused-command-line-argument" >> $out/nix-support/cc-cflags + echo "-B${targetLlvmLibraries.compiler-rt-no-libc}/lib" >> $out/nix-support/cc-cflags + '' +- + lib.optionalString (!stdenv.targetPlatform.isWasm && !stdenv.targetPlatform.isDarwin) '' ++ + lib.optionalString stdenv.targetPlatform.hasStackUnwinding '' + echo "--unwindlib=libunwind" >> $out/nix-support/cc-cflags + echo "-L${targetLlvmLibraries.libunwind}/lib" >> $out/nix-support/cc-ldflags + '' +- + lib.optionalString (!stdenv.targetPlatform.isWasm && stdenv.targetPlatform.useLLVM or false) '' ++ + lib.optionalString (stdenv.targetPlatform.hasStackUnwinding && stdenv.targetPlatform.useLLVM or false) '' + echo "-lunwind" >> $out/nix-support/cc-ldflags + '' +- + lib.optionalString stdenv.targetPlatform.isWasm '' ++ + lib.optionalString (!stdenv.targetPlatform.hasExceptions) '' + echo "-fno-exceptions" >> $out/nix-support/cc-cflags + '' + ) +@@ -820,18 +811,13 @@ let + "-Wno-unused-command-line-argument" + "-B${targetLlvmLibraries.compiler-rt-no-libc}/lib" + ] ++ ++ lib.optional stdenv.targetPlatform.hasStackUnwinding "--unwindlib=libunwind" + ++ lib.optional ( +- !stdenv.targetPlatform.isWasm && !stdenv.targetPlatform.isFreeBSD && !stdenv.targetPlatform.isDarwin +- ) "--unwindlib=libunwind" +- ++ lib.optional ( +- !stdenv.targetPlatform.isWasm +- && !stdenv.targetPlatform.isFreeBSD ++ stdenv.targetPlatform.hasStackUnwinding + && stdenv.targetPlatform.useLLVM or false + ) "-lunwind" +- ++ lib.optional stdenv.targetPlatform.isWasm "-fno-exceptions"; +- nixSupport.cc-ldflags = lib.optionals ( +- !stdenv.targetPlatform.isWasm && !stdenv.targetPlatform.isFreeBSD && !stdenv.targetPlatform.isDarwin +- ) [ "-L${targetLlvmLibraries.libunwind}/lib" ]; ++ ++ lib.optional (!stdenv.targetPlatform.hasExceptions) "-fno-exceptions"; ++ nixSupport.cc-ldflags = lib.optionals stdenv.targetPlatform.hasStackUnwinding [ "-L${targetLlvmLibraries.libunwind}/lib" ]; + } + ); - clangNoCompilerRtWithLibc = wrapCCWith (rec { -@@ -332,7 +332,7 @@ in let - bintools = bintools'; - extraPackages = [ ]; - extraBuildCommands = mkExtraBuildCommands0 cc; -- } // lib.optionalAttrs stdenv.targetPlatform.isWasm { -+ } // lib.optionalAttrs stdenv.targetPlatform.isNone { - nixSupport.cc-cflags = [ "-fno-exceptions" ]; - }); +@@ -857,7 +843,7 @@ let + "-nostdlib++" + ] + ++ lib.optional ( +- lib.versionAtLeast metadata.release_version "15" && stdenv.targetPlatform.isWasm ++ lib.versionAtLeast metadata.release_version "15" && !stdenv.targetPlatform.hasExceptions + ) "-fno-exceptions"; + } + ); +@@ -882,7 +868,7 @@ let + "-B${targetLlvmLibraries.compiler-rt-no-libc}/lib" + ] + ++ lib.optional ( +- lib.versionAtLeast metadata.release_version "15" && stdenv.targetPlatform.isWasm ++ lib.versionAtLeast metadata.release_version "15" && !stdenv.targetPlatform.hasExceptions + ) "-fno-exceptions"; + } + ); +@@ -903,7 +889,7 @@ let + nixSupport.cc-cflags = + [ "-nostartfiles" ] + ++ lib.optional ( +- lib.versionAtLeast metadata.release_version "15" && stdenv.targetPlatform.isWasm ++ lib.versionAtLeast metadata.release_version "15" && !stdenv.targetPlatform.hasExceptions + ) "-fno-exceptions"; + } + ); +@@ -919,7 +905,7 @@ let + extraBuildCommands = mkExtraBuildCommands0 cc; + } + // lib.optionalAttrs ( +- lib.versionAtLeast metadata.release_version "15" && stdenv.targetPlatform.isWasm ++ lib.versionAtLeast metadata.release_version "15" && !stdenv.targetPlatform.hasExceptions + ) { nixSupport.cc-cflags = [ "-fno-exceptions" ]; }; + # Aliases diff --git a/pkgs/development/compilers/llvm/common/libcxx/default.nix b/pkgs/development/compilers/llvm/common/libcxx/default.nix -index b2c23f35f0a1..5d2ff5c1b6ce 100644 +index 17c687f88394..c8271d1365ef 100644 --- a/pkgs/development/compilers/llvm/common/libcxx/default.nix +++ b/pkgs/development/compilers/llvm/common/libcxx/default.nix -@@ -16,7 +16,7 @@ - , cxxabi ? null - , libcxxrt +@@ -15,7 +15,7 @@ + , freebsd + , cxxabi ? if stdenv.hostPlatform.isFreeBSD then freebsd.libcxxrt else null , libunwind -, enableShared ? !stdenv.hostPlatform.isStatic -+, enableShared ? !stdenv.hostPlatform.isStatic && !stdenv.hostPlatform.isNone ++, enableShared ? stdenv.hostPlatform.hasSharedLibraries + , devExtraCmakeFlags ? [] }: - # note: our setup using libcxxabi instead of libcxxrt on FreeBSD diverges from -@@ -56,13 +56,13 @@ let +@@ -52,17 +52,22 @@ let cxxabiCMakeFlags = lib.optionals (lib.versionAtLeast release_version "18") [ "-DLIBCXXABI_USE_LLVM_UNWINDER=OFF" - ] ++ lib.optionals (useLLVM && !stdenv.hostPlatform.isWasm) (if lib.versionAtLeast release_version "18" then [ -+ ] ++ lib.optionals (useLLVM && !stdenv.hostPlatform.isNone) (if lib.versionAtLeast release_version "18" then [ ++ ] ++ lib.optionals (useLLVM && stdenv.targetPlatform.hasStackUnwinding) (if lib.versionAtLeast release_version "18" then [ "-DLIBCXXABI_ADDITIONAL_LIBRARIES=unwind" - "-DLIBCXXABI_USE_COMPILER_RT=ON" +- "-DLIBCXXABI_USE_COMPILER_RT=ON" ] else [ - "-DLIBCXXABI_USE_COMPILER_RT=ON" +- "-DLIBCXXABI_USE_COMPILER_RT=ON" "-DLIBCXXABI_USE_LLVM_UNWINDER=ON" - ]) ++ lib.optionals stdenv.hostPlatform.isWasm [ -+ ]) ++ lib.optionals stdenv.hostPlatform.isNone [ ++ ]) ++ lib.optionals (useLLVM && !stdenv.targetPlatform.isWasm) [ ++ "-DLIBCXXABI_USE_COMPILER_RT=ON" ++ ] ++ lib.optionals (!stdenv.targetPlatform.hasThreads) [ "-DLIBCXXABI_ENABLE_THREADS=OFF" ++ ] ++ lib.optionals (!stdenv.targetPlatform.hasExceptions) [ "-DLIBCXXABI_ENABLE_EXCEPTIONS=OFF" ] ++ lib.optionals (!enableShared) [ -@@ -87,10 +87,12 @@ let + "-DLIBCXXABI_ENABLE_SHARED=OFF" ++ ] ++ lib.optionals (!stdenv.targetPlatform.hasLocalization) [ ++ "-DLIBCXX_ENABLE_LOCALIZATION=OFF" ++ ] ++ lib.optionals (!stdenv.targetPlatform.hasMonotonicClock) [ ++ "-DLIBCXX_ENABLE_MONOTONIC_CLOCK=OFF" + ]; + + cxxCMakeFlags = [ +@@ -87,9 +92,11 @@ let "-DLIBCXX_USE_COMPILER_RT=ON" - ] ++ lib.optionals (useLLVM && lib.versionAtLeast release_version "16") [ + ] ++ lib.optionals (useLLVM && !stdenv.hostPlatform.isFreeBSD && lib.versionAtLeast release_version "16") [ "-DLIBCXX_ADDITIONAL_LIBRARIES=unwind" - ] ++ lib.optionals stdenv.hostPlatform.isWasm [ -+ ] ++ lib.optionals stdenv.hostPlatform.isNone [ ++ ] ++ lib.optionals (!stdenv.targetPlatform.hasThreads) [ "-DLIBCXX_ENABLE_THREADS=OFF" ++ ] ++ lib.optionals (!stdenv.targetPlatform.hasStackUnwinding) [ "-DLIBCXX_ENABLE_FILESYSTEM=OFF" ++ ] ++ lib.optionals (!stdenv.targetPlatform.hasExceptions) [ "-DLIBCXX_ENABLE_EXCEPTIONS=OFF" -+ "-DLIBCXX_ENABLE_LOCALIZATION=OFF" -+ "-DLIBCXX_ENABLE_MONOTONIC_CLOCK=OFF" ] ++ lib.optionals (!enableShared) [ "-DLIBCXX_ENABLE_SHARED=OFF" - ]; -@@ -109,6 +111,8 @@ in +@@ -112,6 +119,8 @@ in stdenv.mkDerivation (rec { inherit pname version cmakeFlags patches; @@ -116,12 +238,12 @@ index b2c23f35f0a1..5d2ff5c1b6ce 100644 src = src'; outputs = [ "out" "dev" ]; -@@ -122,7 +126,7 @@ stdenv.mkDerivation (rec { +@@ -125,7 +134,7 @@ stdenv.mkDerivation (rec { ++ lib.optional (cxxabi != null) lndir; buildInputs = [ cxxabi ] -- ++ lib.optionals (useLLVM && !stdenv.hostPlatform.isWasm) [ libunwind ]; -+ ++ lib.optionals (useLLVM && !stdenv.hostPlatform.isNone) [ libunwind ]; +- ++ lib.optionals (useLLVM && !stdenv.hostPlatform.isWasm && !stdenv.hostPlatform.isFreeBSD) [ libunwind ]; ++ ++ lib.optionals (stdenv.targetPlatform.hasStackUnwinding && useLLVM) [ libunwind ]; # libc++.so is a linker script which expands to multiple libraries, # libc++.so.1 and libc++abi.so or the external cxxabi. ld-wrapper doesn't diff --git a/platform/lean/lean.h b/platform/lean/lean.h index 21c1408..0e54253 100644 --- a/platform/lean/lean.h +++ b/platform/lean/lean.h @@ -1119,7 +1119,12 @@ static inline size_t lean_string_byte_size(lean_object *o) { } /* instance : inhabited char := ⟨'A'⟩ */ static inline uint32_t lean_char_default_value() { return 'A'; } +LEAN_EXPORT lean_obj_res lean_mk_string_unchecked(char const *s, size_t sz, + size_t len); LEAN_EXPORT lean_obj_res lean_mk_string_from_bytes(char const *s, size_t sz); +LEAN_EXPORT lean_obj_res lean_mk_string_from_bytes_unchecked(char const *s, + size_t sz); +LEAN_EXPORT lean_obj_res lean_mk_ascii_string_unchecked(char const *s); LEAN_EXPORT lean_obj_res lean_mk_string(char const *s); static inline char const *lean_string_cstr(b_lean_obj_arg o) { assert(lean_is_string(o)); diff --git a/platform/runtime/object.cpp b/platform/runtime/object.cpp index 53f1a1a..d423d5b 100644 --- a/platform/runtime/object.cpp +++ b/platform/runtime/object.cpp @@ -221,13 +221,15 @@ extern "C" LEAN_EXPORT lean_object *lean_array_data(lean_obj_arg a) { } extern "C" LEAN_EXPORT lean_obj_res lean_array_get_panic(lean_obj_arg def_val) { - return lean_panic_fn(def_val, lean_mk_string("Error: index out of bounds")); + return lean_panic_fn( + def_val, lean_mk_ascii_string_unchecked("Error: index out of bounds")); } extern "C" LEAN_EXPORT lean_obj_res lean_array_set_panic(lean_obj_arg a, lean_obj_arg v) { lean_dec(v); - return lean_panic_fn(a, lean_mk_string("Error: index out of bounds")); + return lean_panic_fn( + a, lean_mk_ascii_string_unchecked("Error: index out of bounds")); } // ======================================= @@ -825,9 +827,9 @@ extern "C" LEAN_EXPORT lean_obj_res lean_float_to_string(double a) { if (isnan(a)) // override NaN because we don't want NaNs to be distinguishable // because the sign bit / payload bits can be architecture-dependent - return mk_string("NaN"); + return mk_ascii_string_unchecked("NaN"); else - return mk_string(std::to_string(a)); + return mk_ascii_string_unchecked(std::to_string(a)); } extern "C" LEAN_EXPORT double lean_float_scaleb(double a, b_lean_obj_arg b) { @@ -880,8 +882,8 @@ static object *string_ensure_capacity(object *o, size_t extra) { } } -extern "C" LEAN_EXPORT object *lean_mk_string_core(char const *s, size_t sz, - size_t len) { +extern "C" LEAN_EXPORT object *lean_mk_string_unchecked(char const *s, + size_t sz, size_t len) { size_t rsz = sz + 1; object *r = lean_alloc_string(rsz, rsz, len); memcpy(w_string_cstr(r), s, sz); @@ -889,22 +891,57 @@ extern "C" LEAN_EXPORT object *lean_mk_string_core(char const *s, size_t sz, return r; } +object *lean_mk_string_lossy_recover(char const *s, size_t sz, size_t pos, + size_t i) { + std::string str(s, pos); + size_t start = pos; + while (pos < sz) { + if (!validate_utf8_one((const uint8_t *)s, sz, pos)) { + str.append(s + start, pos - start); + str.append("\ufffd"); // U+FFFD REPLACEMENT CHARACTER + do + pos++; + while (pos < sz && (s[pos] & 0xc0) == 0x80); + start = pos; + } + i++; + } + str.append(s + start, pos - start); + return lean_mk_string_unchecked(str.data(), str.size(), i); +} + extern "C" LEAN_EXPORT object *lean_mk_string_from_bytes(char const *s, size_t sz) { - return lean_mk_string_core(s, sz, utf8_strlen(s, sz)); + size_t pos = 0, i = 0; + if (validate_utf8((const uint8_t *)s, sz, pos, i)) { + return lean_mk_string_unchecked(s, pos, i); + } else { + return lean_mk_string_lossy_recover(s, sz, pos, i); + } +} + +extern "C" LEAN_EXPORT object * +lean_mk_string_from_bytes_unchecked(char const *s, size_t sz) { + return lean_mk_string_unchecked(s, sz, utf8_strlen(s, sz)); } extern "C" LEAN_EXPORT object *lean_mk_string(char const *s) { return lean_mk_string_from_bytes(s, strlen(s)); } -extern "C" LEAN_EXPORT obj_res lean_string_from_utf8(b_obj_arg a) { - return lean_mk_string_from_bytes( +extern "C" LEAN_EXPORT object *lean_mk_ascii_string_unchecked(char const *s) { + size_t len = strlen(s); + return lean_mk_string_unchecked(s, len, len); +} + +extern "C" LEAN_EXPORT obj_res lean_string_from_utf8_unchecked(b_obj_arg a) { + return lean_mk_string_from_bytes_unchecked( reinterpret_cast(lean_sarray_cptr(a)), lean_sarray_size(a)); } extern "C" LEAN_EXPORT uint8 lean_string_validate_utf8(b_obj_arg a) { - return validate_utf8(lean_sarray_cptr(a), lean_sarray_size(a)); + size_t pos = 0, i = 0; + return validate_utf8(lean_sarray_cptr(a), lean_sarray_size(a), pos, i); } extern "C" LEAN_EXPORT obj_res lean_string_to_utf8(b_obj_arg s) { @@ -918,8 +955,8 @@ object *mk_string(std::string const &s) { return lean_mk_string_from_bytes(s.data(), s.size()); } -object *mk_ascii_string(std::string const &s) { - return lean_mk_string_core(s.data(), s.size(), s.size()); +object *mk_ascii_string_unchecked(std::string const &s) { + return lean_mk_string_unchecked(s.data(), s.size(), s.size()); } std::string string_to_std(b_obj_arg o) { @@ -990,16 +1027,6 @@ extern "C" LEAN_EXPORT bool lean_string_lt(object *s1, object *s2) { return r < 0 || (r == 0 && sz1 < sz2); } -static std::string list_as_string(b_obj_arg lst) { - std::string s; - b_obj_arg o = lst; - while (!lean_is_scalar(o)) { - push_unicode_scalar(s, lean_unbox_uint32(lean_ctor_get(o, 0))); - o = lean_ctor_get(o, 1); - } - return s; -} - static obj_res string_to_list_core(std::string const &s, bool reverse = false) { std::vector tmp; utf8_decode(s, tmp); @@ -1018,9 +1045,16 @@ static obj_res string_to_list_core(std::string const &s, bool reverse = false) { } extern "C" LEAN_EXPORT obj_res lean_string_mk(obj_arg cs) { - std::string s = list_as_string(cs); + std::string s; + b_obj_arg o = cs; + size_t len = 0; + while (!lean_is_scalar(o)) { + push_unicode_scalar(s, lean_unbox_uint32(lean_ctor_get(o, 0))); + o = lean_ctor_get(o, 1); + len++; + } lean_dec(cs); - return mk_string(s); + return lean_mk_string_unchecked(s.data(), s.size(), len); } extern "C" LEAN_EXPORT obj_res lean_string_data(obj_arg s) { @@ -1159,7 +1193,8 @@ extern "C" LEAN_EXPORT obj_res lean_string_utf8_get_opt(b_obj_arg s, static uint32 lean_string_utf8_get_panic() { lean_panic_fn(lean_box(0), - lean_mk_string("Error: invalid `String.Pos` at `String.get!`")); + lean_mk_ascii_string_unchecked( + "Error: invalid `String.Pos` at `String.get!`")); return lean_char_default_value(); } @@ -1258,11 +1293,11 @@ extern "C" LEAN_EXPORT obj_res lean_string_utf8_extract(b_obj_arg s, char const *str = lean_string_cstr(s); usize sz = lean_string_size(s) - 1; if (b >= e || b >= sz) - return lean_mk_string(""); + return lean_mk_string_unchecked("", 0, 0); /* In the reference implementation if `b` is not pointing to a valid UTF8 character start position, the result is the empty string. */ if (!is_utf8_first_byte(str[b])) - return lean_mk_string(""); + return lean_mk_string_unchecked("", 0, 0); if (e > sz) e = sz; lean_assert(b < e); @@ -1273,7 +1308,7 @@ extern "C" LEAN_EXPORT obj_res lean_string_utf8_extract(b_obj_arg s, e = sz; usize new_sz = e - b; lean_assert(new_sz > 0); - return lean_mk_string_from_bytes(lean_string_cstr(s) + b, new_sz); + return lean_mk_string_from_bytes_unchecked(lean_string_cstr(s) + b, new_sz); } extern "C" LEAN_EXPORT obj_res lean_string_utf8_prev(b_obj_arg s, @@ -1347,9 +1382,10 @@ extern "C" LEAN_EXPORT obj_res lean_string_utf8_set(obj_arg s, b_obj_arg i0, std::string tmp; push_unicode_scalar(tmp, c); std::string new_s = string_to_std(s); + usize len = lean_string_len(s); dec(s); new_s.replace(i, get_utf8_char_size_at(new_s, i), tmp); - return mk_string(new_s); + return lean_mk_string_unchecked(new_s.data(), new_s.size(), len); } extern "C" LEAN_EXPORT uint64 lean_string_hash(b_obj_arg s) { @@ -1359,7 +1395,7 @@ extern "C" LEAN_EXPORT uint64 lean_string_hash(b_obj_arg s) { } extern "C" LEAN_EXPORT obj_res lean_string_of_usize(size_t n) { - return mk_ascii_string(std::to_string(n)); + return mk_ascii_string_unchecked(std::to_string(n)); } // ======================================= diff --git a/platform/runtime/object.h b/platform/runtime/object.h index 3422108..3f4dfc7 100644 --- a/platform/runtime/object.h +++ b/platform/runtime/object.h @@ -390,6 +390,7 @@ inline obj_res alloc_string(size_t size, size_t capacity, size_t len) { return lean_alloc_string(size, capacity, len); } inline obj_res mk_string(char const *s) { return lean_mk_string(s); } +LEAN_EXPORT obj_res mk_ascii_string_unchecked(std::string const &s); LEAN_EXPORT obj_res mk_string(std::string const &s); LEAN_EXPORT std::string string_to_std(b_obj_arg o); inline char const *string_cstr(b_obj_arg o) { return lean_string_cstr(o); } diff --git a/platform/runtime/utf8.cpp b/platform/runtime/utf8.cpp index 1073479..d0af1ce 100644 --- a/platform/runtime/utf8.cpp +++ b/platform/runtime/utf8.cpp @@ -118,62 +118,68 @@ void utf8_decode(std::string const &str, std::vector &out) { } } -bool validate_utf8(uint8_t const *str, size_t size) { - size_t i = 0; - while (i < size) { - unsigned c = str[i]; - if ((c & 0x80) == 0) { - /* zero continuation (0 to 0x7F) */ - i++; - } else if ((c & 0xe0) == 0xc0) { - /* one continuation (0x80 to 0x7FF) */ - if (i + 1 >= size) - return false; - - unsigned c1 = str[i + 1]; - if ((c1 & 0xc0) != 0x80) - return false; - - unsigned r = ((c & 0x1f) << 6) | (c1 & 0x3f); - if (r < 0x80) - return false; +bool validate_utf8_one(uint8_t const *str, size_t size, size_t &pos) { + unsigned c = str[pos]; + if ((c & 0x80) == 0) { + /* zero continuation (0 to 0x7F) */ + pos++; + } else if ((c & 0xe0) == 0xc0) { + /* one continuation (0x80 to 0x7FF) */ + if (pos + 1 >= size) + return false; - i += 2; - } else if ((c & 0xf0) == 0xe0) { - /* two continuations (0x800 to 0xD7FF and 0xE000 to 0xFFFF) */ - if (i + 2 >= size) - return false; + unsigned c1 = str[pos + 1]; + if ((c1 & 0xc0) != 0x80) + return false; - unsigned c1 = str[i + 1]; - unsigned c2 = str[i + 2]; - if ((c1 & 0xc0) != 0x80 || (c2 & 0xc0) != 0x80) - return false; + unsigned r = ((c & 0x1f) << 6) | (c1 & 0x3f); + if (r < 0x80) + return false; - unsigned r = ((c & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f); - if (r < 0x800 || (r >= 0xD800 && r <= 0xDFFF)) - return false; + pos += 2; + } else if ((c & 0xf0) == 0xe0) { + /* two continuations (0x800 to 0xD7FF and 0xE000 to 0xFFFF) */ + if (pos + 2 >= size) + return false; - i += 3; - } else if ((c & 0xf8) == 0xf0) { - /* three continuations (0x10000 to 0x10FFFF) */ - if (i + 3 >= size) - return false; - - unsigned c1 = str[i + 1]; - unsigned c2 = str[i + 2]; - unsigned c3 = str[i + 3]; - if ((c1 & 0xc0) != 0x80 || (c2 & 0xc0) != 0x80 || (c3 & 0xc0) != 0x80) - return false; - - unsigned r = ((c & 0x07) << 18) | ((c1 & 0x3f) << 12) | - ((c2 & 0x3f) << 6) | (c3 & 0x3f); - if (r < 0x10000 || r > 0x10FFFF) - return false; + unsigned c1 = str[pos + 1]; + unsigned c2 = str[pos + 2]; + if ((c1 & 0xc0) != 0x80 || (c2 & 0xc0) != 0x80) + return false; - i += 4; - } else { + unsigned r = ((c & 0x0f) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f); + if (r < 0x800 || (r >= 0xD800 && r <= 0xDFFF)) return false; - } + + pos += 3; + } else if ((c & 0xf8) == 0xf0) { + /* three continuations (0x10000 to 0x10FFFF) */ + if (pos + 3 >= size) + return false; + + unsigned c1 = str[pos + 1]; + unsigned c2 = str[pos + 2]; + unsigned c3 = str[pos + 3]; + if ((c1 & 0xc0) != 0x80 || (c2 & 0xc0) != 0x80 || (c3 & 0xc0) != 0x80) + return false; + + unsigned r = ((c & 0x07) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) | + (c3 & 0x3f); + if (r < 0x10000 || r > 0x10FFFF) + return false; + + pos += 4; + } else { + return false; + } + return true; +} + +bool validate_utf8(uint8_t const *str, size_t size, size_t &pos, size_t &i) { + while (pos < size) { + if (!validate_utf8_one(str, size, pos)) + return false; + i++; } return true; } diff --git a/platform/runtime/utf8.h b/platform/runtime/utf8.h index 54603a6..f2076ab 100644 --- a/platform/runtime/utf8.h +++ b/platform/runtime/utf8.h @@ -25,8 +25,13 @@ LEAN_EXPORT size_t utf8_strlen(char const *str, size_t sz); LEAN_EXPORT void utf8_decode(std::string const &str, std::vector &out); +/* Returns true if the given character is valid UTF-8 */ +LEAN_EXPORT bool validate_utf8_one(uint8_t const *str, size_t size, + size_t &pos); + /* Returns true if the provided string is valid UTF-8 */ -LEAN_EXPORT bool validate_utf8(uint8_t const *str, size_t size); +LEAN_EXPORT bool validate_utf8(uint8_t const *str, size_t size, size_t &pos, + size_t &i); /* Push a unicode scalar value into a utf-8 encoded string */ LEAN_EXPORT void push_unicode_scalar(std::string &s, unsigned code);