From 987eeb4865cde6ff6c5f57967fa2ffe0c9f0d066 Mon Sep 17 00:00:00 2001
From: Philip Withnall <pwithnall@gnome.org>
Date: Tue, 19 Nov 2024 14:05:04 +0000
Subject: [PATCH] c-utf8: Always call strlen() on strings of unknown length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There was provision in the validation loop for continuing to validate
until a nul byte was hit, but unfortunately that often results in buffer
overflow reads when used with heap allocated strings.

This is because the SIMD code reads 4 bytes at a time. If the string’s
terminal nul byte is in bytes 0–3, this results in a 3–1 byte overread.
This is detected and warned about by valgrind and asan, even though it’s
harmless because:
 - it’s only a read,
 - the control logic terminates the loop after detecting the nul byte,
   without basing any decisions on the following bytes,
 - the overread is guaranteed to not cross a page boundary (as they are 4
   byte aligned), so will not cause a page fault.

Unfortunately, there’s no easy way to tell valgrind or asan about this.
We could use the valgrind client API to tell it that the overread is
harmless, but using the API is a significant runtime overhead. Asan
doesn’t allow any warnings to be suppressed apart from telling it to not
instrument the entire function, which would mean that basically all the
code in c-utf8 would be uninstrumented. So we’d never be told about
other, true positive, errors, if they exist.

So, the least bad option seems to be to explicitly calculate the string
length with `strlen()` before validating, if the length is not
specified. Many callers into c-utf8 will already specify the length, in
which case this commit is a no-op.

See also: https://gitlab.gnome.org/GNOME/glib/-/issues/3493

Signed-off-by: Philip Withnall <pwithnall@gnome.org>

Fixes: https://github.com/c-util/c-utf8/issues/6
---
 src/c-utf8.c   | 4 ++--
 src/test-api.c | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/c-utf8.c b/src/c-utf8.c
index dcd2dbc..c6ed627 100644
--- a/src/c-utf8.c
+++ b/src/c-utf8.c
@@ -38,7 +38,7 @@ static inline int c_utf8_word_is_ascii(size_t word) {
  */
 _c_public_ void c_utf8_verify_ascii(const char **strp, size_t *lenp) {
         const char *str = *strp;
-        size_t len = lenp ? *lenp : (size_t)-1;
+        size_t len = lenp ? *lenp : strlen (str);
 
         while (len > 0 && c_load_8(str, 0) < 128) {
                 if ((void*)c_align_to((unsigned long)str, sizeof(size_t)) == str) {
@@ -105,7 +105,7 @@ _c_public_ void c_utf8_verify_ascii(const char **strp, size_t *lenp) {
  */
 _c_public_ void c_utf8_verify(const char **strp, size_t *lenp) {
         const char *str = *strp;
-        size_t len = lenp ? *lenp : (size_t)-1;
+        size_t len = lenp ? *lenp : strlen (str);
 
         /* See Unicode 10.0.0, Chapter 3, Section D92 */
 
diff --git a/src/test-api.c b/src/test-api.c
index 599b476..f10c295 100644
--- a/src/test-api.c
+++ b/src/test-api.c
@@ -19,6 +19,9 @@ static void test_api(void) {
 
         c_utf8_verify_ascii(&string1, &n_string1);
         c_utf8_verify(&string2, &n_string2);
+
+        c_utf8_verify_ascii(&string1, NULL);
+        c_utf8_verify(&string2, NULL);
 }
 
 int main(int argc, char **argv) {