From b770786f59b5f2ea790639920efbb853abea44a3 Mon Sep 17 00:00:00 2001
From: Eduardo Silva <eduardo@calyptia.com>
Date: Tue, 3 Dec 2024 20:22:02 -0600
Subject: [PATCH] utf8: enhance handling of multibyte sequences

This patch refactor a bit how UTF8 decoding works by replacing the old lookup table
for special characters/codepoints with a new routine and optional lookup table based
on the compiler type (GNU/Clang).

It also supports proper encoding of multibyte sequences.

Signed-off-by: Eduardo Silva <eduardo@calyptia.com>
---
 include/fluent-bit/flb_utf8.h |  82 ++------------
 src/flb_utf8.c                | 201 ++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+), 75 deletions(-)
 create mode 100644 src/flb_utf8.c

diff --git a/include/fluent-bit/flb_utf8.h b/include/fluent-bit/flb_utf8.h
index 00cb08d066f..b883ff0f78c 100644
--- a/include/fluent-bit/flb_utf8.h
+++ b/include/fluent-bit/flb_utf8.h
@@ -20,84 +20,16 @@
 #ifndef FLB_UTF8_H
 #define FLB_UTF8_H
 
+#define FLB_UTF8_ACCEPT   0
+#define FLB_UTF8_REJECT   1
+#define FLB_UTF8_CONTINUE 2
+
 #include <fluent-bit/flb_info.h>
 #include <inttypes.h>
 
-/* is the start of a UTF-8 string ? */
-#define flb_utf8_check(c) (((c) & 0xC0) != 0x80)
-
-static const char trailingBytesForUTF8[256] = {
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
-};
-
 /* returns length of next utf-8 sequence */
-static inline int flb_utf8_len(const char *s)
-{
-    return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
-}
-
-/*
- * UTF-8 Decoding routines are originally written by Bjoern Hoehrmann
- * <bjoern@hoehrmann.de> and taken from the following web site:
- *
- *   http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
- *
- * They have been siglhy renamed to follow Fluent Bit naming requirements.
- */
-
-#define FLB_UTF8_ACCEPT 0
-#define FLB_UTF8_REJECT 1
-
-static const uint8_t utf8d[] = {
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
-    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
-    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
-    0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
-    0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
-    0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
-    1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
-    1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
-    1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
-};
-
-static inline uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep,
-                                       uint32_t byte)
-{
-    uint32_t type = utf8d[byte];
-
-    *codep = (*state != FLB_UTF8_ACCEPT) ?
-        (byte & 0x3fu) | (*codep << 6) :
-        (0xff >> type) & (byte);
-
-    *state = utf8d[256 + *state*16 + type];
-    return *state;
-}
-
-
-static inline void flb_utf8_print(const uint8_t *s) {
-    uint32_t codepoint;
-    uint32_t state = 0;
-
-    for (; *s; ++s)
-        if (!flb_utf8_decode(&state, &codepoint, *s)) {
-            printf("\\u%04x\n", codepoint);
-        }
-
-    if (state != FLB_UTF8_ACCEPT) {
-        printf("The string is not well-formed\n");
-    }
-}
+int flb_utf8_len(const char *s);
+uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte);
+void flb_utf8_print(char *input);
 
 #endif
diff --git a/src/flb_utf8.c b/src/flb_utf8.c
new file mode 100644
index 00000000000..ba8b4696415
--- /dev/null
+++ b/src/flb_utf8.c
@@ -0,0 +1,201 @@
+/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+
+/*  Fluent Bit
+ *  ==========
+ *  Copyright (C) 2015-2024 The Fluent Bit Authors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <fluent-bit/flb_info.h>
+#include <fluent-bit/flb_utf8.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+static const char trailing_bytes_for_utf8[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+/* returns length of next utf-8 sequence */
+int flb_utf8_len(const char *s)
+{
+    return trailing_bytes_for_utf8[(unsigned int)(unsigned char)s[0]] + 1;
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+/*
+ * if we are compiling with GNU or CLang compiler , we have the ranges
+ * functionality available, so we can tweak our decoder by using a lookup
+ * table.
+ *
+ * Lookup table for byte classification and state transitions:
+ *
+ *  Format: {initial_state, bitmask, expected_continuation_bytes}
+ *  ASCII: state 0, no continuation bytes
+ *  Start of multi-byte sequence: state X, continuation byte count
+ *  Invalid: reject state
+ */
+static const uint8_t utf8_lookup[256][3] = {
+    [0x00 ... 0x7F] = {0, 0x7F, 0},            /* ASCII */
+    [0xC0 ... 0xDF] = {1, 0x1F, 1},            /* Start of 2-byte sequence */
+    [0xE0 ... 0xEF] = {2, 0x0F, 2},            /* Start of 3-byte sequence */
+    [0xF0 ... 0xF7] = {3, 0x07, 3},            /* Start of 4-byte sequence */
+    [0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */
+    [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */
+};
+
+uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte)
+{
+    const uint8_t *entry = utf8_lookup[byte];
+
+    if (*state == FLB_UTF8_ACCEPT) {
+        /* starting a new character */
+        *state = entry[0];
+        if (*state == FLB_UTF8_REJECT) {
+            /* invalid start byte */
+            return FLB_UTF8_REJECT;
+        }
+        *codep = byte & entry[1];
+    }
+    else {
+        /* continuation byte */
+        if ((byte & 0xC0) == 0x80) {
+            *codep = (*codep << 6) | (byte & 0x3F);
+            /* decrement continuation bytes */
+            (*state)--;
+        }
+        else {
+            /* invalid continuation byte */
+            *state = FLB_UTF8_REJECT;
+            return FLB_UTF8_REJECT;
+        }
+    }
+
+    /* check if the sequence is complete */
+    if (*state == 0) {
+        if (*codep >= 0xD800 && *codep <= 0xDFFF) {
+            /* surrogate pair (invalid UTF-8) */
+            *state = FLB_UTF8_REJECT;
+            return FLB_UTF8_REJECT;
+        }
+        else if (*codep > 0x10FFFF) {
+            /* out of range codepoint */
+            *state = FLB_UTF8_REJECT;
+            return FLB_UTF8_REJECT;
+        }
+        /* valid and complete sequence */
+        return FLB_UTF8_ACCEPT;
+    }
+
+    /* we are still processing the current sequence */
+    return FLB_UTF8_CONTINUE;
+}
+
+#else
+
+/* fallback decoder: no lookup table */
+uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte)
+{
+    /* Start of a new character */
+    if (*state == 0) {
+        if (byte <= 0x7F) {
+            /* ASCII */
+            *codep = byte;
+            return FLB_UTF8_ACCEPT;
+        }
+        else if ((byte & 0xE0) == 0xC0) {
+            /* start of a 2-byte sequence */
+            *codep = byte & 0x1F;
+            *state = 1;
+        }
+        else if ((byte & 0xF0) == 0xE0) {
+            /* start of a 3-byte sequence */
+            *codep = byte & 0x0F;
+            *state = 2;
+        }
+        else if ((byte & 0xF8) == 0xF0) {
+            /* start of a 4-byte sequence */
+            *codep = byte & 0x07;
+            *state = 3;
+        }
+        else {
+            /* invalid first byte */
+            *state = FLB_UTF8_REJECT;
+            return FLB_UTF8_REJECT;
+        }
+    }
+    else {
+        /* continuation byte */
+        if ((byte & 0xC0) == 0x80) {
+            *codep = (*codep << 6) | (byte & 0x3F);
+
+            /* reduce the expected continuation bytes */
+            (*state)--;
+        }
+        else {
+            /* invalid continuation byte */
+            *state = FLB_UTF8_REJECT;
+            return FLB_UTF8_REJECT;
+        }
+    }
+
+    if (*state == 0) {
+        /* sequence complete */
+        if (*codep >= 0xD800 && *codep <= 0xDFFF) {
+            /* invalid surrogate pair */
+            *state = FLB_UTF8_REJECT;
+            return FLB_UTF8_REJECT;
+        }
+        else if (*codep > 0x10FFFF) {
+            /* codepoint is out of range */
+            *state = FLB_UTF8_REJECT;
+            return FLB_UTF8_REJECT;
+        }
+        return FLB_UTF8_ACCEPT;
+    }
+
+    /* we are still processing the current sequence */
+    return FLB_UTF8_CONTINUE;
+}
+
+#endif
+
+void flb_utf8_print(char *input)
+{
+    int i;
+    int ret;
+    int len;
+    uint32_t state = 0;
+    uint32_t codepoint = 0;
+
+    len = strlen(input);
+    for (i = 0; i < len; i++) {
+        ret = flb_utf8_decode(&state, &codepoint, (uint8_t) input[i]);
+        if (ret == FLB_UTF8_ACCEPT) {
+            printf("Valid Codepoint: U+%04X\n", codepoint);
+        }
+        else if (ret == FLB_UTF8_REJECT) {
+            printf("Invalid UTF-8 sequence detected.\n");
+            break;
+        }
+    }
+}