From b770786f59b5f2ea790639920efbb853abea44a3 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Tue, 3 Dec 2024 20:22:02 -0600 Subject: [PATCH] utf8: enhance handling of multibyte sequences This patch refactor a bit how UTF8 decoding works by replacing the old lookup table for special characters/codepoints with a new routine and optional lookup table based on the compiler type (GNU/Clang). It also supports proper encoding of multibyte sequences. Signed-off-by: Eduardo Silva --- include/fluent-bit/flb_utf8.h | 82 ++------------ src/flb_utf8.c | 201 ++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+), 75 deletions(-) create mode 100644 src/flb_utf8.c diff --git a/include/fluent-bit/flb_utf8.h b/include/fluent-bit/flb_utf8.h index 00cb08d066f..b883ff0f78c 100644 --- a/include/fluent-bit/flb_utf8.h +++ b/include/fluent-bit/flb_utf8.h @@ -20,84 +20,16 @@ #ifndef FLB_UTF8_H #define FLB_UTF8_H +#define FLB_UTF8_ACCEPT 0 +#define FLB_UTF8_REJECT 1 +#define FLB_UTF8_CONTINUE 2 + #include #include -/* is the start of a UTF-8 string ? */ -#define flb_utf8_check(c) (((c) & 0xC0) != 0x80) - -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - /* returns length of next utf-8 sequence */ -static inline int flb_utf8_len(const char *s) -{ - return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; -} - -/* - * UTF-8 Decoding routines are originally written by Bjoern Hoehrmann - * and taken from the following web site: - * - * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ - * - * They have been siglhy renamed to follow Fluent Bit naming requirements. - */ - -#define FLB_UTF8_ACCEPT 0 -#define FLB_UTF8_REJECT 1 - -static const uint8_t utf8d[] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df - 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef - 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff - 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 - 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 - 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 - 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 -}; - -static inline uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, - uint32_t byte) -{ - uint32_t type = utf8d[byte]; - - *codep = (*state != FLB_UTF8_ACCEPT) ? - (byte & 0x3fu) | (*codep << 6) : - (0xff >> type) & (byte); - - *state = utf8d[256 + *state*16 + type]; - return *state; -} - - -static inline void flb_utf8_print(const uint8_t *s) { - uint32_t codepoint; - uint32_t state = 0; - - for (; *s; ++s) - if (!flb_utf8_decode(&state, &codepoint, *s)) { - printf("\\u%04x\n", codepoint); - } - - if (state != FLB_UTF8_ACCEPT) { - printf("The string is not well-formed\n"); - } -} +int flb_utf8_len(const char *s); +uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte); +void flb_utf8_print(char *input); #endif diff --git a/src/flb_utf8.c b/src/flb_utf8.c new file mode 100644 index 00000000000..ba8b4696415 --- /dev/null +++ b/src/flb_utf8.c @@ -0,0 +1,201 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ + +/* Fluent Bit + * ========== + * Copyright (C) 2015-2024 The Fluent Bit Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +static const char trailing_bytes_for_utf8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* returns length of next utf-8 sequence */ +int flb_utf8_len(const char *s) +{ + return trailing_bytes_for_utf8[(unsigned int)(unsigned char)s[0]] + 1; +} + +#if defined(__GNUC__) || defined(__clang__) +/* + * if we are compiling with GNU or CLang compiler , we have the ranges + * functionality available, so we can tweak our decoder by using a lookup + * table. + * + * Lookup table for byte classification and state transitions: + * + * Format: {initial_state, bitmask, expected_continuation_bytes} + * ASCII: state 0, no continuation bytes + * Start of multi-byte sequence: state X, continuation byte count + * Invalid: reject state + */ +static const uint8_t utf8_lookup[256][3] = { + [0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */ + [0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */ + [0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */ + [0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */ + [0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */ + [0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */ +}; + +uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) +{ + const uint8_t *entry = utf8_lookup[byte]; + + if (*state == FLB_UTF8_ACCEPT) { + /* starting a new character */ + *state = entry[0]; + if (*state == FLB_UTF8_REJECT) { + /* invalid start byte */ + return FLB_UTF8_REJECT; + } + *codep = byte & entry[1]; + } + else { + /* continuation byte */ + if ((byte & 0xC0) == 0x80) { + *codep = (*codep << 6) | (byte & 0x3F); + /* decrement continuation bytes */ + (*state)--; + } + else { + /* invalid continuation byte */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + } + + /* check if the sequence is complete */ + if (*state == 0) { + if (*codep >= 0xD800 && *codep <= 0xDFFF) { + /* surrogate pair (invalid UTF-8) */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + else if (*codep > 0x10FFFF) { + /* out of range codepoint */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + /* valid and complete sequence */ + return FLB_UTF8_ACCEPT; + } + + /* we are still processing the current sequence */ + return FLB_UTF8_CONTINUE; +} + +#else + +/* fallback decoder: no lookup table */ +uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) +{ + /* Start of a new character */ + if (*state == 0) { + if (byte <= 0x7F) { + /* ASCII */ + *codep = byte; + return FLB_UTF8_ACCEPT; + } + else if ((byte & 0xE0) == 0xC0) { + /* start of a 2-byte sequence */ + *codep = byte & 0x1F; + *state = 1; + } + else if ((byte & 0xF0) == 0xE0) { + /* start of a 3-byte sequence */ + *codep = byte & 0x0F; + *state = 2; + } + else if ((byte & 0xF8) == 0xF0) { + /* start of a 4-byte sequence */ + *codep = byte & 0x07; + *state = 3; + } + else { + /* invalid first byte */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + } + else { + /* continuation byte */ + if ((byte & 0xC0) == 0x80) { + *codep = (*codep << 6) | (byte & 0x3F); + + /* reduce the expected continuation bytes */ + (*state)--; + } + else { + /* invalid continuation byte */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + } + + if (*state == 0) { + /* sequence complete */ + if (*codep >= 0xD800 && *codep <= 0xDFFF) { + /* invalid surrogate pair */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + else if (*codep > 0x10FFFF) { + /* codepoint is out of range */ + *state = FLB_UTF8_REJECT; + return FLB_UTF8_REJECT; + } + return FLB_UTF8_ACCEPT; + } + + /* we are still processing the current sequence */ + return FLB_UTF8_CONTINUE; +} + +#endif + +void flb_utf8_print(char *input) +{ + int i; + int ret; + int len; + uint32_t state = 0; + uint32_t codepoint = 0; + + len = strlen(input); + for (i = 0; i < len; i++) { + ret = flb_utf8_decode(&state, &codepoint, (uint8_t) input[i]); + if (ret == FLB_UTF8_ACCEPT) { + printf("Valid Codepoint: U+%04X\n", codepoint); + } + else if (ret == FLB_UTF8_REJECT) { + printf("Invalid UTF-8 sequence detected.\n"); + break; + } + } +}