Skip to content

Commit

Permalink
utf8: enhance handling of multibyte sequences
Browse files Browse the repository at this point in the history
This patch refactor a bit how UTF8 decoding works by replacing the old lookup table
for special characters/codepoints with a new routine and optional lookup table based
on the compiler type (GNU/Clang).

It also supports proper encoding of multibyte sequences.

Signed-off-by: Eduardo Silva <[email protected]>
  • Loading branch information
edsiper committed Dec 4, 2024
1 parent d573777 commit b770786
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 75 deletions.
82 changes: 7 additions & 75 deletions include/fluent-bit/flb_utf8.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,84 +20,16 @@
#ifndef FLB_UTF8_H
#define FLB_UTF8_H

#define FLB_UTF8_ACCEPT 0
#define FLB_UTF8_REJECT 1
#define FLB_UTF8_CONTINUE 2

#include <fluent-bit/flb_info.h>
#include <inttypes.h>

/* is the start of a UTF-8 string ? */
#define flb_utf8_check(c) (((c) & 0xC0) != 0x80)

static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/* returns length of next utf-8 sequence */
static inline int flb_utf8_len(const char *s)
{
return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1;
}

/*
* UTF-8 Decoding routines are originally written by Bjoern Hoehrmann
* <[email protected]> and taken from the following web site:
*
* http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*
* They have been siglhy renamed to follow Fluent Bit naming requirements.
*/

#define FLB_UTF8_ACCEPT 0
#define FLB_UTF8_REJECT 1

static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

static inline uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep,
uint32_t byte)
{
uint32_t type = utf8d[byte];

*codep = (*state != FLB_UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);

*state = utf8d[256 + *state*16 + type];
return *state;
}


static inline void flb_utf8_print(const uint8_t *s) {
uint32_t codepoint;
uint32_t state = 0;

for (; *s; ++s)
if (!flb_utf8_decode(&state, &codepoint, *s)) {
printf("\\u%04x\n", codepoint);
}

if (state != FLB_UTF8_ACCEPT) {
printf("The string is not well-formed\n");
}
}
int flb_utf8_len(const char *s);
uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte);
void flb_utf8_print(char *input);

#endif
201 changes: 201 additions & 0 deletions src/flb_utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/* Fluent Bit
* ==========
* Copyright (C) 2015-2024 The Fluent Bit Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <fluent-bit/flb_info.h>
#include <fluent-bit/flb_utf8.h>

#include <stdio.h>
#include <string.h>
#include <inttypes.h>

static const char trailing_bytes_for_utf8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/* returns length of next utf-8 sequence */
int flb_utf8_len(const char *s)
{
return trailing_bytes_for_utf8[(unsigned int)(unsigned char)s[0]] + 1;
}

#if defined(__GNUC__) || defined(__clang__)
/*
* if we are compiling with GNU or CLang compiler , we have the ranges
* functionality available, so we can tweak our decoder by using a lookup
* table.
*
* Lookup table for byte classification and state transitions:
*
* Format: {initial_state, bitmask, expected_continuation_bytes}
* ASCII: state 0, no continuation bytes
* Start of multi-byte sequence: state X, continuation byte count
* Invalid: reject state
*/
static const uint8_t utf8_lookup[256][3] = {
[0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */
[0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */
[0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */
[0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */
[0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */
[0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */
};

uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte)
{
const uint8_t *entry = utf8_lookup[byte];

if (*state == FLB_UTF8_ACCEPT) {
/* starting a new character */
*state = entry[0];
if (*state == FLB_UTF8_REJECT) {
/* invalid start byte */
return FLB_UTF8_REJECT;
}
*codep = byte & entry[1];
}
else {
/* continuation byte */
if ((byte & 0xC0) == 0x80) {
*codep = (*codep << 6) | (byte & 0x3F);
/* decrement continuation bytes */
(*state)--;
}
else {
/* invalid continuation byte */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
}

/* check if the sequence is complete */
if (*state == 0) {
if (*codep >= 0xD800 && *codep <= 0xDFFF) {
/* surrogate pair (invalid UTF-8) */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
else if (*codep > 0x10FFFF) {
/* out of range codepoint */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
/* valid and complete sequence */
return FLB_UTF8_ACCEPT;
}

/* we are still processing the current sequence */
return FLB_UTF8_CONTINUE;
}

#else

/* fallback decoder: no lookup table */
uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte)
{
/* Start of a new character */
if (*state == 0) {
if (byte <= 0x7F) {
/* ASCII */
*codep = byte;
return FLB_UTF8_ACCEPT;
}
else if ((byte & 0xE0) == 0xC0) {
/* start of a 2-byte sequence */
*codep = byte & 0x1F;
*state = 1;
}
else if ((byte & 0xF0) == 0xE0) {
/* start of a 3-byte sequence */
*codep = byte & 0x0F;
*state = 2;
}
else if ((byte & 0xF8) == 0xF0) {
/* start of a 4-byte sequence */
*codep = byte & 0x07;
*state = 3;
}
else {
/* invalid first byte */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
}
else {
/* continuation byte */
if ((byte & 0xC0) == 0x80) {
*codep = (*codep << 6) | (byte & 0x3F);

/* reduce the expected continuation bytes */
(*state)--;
}
else {
/* invalid continuation byte */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
}

if (*state == 0) {
/* sequence complete */
if (*codep >= 0xD800 && *codep <= 0xDFFF) {
/* invalid surrogate pair */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
else if (*codep > 0x10FFFF) {
/* codepoint is out of range */
*state = FLB_UTF8_REJECT;
return FLB_UTF8_REJECT;
}
return FLB_UTF8_ACCEPT;
}

/* we are still processing the current sequence */
return FLB_UTF8_CONTINUE;
}

#endif

void flb_utf8_print(char *input)
{
int i;
int ret;
int len;
uint32_t state = 0;
uint32_t codepoint = 0;

len = strlen(input);
for (i = 0; i < len; i++) {
ret = flb_utf8_decode(&state, &codepoint, (uint8_t) input[i]);
if (ret == FLB_UTF8_ACCEPT) {
printf("Valid Codepoint: U+%04X\n", codepoint);
}
else if (ret == FLB_UTF8_REJECT) {
printf("Invalid UTF-8 sequence detected.\n");
break;
}
}
}

0 comments on commit b770786

Please sign in to comment.