-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
utf8: enhance handling of multibyte sequences
This patch refactor a bit how UTF8 decoding works by replacing the old lookup table for special characters/codepoints with a new routine and optional lookup table based on the compiler type (GNU/Clang). It also supports proper encoding of multibyte sequences. Signed-off-by: Eduardo Silva <[email protected]>
- Loading branch information
Showing
2 changed files
with
208 additions
and
75 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,84 +20,16 @@ | |
#ifndef FLB_UTF8_H | ||
#define FLB_UTF8_H | ||
|
||
#define FLB_UTF8_ACCEPT 0 | ||
#define FLB_UTF8_REJECT 1 | ||
#define FLB_UTF8_CONTINUE 2 | ||
|
||
#include <fluent-bit/flb_info.h> | ||
#include <inttypes.h> | ||
|
||
/* is the start of a UTF-8 string ? */ | ||
#define flb_utf8_check(c) (((c) & 0xC0) != 0x80) | ||
|
||
static const char trailingBytesForUTF8[256] = { | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 | ||
}; | ||
|
||
/* returns length of next utf-8 sequence */ | ||
static inline int flb_utf8_len(const char *s) | ||
{ | ||
return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; | ||
} | ||
|
||
/* | ||
* UTF-8 Decoding routines are originally written by Bjoern Hoehrmann | ||
* <[email protected]> and taken from the following web site: | ||
* | ||
* http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ | ||
* | ||
* They have been siglhy renamed to follow Fluent Bit naming requirements. | ||
*/ | ||
|
||
#define FLB_UTF8_ACCEPT 0 | ||
#define FLB_UTF8_REJECT 1 | ||
|
||
static const uint8_t utf8d[] = { | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f | ||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf | ||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df | ||
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef | ||
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff | ||
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 | ||
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 | ||
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 | ||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 | ||
}; | ||
|
||
static inline uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, | ||
uint32_t byte) | ||
{ | ||
uint32_t type = utf8d[byte]; | ||
|
||
*codep = (*state != FLB_UTF8_ACCEPT) ? | ||
(byte & 0x3fu) | (*codep << 6) : | ||
(0xff >> type) & (byte); | ||
|
||
*state = utf8d[256 + *state*16 + type]; | ||
return *state; | ||
} | ||
|
||
|
||
static inline void flb_utf8_print(const uint8_t *s) { | ||
uint32_t codepoint; | ||
uint32_t state = 0; | ||
|
||
for (; *s; ++s) | ||
if (!flb_utf8_decode(&state, &codepoint, *s)) { | ||
printf("\\u%04x\n", codepoint); | ||
} | ||
|
||
if (state != FLB_UTF8_ACCEPT) { | ||
printf("The string is not well-formed\n"); | ||
} | ||
} | ||
int flb_utf8_len(const char *s); | ||
uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte); | ||
void flb_utf8_print(char *input); | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ | ||
|
||
/* Fluent Bit | ||
* ========== | ||
* Copyright (C) 2015-2024 The Fluent Bit Authors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <fluent-bit/flb_info.h> | ||
#include <fluent-bit/flb_utf8.h> | ||
|
||
#include <stdio.h> | ||
#include <string.h> | ||
#include <inttypes.h> | ||
|
||
static const char trailing_bytes_for_utf8[256] = { | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 | ||
}; | ||
|
||
/* returns length of next utf-8 sequence */ | ||
int flb_utf8_len(const char *s) | ||
{ | ||
return trailing_bytes_for_utf8[(unsigned int)(unsigned char)s[0]] + 1; | ||
} | ||
|
||
#if defined(__GNUC__) || defined(__clang__) | ||
/* | ||
* if we are compiling with GNU or CLang compiler , we have the ranges | ||
* functionality available, so we can tweak our decoder by using a lookup | ||
* table. | ||
* | ||
* Lookup table for byte classification and state transitions: | ||
* | ||
* Format: {initial_state, bitmask, expected_continuation_bytes} | ||
* ASCII: state 0, no continuation bytes | ||
* Start of multi-byte sequence: state X, continuation byte count | ||
* Invalid: reject state | ||
*/ | ||
static const uint8_t utf8_lookup[256][3] = { | ||
[0x00 ... 0x7F] = {0, 0x7F, 0}, /* ASCII */ | ||
[0xC0 ... 0xDF] = {1, 0x1F, 1}, /* Start of 2-byte sequence */ | ||
[0xE0 ... 0xEF] = {2, 0x0F, 2}, /* Start of 3-byte sequence */ | ||
[0xF0 ... 0xF7] = {3, 0x07, 3}, /* Start of 4-byte sequence */ | ||
[0x80 ... 0xBF] = {FLB_UTF8_REJECT, 0, 0}, /* Continuation bytes */ | ||
[0xF8 ... 0xFF] = {FLB_UTF8_REJECT, 0, 0}, /* Invalid bytes */ | ||
}; | ||
|
||
uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) | ||
{ | ||
const uint8_t *entry = utf8_lookup[byte]; | ||
|
||
if (*state == FLB_UTF8_ACCEPT) { | ||
/* starting a new character */ | ||
*state = entry[0]; | ||
if (*state == FLB_UTF8_REJECT) { | ||
/* invalid start byte */ | ||
return FLB_UTF8_REJECT; | ||
} | ||
*codep = byte & entry[1]; | ||
} | ||
else { | ||
/* continuation byte */ | ||
if ((byte & 0xC0) == 0x80) { | ||
*codep = (*codep << 6) | (byte & 0x3F); | ||
/* decrement continuation bytes */ | ||
(*state)--; | ||
} | ||
else { | ||
/* invalid continuation byte */ | ||
*state = FLB_UTF8_REJECT; | ||
return FLB_UTF8_REJECT; | ||
} | ||
} | ||
|
||
/* check if the sequence is complete */ | ||
if (*state == 0) { | ||
if (*codep >= 0xD800 && *codep <= 0xDFFF) { | ||
/* surrogate pair (invalid UTF-8) */ | ||
*state = FLB_UTF8_REJECT; | ||
return FLB_UTF8_REJECT; | ||
} | ||
else if (*codep > 0x10FFFF) { | ||
/* out of range codepoint */ | ||
*state = FLB_UTF8_REJECT; | ||
return FLB_UTF8_REJECT; | ||
} | ||
/* valid and complete sequence */ | ||
return FLB_UTF8_ACCEPT; | ||
} | ||
|
||
/* we are still processing the current sequence */ | ||
return FLB_UTF8_CONTINUE; | ||
} | ||
|
||
#else | ||
|
||
/* fallback decoder: no lookup table */ | ||
uint32_t flb_utf8_decode(uint32_t *state, uint32_t *codep, uint8_t byte) | ||
{ | ||
/* Start of a new character */ | ||
if (*state == 0) { | ||
if (byte <= 0x7F) { | ||
/* ASCII */ | ||
*codep = byte; | ||
return FLB_UTF8_ACCEPT; | ||
} | ||
else if ((byte & 0xE0) == 0xC0) { | ||
/* start of a 2-byte sequence */ | ||
*codep = byte & 0x1F; | ||
*state = 1; | ||
} | ||
else if ((byte & 0xF0) == 0xE0) { | ||
/* start of a 3-byte sequence */ | ||
*codep = byte & 0x0F; | ||
*state = 2; | ||
} | ||
else if ((byte & 0xF8) == 0xF0) { | ||
/* start of a 4-byte sequence */ | ||
*codep = byte & 0x07; | ||
*state = 3; | ||
} | ||
else { | ||
/* invalid first byte */ | ||
*state = FLB_UTF8_REJECT; | ||
return FLB_UTF8_REJECT; | ||
} | ||
} | ||
else { | ||
/* continuation byte */ | ||
if ((byte & 0xC0) == 0x80) { | ||
*codep = (*codep << 6) | (byte & 0x3F); | ||
|
||
/* reduce the expected continuation bytes */ | ||
(*state)--; | ||
} | ||
else { | ||
/* invalid continuation byte */ | ||
*state = FLB_UTF8_REJECT; | ||
return FLB_UTF8_REJECT; | ||
} | ||
} | ||
|
||
if (*state == 0) { | ||
/* sequence complete */ | ||
if (*codep >= 0xD800 && *codep <= 0xDFFF) { | ||
/* invalid surrogate pair */ | ||
*state = FLB_UTF8_REJECT; | ||
return FLB_UTF8_REJECT; | ||
} | ||
else if (*codep > 0x10FFFF) { | ||
/* codepoint is out of range */ | ||
*state = FLB_UTF8_REJECT; | ||
return FLB_UTF8_REJECT; | ||
} | ||
return FLB_UTF8_ACCEPT; | ||
} | ||
|
||
/* we are still processing the current sequence */ | ||
return FLB_UTF8_CONTINUE; | ||
} | ||
|
||
#endif | ||
|
||
void flb_utf8_print(char *input) | ||
{ | ||
int i; | ||
int ret; | ||
int len; | ||
uint32_t state = 0; | ||
uint32_t codepoint = 0; | ||
|
||
len = strlen(input); | ||
for (i = 0; i < len; i++) { | ||
ret = flb_utf8_decode(&state, &codepoint, (uint8_t) input[i]); | ||
if (ret == FLB_UTF8_ACCEPT) { | ||
printf("Valid Codepoint: U+%04X\n", codepoint); | ||
} | ||
else if (ret == FLB_UTF8_REJECT) { | ||
printf("Invalid UTF-8 sequence detected.\n"); | ||
break; | ||
} | ||
} | ||
} |