diff --git a/grammars/f1_c_gen.py b/grammars/f1_c_gen.py index b8a7f18..31464d4 100644 --- a/grammars/f1_c_gen.py +++ b/grammars/f1_c_gen.py @@ -78,22 +78,13 @@ def to_bytes(self): # subnode_count subnode_count = len(self) ret += subnode_count.to_bytes(4, byteorder='little', signed=False) - # val_len - val_len = len(self.val) + + # Encode the value as UTF-8 + val_bytes = self.val.encode('utf-8') + # val_len (now stores the byte length of the UTF-8 encoded string) + val_len = len(val_bytes) ret += val_len.to_bytes(4, byteorder='little', signed=False) # val - # Latin-1 is an 8-bit character set. The first 128 characters of its - # set are identical to the US ASCII standard. By encoding the string as - # Latin-1, we can handle all hex characters from \u0000 to \u00ff - # Refs: - # - https://stackoverflow.com/questions/66601743/python3-str-to-bytes-convertation-problem - # - https://kb.iu.edu/d/aepu - val_bytes = bytes(self.val, 'latin-1') - if val_len != len(val_bytes): - print(f'The length of `val` should be {val_len}, but found {len(val_bytes)}.') - print(f'`val` bytes in UTF-8 encoding: {val_bytes}') - print('Please check your grammar file!') - sys.exit(1) ret += val_bytes # subnodes @@ -103,6 +94,7 @@ def to_bytes(self): return ret @staticmethod + def from_bytes(data: bytes): node = TreeNode() consumed = 0 @@ -133,6 +125,7 @@ def from_bytes(data: bytes): return node, consumed + def __str__(self): ret = '' if len(self) == 0: diff --git a/src/Makefile b/src/Makefile index 43a255f..8cf96c2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -47,6 +47,7 @@ BENCHMARK_OBJS = $(BENCHMARK_SRC_FILES:.c=.o) OBJS = $(LIB_OBJS) $(GEN_OBJS) $(BENCHMARK_OBJS) C_FLAGS = $(C_FLAGS_OPT) +C_FLAGS += -Wno-error=bidi-chars C_DEFINES = C_INCLUDES = -I../include -I../third_party/rxi_map -I../third_party/Cyan4973_xxHash