diff --git a/src/pre_tokenizer.cpp b/src/pre_tokenizer.cpp index 5838e9e..956403d 100644 --- a/src/pre_tokenizer.cpp +++ b/src/pre_tokenizer.cpp @@ -78,7 +78,7 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) { type = json_config.at("type"); if (type == "Split") { try { - pattern = json_config.at("pattern"); + pattern = json_config.at("pattern").at("Regex"); } catch (json::out_of_range&) { } } else if (type == "Digits") { diff --git a/test/test_pre_tokenizer.cpp b/test/test_pre_tokenizer.cpp index f87c892..baa795b 100644 --- a/test/test_pre_tokenizer.cpp +++ b/test/test_pre_tokenizer.cpp @@ -221,7 +221,8 @@ TEST_F(PreTokenizerConfigTest, Split) { .parse_json(json{ {"type", "Split"}, {"pattern", - R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)"}, + {{"Regex", + R"((?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)"}}}, }) .create(); assert_split_match(*ptok, "Hello World", {"Hello", " World"});