diff --git a/cpp/compiler.cc b/cpp/compiler.cc index 7ec5dc7e..01040735 100644 --- a/cpp/compiler.cc +++ b/cpp/compiler.cc @@ -330,6 +330,7 @@ class GrammarCompiler::Impl { CompiledGrammar CompileJSONSchema( const std::string& schema, + bool any_whitespace, std::optional indent, std::optional> separators, bool strict_mode = true @@ -342,7 +343,7 @@ class GrammarCompiler::Impl { private: /*! \brief The cache for the compiled grammar of a JSON schema. */ using SchemaKey = - std::tuple, std::pair, bool>; + std::tuple, std::pair, bool>; std::function GetCompileJSONSchemaCacheFunc(bool cache_enabled ) { @@ -350,8 +351,9 @@ class GrammarCompiler::Impl { return nullptr; } return [&](const SchemaKey& key) { - auto [schema, indent, separators, strict_mode] = key; - auto grammar = Grammar::FromJSONSchema(schema, indent, separators, strict_mode); + auto [schema, any_whitespace, indent, separators, strict_mode] = key; + auto grammar = + Grammar::FromJSONSchema(schema, any_whitespace, indent, separators, strict_mode); return MultiThreadCompileGrammar(grammar, tokenizer_info_, max_threads_); }; } @@ -404,13 +406,14 @@ CompiledGrammar GrammarCompiler::Impl::CompileBuiltinJSONGrammar() { CompiledGrammar GrammarCompiler::Impl::CompileJSONSchema( const std::string& schema, + bool any_whitespace, std::optional indent, std::optional> separators, bool strict_mode ) { if (!cache_enabled_) { return MultiThreadCompileGrammar( - Grammar::FromJSONSchema(schema, indent, separators, strict_mode), + Grammar::FromJSONSchema(schema, any_whitespace, indent, separators, strict_mode), tokenizer_info_, max_threads_ ); @@ -418,7 +421,7 @@ CompiledGrammar GrammarCompiler::Impl::CompileJSONSchema( auto separators_value = separators.value_or( (indent == std::nullopt) ? std::make_pair(", ", ": ") : std::make_pair(",", ": ") ); - auto key = std::make_tuple(schema, indent, separators_value, strict_mode); + auto key = std::make_tuple(schema, any_whitespace, indent, separators_value, strict_mode); return compile_json_schema_cache_.Get(key); } @@ -444,11 +447,12 @@ GrammarCompiler::GrammarCompiler( CompiledGrammar GrammarCompiler::CompileJSONSchema( const std::string& schema, + bool any_whitespace, std::optional indent, std::optional> separators, bool strict_mode ) { - return pimpl_->CompileJSONSchema(schema, indent, separators, strict_mode); + return pimpl_->CompileJSONSchema(schema, any_whitespace, indent, separators, strict_mode); } CompiledGrammar GrammarCompiler::CompileBuiltinJSONGrammar() { diff --git a/cpp/grammar.cc b/cpp/grammar.cc index 1522cb68..b221aa2f 100644 --- a/cpp/grammar.cc +++ b/cpp/grammar.cc @@ -23,11 +23,12 @@ Grammar Grammar::FromEBNF(const std::string& ebnf_string, const std::string& roo Grammar Grammar::FromJSONSchema( const std::string& schema, + bool any_whitespace, std::optional indent, std::optional> separators, bool strict_mode ) { - auto ebnf_string = JSONSchemaToEBNF(schema, indent, separators, strict_mode); + auto ebnf_string = JSONSchemaToEBNF(schema, any_whitespace, indent, separators, strict_mode); return FromEBNF(ebnf_string); } diff --git a/cpp/json_schema_converter.cc b/cpp/json_schema_converter.cc index ccb15320..b628b10d 100644 --- a/cpp/json_schema_converter.cc +++ b/cpp/json_schema_converter.cc @@ -27,11 +27,13 @@ namespace xgrammar { * \param indent The number of spaces for each indent. If it is std::nullopt, there will be no * indent or newline. * \param separator The separator between different elements in json. Examples include "," and ", ". + * \param any_whitespace Whether to ignore the indentation restrictions, and allow any whitespace. */ class IndentManager { public: - IndentManager(std::optional indent, const std::string& separator) - : enable_newline_(indent.has_value()), + IndentManager(std::optional indent, const std::string& separator, bool any_whitespace) + : any_whitespace_(any_whitespace), + enable_newline_(indent.has_value()), indent_(indent.value_or(0)), separator_(separator), total_indent_(0), @@ -66,10 +68,8 @@ class IndentManager { */ std::string NextSeparator(bool is_end = false); - /*! \brief Get the separator itself. */ - std::string GetBareSeparator() { return separator_; } - private: + bool any_whitespace_; bool enable_newline_; int indent_; std::string separator_; @@ -79,6 +79,15 @@ class IndentManager { }; std::string IndentManager::NextSeparator(bool is_end) { + if (any_whitespace_) { + if (is_first_.back() || is_end) { + is_first_.back() = false; + return "[ \\n\\t]*"; + } else { + return "[ \\n\\t]* \"" + separator_ + "\" [ \\n\\t]*"; + } + } + std::string res = ""; if (!is_first_.back() && !is_end) { res += separator_; @@ -110,9 +119,10 @@ class JSONSchemaConverter { public: JSONSchemaConverter( const picojson::value& json_schema, - std::optional indent = std::nullopt, - std::optional> separators = std::nullopt, - bool strict_mode = false + bool any_whitespace, + std::optional indent, + std::optional> separators, + bool strict_mode ); /*! \brief The root method. Convert the JSON schema to EBNF grammar string. */ @@ -327,32 +337,49 @@ class JSONSchemaConverter { ); // The indent manager to get separators - std::unique_ptr indentManager_; + std::optional indentManager_; // The root JSON schema picojson::value json_schema_; // Whether to use strict mode in conversion. See JSONSchemaToEBNF(). bool strict_mode_; + // Whether to allow empty object/array + bool allow_empty_; // The colon separator - std::string colon_; + std::string colon_pattern_; // The rules constructed std::vector> rules_; // The cache for basic rules. Mapping from the key of schema returned by GetSchemaCacheIndex() // to the basic rule name. std::map basic_rules_cache_; + // Whether to use any whitespace in the conversion + bool any_whitespace_; }; JSONSchemaConverter::JSONSchemaConverter( const picojson::value& json_schema, + bool any_whitespace, std::optional indent, std::optional> separators, bool strict_mode ) - : json_schema_(json_schema), strict_mode_(strict_mode) { + : json_schema_(json_schema), strict_mode_(strict_mode), any_whitespace_(any_whitespace) { if (!separators.has_value()) { - separators = (indent == std::nullopt) ? std::make_pair(", ", ": ") : std::make_pair(",", ": "); + if (indent == std::nullopt) { + separators = std::make_pair(", ", ": "); + } else { + separators = std::make_pair(",", ": "); + } + } + if (any_whitespace) { + separators = std::make_pair(",", ":"); + } + indentManager_ = IndentManager(indent, separators->first, any_whitespace); + if (any_whitespace) { + colon_pattern_ = "[ \\n\\t]* \"" + separators->second + "\" [ \\n\\t]*"; + } else { + colon_pattern_ = "\"" + separators->second + "\""; } - indentManager_ = std::make_unique(indent, separators->first); - colon_ = separators->second; + allow_empty_ = !strict_mode_; AddBasicRules(); } @@ -368,11 +395,15 @@ std::string JSONSchemaConverter::Convert() { void JSONSchemaConverter::AddBasicRules() { bool past_strict_mode = strict_mode_; + // Allow any field for basic array/obj rules strict_mode_ = false; - auto past_indent_manager = std::move(indentManager_); - indentManager_ = - std::make_unique(std::nullopt, past_indent_manager->GetBareSeparator()); + auto past_indent_manager = indentManager_; + if (any_whitespace_) { + indentManager_ = IndentManager(std::nullopt, ",", true); + } else { + indentManager_ = IndentManager(std::nullopt, ", ", false); + } AddHelperRules(); CreateBasicRule(picojson::value(true), kBasicAny); @@ -398,7 +429,7 @@ void JSONSchemaConverter::AddBasicRules() { ); strict_mode_ = past_strict_mode; - indentManager_ = std::move(past_indent_manager); + indentManager_ = past_indent_manager; } void JSONSchemaConverter::AddHelperRules() { @@ -664,39 +695,39 @@ std::string JSONSchemaConverter::GenerateRangeRegex( return "^\\d+$"; // Match any positive number if no start or end is specified } - std::vector positiveParts; - std::vector negativeParts; + std::vector positive_parts; + std::vector negative_parts; - auto generateGroup = [](int s, int e) -> std::string { + auto generate_group = [](int s, int e) -> std::string { std::ostringstream oss; if (s == e) { return std::to_string(s); } - std::string startStr = std::to_string(s); - std::string endStr = std::to_string(e); + std::string start_str = std::to_string(s); + std::string end_str = std::to_string(e); - size_t commonPrefix = 0; - while (commonPrefix < startStr.size() && startStr[commonPrefix] == endStr[commonPrefix]) { - ++commonPrefix; + size_t common_prefix = 0; + while (common_prefix < start_str.size() && start_str[common_prefix] == end_str[common_prefix]) { + ++common_prefix; } - if (commonPrefix > 0) { - oss << startStr.substr(0, commonPrefix); + if (common_prefix > 0) { + oss << start_str.substr(0, common_prefix); } - if (commonPrefix < startStr.size()) { + if (common_prefix < start_str.size()) { oss << "["; - oss << startStr[commonPrefix]; - if (startStr[commonPrefix] != endStr[commonPrefix]) { - oss << "-" << endStr[commonPrefix]; + oss << start_str[common_prefix]; + if (start_str[common_prefix] != end_str[common_prefix]) { + oss << "-" << end_str[common_prefix]; } oss << "]"; // Add trailing zero ranges - if (commonPrefix + 1 < startStr.size()) { - oss << "\\d{" << startStr.size() - commonPrefix - 1 << "}"; + if (common_prefix + 1 < start_str.size()) { + oss << "\\d{" << start_str.size() - common_prefix - 1 << "}"; } } @@ -704,70 +735,70 @@ std::string JSONSchemaConverter::GenerateRangeRegex( }; if (start && end) { - int rangeStart = start.value(); - int rangeEnd = end.value(); + int range_start = start.value(); + int range_end = end.value(); // Handle negative part of the range - if (rangeStart < 0) { - int negativeEnd = std::min(rangeEnd, -1); - while (rangeStart <= negativeEnd) { - int nextRangeEnd = (rangeStart / 10 - 1) * 10 + 9; // Handle negative tens group - if (nextRangeEnd < negativeEnd) { - nextRangeEnd = negativeEnd; + if (range_start < 0) { + int negative_end = std::min(range_end, -1); + while (range_start <= negative_end) { + int next_range_end = (range_start / 10 - 1) * 10 + 9; // Handle negative tens group + if (next_range_end < negative_end) { + next_range_end = negative_end; } - negativeParts.push_back("-" + generateGroup(-nextRangeEnd, -rangeStart)); - rangeStart = nextRangeEnd + 1; + negative_parts.push_back("-" + generate_group(-next_range_end, -range_start)); + range_start = next_range_end + 1; } } // Handle positive part of the range - if (rangeEnd >= 0) { - rangeStart = std::max(rangeStart, 0); - while (rangeStart <= rangeEnd) { - int nextRangeEnd = (rangeStart / 10 + 1) * 10 - 1; // Handle positive tens group - if (nextRangeEnd > rangeEnd) { - nextRangeEnd = rangeEnd; + if (range_end >= 0) { + range_start = std::max(range_start, 0); + while (range_start <= range_end) { + int next_range_end = (range_start / 10 + 1) * 10 - 1; // Handle positive tens group + if (next_range_end > range_end) { + next_range_end = range_end; } - positiveParts.push_back(generateGroup(rangeStart, nextRangeEnd)); - rangeStart = nextRangeEnd + 1; + positive_parts.push_back(generate_group(range_start, next_range_end)); + range_start = next_range_end + 1; } } } else if (start) { if (start.value() < 0) { - negativeParts.push_back("-" + std::to_string(-start.value()) + "\\d*"); + negative_parts.push_back("-" + std::to_string(-start.value()) + "\\d*"); } else { - positiveParts.push_back(std::to_string(start.value()) + "\\d*"); + positive_parts.push_back(std::to_string(start.value()) + "\\d*"); } } else if (end) { if (end.value() < 0) { - negativeParts.push_back("-" + std::to_string(-end.value())); + negative_parts.push_back("-" + std::to_string(-end.value())); } else { - positiveParts.push_back(std::to_string(end.value())); + positive_parts.push_back(std::to_string(end.value())); } } std::ostringstream result; result << "^("; - if (!negativeParts.empty()) { + if (!negative_parts.empty()) { result << "("; - for (size_t i = 0; i < negativeParts.size(); ++i) { + for (size_t i = 0; i < negative_parts.size(); ++i) { if (i > 0) { result << "|"; } - result << negativeParts[i]; + result << negative_parts[i]; } result << ")"; - if (!positiveParts.empty()) { + if (!positive_parts.empty()) { result << "|"; } } - if (!positiveParts.empty()) { + if (!positive_parts.empty()) { result << "("; - for (size_t i = 0; i < positiveParts.size(); ++i) { + for (size_t i = 0; i < positive_parts.size(); ++i) { if (i > 0) { result << "|"; } - result << positiveParts[i]; + result << positive_parts[i]; } result << ")"; } @@ -942,8 +973,10 @@ std::string JSONSchemaConverter::VisitArray( result += " \"]\""; - if (could_be_empty) { - result = "(" + result + ") | \"[]\""; + if (allow_empty_ && could_be_empty) { + // result = (result) | [] + auto rest = "\"[\" " + std::string(any_whitespace_ ? "[ \\n\\t]* " : "") + "\"]\""; + result = "(" + result + ") | " + rest; } return result; @@ -958,9 +991,8 @@ std::string JSONSchemaConverter::GetPropertyPattern( // the outer quote is for the string in EBNF grammar, and the inner quote is for // the string in JSON std::string key = "\"\\\"" + prop_name + "\\\"\""; - std::string colon = "\"" + colon_ + "\""; std::string value = CreateRuleFromSchema(prop_schema, rule_name + "_prop_" + std::to_string(idx)); - return key + " " + colon + " " + value; + return key + " " + colon_pattern_ + " " + value; } std::string JSONSchemaConverter::GetOtherPropertyPattern( @@ -969,9 +1001,8 @@ std::string JSONSchemaConverter::GetOtherPropertyPattern( const std::string& rule_name, const std::string& rule_name_suffix ) { - std::string colon = "\"" + colon_ + "\""; std::string value = CreateRuleFromSchema(prop_schema, rule_name + "_" + rule_name_suffix); - return key_pattern + " " + colon + " " + value; + return key_pattern + " " + colon_pattern_ + " " + value; } std::string JSONSchemaConverter::GetPartialRuleForPropertiesAllOptional( @@ -1177,8 +1208,10 @@ std::string JSONSchemaConverter::VisitObject( indentManager_->EndIndent(); result += " \"}\""; - if (could_be_empty) { - result = "(" + result + ") | \"{}\""; + if (allow_empty_ && could_be_empty) { + // result = (result) | {} + auto rest = "\"{\" " + std::string(any_whitespace_ ? "[ \\n\\t]* " : "") + "\"}\""; + result = "(" + result + ") | " + rest; } return result; @@ -1186,6 +1219,7 @@ std::string JSONSchemaConverter::VisitObject( std::string JSONSchemaToEBNF( const std::string& schema, + bool any_whitespace, std::optional indent, std::optional> separators, bool strict_mode @@ -1194,16 +1228,17 @@ std::string JSONSchemaToEBNF( std::string err = picojson::parse(schema_value, schema); XGRAMMAR_CHECK(err.empty()) << "Failed to parse JSON: " << err << ". The JSON string is:" << schema; - return JSONSchemaToEBNF(schema_value, indent, separators, strict_mode); + return JSONSchemaToEBNF(schema_value, any_whitespace, indent, separators, strict_mode); } std::string JSONSchemaToEBNF( const picojson::value& schema, + bool any_whitespace, std::optional indent, std::optional> separators, bool strict_mode ) { - JSONSchemaConverter converter(schema, indent, separators, strict_mode); + JSONSchemaConverter converter(schema, any_whitespace, indent, separators, strict_mode); return converter.Convert(); } diff --git a/cpp/json_schema_converter.h b/cpp/json_schema_converter.h index 8ad82bbd..e278fb6f 100644 --- a/cpp/json_schema_converter.h +++ b/cpp/json_schema_converter.h @@ -16,20 +16,51 @@ namespace xgrammar { /*! - * \brief Convert a JSON schema string to EBNF grammar string. + * \brief Convert JSON schema string to EBNF grammar string. + * \param schema The JSON schema string. + * \param indent The number of spaces for indentation. If set to std::nullopt, the output will be + * in one line. Default: 2. + * \param separators Two separators used in the schema: comma and colon. Examples: {",", ":"}, + * {", ", ": "}. If std::nullopt, the default separators will be used: {",", ": "} when the + * indent is not -1, and {", ", ": "} otherwise. This follows the convention in python + * json.dumps(). Default: std::nullopt. \param strict_mode Whether to use strict mode. In strict + * mode, the generated grammar will not allow properties and items that is not specified in the + * schema. This is equivalent to setting unevaluatedProperties and unevaluatedItems to false. + * + * This helps LLM to generate accurate output in the grammar-guided generation with JSON + * schema. Default: true. + * \returns The EBNF grammar string. */ std::string JSONSchemaToEBNF( const std::string& schema, - std::optional indent, - std::optional> separators, - bool strict_mode + bool any_whitespace = true, + std::optional indent = std::nullopt, + std::optional> separators = std::nullopt, + bool strict_mode = true ); +/*! + * \brief Convert JSON schema string to EBNF grammar string. + * \param schema The JSON schema object. + * \param indent The number of spaces for indentation. If set to std::nullopt, the output will be + * in one line. Default: 2. + * \param separators Two separators used in the schema: comma and colon. Examples: {",", ":"}, + * {", ", ": "}. If std::nullopt, the default separators will be used: {",", ": "} when the + * indent is not -1, and {", ", ": "} otherwise. This follows the convention in python + * json.dumps(). Default: std::nullopt. \param strict_mode Whether to use strict mode. In strict + * mode, the generated grammar will not allow properties and items that is not specified in the + * schema. This is equivalent to setting unevaluatedProperties and unevaluatedItems to false. + * + * This helps LLM to generate accurate output in the grammar-guided generation with JSON + * schema. Default: true. + * \returns The EBNF grammar string. + */ std::string JSONSchemaToEBNF( const picojson::value& schema, - std::optional indent, - std::optional> separators, - bool strict_mode + bool any_whitespace = true, + std::optional indent = std::nullopt, + std::optional> separators = std::nullopt, + bool strict_mode = true ); } // namespace xgrammar diff --git a/cpp/pybind/pybind.cc b/cpp/pybind/pybind.cc index 82ea867f..e80255f7 100644 --- a/cpp/pybind/pybind.cc +++ b/cpp/pybind/pybind.cc @@ -7,7 +7,8 @@ #include #include -#include "../testing.h" +#include "../json_schema_converter.h" +#include "../regex_converter.h" #include "python_methods.h" namespace py = pybind11; @@ -71,7 +72,16 @@ PYBIND11_MODULE(xgrammar_bindings, m) { .def("_debug_accept_string", &GrammarMatcher::_DebugAcceptString); auto pyTestingModule = m.def_submodule("testing"); - pyTestingModule.def("_json_schema_to_ebnf", &_JSONSchemaToEBNF) - .def("_regex_to_ebnf", &_RegexToEBNF) + pyTestingModule + .def( + "_json_schema_to_ebnf", + py::overload_cast< + const std::string&, + bool, + std::optional, + std::optional>, + bool>(&JSONSchemaToEBNF) + ) + .def("_regex_to_ebnf", &RegexToEBNF) .def("_get_masked_tokens_from_bitmask", &Matcher_DebugGetMaskedTokensFromBitmask); } diff --git a/cpp/testing.cc b/cpp/testing.cc deleted file mode 100644 index 85740f6d..00000000 --- a/cpp/testing.cc +++ /dev/null @@ -1,26 +0,0 @@ -/*! - * Copyright (c) 2024 by Contributors - * \file xgrammar/testing.cc - */ -#include "testing.h" - -#include "grammar_parser.h" -#include "json_schema_converter.h" -#include "regex_converter.h" - -namespace xgrammar { - -std::string _JSONSchemaToEBNF( - const std::string& schema, - std::optional indent, - std::optional> separators, - bool strict_mode -) { - return JSONSchemaToEBNF(schema, indent, separators, strict_mode); -} - -std::string _RegexToEBNF(const std::string& regex, bool with_rule_name) { - return RegexToEBNF(regex, with_rule_name); -} - -} // namespace xgrammar diff --git a/cpp/testing.h b/cpp/testing.h deleted file mode 100644 index 4a0a9a00..00000000 --- a/cpp/testing.h +++ /dev/null @@ -1,48 +0,0 @@ -/*! - * Copyright (c) 2024 by Contributors - * \file xgrammar/xgrammar.h - * \brief The header for the support of grammar-guided generation. - */ - -#ifndef XGRAMMAR_TESTING_H_ -#define XGRAMMAR_TESTING_H_ - -#include - -#include -#include -#include - -namespace xgrammar { - -/*! - * \brief Convert JSON schema string to EBNF grammar string. - * \param json_schema The JSON schema string. - * \param indent The number of spaces for indentation. If set to std::nullopt, the output will be - * in one line. Default: 2. - * \param separators Two separators used in the schema: comma and colon. Examples: {",", ":"}, - * {", ", ": "}. If std::nullopt, the default separators will be used: {",", ": "} when the - * indent is not -1, and {", ", ": "} otherwise. This follows the convention in python - * json.dumps(). Default: std::nullopt. \param strict_mode Whether to use strict mode. In strict - * mode, the generated grammar will not allow properties and items that is not specified in the - * schema. This is equivalent to setting unevaluatedProperties and unevaluatedItems to false. - * - * This helps LLM to generate accurate output in the grammar-guided generation with JSON - * schema. Default: true. - * \returns The EBNF grammar string. - */ -std::string _JSONSchemaToEBNF( - const std::string& schema, - std::optional indent = std::nullopt, - std::optional> separators = std::nullopt, - bool strict_mode = true -); - -/*! - * \brief Convert a regex string to EBNF grammar string. - */ -std::string _RegexToEBNF(const std::string& regex, bool with_rule_name = true); - -} // namespace xgrammar - -#endif // XGRAMMAR_TESTING_H_ diff --git a/include/xgrammar/compiler.h b/include/xgrammar/compiler.h index 5f69b2f6..b2e352ba 100644 --- a/include/xgrammar/compiler.h +++ b/include/xgrammar/compiler.h @@ -49,6 +49,7 @@ class GrammarCompiler { /*! \brief Get the compiled grammar for a JSON schema string. */ CompiledGrammar CompileJSONSchema( const std::string& schema, + bool any_whitespace = true, std::optional indent = std::nullopt, std::optional> separators = std::nullopt, bool strict_mode = true diff --git a/include/xgrammar/grammar.h b/include/xgrammar/grammar.h index dfd4d628..8f458ed8 100644 --- a/include/xgrammar/grammar.h +++ b/include/xgrammar/grammar.h @@ -97,6 +97,7 @@ class Grammar { */ static Grammar FromJSONSchema( const std::string& schema, + bool any_whitespace = true, std::optional indent = std::nullopt, std::optional> separators = std::nullopt, bool strict_mode = true diff --git a/python/xgrammar/compiler.py b/python/xgrammar/compiler.py index 1024a4fa..a7589610 100644 --- a/python/xgrammar/compiler.py +++ b/python/xgrammar/compiler.py @@ -70,6 +70,7 @@ def compile_json_schema( self, schema: Union[str, Type[BaseModel]], *, + any_whitespace: bool = True, indent: Optional[int] = None, separators: Optional[Tuple[str, str]] = None, strict_mode: bool = True, @@ -104,7 +105,9 @@ def compile_json_schema( schema = json.dumps(schema.model_json_schema()) return CompiledGrammar._create_from_handle( - self._handle.compile_json_schema(schema, indent, separators, strict_mode) + self._handle.compile_json_schema( + schema, any_whitespace, indent, separators, strict_mode + ) ) def compile_builtin_json_grammar(self) -> CompiledGrammar: diff --git a/python/xgrammar/grammar.py b/python/xgrammar/grammar.py index f480c39b..21e875c3 100644 --- a/python/xgrammar/grammar.py +++ b/python/xgrammar/grammar.py @@ -49,6 +49,7 @@ def from_ebnf(ebnf_string: str, *, root_rule_name: str = "root") -> "Grammar": def from_json_schema( schema: Union[str, Type[BaseModel]], *, + any_whitespace: bool = True, indent: Optional[int] = None, separators: Optional[Tuple[str, str]] = None, strict_mode: bool = True, @@ -56,9 +57,9 @@ def from_json_schema( """Construct a grammar from JSON schema. Pydantic model or JSON schema string can be used to specify the schema. - The format of the JSON schema can be specified with the `indent` and `separators` - parameters. The meaning and the default values of the parameters follows the convention in - json.dumps(). + It allows any whitespace by default. If user want to specify the format of the JSON, + set `any_whitespace` to False and use the `indent` and `separators` parameters. The + meaning and the default values of the parameters follows the convention in json.dumps(). It internally converts the JSON schema to a EBNF grammar. @@ -67,9 +68,20 @@ def from_json_schema( schema : Union[str, Type[BaseModel]] The schema string or Pydantic model. + any_whitespace : bool, default: True + Whether to use any whitespace. If True, the generated grammar will ignore the + indent and separators parameters, and allow any whitespace. + indent : Optional[int], default: None The number of spaces for indentation. If None, the output will be in one line. + Note that specifying the indentation means forcing the LLM to generate JSON strings + strictly formatted. However, some models may tend to generate JSON strings that + are not strictly formatted. In this case, forcing the LLM to generate strictly + formatted JSON strings may degrade the generation quality. See + for more + details. + separators : Optional[Tuple[str, str]], default: None Two separators used in the schema: comma and colon. Examples: (",", ":"), (", ", ": "). If None, the default separators will be used: (",", ": ") when the indent is not None, @@ -78,7 +90,8 @@ def from_json_schema( strict_mode : bool, default: True Whether to use strict mode. In strict mode, the generated grammar will not allow properties and items that is not specified in the schema. This is equivalent to - setting unevaluatedProperties and unevaluatedItems to false. + setting unevaluatedProperties and unevaluatedItems to false. It also disallows empty + JSON objects and arrays. This helps LLM to generate accurate output in the grammar-guided generation with JSON schema. @@ -89,10 +102,19 @@ def from_json_schema( The constructed grammar. """ if isinstance(schema, type) and issubclass(schema, BaseModel): - schema = json.dumps(schema.model_json_schema()) + if hasattr(schema, "model_json_schema"): + # pydantic 2.x + schema = json.dumps(schema.model_json_schema()) + elif hasattr(schema, "schema_json"): + # pydantic 1.x + schema = json.dumps(schema.schema_json()) + else: + raise ValueError( + "The schema should have a model_json_schema or json_schema method." + ) return Grammar._create_from_handle( - _core.Grammar.from_json_schema(schema, indent, separators, strict_mode), + _core.Grammar.from_json_schema(schema, any_whitespace, indent, separators, strict_mode), ) @staticmethod diff --git a/python/xgrammar/testing.py b/python/xgrammar/testing.py index a10e2e6f..2db103f1 100644 --- a/python/xgrammar/testing.py +++ b/python/xgrammar/testing.py @@ -14,6 +14,7 @@ def _json_schema_to_ebnf( schema: str, *, + any_whitespace: bool = True, indent: Optional[int] = None, separators: Optional[Tuple[str, str]] = None, strict_mode: bool = True, @@ -48,6 +49,7 @@ def _json_schema_to_ebnf( """ return _core.testing._json_schema_to_ebnf( schema, + any_whitespace, indent, separators, strict_mode, diff --git a/tests/python/test_grammar_compiler.py b/tests/python/test_grammar_compiler.py index 82c84ac2..3703cbb1 100644 --- a/tests/python/test_grammar_compiler.py +++ b/tests/python/test_grammar_compiler.py @@ -106,12 +106,12 @@ class MainModel(BaseModel): nested_object_field={"foo": {"bar": 42}}, ) - def check_with_fmt(indent, separators, test_id): + def check_with_fmt(any_whitespace, indent, separators, test_id): instance_str = instance.model_dump_json(indent=indent, round_trip=True) time_start = time.monotonic_ns() compiled_grammar = grammar_compiler.compile_json_schema( - MainModel, indent=indent, separators=separators + MainModel, any_whitespace=any_whitespace, indent=indent, separators=separators ) time_end = time.monotonic_ns() print(f"Time to get compiled grammar {test_id}: {(time_end - time_start) / 1e3} us") @@ -121,14 +121,19 @@ def check_with_fmt(indent, separators, test_id): assert matcher._debug_accept_string(instance_str) assert matcher.is_terminated() - check_with_fmt(None, (",", ":"), "1") - check_with_fmt(None, (",", ":"), "2") - check_with_fmt(2, None, "3") - check_with_fmt(2, (",", ": "), "4") + check_with_fmt(False, None, (",", ":"), "1") + check_with_fmt(False, None, (",", ":"), "2") + check_with_fmt(False, 2, None, "3") + check_with_fmt(False, 2, (",", ": "), "4") + + check_with_fmt(True, None, (",", ":"), "5") + check_with_fmt(True, None, (",", ":"), "6") + check_with_fmt(True, 2, None, "7") + check_with_fmt(True, 2, (",", ": "), "8") grammar_compiler.clear_cache() - check_with_fmt(None, (",", ":"), "5") + check_with_fmt(False, None, (",", ":"), "9") schema_instances = [ diff --git a/tests/python/test_json_schema_converter.py b/tests/python/test_json_schema_converter.py index 24e2a40b..4466874a 100644 --- a/tests/python/test_json_schema_converter.py +++ b/tests/python/test_json_schema_converter.py @@ -1,7 +1,7 @@ import json import sys from enum import Enum -from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Union +from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union import pytest from pydantic import BaseModel, Field, TypeAdapter, WithJsonSchema, create_model @@ -13,6 +13,7 @@ def check_schema_with_grammar( schema: Dict[str, Any], expected_grammar_ebnf: str, + any_whitespace: bool = True, indent: Optional[int] = None, separators: Optional[Tuple[str, str]] = None, strict_mode: bool = True, @@ -20,6 +21,7 @@ def check_schema_with_grammar( schema_str = json.dumps(schema) json_schema_ebnf = _json_schema_to_ebnf( schema_str, + any_whitespace=any_whitespace, indent=indent, separators=separators, strict_mode=strict_mode, @@ -30,19 +32,21 @@ def check_schema_with_grammar( def check_schema_with_json( schema: Dict[str, Any], json_str: str, - check_accepted: bool = True, + is_accepted: bool = True, + any_whitespace: bool = True, indent: Optional[int] = None, separators: Optional[Tuple[str, str]] = None, strict_mode: bool = True, ): json_schema_grammar = xgr.Grammar.from_json_schema( json.dumps(schema), + any_whitespace=any_whitespace, indent=indent, separators=separators, strict_mode=strict_mode, ) - if check_accepted: + if is_accepted: assert _match_grammar_with_string(json_schema_grammar, json_str) else: assert not _match_grammar_with_string(json_schema_grammar, json_str) @@ -51,14 +55,17 @@ def check_schema_with_json( def check_schema_with_instance( schema: Dict[str, Any], instance: BaseModel, - check_accepted: bool = True, + is_accepted: bool = True, + any_whitespace: bool = True, indent: Optional[int] = None, separators: Optional[Tuple[str, str]] = None, strict_mode: bool = True, ): instance_obj = instance.model_dump(mode="json", round_trip=True) instance_str = json.dumps(instance_obj, indent=indent, separators=separators) - check_schema_with_json(schema, instance_str, check_accepted, indent, separators, strict_mode) + check_schema_with_json( + schema, instance_str, is_accepted, any_whitespace, indent, separators, strict_mode + ) def test_basic() -> None: @@ -80,20 +87,20 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" -root_prop_3 ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -root_prop_4 ::= ("[" "" basic_string (", " basic_string)* "" "]") | "[]" -root_prop_5_item_2 ::= ("[" "" basic_string (", " basic_string)* "" "]") | "[]" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" +root_prop_3 ::= "[" "" basic_any (", " basic_any)* "" "]" +root_prop_4 ::= "[" "" basic_string (", " basic_string)* "" "]" +root_prop_5_item_2 ::= "[" "" basic_string (", " basic_string)* "" "]" root_prop_5 ::= "[" "" basic_string ", " basic_integer ", " root_prop_5_item_2 "" "]" -root_prop_6 ::= ("{" "" basic_string ": " basic_integer (", " basic_string ": " basic_integer)* "" "}") | "{}" -root_prop_7_addl ::= ("{" "" basic_string ": " basic_integer (", " basic_string ": " basic_integer)* "" "}") | "{}" -root_prop_7 ::= ("{" "" basic_string ": " root_prop_7_addl (", " basic_string ": " root_prop_7_addl)* "" "}") | "{}" +root_prop_6 ::= "{" "" basic_string ": " basic_integer (", " basic_string ": " basic_integer)* "" "}" +root_prop_7_addl ::= "{" "" basic_string ": " basic_integer (", " basic_string ": " basic_integer)* "" "}" +root_prop_7 ::= "{" "" basic_string ": " root_prop_7_addl (", " basic_string ": " root_prop_7_addl)* "" "}" root ::= "{" "" "\"integer_field\"" ": " basic_integer ", " "\"number_field\"" ": " basic_number ", " "\"boolean_field\"" ": " basic_boolean ", " "\"any_array_field\"" ": " root_prop_3 ", " "\"array_field\"" ": " root_prop_4 ", " "\"tuple_field\"" ": " root_prop_5 ", " "\"object_field\"" ": " root_prop_6 ", " "\"nested_object_field\"" ": " root_prop_7 "" "}" """ schema = MainModel.model_json_schema() - check_schema_with_grammar(schema, ebnf_grammar) + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=False) instance = MainModel( integer_field=42, @@ -105,7 +112,7 @@ class MainModel(BaseModel): object_field={"foo": 42, "bar": 43}, nested_object_field={"foo": {"bar": 42}}, ) - check_schema_with_instance(schema, instance) + check_schema_with_instance(schema, instance, any_whitespace=False) instance_empty = MainModel( integer_field=42, @@ -119,7 +126,7 @@ class MainModel(BaseModel): ) schema = MainModel.model_json_schema() - check_schema_with_instance(schema, instance_empty) + check_schema_with_instance(schema, instance_empty, is_accepted=False, any_whitespace=False) def test_indent() -> None: @@ -136,12 +143,12 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any ("," basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any ("," basic_string ": " basic_any)* "" "}") | "{}" -root_prop_0 ::= ("[" "\n " basic_string (",\n " basic_string)* "\n " "]") | "[]" -root_prop_1_item_2 ::= ("[" "\n " basic_string (",\n " basic_string)* "\n " "]") | "[]" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" +root_prop_0 ::= "[" "\n " basic_string (",\n " basic_string)* "\n " "]" +root_prop_1_item_2 ::= "[" "\n " basic_string (",\n " basic_string)* "\n " "]" root_prop_1 ::= "[" "\n " basic_string ",\n " basic_integer ",\n " root_prop_1_item_2 "\n " "]" -root_prop_2 ::= ("{" "\n " basic_string ": " basic_integer (",\n " basic_string ": " basic_integer)* "\n " "}") | "{}" +root_prop_2 ::= "{" "\n " basic_string ": " basic_integer (",\n " basic_string ": " basic_integer)* "\n " "}" root ::= "{" "\n " "\"array_field\"" ": " root_prop_0 ",\n " "\"tuple_field\"" ": " root_prop_1 ",\n " "\"object_field\"" ": " root_prop_2 "\n" "}" """ @@ -152,9 +159,11 @@ class MainModel(BaseModel): ) schema = MainModel.model_json_schema() - check_schema_with_grammar(schema, ebnf_grammar, indent=2) - check_schema_with_instance(schema, instance, indent=2) - check_schema_with_instance(schema, instance, indent=None, separators=(",", ":")) + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=False, indent=2) + check_schema_with_instance(schema, instance, any_whitespace=False, indent=2) + check_schema_with_instance( + schema, instance, any_whitespace=False, indent=None, separators=(",", ":") + ) def test_non_strict() -> None: @@ -164,6 +173,8 @@ class Foo(BaseModel): class MainModel(BaseModel): tuple_field: Tuple[str, Tuple[int, int]] foo_field: Foo + list_field: List[str] + object_field: Dict[str, Any] ebnf_grammar = r"""basic_escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] basic_string_sub ::= ("\"" | [^"\\\r\n] basic_string_sub | "\\" basic_escape basic_string_sub) (= [ \n\t]* [,}\]:]) @@ -173,15 +184,16 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any ("," basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any ("," basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[" "]" +basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{" "}" root_prop_0_item_1 ::= "[" "\n " basic_integer ",\n " basic_integer (",\n " basic_any)* "\n " "]" root_prop_0 ::= "[" "\n " basic_string ",\n " root_prop_0_item_1 (",\n " basic_any)* "\n " "]" -root_prop_1 ::= ("{" "\n " basic_string ": " basic_any (",\n " basic_string ": " basic_any)* "\n " "}") | "{}" -root ::= "{" "\n " "\"tuple_field\"" ": " root_prop_0 ",\n " "\"foo_field\"" ": " root_prop_1 (",\n " basic_string ": " basic_any)* "\n" "}" +root_prop_1 ::= ("{" "\n " basic_string ": " basic_any (",\n " basic_string ": " basic_any)* "\n " "}") | "{" "}" +root_prop_2 ::= ("[" "\n " basic_string (",\n " basic_string)* "\n " "]") | "[" "]" +root ::= "{" "\n " "\"tuple_field\"" ": " root_prop_0 ",\n " "\"foo_field\"" ": " root_prop_1 ",\n " "\"list_field\"" ": " root_prop_2 ",\n " "\"object_field\"" ": " basic_object (",\n " basic_string ": " basic_any)* "\n" "}" """ - instance_json = """{ + instance_json = r"""{ "tuple_field": [ "foo", [ @@ -194,12 +206,16 @@ class MainModel(BaseModel): "foo_field": { "tmp": "str" }, + "list_field": [], + "object_field": {}, "extra": "field" }""" schema = MainModel.model_json_schema() - check_schema_with_grammar(schema, ebnf_grammar, indent=2, strict_mode=False) - check_schema_with_json(schema, instance_json, indent=2, strict_mode=False) + check_schema_with_grammar( + schema, ebnf_grammar, any_whitespace=False, indent=2, strict_mode=False + ) + check_schema_with_json(schema, instance_json, any_whitespace=False, indent=2, strict_mode=False) def test_enum_const() -> None: @@ -222,8 +238,8 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" root_prop_0 ::= "\"a\"" root_prop_1 ::= "\"a\\n\\r\\\"\"" root_prop_2 ::= ("\"a\"") | ("\"b\"") | ("\"c\"") @@ -234,8 +250,8 @@ class MainModel(BaseModel): schema = MainModel.model_json_schema() instance = MainModel(foo="a", values=1, bars="a", str_values='a\n\r"', field=Field.FOO) - check_schema_with_grammar(schema, ebnf_grammar) - check_schema_with_instance(schema, instance) + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=False) + check_schema_with_instance(schema, instance, any_whitespace=False) def test_optional() -> None: @@ -253,25 +269,25 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" root_prop_1 ::= basic_boolean | basic_null root_prop_2 ::= basic_number | basic_null root ::= "{" "" ("\"num\"" ": " basic_integer ", ")? ("\"opt_bool\"" ": " root_prop_1 ", ")? "\"size\"" ": " root_prop_2 (", " "\"name\"" ": " basic_string)? "" "}" """ schema = MainModel.model_json_schema() - check_schema_with_grammar(schema, ebnf_grammar) + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=False) instance = MainModel(num=42, opt_bool=True, size=3.14, name="foo") - check_schema_with_instance(schema, instance) + check_schema_with_instance(schema, instance, any_whitespace=False) instance = MainModel(size=None) - check_schema_with_instance(schema, instance) + check_schema_with_instance(schema, instance, any_whitespace=False) - check_schema_with_json(schema, '{"size": null}') - check_schema_with_json(schema, '{"size": null, "name": "foo"}') - check_schema_with_json(schema, '{"num": 1, "size": null, "name": "foo"}') + check_schema_with_json(schema, '{"size": null}', any_whitespace=False) + check_schema_with_json(schema, '{"size": null, "name": "foo"}', any_whitespace=False) + check_schema_with_json(schema, '{"num": 1, "size": null, "name": "foo"}', any_whitespace=False) def test_all_optional() -> None: @@ -288,21 +304,21 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" root_part_1 ::= "" | ", " "\"num\"" ": " basic_number "" root_part_0 ::= root_part_1 | ", " "\"state\"" ": " basic_boolean root_part_1 -root ::= ("{" "" (("\"size\"" ": " basic_integer root_part_0) | ("\"state\"" ": " basic_boolean root_part_1) | ("\"num\"" ": " basic_number "")) "" "}") | "{}" +root ::= "{" "" (("\"size\"" ": " basic_integer root_part_0) | ("\"state\"" ": " basic_boolean root_part_1) | ("\"num\"" ": " basic_number "")) "" "}" """ schema = MainModel.model_json_schema() - check_schema_with_grammar(schema, ebnf_grammar) + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=False) instance = MainModel(size=42, state=True, num=3.14) - check_schema_with_instance(schema, instance) + check_schema_with_instance(schema, instance, any_whitespace=False) - check_schema_with_json(schema, '{"state": false}') - check_schema_with_json(schema, '{"size": 1, "num": 1.5}') + check_schema_with_json(schema, '{"state": false}', any_whitespace=False) + check_schema_with_json(schema, '{"size": 1, "num": 1.5}', any_whitespace=False) ebnf_grammar_non_strict = r"""basic_escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] basic_string_sub ::= ("\"" | [^"\\\r\n] basic_string_sub | "\\" basic_escape basic_string_sub) (= [ \n\t]* [,}\]:]) @@ -312,18 +328,22 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[" "]" +basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{" "}" root_part_2 ::= (", " basic_string ": " basic_any)* root_part_1 ::= root_part_2 | ", " "\"num\"" ": " basic_number root_part_2 root_part_0 ::= root_part_1 | ", " "\"state\"" ": " basic_boolean root_part_1 -root ::= ("{" "" (("\"size\"" ": " basic_integer root_part_0) | ("\"state\"" ": " basic_boolean root_part_1) | ("\"num\"" ": " basic_number root_part_2) | basic_string ": " basic_any root_part_2) "" "}") | "{}" +root ::= ("{" "" (("\"size\"" ": " basic_integer root_part_0) | ("\"state\"" ": " basic_boolean root_part_1) | ("\"num\"" ": " basic_number root_part_2) | basic_string ": " basic_any root_part_2) "" "}") | "{" "}" """ - check_schema_with_grammar(schema, ebnf_grammar_non_strict, strict_mode=False) + check_schema_with_grammar( + schema, ebnf_grammar_non_strict, any_whitespace=False, strict_mode=False + ) - check_schema_with_json(schema, '{"size": 1, "num": 1.5, "other": false}', strict_mode=False) - check_schema_with_json(schema, '{"other": false}', strict_mode=False) + check_schema_with_json( + schema, '{"size": 1, "num": 1.5, "other": false}', any_whitespace=False, strict_mode=False + ) + check_schema_with_json(schema, '{"other": false}', any_whitespace=False, strict_mode=False) def test_empty() -> None: @@ -338,18 +358,18 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" root ::= "{" "}" """ schema = MainModel.model_json_schema() - check_schema_with_grammar(schema, ebnf_grammar) + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=False) instance = MainModel() - check_schema_with_instance(schema, instance) + check_schema_with_instance(schema, instance, any_whitespace=False) - check_schema_with_json(schema, '{"tmp": 123}', strict_mode=False) + check_schema_with_json(schema, '{"tmp": 123}', any_whitespace=False, strict_mode=False) def test_reference() -> None: @@ -378,19 +398,19 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" root_prop_0_prop_1 ::= basic_number | basic_null root_prop_0 ::= "{" "" "\"count\"" ": " basic_integer (", " "\"size\"" ": " root_prop_0_prop_1)? "" "}" root_prop_1_items_part_0 ::= "" | ", " "\"banana\"" ": " basic_string "" -root_prop_1_items ::= ("{" "" (("\"apple\"" ": " basic_string root_prop_1_items_part_0) | ("\"banana\"" ": " basic_string "")) "" "}") | "{}" -root_prop_1 ::= ("[" "" root_prop_1_items (", " root_prop_1_items)* "" "]") | "[]" +root_prop_1_items ::= "{" "" (("\"apple\"" ": " basic_string root_prop_1_items_part_0) | ("\"banana\"" ": " basic_string "")) "" "}" +root_prop_1 ::= "[" "" root_prop_1_items (", " root_prop_1_items)* "" "]" root ::= "{" "" "\"foo\"" ": " root_prop_0 ", " "\"bars\"" ": " root_prop_1 "" "}" """ schema = MainModel.model_json_schema() - check_schema_with_grammar(schema, ebnf_grammar) - check_schema_with_instance(schema, instance) + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=False) + check_schema_with_instance(schema, instance, any_whitespace=False) def test_union() -> None: @@ -414,18 +434,22 @@ class Dog(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" root_case_0 ::= "{" "" "\"name\"" ": " basic_string ", " "\"color\"" ": " basic_string "" "}" root_case_1 ::= "{" "" "\"name\"" ": " basic_string ", " "\"breed\"" ": " basic_string "" "}" root ::= root_case_0 | root_case_1 """ - check_schema_with_grammar(model_schema, ebnf_grammar) + check_schema_with_grammar(model_schema, ebnf_grammar, any_whitespace=False) - check_schema_with_instance(model_schema, Cat(name="kitty", color="black")) - check_schema_with_instance(model_schema, Dog(name="doggy", breed="bulldog")) - check_schema_with_json(model_schema, '{"name": "kitty", "test": "black"}', False) + check_schema_with_instance(model_schema, Cat(name="kitty", color="black"), any_whitespace=False) + check_schema_with_instance( + model_schema, Dog(name="doggy", breed="bulldog"), any_whitespace=False + ) + check_schema_with_json( + model_schema, '{"name": "kitty", "test": "black"}', False, any_whitespace=False + ) def test_alias() -> None: @@ -440,19 +464,23 @@ class MainModel(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" root ::= "{" "" "\"name\"" ": " basic_string "" "}" """ - check_schema_with_grammar(MainModel.model_json_schema(), ebnf_grammar) + check_schema_with_grammar(MainModel.model_json_schema(), ebnf_grammar, any_whitespace=False) instance = MainModel(name="kitty") instance_str = json.dumps(instance.model_dump(mode="json", round_trip=True, by_alias=False)) - check_schema_with_json(MainModel.model_json_schema(by_alias=False), instance_str) + check_schema_with_json( + MainModel.model_json_schema(by_alias=False), instance_str, any_whitespace=False + ) instance_str = json.dumps(instance.model_dump(mode="json", round_trip=True, by_alias=True)) - check_schema_with_json(MainModel.model_json_schema(by_alias=True), instance_str) + check_schema_with_json( + MainModel.model_json_schema(by_alias=True), instance_str, any_whitespace=False + ) # property name contains space class MainModelSpace(BaseModel): @@ -466,19 +494,23 @@ class MainModelSpace(BaseModel): basic_string ::= ["] basic_string_sub basic_boolean ::= "true" | "false" basic_null ::= "null" -basic_array ::= ("[" "" basic_any (", " basic_any)* "" "]") | "[]" -basic_object ::= ("{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}") | "{}" +basic_array ::= "[" "" basic_any (", " basic_any)* "" "]" +basic_object ::= "{" "" basic_string ": " basic_any (", " basic_string ": " basic_any)* "" "}" root_prop_0 ::= "\"abc\"" root ::= "{" "" "\"name 1\"" ": " root_prop_0 "" "}" """ - check_schema_with_grammar(MainModelSpace.model_json_schema(), ebnf_grammar_space) + check_schema_with_grammar( + MainModelSpace.model_json_schema(), ebnf_grammar_space, any_whitespace=False + ) instance_space = MainModelSpace(**{"name 1": "abc"}) instance_space_str = json.dumps( instance_space.model_dump(mode="json", round_trip=True, by_alias=True), ) - check_schema_with_json(MainModelSpace.model_json_schema(by_alias=True), instance_space_str) + check_schema_with_json( + MainModelSpace.model_json_schema(by_alias=True), instance_space_str, any_whitespace=False + ) def test_restricted_string() -> None: @@ -487,33 +519,40 @@ class MainModel(BaseModel): instance = MainModel(restricted_string="a") instance_str = json.dumps(instance.model_dump(mode="json")) - check_schema_with_json(MainModel.model_json_schema(), instance_str) + check_schema_with_json(MainModel.model_json_schema(), instance_str, any_whitespace=False) check_schema_with_json( - MainModel.model_json_schema(), '{"restricted_string": "j"}', check_accepted=False + MainModel.model_json_schema(), + '{"restricted_string": "j"}', + is_accepted=False, + any_whitespace=False, ) def test_complex_restrictions() -> None: - string_without_quotes = Annotated[str, WithJsonSchema({"type": "string", "pattern": r"[^\"]*"})] - class RestrictedModel(BaseModel): - restricted_string: string_without_quotes + restricted_string: Annotated[str, WithJsonSchema({"type": "string", "pattern": r"[^\"]*"})] restricted_value: Annotated[int, Field(strict=True, ge=0, lt=44)] # working instance instance = RestrictedModel(restricted_string="abd", restricted_value=42) instance_str = json.dumps(instance.model_dump(mode="json")) - check_schema_with_json(RestrictedModel.model_json_schema(), instance_str) + check_schema_with_json(RestrictedModel.model_json_schema(), instance_str, any_whitespace=False) instance_err = RestrictedModel(restricted_string='"', restricted_value=42) instance_str = json.dumps(instance_err.model_dump(mode="json")) - check_schema_with_json(RestrictedModel.model_json_schema(), instance_str, check_accepted=False) + check_schema_with_json( + RestrictedModel.model_json_schema(), + instance_str, + is_accepted=False, + any_whitespace=False, + ) check_schema_with_json( RestrictedModel.model_json_schema(), '{"restricted_string": "j", "restricted_value": 45}', - check_accepted=False, + is_accepted=False, + any_whitespace=False, ) @@ -521,16 +560,72 @@ def test_dynamic_model() -> None: class MainModel(BaseModel): restricted_string: Annotated[str, WithJsonSchema({"type": "string", "pattern": r"[a-f]"})] - additional_fields = {} - additional_fields["restricted_string_dynamic"] = ( - Annotated[str, WithJsonSchema({"type": "string", "pattern": r"[a-x]"})], - ..., - ) + additional_fields = { + "restricted_string_dynamic": ( + Annotated[str, WithJsonSchema({"type": "string", "pattern": r"[a-x]"})], + ..., + ) + } - CompleteModel = create_model("CompleteModel", **additional_fields) + CompleteModel: Type[BaseModel] = create_model( + "CompleteModel", __base__=MainModel, **additional_fields + ) instance = CompleteModel(restricted_string="a", restricted_string_dynamic="j") instance_str = json.dumps(instance.model_dump(mode="json")) - check_schema_with_json(CompleteModel.model_json_schema(), instance_str) + check_schema_with_json(CompleteModel.model_json_schema(), instance_str, any_whitespace=False) + + +def test_any_whitespace() -> None: + class SimpleModel(BaseModel): + value: str + arr: List[int] + obj: Dict[str, int] + + schema = SimpleModel.model_json_schema() + + ebnf_grammar = r"""basic_escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] +basic_string_sub ::= ("\"" | [^"\\\r\n] basic_string_sub | "\\" basic_escape basic_string_sub) (= [ \n\t]* [,}\]:]) +basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object +basic_integer ::= ("0" | "-"? [1-9] [0-9]*) +basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)? +basic_string ::= ["] basic_string_sub +basic_boolean ::= "true" | "false" +basic_null ::= "null" +basic_array ::= "[" [ \n\t]* basic_any ([ \n\t]* "," [ \n\t]* basic_any)* [ \n\t]* "]" +basic_object ::= "{" [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_any ([ \n\t]* "," [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_any)* [ \n\t]* "}" +root_prop_1 ::= "[" [ \n\t]* basic_integer ([ \n\t]* "," [ \n\t]* basic_integer)* [ \n\t]* "]" +root_prop_2 ::= "{" [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_integer ([ \n\t]* "," [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_integer)* [ \n\t]* "}" +root ::= "{" [ \n\t]* "\"value\"" [ \n\t]* ":" [ \n\t]* basic_string [ \n\t]* "," [ \n\t]* "\"arr\"" [ \n\t]* ":" [ \n\t]* root_prop_1 [ \n\t]* "," [ \n\t]* "\"obj\"" [ \n\t]* ":" [ \n\t]* root_prop_2 [ \n\t]* "}" +""" + + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=True, strict_mode=True) + + ebnf_grammar = r"""basic_escape ::= ["\\/bfnrt] | "u" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] +basic_string_sub ::= ("\"" | [^"\\\r\n] basic_string_sub | "\\" basic_escape basic_string_sub) (= [ \n\t]* [,}\]:]) +basic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object +basic_integer ::= ("0" | "-"? [1-9] [0-9]*) +basic_number ::= ("0" | "-"? [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)? +basic_string ::= ["] basic_string_sub +basic_boolean ::= "true" | "false" +basic_null ::= "null" +basic_array ::= ("[" [ \n\t]* basic_any ([ \n\t]* "," [ \n\t]* basic_any)* [ \n\t]* "]") | "[" [ \n\t]* "]" +basic_object ::= ("{" [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_any ([ \n\t]* "," [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_any)* [ \n\t]* "}") | "{" [ \n\t]* "}" +root_prop_1 ::= ("[" [ \n\t]* basic_integer ([ \n\t]* "," [ \n\t]* basic_integer)* [ \n\t]* "]") | "[" [ \n\t]* "]" +root_prop_2 ::= ("{" [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_integer ([ \n\t]* "," [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_integer)* [ \n\t]* "}") | "{" [ \n\t]* "}" +root ::= "{" [ \n\t]* "\"value\"" [ \n\t]* ":" [ \n\t]* basic_string [ \n\t]* "," [ \n\t]* "\"arr\"" [ \n\t]* ":" [ \n\t]* root_prop_1 [ \n\t]* "," [ \n\t]* "\"obj\"" [ \n\t]* ":" [ \n\t]* root_prop_2 ([ \n\t]* "," [ \n\t]* basic_string [ \n\t]* ":" [ \n\t]* basic_any)* [ \n\t]* "}" +""" + + check_schema_with_grammar(schema, ebnf_grammar, any_whitespace=True, strict_mode=False) + + # Test that different whitespace variations are accepted when any_whitespace=True + instances = [ + '{"value": "test", "arr": [1, 2], "obj": {"a": 1}}', + '{ "value" : "test", "arr": [1, 2], "obj": {"a": 1} }', + '{\n "value" : "test",\n "arr" : [1, 2],\n "obj" : {"a": 1}\n}', + '{\t"value"\t:\t"test",\t"arr":\t[1,\t2],\t"obj":\t{"a":\t1}\t}', + ] + for instance in instances: + check_schema_with_json(schema, instance, any_whitespace=True) if __name__ == "__main__":