From f55e29dfe8182641c28cca99175a5305da887840 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20Marj=C3=A1n?= Date: Mon, 16 Jan 2023 11:08:30 +0100 Subject: [PATCH] port performance fixes from go-yaml --- converter.go | 2 + decode.go | 30 ++++++++++ scannerc.go | 155 ++++++++++++++++++++++++++++----------------------- yamlh.go | 10 +++- 4 files changed, 125 insertions(+), 72 deletions(-) diff --git a/converter.go b/converter.go index fc5d3d8..fdb1343 100644 --- a/converter.go +++ b/converter.go @@ -21,6 +21,8 @@ type ConverterConfig struct { // converter in Standard should be good enough. type Converter struct { encodeAsFlow bool + decodeCount int + aliasCount int } // NewConverter creates a new Converter with the given configuration. diff --git a/decode.go b/decode.go index e369ff2..eaa5cc4 100644 --- a/decode.go +++ b/decode.go @@ -70,7 +70,33 @@ func (c *Converter) unmarshalParse(an *valueAnalysis, p *yaml_parser_t) (cty.Val return c.unmarshalParseRemainder(an, &evt, p) } +const ( + // 400,000 decode operations is ~500kb of dense object declarations, or ~5kb of dense object declarations with 10000% alias expansion + alias_ratio_range_low = 400000 + // 4,000,000 decode operations is ~5MB of dense object declarations, or ~4.5MB of dense object declarations with 10% alias expansion + alias_ratio_range_high = 4000000 + // alias_ratio_range is the range over which we scale allowed alias ratios + alias_ratio_range = float64(alias_ratio_range_high - alias_ratio_range_low) +) + +func allowedAliasRatio(decodeCount int) float64 { + switch { + case decodeCount <= alias_ratio_range_low: + // allow 99% to come from alias expansion for small-to-medium documents + return 0.99 + case decodeCount >= alias_ratio_range_high: + // allow 10% to come from alias expansion for very large documents + return 0.10 + default: + // scale smoothly from 99% down to 10% over the range. + // this maps to 396,000 - 400,000 allowed alias-driven decodes over the range. + // 400,000 decode operations is ~100MB of allocations in worst-case scenarios (single-item maps). + return 0.99 - 0.89*(float64(decodeCount-alias_ratio_range_low)/alias_ratio_range) + } +} + func (c *Converter) unmarshalParseRemainder(an *valueAnalysis, evt *yaml_event_t, p *yaml_parser_t) (cty.Value, error) { + c.decodeCount++ switch evt.typ { case yaml_SCALAR_EVENT: return c.unmarshalScalar(an, evt, p) @@ -222,6 +248,10 @@ func (c *Converter) unmarshalSequence(an *valueAnalysis, evt *yaml_event_t, p *y } func (c *Converter) unmarshalAlias(an *valueAnalysis, evt *yaml_event_t, p *yaml_parser_t) (cty.Value, error) { + c.aliasCount++ + if c.aliasCount > 100 && c.decodeCount > 1000 && float64(c.aliasCount)/float64(c.decodeCount) > allowedAliasRatio(c.decodeCount) { + failf("document contains excessive aliasing") + } v, err := an.anchorVal(string(evt.anchor)) if err != nil { err = parseEventErrorWrap(evt, err) diff --git a/scannerc.go b/scannerc.go index 077fd1d..cba6656 100644 --- a/scannerc.go +++ b/scannerc.go @@ -626,30 +626,17 @@ func trace(args ...interface{}) func() { func yaml_parser_fetch_more_tokens(parser *yaml_parser_t) bool { // While we need more tokens to fetch, do it. for { - // Check if we really need to fetch more tokens. - need_more_tokens := false - - if parser.tokens_head == len(parser.tokens) { - // Queue is empty. - need_more_tokens = true - } else { - // Check if any potential simple key may occupy the head position. - if !yaml_parser_stale_simple_keys(parser) { + if parser.tokens_head != len(parser.tokens) { + // If queue is non-empty, check if any potential simple key may + // occupy the head position. + head_tok_idx, ok := parser.simple_keys_by_tok[parser.tokens_parsed] + if !ok { + break + } else if valid, ok := yaml_simple_key_is_valid(parser, &parser.simple_keys[head_tok_idx]); !ok { return false + } else if !valid { + break } - - for i := range parser.simple_keys { - simple_key := &parser.simple_keys[i] - if simple_key.possible && simple_key.token_number == parser.tokens_parsed { - need_more_tokens = true - break - } - } - } - - // We are finished. - if !need_more_tokens { - break } // Fetch the next token. if !yaml_parser_fetch_next_token(parser) { @@ -678,11 +665,6 @@ func yaml_parser_fetch_next_token(parser *yaml_parser_t) bool { return false } - // Remove obsolete potential simple keys. - if !yaml_parser_stale_simple_keys(parser) { - return false - } - // Check the indentation level against the current column. if !yaml_parser_unroll_indent(parser, parser.mark.column) { return false @@ -837,29 +819,30 @@ func yaml_parser_fetch_next_token(parser *yaml_parser_t) bool { "found character that cannot start any token") } -// Check the list of potential simple keys and remove the positions that -// cannot contain simple keys anymore. -func yaml_parser_stale_simple_keys(parser *yaml_parser_t) bool { - // Check for a potential simple key for each flow level. - for i := range parser.simple_keys { - simple_key := &parser.simple_keys[i] - - // The specification requires that a simple key - // - // - is limited to a single line, - // - is shorter than 1024 characters. - if simple_key.possible && (simple_key.mark.line < parser.mark.line || simple_key.mark.index+1024 < parser.mark.index) { - - // Check if the potential simple key to be removed is required. - if simple_key.required { - return yaml_parser_set_scanner_error(parser, - "while scanning a simple key", simple_key.mark, - "could not find expected ':'") - } - simple_key.possible = false +func yaml_simple_key_is_valid(parser *yaml_parser_t, simple_key *yaml_simple_key_t) (valid, ok bool) { + if !simple_key.possible { + return false, true + } + + // The 1.2 specification says: + // + // "If the ? indicator is omitted, parsing needs to see past the + // implicit key to recognize it as such. To limit the amount of + // lookahead required, the “:” indicator must appear at most 1024 + // Unicode characters beyond the start of the key. In addition, the key + // is restricted to a single line." + // + if simple_key.mark.line < parser.mark.line || simple_key.mark.index+1024 < parser.mark.index { + // Check if the potential simple key to be removed is required. + if simple_key.required { + return false, yaml_parser_set_scanner_error(parser, + "while scanning a simple key", simple_key.mark, + "could not find expected ':'") } + simple_key.possible = false + return false, true } - return true + return true, true } // Check if a simple key may start at the current position and add it if @@ -879,6 +862,7 @@ func yaml_parser_save_simple_key(parser *yaml_parser_t) bool { possible: true, required: required, token_number: parser.tokens_parsed + (len(parser.tokens) - parser.tokens_head), + mark: parser.mark, } simple_key.mark = parser.mark @@ -886,6 +870,7 @@ func yaml_parser_save_simple_key(parser *yaml_parser_t) bool { return false } parser.simple_keys[len(parser.simple_keys)-1] = simple_key + parser.simple_keys_by_tok[simple_key.token_number] = len(parser.simple_keys) - 1 } return true } @@ -900,19 +885,33 @@ func yaml_parser_remove_simple_key(parser *yaml_parser_t) bool { "while scanning a simple key", parser.simple_keys[i].mark, "could not find expected ':'") } + // Remove the key from the stack. + parser.simple_keys[i].possible = false + delete(parser.simple_keys_by_tok, parser.simple_keys[i].token_number) } - // Remove the key from the stack. - parser.simple_keys[i].possible = false return true } +// max_flow_level limits the flow_level +const max_flow_level = 10000 + // Increase the flow level and resize the simple key list if needed. func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool { // Reset the simple key on the next level. - parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{}) + parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{ + possible: false, + required: false, + token_number: parser.tokens_parsed + (len(parser.tokens) - parser.tokens_head), + mark: parser.mark, + }) // Increase the flow level. parser.flow_level++ + if parser.flow_level > max_flow_level { + return yaml_parser_set_scanner_error(parser, + "while increasing flow level", parser.simple_keys[len(parser.simple_keys)-1].mark, + fmt.Sprintf("exceeded max depth of %d", max_flow_level)) + } return true } @@ -920,11 +919,16 @@ func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool { func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool { if parser.flow_level > 0 { parser.flow_level-- - parser.simple_keys = parser.simple_keys[:len(parser.simple_keys)-1] + last := len(parser.simple_keys) - 1 + delete(parser.simple_keys_by_tok, parser.simple_keys[last].token_number) + parser.simple_keys = parser.simple_keys[:last] } return true } +// max_indents limits the indents stack size +const max_indents = 10000 + // Push the current indentation level to the stack and set the new level // the current column is greater than the indentation level. In this case, // append or insert the specified token into the token queue. @@ -939,6 +943,11 @@ func yaml_parser_roll_indent(parser *yaml_parser_t, column, number int, typ yaml // indentation level. parser.indents = append(parser.indents, parser.indent) parser.indent = column + if len(parser.indents) > max_indents { + return yaml_parser_set_scanner_error(parser, + "while increasing indent level", parser.simple_keys[len(parser.simple_keys)-1].mark, + fmt.Sprintf("exceeded max depth of %d", max_indents)) + } // Create a token and insert it into the queue. token := yaml_token_t{ @@ -989,6 +998,8 @@ func yaml_parser_fetch_stream_start(parser *yaml_parser_t) bool { // Initialize the simple key stack. parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{}) + parser.simple_keys_by_tok = make(map[int]int) + // A simple key is allowed at the beginning of the stream. parser.simple_key_allowed = true @@ -1270,7 +1281,10 @@ func yaml_parser_fetch_value(parser *yaml_parser_t) bool { simple_key := &parser.simple_keys[len(parser.simple_keys)-1] // Have we found a simple key? - if simple_key.possible { + if valid, ok := yaml_simple_key_is_valid(parser, simple_key); !ok { + return false + + } else if valid { // Create the KEY token and insert it into the queue. token := yaml_token_t{ typ: yaml_KEY_TOKEN, @@ -1288,6 +1302,7 @@ func yaml_parser_fetch_value(parser *yaml_parser_t) bool { // Remove the simple key. simple_key.possible = false + delete(parser.simple_keys_by_tok, simple_key.token_number) // A simple key cannot follow another simple key. parser.simple_key_allowed = false @@ -1485,11 +1500,11 @@ func yaml_parser_scan_to_next_token(parser *yaml_parser_t) bool { // Scan a YAML-DIRECTIVE or TAG-DIRECTIVE token. // // Scope: -// %YAML 1.1 # a comment \n -// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -// %TAG !yaml! tag:yaml.org,2002: \n -// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // +// %YAML 1.1 # a comment \n +// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +// %TAG !yaml! tag:yaml.org,2002: \n +// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ func yaml_parser_scan_directive(parser *yaml_parser_t, token *yaml_token_t) bool { // Eat '%'. start_mark := parser.mark @@ -1586,11 +1601,11 @@ func yaml_parser_scan_directive(parser *yaml_parser_t, token *yaml_token_t) bool // Scan the directive name. // // Scope: -// %YAML 1.1 # a comment \n -// ^^^^ -// %TAG !yaml! tag:yaml.org,2002: \n -// ^^^ // +// %YAML 1.1 # a comment \n +// ^^^^ +// %TAG !yaml! tag:yaml.org,2002: \n +// ^^^ func yaml_parser_scan_directive_name(parser *yaml_parser_t, start_mark yaml_mark_t, name *[]byte) bool { // Consume the directive name. if parser.unread < 1 && !yaml_parser_update_buffer(parser, 1) { @@ -1625,8 +1640,9 @@ func yaml_parser_scan_directive_name(parser *yaml_parser_t, start_mark yaml_mark // Scan the value of VERSION-DIRECTIVE. // // Scope: -// %YAML 1.1 # a comment \n -// ^^^^^^ +// +// %YAML 1.1 # a comment \n +// ^^^^^^ func yaml_parser_scan_version_directive_value(parser *yaml_parser_t, start_mark yaml_mark_t, major, minor *int8) bool { // Eat whitespaces. if parser.unread < 1 && !yaml_parser_update_buffer(parser, 1) { @@ -1664,10 +1680,11 @@ const max_number_length = 2 // Scan the version number of VERSION-DIRECTIVE. // // Scope: -// %YAML 1.1 # a comment \n -// ^ -// %YAML 1.1 # a comment \n -// ^ +// +// %YAML 1.1 # a comment \n +// ^ +// %YAML 1.1 # a comment \n +// ^ func yaml_parser_scan_version_directive_number(parser *yaml_parser_t, start_mark yaml_mark_t, number *int8) bool { // Repeat while the next character is digit. @@ -1701,9 +1718,9 @@ func yaml_parser_scan_version_directive_number(parser *yaml_parser_t, start_mark // Scan the value of a TAG-DIRECTIVE token. // // Scope: -// %TAG !yaml! tag:yaml.org,2002: \n -// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ // +// %TAG !yaml! tag:yaml.org,2002: \n +// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ func yaml_parser_scan_tag_directive_value(parser *yaml_parser_t, start_mark yaml_mark_t, handle, prefix *[]byte) bool { var handle_value, prefix_value []byte diff --git a/yamlh.go b/yamlh.go index e25cee5..640f9d9 100644 --- a/yamlh.go +++ b/yamlh.go @@ -408,7 +408,9 @@ type yaml_document_t struct { // The number of written bytes should be set to the size_read variable. // // [in,out] data A pointer to an application data specified by -// yaml_parser_set_input(). +// +// yaml_parser_set_input(). +// // [out] buffer The buffer to write the data from the source. // [in] size The size of the buffer. // [out] size_read The actual number of bytes read from the source. @@ -579,6 +581,7 @@ type yaml_parser_t struct { simple_key_allowed bool // May a simple key occur at the current position? simple_keys []yaml_simple_key_t // The stack of simple keys. + simple_keys_by_tok map[int]int // possible simple_key indexes indexed by token_number // Parser stuff @@ -603,13 +606,14 @@ type yaml_parser_t struct { // @a buffer to the output. // // @param[in,out] data A pointer to an application data specified by -// yaml_emitter_set_output(). +// +// yaml_emitter_set_output(). +// // @param[in] buffer The buffer with bytes to be written. // @param[in] size The size of the buffer. // // @returns On success, the handler should return @c 1. If the handler failed, // the returned value should be @c 0. -// type yaml_write_handler_t func(emitter *yaml_emitter_t, buffer []byte) error type yaml_emitter_state_t int