Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

port performance fixes from go-yaml #10

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ type ConverterConfig struct {
// converter in Standard should be good enough.
type Converter struct {
encodeAsFlow bool
decodeCount int
aliasCount int
}

// NewConverter creates a new Converter with the given configuration.
Expand Down
30 changes: 30 additions & 0 deletions decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,33 @@ func (c *Converter) unmarshalParse(an *valueAnalysis, p *yaml_parser_t) (cty.Val
return c.unmarshalParseRemainder(an, &evt, p)
}

const (
// 400,000 decode operations is ~500kb of dense object declarations, or ~5kb of dense object declarations with 10000% alias expansion
alias_ratio_range_low = 400000
// 4,000,000 decode operations is ~5MB of dense object declarations, or ~4.5MB of dense object declarations with 10% alias expansion
alias_ratio_range_high = 4000000
// alias_ratio_range is the range over which we scale allowed alias ratios
alias_ratio_range = float64(alias_ratio_range_high - alias_ratio_range_low)
)

func allowedAliasRatio(decodeCount int) float64 {
switch {
case decodeCount <= alias_ratio_range_low:
// allow 99% to come from alias expansion for small-to-medium documents
return 0.99
case decodeCount >= alias_ratio_range_high:
// allow 10% to come from alias expansion for very large documents
return 0.10
default:
// scale smoothly from 99% down to 10% over the range.
// this maps to 396,000 - 400,000 allowed alias-driven decodes over the range.
// 400,000 decode operations is ~100MB of allocations in worst-case scenarios (single-item maps).
return 0.99 - 0.89*(float64(decodeCount-alias_ratio_range_low)/alias_ratio_range)
}
}

func (c *Converter) unmarshalParseRemainder(an *valueAnalysis, evt *yaml_event_t, p *yaml_parser_t) (cty.Value, error) {
c.decodeCount++
switch evt.typ {
case yaml_SCALAR_EVENT:
return c.unmarshalScalar(an, evt, p)
Expand Down Expand Up @@ -222,6 +248,10 @@ func (c *Converter) unmarshalSequence(an *valueAnalysis, evt *yaml_event_t, p *y
}

func (c *Converter) unmarshalAlias(an *valueAnalysis, evt *yaml_event_t, p *yaml_parser_t) (cty.Value, error) {
c.aliasCount++
if c.aliasCount > 100 && c.decodeCount > 1000 && float64(c.aliasCount)/float64(c.decodeCount) > allowedAliasRatio(c.decodeCount) {
failf("document contains excessive aliasing")
}
v, err := an.anchorVal(string(evt.anchor))
if err != nil {
err = parseEventErrorWrap(evt, err)
Expand Down
155 changes: 86 additions & 69 deletions scannerc.go
Original file line number Diff line number Diff line change
Expand Up @@ -626,30 +626,17 @@ func trace(args ...interface{}) func() {
func yaml_parser_fetch_more_tokens(parser *yaml_parser_t) bool {
// While we need more tokens to fetch, do it.
for {
// Check if we really need to fetch more tokens.
need_more_tokens := false

if parser.tokens_head == len(parser.tokens) {
// Queue is empty.
need_more_tokens = true
} else {
// Check if any potential simple key may occupy the head position.
if !yaml_parser_stale_simple_keys(parser) {
if parser.tokens_head != len(parser.tokens) {
// If queue is non-empty, check if any potential simple key may
// occupy the head position.
head_tok_idx, ok := parser.simple_keys_by_tok[parser.tokens_parsed]
if !ok {
break
} else if valid, ok := yaml_simple_key_is_valid(parser, &parser.simple_keys[head_tok_idx]); !ok {
return false
} else if !valid {
break
}

for i := range parser.simple_keys {
simple_key := &parser.simple_keys[i]
if simple_key.possible && simple_key.token_number == parser.tokens_parsed {
need_more_tokens = true
break
}
}
}

// We are finished.
if !need_more_tokens {
break
}
// Fetch the next token.
if !yaml_parser_fetch_next_token(parser) {
Expand Down Expand Up @@ -678,11 +665,6 @@ func yaml_parser_fetch_next_token(parser *yaml_parser_t) bool {
return false
}

// Remove obsolete potential simple keys.
if !yaml_parser_stale_simple_keys(parser) {
return false
}

// Check the indentation level against the current column.
if !yaml_parser_unroll_indent(parser, parser.mark.column) {
return false
Expand Down Expand Up @@ -837,29 +819,30 @@ func yaml_parser_fetch_next_token(parser *yaml_parser_t) bool {
"found character that cannot start any token")
}

// Check the list of potential simple keys and remove the positions that
// cannot contain simple keys anymore.
func yaml_parser_stale_simple_keys(parser *yaml_parser_t) bool {
// Check for a potential simple key for each flow level.
for i := range parser.simple_keys {
simple_key := &parser.simple_keys[i]

// The specification requires that a simple key
//
// - is limited to a single line,
// - is shorter than 1024 characters.
if simple_key.possible && (simple_key.mark.line < parser.mark.line || simple_key.mark.index+1024 < parser.mark.index) {

// Check if the potential simple key to be removed is required.
if simple_key.required {
return yaml_parser_set_scanner_error(parser,
"while scanning a simple key", simple_key.mark,
"could not find expected ':'")
}
simple_key.possible = false
func yaml_simple_key_is_valid(parser *yaml_parser_t, simple_key *yaml_simple_key_t) (valid, ok bool) {
if !simple_key.possible {
return false, true
}

// The 1.2 specification says:
//
// "If the ? indicator is omitted, parsing needs to see past the
// implicit key to recognize it as such. To limit the amount of
// lookahead required, the “:” indicator must appear at most 1024
// Unicode characters beyond the start of the key. In addition, the key
// is restricted to a single line."
//
if simple_key.mark.line < parser.mark.line || simple_key.mark.index+1024 < parser.mark.index {
// Check if the potential simple key to be removed is required.
if simple_key.required {
return false, yaml_parser_set_scanner_error(parser,
"while scanning a simple key", simple_key.mark,
"could not find expected ':'")
}
simple_key.possible = false
return false, true
}
return true
return true, true
}

// Check if a simple key may start at the current position and add it if
Expand All @@ -879,13 +862,15 @@ func yaml_parser_save_simple_key(parser *yaml_parser_t) bool {
possible: true,
required: required,
token_number: parser.tokens_parsed + (len(parser.tokens) - parser.tokens_head),
mark: parser.mark,
}
simple_key.mark = parser.mark

if !yaml_parser_remove_simple_key(parser) {
return false
}
parser.simple_keys[len(parser.simple_keys)-1] = simple_key
parser.simple_keys_by_tok[simple_key.token_number] = len(parser.simple_keys) - 1
}
return true
}
Expand All @@ -900,31 +885,50 @@ func yaml_parser_remove_simple_key(parser *yaml_parser_t) bool {
"while scanning a simple key", parser.simple_keys[i].mark,
"could not find expected ':'")
}
// Remove the key from the stack.
parser.simple_keys[i].possible = false
delete(parser.simple_keys_by_tok, parser.simple_keys[i].token_number)
}
// Remove the key from the stack.
parser.simple_keys[i].possible = false
return true
}

// max_flow_level limits the flow_level
const max_flow_level = 10000

// Increase the flow level and resize the simple key list if needed.
func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool {
// Reset the simple key on the next level.
parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{})
parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{
possible: false,
required: false,
token_number: parser.tokens_parsed + (len(parser.tokens) - parser.tokens_head),
mark: parser.mark,
})

// Increase the flow level.
parser.flow_level++
if parser.flow_level > max_flow_level {
return yaml_parser_set_scanner_error(parser,
"while increasing flow level", parser.simple_keys[len(parser.simple_keys)-1].mark,
fmt.Sprintf("exceeded max depth of %d", max_flow_level))
}
return true
}

// Decrease the flow level.
func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool {
if parser.flow_level > 0 {
parser.flow_level--
parser.simple_keys = parser.simple_keys[:len(parser.simple_keys)-1]
last := len(parser.simple_keys) - 1
delete(parser.simple_keys_by_tok, parser.simple_keys[last].token_number)
parser.simple_keys = parser.simple_keys[:last]
}
return true
}

// max_indents limits the indents stack size
const max_indents = 10000

// Push the current indentation level to the stack and set the new level
// the current column is greater than the indentation level. In this case,
// append or insert the specified token into the token queue.
Expand All @@ -939,6 +943,11 @@ func yaml_parser_roll_indent(parser *yaml_parser_t, column, number int, typ yaml
// indentation level.
parser.indents = append(parser.indents, parser.indent)
parser.indent = column
if len(parser.indents) > max_indents {
return yaml_parser_set_scanner_error(parser,
"while increasing indent level", parser.simple_keys[len(parser.simple_keys)-1].mark,
fmt.Sprintf("exceeded max depth of %d", max_indents))
}

// Create a token and insert it into the queue.
token := yaml_token_t{
Expand Down Expand Up @@ -989,6 +998,8 @@ func yaml_parser_fetch_stream_start(parser *yaml_parser_t) bool {
// Initialize the simple key stack.
parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{})

parser.simple_keys_by_tok = make(map[int]int)

// A simple key is allowed at the beginning of the stream.
parser.simple_key_allowed = true

Expand Down Expand Up @@ -1270,7 +1281,10 @@ func yaml_parser_fetch_value(parser *yaml_parser_t) bool {
simple_key := &parser.simple_keys[len(parser.simple_keys)-1]

// Have we found a simple key?
if simple_key.possible {
if valid, ok := yaml_simple_key_is_valid(parser, simple_key); !ok {
return false

} else if valid {
// Create the KEY token and insert it into the queue.
token := yaml_token_t{
typ: yaml_KEY_TOKEN,
Expand All @@ -1288,6 +1302,7 @@ func yaml_parser_fetch_value(parser *yaml_parser_t) bool {

// Remove the simple key.
simple_key.possible = false
delete(parser.simple_keys_by_tok, simple_key.token_number)

// A simple key cannot follow another simple key.
parser.simple_key_allowed = false
Expand Down Expand Up @@ -1485,11 +1500,11 @@ func yaml_parser_scan_to_next_token(parser *yaml_parser_t) bool {
// Scan a YAML-DIRECTIVE or TAG-DIRECTIVE token.
//
// Scope:
// %YAML 1.1 # a comment \n
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// %TAG !yaml! tag:yaml.org,2002: \n
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
//
// %YAML 1.1 # a comment \n
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
// %TAG !yaml! tag:yaml.org,2002: \n
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
func yaml_parser_scan_directive(parser *yaml_parser_t, token *yaml_token_t) bool {
// Eat '%'.
start_mark := parser.mark
Expand Down Expand Up @@ -1586,11 +1601,11 @@ func yaml_parser_scan_directive(parser *yaml_parser_t, token *yaml_token_t) bool
// Scan the directive name.
//
// Scope:
// %YAML 1.1 # a comment \n
// ^^^^
// %TAG !yaml! tag:yaml.org,2002: \n
// ^^^
//
// %YAML 1.1 # a comment \n
// ^^^^
// %TAG !yaml! tag:yaml.org,2002: \n
// ^^^
func yaml_parser_scan_directive_name(parser *yaml_parser_t, start_mark yaml_mark_t, name *[]byte) bool {
// Consume the directive name.
if parser.unread < 1 && !yaml_parser_update_buffer(parser, 1) {
Expand Down Expand Up @@ -1625,8 +1640,9 @@ func yaml_parser_scan_directive_name(parser *yaml_parser_t, start_mark yaml_mark
// Scan the value of VERSION-DIRECTIVE.
//
// Scope:
// %YAML 1.1 # a comment \n
// ^^^^^^
//
// %YAML 1.1 # a comment \n
// ^^^^^^
func yaml_parser_scan_version_directive_value(parser *yaml_parser_t, start_mark yaml_mark_t, major, minor *int8) bool {
// Eat whitespaces.
if parser.unread < 1 && !yaml_parser_update_buffer(parser, 1) {
Expand Down Expand Up @@ -1664,10 +1680,11 @@ const max_number_length = 2
// Scan the version number of VERSION-DIRECTIVE.
//
// Scope:
// %YAML 1.1 # a comment \n
// ^
// %YAML 1.1 # a comment \n
// ^
//
// %YAML 1.1 # a comment \n
// ^
// %YAML 1.1 # a comment \n
// ^
func yaml_parser_scan_version_directive_number(parser *yaml_parser_t, start_mark yaml_mark_t, number *int8) bool {

// Repeat while the next character is digit.
Expand Down Expand Up @@ -1701,9 +1718,9 @@ func yaml_parser_scan_version_directive_number(parser *yaml_parser_t, start_mark
// Scan the value of a TAG-DIRECTIVE token.
//
// Scope:
// %TAG !yaml! tag:yaml.org,2002: \n
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
//
// %TAG !yaml! tag:yaml.org,2002: \n
// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
func yaml_parser_scan_tag_directive_value(parser *yaml_parser_t, start_mark yaml_mark_t, handle, prefix *[]byte) bool {
var handle_value, prefix_value []byte

Expand Down
10 changes: 7 additions & 3 deletions yamlh.go
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,9 @@ type yaml_document_t struct {
// The number of written bytes should be set to the size_read variable.
//
// [in,out] data A pointer to an application data specified by
// yaml_parser_set_input().
//
// yaml_parser_set_input().
//
// [out] buffer The buffer to write the data from the source.
// [in] size The size of the buffer.
// [out] size_read The actual number of bytes read from the source.
Expand Down Expand Up @@ -579,6 +581,7 @@ type yaml_parser_t struct {

simple_key_allowed bool // May a simple key occur at the current position?
simple_keys []yaml_simple_key_t // The stack of simple keys.
simple_keys_by_tok map[int]int // possible simple_key indexes indexed by token_number

// Parser stuff

Expand All @@ -603,13 +606,14 @@ type yaml_parser_t struct {
// @a buffer to the output.
//
// @param[in,out] data A pointer to an application data specified by
// yaml_emitter_set_output().
//
// yaml_emitter_set_output().
//
// @param[in] buffer The buffer with bytes to be written.
// @param[in] size The size of the buffer.
//
// @returns On success, the handler should return @c 1. If the handler failed,
// the returned value should be @c 0.
//
type yaml_write_handler_t func(emitter *yaml_emitter_t, buffer []byte) error

type yaml_emitter_state_t int
Expand Down