Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add large document benchmarks, tune alias heuristic, add max depth limits #515

Merged
merged 1 commit into from
Oct 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions benchmark_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package yaml_test

import (
"strings"
"testing"

. "gopkg.in/check.v1"
"gopkg.in/yaml.v2"
)

type testcase struct {
name string
data []byte
error string
}

func testcases() []testcase {
return []testcase{
{
name: "1000kb of maps with 100 aliases",
data: []byte(`{a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-100) + `], b: &b [*a` + strings.Repeat(`,*a`, 99) + `]}`),
error: "yaml: document contains excessive aliasing",
},
{
name: "1000kb of deeply nested slices",
data: []byte(strings.Repeat(`[`, 1000*1024)),
error: "yaml: exceeded max depth of 10000",
},
{
name: "1000kb of deeply nested maps",
data: []byte("x: " + strings.Repeat(`{`, 1000*1024)),
error: "yaml: exceeded max depth of 10000",
},
{
name: "1000kb of deeply nested indents",
data: []byte(strings.Repeat(`- `, 1000*1024)),
error: "yaml: exceeded max depth of 10000",
},
{
name: "1000kb of 1000-indent lines",
data: []byte(strings.Repeat(strings.Repeat(`- `, 1000)+"\n", 1024/2)),
},
{name: "1kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1*1024/4-1) + `]`)},
{name: "10kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 10*1024/4-1) + `]`)},
{name: "100kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 100*1024/4-1) + `]`)},
{name: "1000kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-1) + `]`)},
}
}

func (s *S) TestLimits(c *C) {
if testing.Short() {
return
}
for _, tc := range testcases() {
var v interface{}
err := yaml.Unmarshal(tc.data, &v)
if len(tc.error) > 0 {
c.Assert(err, ErrorMatches, tc.error, Commentf("testcase: %s", tc.name))
} else {
c.Assert(err, IsNil, Commentf("testcase: %s", tc.name))
}
}
}

func Benchmark1000KB100Aliases(b *testing.B) {
benchmark(b, "1000kb of maps with 100 aliases")
}
func Benchmark1000KBDeeplyNestedSlices(b *testing.B) {
benchmark(b, "1000kb of deeply nested slices")
}
func Benchmark1000KBDeeplyNestedMaps(b *testing.B) {
benchmark(b, "1000kb of deeply nested maps")
}
func Benchmark1000KBDeeplyNestedIndents(b *testing.B) {
benchmark(b, "1000kb of deeply nested indents")
}
func Benchmark1000KB1000IndentLines(b *testing.B) {
benchmark(b, "1000kb of 1000-indent lines")
}
func Benchmark1KBMaps(b *testing.B) {
benchmark(b, "1kb of maps")
}
func Benchmark10KBMaps(b *testing.B) {
benchmark(b, "10kb of maps")
}
func Benchmark100KBMaps(b *testing.B) {
benchmark(b, "100kb of maps")
}
func Benchmark1000KBMaps(b *testing.B) {
benchmark(b, "1000kb of maps")
}

func benchmark(b *testing.B, name string) {
var tc testcase
for _, t := range testcases() {
if t.name == name {
tc = t
break
}
}
if tc.name != name {
b.Errorf("testcase %q not found", name)
return
}

b.ResetTimer()

for i := 0; i < b.N; i++ {
var v interface{}
err := yaml.Unmarshal(tc.data, &v)
if len(tc.error) > 0 {
if err == nil {
b.Errorf("expected error, got none")
} else if err.Error() != tc.error {
b.Errorf("expected error '%s', got '%s'", tc.error, err.Error())
}
} else {
if err != nil {
b.Errorf("unexpected error: %v", err)
}
}
}
}
27 changes: 26 additions & 1 deletion decode.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,12 +318,37 @@ func (d *decoder) prepare(n *node, out reflect.Value) (newout reflect.Value, unm
return out, false, false
}

const (
// 400,000 decode operations is ~500kb of dense object declarations, or ~5kb of dense object declarations with 10000% alias expansion
alias_ratio_range_low = 400000
// 4,000,000 decode operations is ~5MB of dense object declarations, or ~4.5MB of dense object declarations with 10% alias expansion
alias_ratio_range_high = 4000000
// alias_ratio_range is the range over which we scale allowed alias ratios
alias_ratio_range = float64(alias_ratio_range_high - alias_ratio_range_low)
)

func allowedAliasRatio(decodeCount int) float64 {
switch {
case decodeCount <= alias_ratio_range_low:
// allow 99% to come from alias expansion for small-to-medium documents
return 0.99
case decodeCount >= alias_ratio_range_high:
// allow 10% to come from alias expansion for very large documents
return 0.10
default:
// scale smoothly from 99% down to 10% over the range.
// this maps to 396,000 - 400,000 allowed alias-driven decodes over the range.
// 400,000 decode operations is ~100MB of allocations in worst-case scenarios (single-item maps).
return 0.99 - 0.89*(float64(decodeCount-alias_ratio_range_low)/alias_ratio_range)
}
}

func (d *decoder) unmarshal(n *node, out reflect.Value) (good bool) {
d.decodeCount++
if d.aliasDepth > 0 {
d.aliasCount++
}
if d.aliasCount > 100 && d.decodeCount > 1000 && float64(d.aliasCount)/float64(d.decodeCount) > 0.99 {
if d.aliasCount > 100 && d.decodeCount > 1000 && float64(d.aliasCount)/float64(d.decodeCount) > allowedAliasRatio(d.decodeCount) {
failf("document contains excessive aliasing")
}
switch n.kind {
Expand Down
16 changes: 16 additions & 0 deletions scannerc.go
Original file line number Diff line number Diff line change
Expand Up @@ -906,13 +906,21 @@ func yaml_parser_remove_simple_key(parser *yaml_parser_t) bool {
return true
}

// max_flow_level limits the flow_level
const max_flow_level = 10000

// Increase the flow level and resize the simple key list if needed.
func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool {
// Reset the simple key on the next level.
parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{})

// Increase the flow level.
parser.flow_level++
if parser.flow_level > max_flow_level {
return yaml_parser_set_scanner_error(parser,
"while increasing flow level", parser.simple_keys[len(parser.simple_keys)-1].mark,
fmt.Sprintf("exceeded max depth of %d", max_flow_level))
}
return true
}

Expand All @@ -925,6 +933,9 @@ func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool {
return true
}

// max_indents limits the indents stack size
const max_indents = 10000

// Push the current indentation level to the stack and set the new level
// the current column is greater than the indentation level. In this case,
// append or insert the specified token into the token queue.
Expand All @@ -939,6 +950,11 @@ func yaml_parser_roll_indent(parser *yaml_parser_t, column, number int, typ yaml
// indentation level.
parser.indents = append(parser.indents, parser.indent)
parser.indent = column
if len(parser.indents) > max_indents {
return yaml_parser_set_scanner_error(parser,
"while increasing indent level", parser.simple_keys[len(parser.simple_keys)-1].mark,
fmt.Sprintf("exceeded max depth of %d", max_indents))
}

// Create a token and insert it into the queue.
token := yaml_token_t{
Expand Down