Skip to content

Commit

Permalink
perf: Faster string encoding (#1350)
Browse files Browse the repository at this point in the history
Recently we found an application were using zap.Reflect was faster than
using zapcore.ObjectMarshaler. After profiling we found that string
encoding is really expensive.

After replicating what encoding/json does,
we're able to get much better performance out of string encoding.
The optimization is roughly this:
instead of appending a rune at a time to the buffer,
scan and append contiguous chunks of
runes that don't need special handling
(valid runes that don't need to be escaped).

### Benchmark results

```
goos: linux
goarch: amd64
pkg: go.uber.org/zap/zapcore
cpu: AMD EPYC 7B13
               │ /tmp/old.txt │            /tmp/new.txt             │
               │    sec/op    │   sec/op     vs base                │
ZapJSON-8         89.10µ ± 1%   33.38µ ± 3%  -62.54% (p=0.000 n=10)
StandardJSON-8    40.74µ ± 1%   42.46µ ± 1%   +4.22% (p=0.000 n=10)
geomean           60.25µ        37.65µ       -37.52%
```

---------

Co-authored-by: Abhinav Gupta <mail@abhinavg.net>
  • Loading branch information
cdvr1993 and abhinav authored Sep 9, 2023
1 parent 82c728b commit 5a27bab
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 63 deletions.
5 changes: 5 additions & 0 deletions buffer/buffer.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ func (b *Buffer) AppendByte(v byte) {
b.bs = append(b.bs, v)
}

// AppendBytes writes a single byte to the Buffer.
func (b *Buffer) AppendBytes(v []byte) {
b.bs = append(b.bs, v...)
}

// AppendString writes a string to the Buffer.
func (b *Buffer) AppendString(s string) {
b.bs = append(b.bs, s...)
Expand Down
143 changes: 84 additions & 59 deletions zapcore/json_encoder.go
Original file line number Diff line number Diff line change
Expand Up @@ -486,73 +486,98 @@ func (enc *jsonEncoder) appendFloat(val float64, bitSize int) {
// Unlike the standard library's encoder, it doesn't attempt to protect the
// user from browser vulnerabilities or JSONP-related problems.
func (enc *jsonEncoder) safeAddString(s string) {
for i := 0; i < len(s); {
if enc.tryAddRuneSelf(s[i]) {
i++
continue
}
r, size := utf8.DecodeRuneInString(s[i:])
if enc.tryAddRuneError(r, size) {
i++
continue
}
enc.buf.AppendString(s[i : i+size])
i += size
}
safeAppendStringLike(
(*buffer.Buffer).AppendString,
utf8.DecodeRuneInString,
enc.buf,
s,
)
}

// safeAddByteString is no-alloc equivalent of safeAddString(string(s)) for s []byte.
func (enc *jsonEncoder) safeAddByteString(s []byte) {
safeAppendStringLike(
(*buffer.Buffer).AppendBytes,
utf8.DecodeRune,
enc.buf,
s,
)
}

// safeAppendStringLike is a generic implementation of safeAddString and safeAddByteString.
// It appends a string or byte slice to the buffer, escaping all special characters.
func safeAppendStringLike[S []byte | string](
// appendTo appends this string-like object to the buffer.
appendTo func(*buffer.Buffer, S),
// decodeRune decodes the next rune from the string-like object
// and returns its value and width in bytes.
decodeRune func(S) (rune, int),
buf *buffer.Buffer,
s S,
) {
// The encoding logic below works by skipping over characters
// that can be safely copied as-is,
// until a character is found that needs special handling.
// At that point, we copy everything we've seen so far,
// and then handle that special character.
//
// last is the index of the last byte that was copied to the buffer.
last := 0
for i := 0; i < len(s); {
if enc.tryAddRuneSelf(s[i]) {
if s[i] >= utf8.RuneSelf {
// Character >= RuneSelf may be part of a multi-byte rune.
// They need to be decoded before we can decide how to handle them.
r, size := decodeRune(s[i:])
if r != utf8.RuneError || size != 1 {
// No special handling required.
// Skip over this rune and continue.
i += size
continue
}

// Invalid UTF-8 sequence.
// Replace it with the Unicode replacement character.
appendTo(buf, s[last:i])
buf.AppendString(`\ufffd`)

i++
continue
}
r, size := utf8.DecodeRune(s[i:])
if enc.tryAddRuneError(r, size) {
last = i
} else {
// Character < RuneSelf is a single-byte UTF-8 rune.
if s[i] >= 0x20 && s[i] != '\\' && s[i] != '"' {
// No escaping necessary.
// Skip over this character and continue.
i++
continue
}

// This character needs to be escaped.
appendTo(buf, s[last:i])
switch s[i] {
case '\\', '"':
buf.AppendByte('\\')
buf.AppendByte(s[i])
case '\n':
buf.AppendByte('\\')
buf.AppendByte('n')
case '\r':
buf.AppendByte('\\')
buf.AppendByte('r')
case '\t':
buf.AppendByte('\\')
buf.AppendByte('t')
default:
// Encode bytes < 0x20, except for the escape sequences above.
buf.AppendString(`\u00`)
buf.AppendByte(_hex[s[i]>>4])
buf.AppendByte(_hex[s[i]&0xF])
}

i++
continue
last = i
}
enc.buf.Write(s[i : i+size])
i += size
}
}

// tryAddRuneSelf appends b if it is valid UTF-8 character represented in a single byte.
func (enc *jsonEncoder) tryAddRuneSelf(b byte) bool {
if b >= utf8.RuneSelf {
return false
}
if b >= 0x20 && b != '\\' && b != '"' {
enc.buf.AppendByte(b)
return true
}
switch b {
case '\\', '"':
enc.buf.AppendByte('\\')
enc.buf.AppendByte(b)
case '\n':
enc.buf.AppendByte('\\')
enc.buf.AppendByte('n')
case '\r':
enc.buf.AppendByte('\\')
enc.buf.AppendByte('r')
case '\t':
enc.buf.AppendByte('\\')
enc.buf.AppendByte('t')
default:
// Encode bytes < 0x20, except for the escape sequences above.
enc.buf.AppendString(`\u00`)
enc.buf.AppendByte(_hex[b>>4])
enc.buf.AppendByte(_hex[b&0xF])
}
return true
}

func (enc *jsonEncoder) tryAddRuneError(r rune, size int) bool {
if r == utf8.RuneError && size == 1 {
enc.buf.AppendString(`\ufffd`)
return true
}
return false
// add remaining
appendTo(buf, s[last:])
}
33 changes: 29 additions & 4 deletions zapcore/json_encoder_bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ package zapcore_test

import (
"encoding/json"
"fmt"
"testing"
"time"

Expand Down Expand Up @@ -51,7 +52,28 @@ func BenchmarkZapJSONFloat32AndComplex64(b *testing.B) {
})
}

const _sliceSize = 5000

type StringSlice []string

func (s StringSlice) MarshalLogArray(encoder ArrayEncoder) error {
for _, str := range s {
encoder.AppendString(str)
}
return nil
}

func generateStringSlice(n int) StringSlice {
output := make(StringSlice, 0, n)
for i := 0; i < n; i++ {
output = append(output, fmt.Sprint("00000000-0000-0000-0000-0000000000", i))
}
return output
}

func BenchmarkZapJSON(b *testing.B) {
additional := generateStringSlice(_sliceSize)
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
enc := NewJSONEncoder(testEncoderConfig())
Expand All @@ -64,6 +86,7 @@ func BenchmarkZapJSON(b *testing.B) {
enc.AddString("string3", "🤔")
enc.AddString("string4", "🙊")
enc.AddBool("bool", true)
_ = enc.AddArray("test", additional)
buf, _ := enc.EncodeEntry(Entry{
Message: "fake",
Level: DebugLevel,
Expand All @@ -75,10 +98,11 @@ func BenchmarkZapJSON(b *testing.B) {

func BenchmarkStandardJSON(b *testing.B) {
record := struct {
Level string `json:"level"`
Message string `json:"msg"`
Time time.Time `json:"ts"`
Fields map[string]interface{} `json:"fields"`
Level string `json:"level"`
Message string `json:"msg"`
Time time.Time `json:"ts"`
Fields map[string]interface{} `json:"fields"`
Additional StringSlice
}{
Level: "debug",
Message: "fake",
Expand All @@ -94,6 +118,7 @@ func BenchmarkStandardJSON(b *testing.B) {
"string4": "🙊",
"bool": true,
},
Additional: generateStringSlice(_sliceSize),
}
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
Expand Down
72 changes: 72 additions & 0 deletions zapcore/json_encoder_impl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ import (
"testing"
"testing/quick"
"time"
"unicode/utf8"

"go.uber.org/zap/buffer"
"go.uber.org/zap/internal/bufferpool"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/multierr"
)

Expand Down Expand Up @@ -662,3 +665,72 @@ func TestJSONQuick(t *testing.T) {
check(asciiRoundTripsCorrectlyString)
check(asciiRoundTripsCorrectlyByteString)
}

var _stringLikeCorpus = []string{
"",
"foo",
"bar",
"a\nb",
"a\tb",
"a\\b",
`a"b`,
}

func FuzzSafeAppendStringLike_bytes(f *testing.F) {
for _, s := range _stringLikeCorpus {
f.Add([]byte(s))
}
f.Fuzz(func(t *testing.T, b []byte) {
if !utf8.Valid(b) {
t.Skip()
}

fuzzSafeAppendStringLike(t, string(b), func(buf *buffer.Buffer) {
safeAppendStringLike(
(*buffer.Buffer).AppendBytes,
utf8.DecodeRune,
buf,
b,
)
})
})
}

func FuzzSafeAppendStringLike_string(f *testing.F) {
for _, s := range _stringLikeCorpus {
f.Add(s)
}
f.Fuzz(func(t *testing.T, s string) {
if !utf8.ValidString(s) {
t.Skip()
}

fuzzSafeAppendStringLike(t, s, func(buf *buffer.Buffer) {
safeAppendStringLike(
(*buffer.Buffer).AppendString,
utf8.DecodeRuneInString,
buf,
s,
)
})
})
}

func fuzzSafeAppendStringLike(
t *testing.T,
want string,
writeString func(*buffer.Buffer),
) {
t.Helper()

buf := bufferpool.Get()
defer buf.Free()

buf.AppendByte('"')
writeString(buf)
buf.AppendByte('"')

var got string
require.NoError(t, json.Unmarshal(buf.Bytes(), &got))
assert.Equal(t, want, got)
}

0 comments on commit 5a27bab

Please sign in to comment.