feat: streaming encoding

go-faster · Jan 25, 2023 · 57ff135 · 57ff135
1 parent 644d74e
commit 57ff135
Show file tree

Hide file tree

Showing 17 changed files with 1,192 additions and 0 deletions.
diff --git a/internal/byteseq/byteseq.go b/internal/byteseq/byteseq.go
@@ -1,7 +1,9 @@
+// Package byteseq provides a Byteseq type that can be used to represent a sequence of bytes.
 package byteseq
 
 import "unicode/utf8"
 
+// Byteseq is common interface for byte slices and strings.
 type Byteseq interface {
 	string | []byte
 }

diff --git a/stream/enc.go b/stream/enc.go
@@ -0,0 +1,76 @@
+package stream
+
+import "io"
+
+// Encoder writes json tokens to given writer.
+type Encoder[W io.Writer] struct {
+	w      writer[W]
+	indent int // count of spaces for single indentation level
+
+	// first handles state for comma and indentation writing.
+	//
+	// New Object or Array appends new level to this slice, and
+	// last element of this slice denotes whether first element was written.
+	//
+	// We write commas only before non-first element of Array or Object.
+	//
+	// See comma, begin, end and FieldStart for implementation details.
+	//
+	// Note: probably, this can be optimized as bit set to ease memory
+	// consumption.
+	//
+	// See https://yourbasic.org/algorithms/your-basic-int/#simple-sets
+	first []bool
+}
+
+// NewEncoder returns new Encoder that writes to given writer.
+func NewEncoder[W io.Writer](w W) *Encoder[W] {
+	return &Encoder[W]{
+		w: writer[W]{
+			writer: w,
+			buf:    make([]byte, 0, 512),
+		},
+	}
+}
+
+// Close flushes all buffered data to underlying writer.
+func (e *Encoder[W]) Close() error {
+	e.w.flush()
+	return e.w.flushErr
+}
+
+// SetIdent sets length of single indentation step.
+func (e *Encoder[W]) SetIdent(n int) {
+	e.indent = n
+}
+
+// Reset resets underlying buffer.
+func (e *Encoder[W]) Reset(w W) {
+	e.w.Reset(w)
+	e.first = e.first[:0]
+}
+
+// RawStr writes string as raw json.
+func (e *Encoder[W]) RawStr(v string) bool {
+	return e.comma() || e.w.writeString(v)
+}
+
+// Raw writes byte slice as raw json.
+func (e *Encoder[W]) Raw(b []byte) bool {
+	return e.comma() || e.w.writeBytes(b...)
+}
+
+func (e *Encoder[W]) writeIndent() bool {
+	if e.indent == 0 {
+		return false
+	}
+	if e.w.writeByte('\n') {
+		return true
+	}
+	for i := 0; i < len(e.first)*e.indent; i++ {
+		if e.w.writeByte(' ') {
+			return true
+		}
+	}
+	return false
+}
diff --git a/stream/enc_arr.go b/stream/enc_arr.go
@@ -0,0 +1,40 @@
+package stream
+
+// ArrStart writes start of array, performing indentation if needed.
+//
+// Use Arr as convenience helper for writing arrays.
+func (e *Encoder[W]) ArrStart() bool {
+	if e.comma() || e.w.writeByte('[') {
+		return true
+	}
+	e.begin()
+	return e.writeIndent()
+}
+
+// ArrEnd writes end of array, performing indentation if needed.
+//
+// Use Arr as convenience helper for writing arrays.
+func (e *Encoder[W]) ArrEnd() bool {
+	e.end()
+	return e.writeIndent() ||
+		e.w.writeByte(']')
+}
+
+// ArrEmpty writes empty array.
+func (e *Encoder[W]) ArrEmpty() bool {
+	return e.comma() ||
+		e.ArrStart() ||
+		e.ArrEnd()
+}
+
+// Arr writes start of array, invokes callback and writes end of array.
+//
+// If callback is nil, writes empty array.
+func (e *Encoder[W]) Arr(f func(e *Encoder[W]) bool) bool {
+	if f == nil {
+		return e.ArrEmpty()
+	}
+	return e.ArrStart() ||
+		f(e) ||
+		e.ArrEnd()
+}
diff --git a/stream/enc_b64.go b/stream/enc_b64.go
@@ -0,0 +1,50 @@
+package stream
+
+import "github.com/segmentio/asm/base64"
+
+// Base64 encodes data as standard base64 encoded string.
+//
+// Same as encoding/json, base64.StdEncoding or RFC 4648.
+func (e *Encoder[W]) Base64(data []byte) bool {
+	if data == nil {
+		return e.Null()
+	}
+
+	if e.comma() || e.w.writeByte('"') {
+		return true
+	}
+
+	encodedLen := base64.StdEncoding.EncodedLen(len(data))
+	buf := e.w.buf
+	switch {
+	case encodedLen <= cap(buf):
+		// Case 2: There is enough space in the buffer after flushing.
+		if e.w.flush() {
+			return true
+		}
+		fallthrough
+	case len(buf)+encodedLen <= cap(buf):
+		// Case 1: There is enough space in the buffer.
+		base64.StdEncoding.Encode(buf[len(buf):len(buf)+encodedLen], data)
+		e.w.buf = buf[:len(buf)+encodedLen]
+	default:
+		// Case 3: There is not enough space in the buffer.
+		// We need to flush the buffer and then write the encoded data directly.
+		if e.w.flush() {
+			return true
+		}
+
+		// StdEncoding includes padding, so we can't just split the data into chunks.
+		// FIXME(tdakkota): Is there a way to avoid this allocation?
+		//  If we can't, we should at least use a pool.
+		//  Or use streaming encoder from stdlib.
+		//  Or remove this method.
+		r := make([]byte, encodedLen)
+		base64.StdEncoding.Encode(r, data)
+		if e.w.writeBytes(r...) {
+			return true
+		}
+	}
+
+	return e.w.writeByte('"')
+}
diff --git a/stream/enc_bool.go b/stream/enc_bool.go
@@ -0,0 +1,19 @@
+package stream
+
+// True writes true.
+func (e *Encoder[W]) True() bool {
+	return e.comma() || e.w.writeString("true")
+}
+
+// False writes false.
+func (e *Encoder[W]) False() bool {
+	return e.comma() || e.w.writeString("false")
+}
+
+// Bool encodes boolean.
+func (e *Encoder[W]) Bool(v bool) bool {
+	if v {
+		return e.True()
+	}
+	return e.False()
+}
diff --git a/stream/enc_comma.go b/stream/enc_comma.go
@@ -0,0 +1,33 @@
+package stream
+
+// begin should be called before new Array or Object.
+func (e *Encoder[W]) begin() {
+	e.first = append(e.first, true)
+}
+
+// end should be called after Array or Object.
+func (e *Encoder[W]) end() {
+	if len(e.first) == 0 {
+		return
+	}
+	e.first = e.first[:e.current()]
+}
+
+func (e *Encoder[W]) current() int { return len(e.first) - 1 }
+
+// comma should be called before any new value.
+func (e *Encoder[W]) comma() bool {
+	// Writing commas.
+	// 1. Before every field expect first.
+	// 2. Before every array element except first.
+	if len(e.first) == 0 {
+		return false
+	}
+	current := e.current()
+	_ = e.first[current]
+	if e.first[current] {
+		e.first[current] = false
+		return false
+	}
+	return e.w.writeByte(',') || e.writeIndent()
+}
diff --git a/stream/enc_float.go b/stream/enc_float.go
@@ -0,0 +1,56 @@
+package stream
+
+import (
+	"math"
+	"strconv"
+)
+
+// Float32 encodes float32.
+//
+// NB: Infinities and NaN are represented as null.
+func (e *Encoder[W]) Float32(v float32) bool { return e.Float(float64(v), 32) }
+
+// Float64 encodes float64.
+//
+// NB: Infinities and NaN are represented as null.
+func (e *Encoder[W]) Float64(v float64) bool { return e.Float(v, 64) }
+
+// Float writes float value to buffer.
+func (e *Encoder[W]) Float(v float64, bits int) bool {
+	if math.IsNaN(v) || math.IsInf(v, 0) {
+		// Like in ECMA:
+		// NaN and Infinity regardless of sign are represented
+		// as the String null.
+		//
+		// JSON.stringify({"foo":NaN}) -> {"foo":null}
+		return e.Null() // FIXME(tdakkota): probably, we should return false here and set error
+	}
+
+	// From go std sources, strconv/ftoa.go:
+
+	// Convert as if by ES6 number to string conversion.
+	// This matches most other JSON generators.
+	// See golang.org/issue/6384 and golang.org/issue/14135.
+	// Like fmt %g, but the exponent cutoffs are different
+	// and exponents themselves are not padded to two digits.
+	abs := math.Abs(v)
+	fmt := byte('f')
+	// Note: Must use float32 comparisons for underlying float32 value to get precise cutoffs right.
+	if abs != 0 {
+		if bits == 64 && (abs < 1e-6 || abs >= 1e21) || bits == 32 && (float32(abs) < 1e-6 || float32(abs) >= 1e21) {
+			fmt = 'e'
+		}
+	}
+
+	b := make([]byte, 0, 32) // FIXME(tdakkota): use e.w.buf?
+	b = strconv.AppendFloat(b, v, fmt, -1, bits)
+	if fmt == 'e' {
+		// clean up e-09 to e-9
+		n := len(b)
+		if n >= 4 && b[n-4] == 'e' && b[n-3] == '-' && b[n-2] == '0' {
+			b[n-2] = b[n-1]
+			b = b[:n-1]
+		}
+	}
+	return e.comma() || e.w.writeBytes(b...)
+}