diff --git a/byteseq.go b/byteseq.go index 231c358..8f99300 100644 --- a/byteseq.go +++ b/byteseq.go @@ -1,3 +1,5 @@ +//go:build go1.18 + package jx import "unicode/utf8" diff --git a/byteseq_go117.go b/byteseq_go117.go new file mode 100644 index 0000000..de69428 --- /dev/null +++ b/byteseq_go117.go @@ -0,0 +1,11 @@ +//go:build !go1.18 + +package jx + +import "unicode/utf8" + +func decodeRuneInByteseq(val string) (r rune, size int) { + var tmp [4]byte + n := copy(tmp[:], val) + return utf8.DecodeRune(tmp[:n]) +} diff --git a/byteseq_go117_test.go b/byteseq_go117_test.go new file mode 100644 index 0000000..4a491ef --- /dev/null +++ b/byteseq_go117_test.go @@ -0,0 +1,23 @@ +//go:build !go1.18 + +package jx + +import ( + "testing" +) + +func Benchmark_decodeRuneInByteseq(b *testing.B) { + var result rune + const buf = `ж` + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + result, _ = decodeRuneInByteseq(buf[:]) + } + + if result != 'ж' { + b.Fatal(result) + } +} diff --git a/byteseq_test.go b/byteseq_test.go index f79aa79..d2c8176 100644 --- a/byteseq_test.go +++ b/byteseq_test.go @@ -1,3 +1,5 @@ +//go:build go1.18 + package jx import ( diff --git a/go.mod b/go.mod index 5090900..8a8946b 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/go-faster/jx go 1.18 require ( - github.com/go-faster/errors v0.5.0 + github.com/go-faster/errors v0.6.1 github.com/segmentio/asm v1.2.0 github.com/stretchr/testify v1.7.1 ) diff --git a/go.sum b/go.sum index 99d5c5f..3499d0e 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,8 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/go-faster/errors v0.5.0 h1:hS/zHFJ2Vb14jcupq5J9tk05XW+PFTmySOkDRByHBo4= -github.com/go-faster/errors v0.5.0/go.mod h1:/9SNBcg2ESJTYztBFEiM5Np6ns85BtPNMJd8lFTiFwk= +github.com/go-faster/errors v0.6.1 h1:nNIPOBkprlKzkThvS/0YaX8Zs9KewLCOSFQS5BU06FI= +github.com/go-faster/errors v0.6.1/go.mod h1:5MGV2/2T9yvlrbhe9pD9LO5Z/2zCSq2T8j+Jpi2LAyY= github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= diff --git a/w_str.go b/w_str.go index 7a72d83..3952aa3 100644 --- a/w_str.go +++ b/w_str.go @@ -112,104 +112,6 @@ var htmlSafeSet = [utf8.RuneSelf]bool{ const hexChars = "0123456789abcdef" -// StrEscape encodes string with html special characters escaping. -func (w *Writer) StrEscape(v string) { - strEscape(w, v) -} - -// ByteStrEscape encodes string with html special characters escaping. -func (w *Writer) ByteStrEscape(v []byte) { - strEscape(w, v) -} - -func strEscape[T byteseq](w *Writer, v T) { - length := len(v) - w.Buf = append(w.Buf, '"') - // Fast path, probably does not require escaping. - i := 0 - for ; i < length; i++ { - c := v[i] - if c >= utf8.RuneSelf || !(htmlSafeSet[c]) { - break - } - } - w.Buf = append(w.Buf, v[:i]...) - if i == length { - w.Buf = append(w.Buf, '"') - return - } - strEscapeSlow[T](w, i, v, length) -} - -func strEscapeSlow[T byteseq](w *Writer, i int, v T, valLen int) { - start := i - // for the remaining parts, we process them char by char - for i < valLen { - if b := v[i]; b < utf8.RuneSelf { - if htmlSafeSet[b] { - i++ - continue - } - if start < i { - w.Buf = append(w.Buf, v[start:i]...) - } - switch b { - case '\\', '"': - w.twoBytes('\\', b) - case '\n': - w.twoBytes('\\', 'n') - case '\r': - w.twoBytes('\\', 'r') - case '\t': - w.twoBytes('\\', 't') - default: - // This encodes bytes < 0x20 except for \t, \n and \r. - // If escapeHTML is set, it also escapes <, >, and & - // because they can lead to security holes when - // user-controlled strings are rendered into JSON - // and served to some browsers. - w.rawStr(`\u00`) - w.twoBytes(hexChars[b>>4], hexChars[b&0xF]) - } - i++ - start = i - continue - } - c, size := decodeRuneInByteseq(v[i:]) - if c == utf8.RuneError && size == 1 { - if start < i { - w.Buf = append(w.Buf, v[start:i]...) - } - w.rawStr(`\ufffd`) - i++ - start = i - continue - } - // U+2028 is LINE SEPARATOR. - // U+2029 is PARAGRAPH SEPARATOR. - // They are both technically valid characters in JSON strings, - // but don't work in JSONP, which has to be evaluated as JavaScript, - // and can lead to security holes there. It is valid JSON to - // escape them, so we do so unconditionally. - // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. - if c == '\u2028' || c == '\u2029' { - if start < i { - w.Buf = append(w.Buf, v[start:i]...) - } - w.rawStr(`\u202`) - w.byte(hexChars[c&0xF]) - i += size - start = i - continue - } - i += size - } - if start < len(v) { - w.Buf = append(w.Buf, v[start:]...) - } - w.byte('"') -} - // safeSet holds the value true if the ASCII character with the given array // position can be represented inside a JSON string without any further // escaping. @@ -225,83 +127,3 @@ var safeSet = [256]byte{ '"': 1, '\\': 1, } - -// Str encodes string without html escaping. -// -// Use StrEscape to escape html, this is default for encoding/json and -// should be used by default for untrusted strings. -func (w *Writer) Str(v string) { - writeStr(w, v) -} - -// ByteStr encodes string without html escaping. -// -// Use ByteStrEscape to escape html, this is default for encoding/json and -// should be used by default for untrusted strings. -func (w *Writer) ByteStr(v []byte) { - writeStr(w, v) -} - -func writeStr[T byteseq](w *Writer, v T) { - w.Buf = append(w.Buf, '"') - - // Fast path, without utf8 and escape support. - var ( - i = 0 - length = len(v) - c byte - ) - for i, c = range []byte(v) { - if safeSet[c] != 0 { - goto slow - } - } - if i == length-1 { - w.Buf = append(w.Buf, v...) - w.Buf = append(w.Buf, '"') - return - } -slow: - w.Buf = append(w.Buf, v[:i]...) - strSlow[T](w, v[i:]) -} - -func strSlow[T byteseq](w *Writer, v T) { - var i, start int - // for the remaining parts, we process them char by char - for i < len(v) { - b := v[i] - if safeSet[b] == 0 { - i++ - continue - } - if start < i { - w.Buf = append(w.Buf, v[start:i]...) - } - switch b { - case '\\', '"': - w.twoBytes('\\', b) - case '\n': - w.twoBytes('\\', 'n') - case '\r': - w.twoBytes('\\', 'r') - case '\t': - w.twoBytes('\\', 't') - default: - // This encodes bytes < 0x20 except for \t, \n and \r. - // If escapeHTML is set, it also escapes <, >, and & - // because they can lead to security holes when - // user-controlled strings are rendered into JSON - // and served to some browsers. - w.rawStr(`\u00`) - w.twoBytes(hexChars[b>>4], hexChars[b&0xF]) - } - i++ - start = i - continue - } - if start < len(v) { - w.Buf = append(w.Buf, v[start:]...) - } - w.byte('"') -} diff --git a/w_str_go117.go b/w_str_go117.go new file mode 100644 index 0000000..c737ebb --- /dev/null +++ b/w_str_go117.go @@ -0,0 +1,190 @@ +//go:build !go1.18 + +package jx + +import ( + "unicode/utf8" + "unsafe" +) + +// Str encodes string without html escaping. +// +// Use StrEscape to escape html, this is default for encoding/json and +// should be used by default for untrusted strings. +func (w *Writer) Str(v string) { + writeStr(w, v) +} + +func bts(buf []byte) string { + return *(*string)(unsafe.Pointer(&buf)) // #nosec G103: internal usage +} + +// ByteStr encodes string without html escaping. +// +// Use ByteStrEscape to escape html, this is default for encoding/json and +// should be used by default for untrusted strings. +func (w *Writer) ByteStr(v []byte) { + writeStr(w, bts(v)) +} + +func writeStr(w *Writer, v string) { + w.Buf = append(w.Buf, '"') + + // Fast path, without utf8 and escape support. + var ( + i = 0 + length = len(v) + c byte + ) + for i, c = range []byte(v) { + if safeSet[c] != 0 { + goto slow + } + } + if i == length-1 { + w.Buf = append(w.Buf, v...) + w.Buf = append(w.Buf, '"') + return + } +slow: + w.Buf = append(w.Buf, v[:i]...) + strSlow(w, v[i:]) +} + +func strSlow(w *Writer, v string) { + var i, start int + // for the remaining parts, we process them char by char + for i < len(v) { + b := v[i] + if safeSet[b] == 0 { + i++ + continue + } + if start < i { + w.Buf = append(w.Buf, v[start:i]...) + } + switch b { + case '\\', '"': + w.twoBytes('\\', b) + case '\n': + w.twoBytes('\\', 'n') + case '\r': + w.twoBytes('\\', 'r') + case '\t': + w.twoBytes('\\', 't') + default: + // This encodes bytes < 0x20 except for \t, \n and \r. + // If escapeHTML is set, it also escapes <, >, and & + // because they can lead to security holes when + // user-controlled strings are rendered into JSON + // and served to some browsers. + w.rawStr(`\u00`) + w.twoBytes(hexChars[b>>4], hexChars[b&0xF]) + } + i++ + start = i + continue + } + if start < len(v) { + w.Buf = append(w.Buf, v[start:]...) + } + w.byte('"') +} + +// StrEscape encodes string with html special characters escaping. +func (w *Writer) StrEscape(v string) { + strEscape(w, v) +} + +// ByteStrEscape encodes string with html special characters escaping. +func (w *Writer) ByteStrEscape(v []byte) { + strEscape(w, bts(v)) +} + +func strEscape(w *Writer, v string) { + length := len(v) + w.Buf = append(w.Buf, '"') + // Fast path, probably does not require escaping. + i := 0 + for ; i < length; i++ { + c := v[i] + if c >= utf8.RuneSelf || !(htmlSafeSet[c]) { + break + } + } + w.Buf = append(w.Buf, v[:i]...) + if i == length { + w.Buf = append(w.Buf, '"') + return + } + strEscapeSlow(w, i, v, length) +} + +func strEscapeSlow(w *Writer, i int, v string, valLen int) { + start := i + // for the remaining parts, we process them char by char + for i < valLen { + if b := v[i]; b < utf8.RuneSelf { + if htmlSafeSet[b] { + i++ + continue + } + if start < i { + w.Buf = append(w.Buf, v[start:i]...) + } + switch b { + case '\\', '"': + w.twoBytes('\\', b) + case '\n': + w.twoBytes('\\', 'n') + case '\r': + w.twoBytes('\\', 'r') + case '\t': + w.twoBytes('\\', 't') + default: + // This encodes bytes < 0x20 except for \t, \n and \r. + // If escapeHTML is set, it also escapes <, >, and & + // because they can lead to security holes when + // user-controlled strings are rendered into JSON + // and served to some browsers. + w.rawStr(`\u00`) + w.twoBytes(hexChars[b>>4], hexChars[b&0xF]) + } + i++ + start = i + continue + } + c, size := decodeRuneInByteseq(v[i:]) + if c == utf8.RuneError && size == 1 { + if start < i { + w.Buf = append(w.Buf, v[start:i]...) + } + w.rawStr(`\ufffd`) + i++ + start = i + continue + } + // U+2028 is LINE SEPARATOR. + // U+2029 is PARAGRAPH SEPARATOR. + // They are both technically valid characters in JSON strings, + // but don't work in JSONP, which has to be evaluated as JavaScript, + // and can lead to security holes there. It is valid JSON to + // escape them, so we do so unconditionally. + // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. + if c == '\u2028' || c == '\u2029' { + if start < i { + w.Buf = append(w.Buf, v[start:i]...) + } + w.rawStr(`\u202`) + w.byte(hexChars[c&0xF]) + i += size + start = i + continue + } + i += size + } + if start < len(v) { + w.Buf = append(w.Buf, v[start:]...) + } + w.byte('"') +} diff --git a/w_str_go118.go b/w_str_go118.go new file mode 100644 index 0000000..c95938a --- /dev/null +++ b/w_str_go118.go @@ -0,0 +1,183 @@ +//go:build go1.18 + +package jx + +import "unicode/utf8" + +// Str encodes string without html escaping. +// +// Use StrEscape to escape html, this is default for encoding/json and +// should be used by default for untrusted strings. +func (w *Writer) Str(v string) { + writeStr(w, v) +} + +// ByteStr encodes string without html escaping. +// +// Use ByteStrEscape to escape html, this is default for encoding/json and +// should be used by default for untrusted strings. +func (w *Writer) ByteStr(v []byte) { + writeStr(w, v) +} + +func writeStr[T byteseq](w *Writer, v T) { + w.Buf = append(w.Buf, '"') + + // Fast path, without utf8 and escape support. + var ( + i = 0 + length = len(v) + c byte + ) + for i, c = range []byte(v) { + if safeSet[c] != 0 { + goto slow + } + } + if i == length-1 { + w.Buf = append(w.Buf, v...) + w.Buf = append(w.Buf, '"') + return + } +slow: + w.Buf = append(w.Buf, v[:i]...) + strSlow[T](w, v[i:]) +} + +func strSlow[T byteseq](w *Writer, v T) { + var i, start int + // for the remaining parts, we process them char by char + for i < len(v) { + b := v[i] + if safeSet[b] == 0 { + i++ + continue + } + if start < i { + w.Buf = append(w.Buf, v[start:i]...) + } + switch b { + case '\\', '"': + w.twoBytes('\\', b) + case '\n': + w.twoBytes('\\', 'n') + case '\r': + w.twoBytes('\\', 'r') + case '\t': + w.twoBytes('\\', 't') + default: + // This encodes bytes < 0x20 except for \t, \n and \r. + // If escapeHTML is set, it also escapes <, >, and & + // because they can lead to security holes when + // user-controlled strings are rendered into JSON + // and served to some browsers. + w.rawStr(`\u00`) + w.twoBytes(hexChars[b>>4], hexChars[b&0xF]) + } + i++ + start = i + continue + } + if start < len(v) { + w.Buf = append(w.Buf, v[start:]...) + } + w.byte('"') +} + +// StrEscape encodes string with html special characters escaping. +func (w *Writer) StrEscape(v string) { + strEscape(w, v) +} + +// ByteStrEscape encodes string with html special characters escaping. +func (w *Writer) ByteStrEscape(v []byte) { + strEscape(w, v) +} + +func strEscape[T byteseq](w *Writer, v T) { + length := len(v) + w.Buf = append(w.Buf, '"') + // Fast path, probably does not require escaping. + i := 0 + for ; i < length; i++ { + c := v[i] + if c >= utf8.RuneSelf || !(htmlSafeSet[c]) { + break + } + } + w.Buf = append(w.Buf, v[:i]...) + if i == length { + w.Buf = append(w.Buf, '"') + return + } + strEscapeSlow[T](w, i, v, length) +} + +func strEscapeSlow[T byteseq](w *Writer, i int, v T, valLen int) { + start := i + // for the remaining parts, we process them char by char + for i < valLen { + if b := v[i]; b < utf8.RuneSelf { + if htmlSafeSet[b] { + i++ + continue + } + if start < i { + w.Buf = append(w.Buf, v[start:i]...) + } + switch b { + case '\\', '"': + w.twoBytes('\\', b) + case '\n': + w.twoBytes('\\', 'n') + case '\r': + w.twoBytes('\\', 'r') + case '\t': + w.twoBytes('\\', 't') + default: + // This encodes bytes < 0x20 except for \t, \n and \r. + // If escapeHTML is set, it also escapes <, >, and & + // because they can lead to security holes when + // user-controlled strings are rendered into JSON + // and served to some browsers. + w.rawStr(`\u00`) + w.twoBytes(hexChars[b>>4], hexChars[b&0xF]) + } + i++ + start = i + continue + } + c, size := decodeRuneInByteseq(v[i:]) + if c == utf8.RuneError && size == 1 { + if start < i { + w.Buf = append(w.Buf, v[start:i]...) + } + w.rawStr(`\ufffd`) + i++ + start = i + continue + } + // U+2028 is LINE SEPARATOR. + // U+2029 is PARAGRAPH SEPARATOR. + // They are both technically valid characters in JSON strings, + // but don't work in JSONP, which has to be evaluated as JavaScript, + // and can lead to security holes there. It is valid JSON to + // escape them, so we do so unconditionally. + // See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion. + if c == '\u2028' || c == '\u2029' { + if start < i { + w.Buf = append(w.Buf, v[start:i]...) + } + w.rawStr(`\u202`) + w.byte(hexChars[c&0xF]) + i += size + start = i + continue + } + i += size + } + if start < len(v) { + w.Buf = append(w.Buf, v[start:]...) + } + w.byte('"') +}