From 895291cf4524385c0d42b56e173a6022fee9b400 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 27 Jul 2023 00:57:26 -0700 Subject: [PATCH] Add xerial snappy read/writer (#838) Forked from [github.com/eapache/go-xerial-snappy](https://github.com/eapache/go-xerial-snappy). Changes: * Uses [S2](https://github.com/klauspost/compress/tree/master/s2#snappy-compatibility) for better/faster compression and decompression. * Fixes 0-length roundtrips. * Adds `DecodeCapped`, which allows decompression with capped output size. * `DecodeInto` will decode directly into destination if there is space enough. * `Encode` will now encode directly into 'dst' if it has space enough. * Fixes short snappy buffers returning `ErrMalformed`. * Renames `EncodeStream` to `Encode`. * Adds `EncodeBetter` for better than default compression at ~half the speed. Comparison (before/after): ``` BenchmarkSnappyStreamEncode-32 959010 1170 ns/op 875.15 MB/s 1280 B/op 1 allocs/op BenchmarkSnappyStreamEncode-32 1000000 1107 ns/op 925.04 MB/s 0 B/op 0 allocs/op --> Output size: 913 -> 856 bytes BenchmarkSnappyStreamEncodeBetter-32 477739 2506 ns/op 408.62 MB/s 0 B/op 0 allocs/op --> Output size: 835 bytes BenchmarkSnappyStreamEncodeMassive-32 100 10596963 ns/op 966.31 MB/s 40977 B/op 1 allocs/op BenchmarkSnappyStreamEncodeMassive-32 100 10220236 ns/op 1001.93 MB/s 0 B/op 0 allocs/op --> Output size: 2365547 -> 2256991 bytes BenchmarkSnappyStreamEncodeBetterMassive-32 69 16983314 ns/op 602.94 MB/s 0 B/op 0 allocs/op --> Output size: 2011997 bytes BenchmarkSnappyStreamDecodeInto-32 1887378 639.5 ns/op 1673.19 MB/s 1088 B/op 3 allocs/op BenchmarkSnappyStreamDecodeInto-32 2707915 436.2 ns/op 2452.99 MB/s 0 B/op 0 allocs/op BenchmarkSnappyStreamDecodeIntoMassive-32 267 4559594 ns/op 2245.81 MB/s 71120 B/op 1 allocs/op BenchmarkSnappyStreamDecodeIntoMassive-32 282 4285844 ns/op 2389.26 MB/s 0 B/op 0 allocs/op ``` --- snappy/xerial/LICENSE | 22 ++ snappy/xerial/README.md | 50 +++++ snappy/xerial/fuzz_test.go | 64 ++++++ snappy/xerial/testdata/FuzzDecoder.zip | Bin 0 -> 19797 bytes snappy/xerial/xerial.go | 262 +++++++++++++++++++++++ snappy/xerial/xerial_test.go | 276 +++++++++++++++++++++++++ 6 files changed, 674 insertions(+) create mode 100644 snappy/xerial/LICENSE create mode 100644 snappy/xerial/README.md create mode 100644 snappy/xerial/fuzz_test.go create mode 100644 snappy/xerial/testdata/FuzzDecoder.zip create mode 100644 snappy/xerial/xerial.go create mode 100644 snappy/xerial/xerial_test.go diff --git a/snappy/xerial/LICENSE b/snappy/xerial/LICENSE new file mode 100644 index 0000000000..a97223897a --- /dev/null +++ b/snappy/xerial/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2016 Evan Huus +Copyright (c) 2023 Klaus Post + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/snappy/xerial/README.md b/snappy/xerial/README.md new file mode 100644 index 0000000000..08d4332897 --- /dev/null +++ b/snappy/xerial/README.md @@ -0,0 +1,50 @@ +# go-xerial-snappy + +Xerial-compatible Snappy framing support for golang. + +Packages using Xerial for snappy encoding use a framing format incompatible with +basically everything else in existence. + +Apps that use this format include Apache Kafka (see +https://github.com/dpkp/kafka-python/issues/126#issuecomment-35478921 for +details). + +# Fork + +Forked from [github.com/eapache/go-xerial-snappy](https://github.com/eapache/go-xerial-snappy). + +Changes: + +* Uses [S2](https://github.com/klauspost/compress/tree/master/s2#snappy-compatibility) for better/faster compression and decompression. +* Fixes 0-length roundtrips. +* Adds `DecodeCapped`, which allows decompression with capped output size. +* `DecodeInto` will decode directly into destination if there is space enough. +* `Encode` will now encode directly into 'dst' if it has space enough. +* Fixes short snappy buffers returning `ErrMalformed`. +* Renames `EncodeStream` to `Encode`. +* Adds `EncodeBetter` for better than default compression at ~half the speed. + + +Comparison (before/after): + +``` +BenchmarkSnappyStreamEncode-32 959010 1170 ns/op 875.15 MB/s 1280 B/op 1 allocs/op +BenchmarkSnappyStreamEncode-32 1000000 1107 ns/op 925.04 MB/s 0 B/op 0 allocs/op +--> Output size: 913 -> 856 bytes + +BenchmarkSnappyStreamEncodeBetter-32 477739 2506 ns/op 408.62 MB/s 0 B/op 0 allocs/op +--> Output size: 835 bytes + +BenchmarkSnappyStreamEncodeMassive-32 100 10596963 ns/op 966.31 MB/s 40977 B/op 1 allocs/op +BenchmarkSnappyStreamEncodeMassive-32 100 10220236 ns/op 1001.93 MB/s 0 B/op 0 allocs/op +--> Output size: 2365547 -> 2256991 bytes + +BenchmarkSnappyStreamEncodeBetterMassive-32 69 16983314 ns/op 602.94 MB/s 0 B/op 0 allocs/op +--> Output size: 2011997 bytes + +BenchmarkSnappyStreamDecodeInto-32 1887378 639.5 ns/op 1673.19 MB/s 1088 B/op 3 allocs/op +BenchmarkSnappyStreamDecodeInto-32 2707915 436.2 ns/op 2452.99 MB/s 0 B/op 0 allocs/op + +BenchmarkSnappyStreamDecodeIntoMassive-32 267 4559594 ns/op 2245.81 MB/s 71120 B/op 1 allocs/op +BenchmarkSnappyStreamDecodeIntoMassive-32 282 4285844 ns/op 2389.26 MB/s 0 B/op 0 allocs/op +``` \ No newline at end of file diff --git a/snappy/xerial/fuzz_test.go b/snappy/xerial/fuzz_test.go new file mode 100644 index 0000000000..b03e5bb455 --- /dev/null +++ b/snappy/xerial/fuzz_test.go @@ -0,0 +1,64 @@ +package xerial + +import ( + "bytes" + "testing" + + "github.com/klauspost/compress/internal/fuzz" + "github.com/klauspost/compress/s2" +) + +func FuzzDecode(f *testing.F) { + fuzz.AddFromZip(f, "testdata/FuzzDecoder.zip", fuzz.TypeGoFuzz, false) + const limit = 1 << 20 + dst := make([]byte, 0, limit) + f.Fuzz(func(t *testing.T, data []byte) { + got, _ := DecodeCapped(dst[:0], data) + if len(got) > cap(dst) { + t.Fatalf("cap exceeded: %d > %d", len(got), cap(dst)) + } + }) +} + +func FuzzEncode(f *testing.F) { + fuzz.AddFromZip(f, "../../s2/testdata/enc_regressions.zip", fuzz.TypeRaw, false) + fuzz.AddFromZip(f, "../../s2/testdata/fuzz/block-corpus-raw.zip", fuzz.TypeRaw, testing.Short()) + fuzz.AddFromZip(f, "../../s2/testdata/fuzz/block-corpus-enc.zip", fuzz.TypeGoFuzz, testing.Short()) + + f.Fuzz(func(t *testing.T, data []byte) { + t.Run("standard", func(t *testing.T) { + encoded := Encode(make([]byte, 0, len(data)/2), data) + decoded, err := Decode(encoded) + if err != nil { + t.Errorf("input: %+v, encoded: %+v", data, encoded) + t.Fatal(err) + } + if !bytes.Equal(decoded, data) { + t.Fatal("mismatch") + } + + }) + t.Run("better", func(t *testing.T) { + encoded := EncodeBetter(make([]byte, 0, len(data)/2), data) + decoded, err := Decode(encoded) + if err != nil { + t.Errorf("input: %+v, encoded: %+v", data, encoded) + t.Fatal(err) + } + if !bytes.Equal(decoded, data) { + t.Fatal("mismatch") + } + }) + t.Run("snappy", func(t *testing.T) { + encoded := s2.EncodeSnappy(make([]byte, 0, len(data)/2), data) + decoded, err := Decode(encoded) + if err != nil { + t.Errorf("input: %+v, encoded: %+v", data, encoded) + t.Fatal(err) + } + if !bytes.Equal(decoded, data) { + t.Fatal("mismatch") + } + }) + }) +} diff --git a/snappy/xerial/testdata/FuzzDecoder.zip b/snappy/xerial/testdata/FuzzDecoder.zip new file mode 100644 index 0000000000000000000000000000000000000000..e5eeec822ac97342f06803596bc552d9c1ea8590 GIT binary patch literal 19797 zcmbta4VYC`xjw@TBjBh5A`^)=3hoy zIw(vdA}M0=V0OKN+P6Z-!L*aC^sb6bM2u;Pk;AAmz zts)(}%5q&Nj$F?Yfh!!rnPYjer95c~#bp$Ep6&Z{e{#XZ*^{R%Xg__&_Rfjo)wS#A zZ$D%93mt3wcd}uBy`pJI-@1Or+MeiL`S)nf=3GaKGW&M_>^mQY5?2+KQMPTn4wJFV zE#Hd0STW0Cg1J)okuABjl?)tTDG>^r2g(g?R|b~lv%uAU^v@_`jUQI`(e+=ucFOv3 z`|P(TEgxj)c3Rh?_2)ycOX~`T(t)xg-}gORIxOH)`O4NTv~3~0Adr4wam^j2T&7Ti@EDXBJdfH2c}z3_Y++cf7Lv$cg@mK3-A8T%cHN~)_BK9e;>5Gn^*S9F@yHD zJuvynWwyf z$37Qv==;o%{McrG5C)Nyu@FI!rSjE}U3X^l%spMBM^7v>+nznmb)As^eo==|_=;(5 ziCDz0c5O?$Q5?m>_8>9#L+-?mbS>sPTKgh&J*BOXTb>Pw7pN>9O8skKW^1zskDqT` z^U^QWT{I24ZY<~t79#^(|tf>29Zv?pk>U)S6sSO)F`^Qg&op zh|?$xkTj%?u<;!PFBI@B3o#N{u0X!=bjU)B`w=u0q12JgHeLIi8~fYm*Y92a##?U; zzPr8b66==#W$9VaA}DAQ3a0R<&+IUaBCUADg4ppa9SJF{FbKFSBJFF*W67P+wYldq zD-s?q(2ls3tx0)OOI_mnvIMuQ@5o^%K&dfB#cXKhd6Gv63lRmb#UtBQNV__UEiaNW zw{pClz{M57ft|=f z+J~!a+jlt+p@zknBL!FRYUL=#rSC|kn8=b`8i(-fhQBqInPJVEo_pJ&$UTKz8cEBF zq!6~WSsWuVahSjhS>X8$UIb5-jY!Vn44XKsjmV zSZGu$Bn%75Ln0C*C&;|4*7kWnK6l=E=XaD{+K(FMJv_G zf5apYsx2+32O)^#FlM%p%3+kUg8-2hD5N9LQGpeD&>uBg1wPD%+#2{n9HAD4o?`aE zt}&l0lVV+VZC&TQH=-39O)O{?gl^<`uH##p{(w`*E(O0Y7<5xwBC#{Y+&Dn%z`{6` zv5b()xy@Y8EIlx-$}eyEmoI;mD&KPjJv0}N7ik-Ud^nA;+<>``$09q7(LyM6S`sPT z31q|rq(vU1v9V($Y4kfbV_AA!{n+Jiy#4O!x^+Wm(AxQY&y{0VpF8)4zVY`w`qD97 z_a8r79=CAtzFh032ai9c>G`j29^y1l`Rm2+Us#qj4}SODZ;{Q@pM(Y{khl(=2V#;^KadM-Z{|W z*9AS06OpAn+b8G;aQkLcvwWpaSYJ3BeV zn{O=d2A4H{=Ds#4*i%%}V;nUGSuApqDqT-V2UQHW>G?vsfzKm1l<2aU?>e@HwnsQ_ zj=5#%j9bfMaaPxL!|5(37ZSav z+*r!U(vo@TNE|0dW57MseDs%yI@D*Ov6TQwPG2ETE+wHR(yM1;=ac34ZE~&Qh77ocm8`jJ^~GQ70f9RfkH_xTrN?fF^niHV2)-s zg3xu5+_>*X$ab+~;bJk2gbs>5bd**o+^Qj(8rJsEQ1p7xw!&+jb%n*2&OX+#;3-R= z`{vouYe@ZJ3)-R59`02-XzfGK3M@NDDpQ`P8OL~u;nLU2Wf&$s9&^oDgnVaf5iq5A z{4eMwTV=yjkAJ=_nLK#=FIs0ovrF-7!_RS|(AGXOT8yrOe#&;?cW48-wp`RAv=|zF z5jD?9z&3hAD9d7@lvH62N%*E=eQd^?-+1?$vN7$UEx`q(+f(?_-=&Q~O~+9XMUrU; zH_A{5tOytEqD;je6?jW~n7$Dq=m1UL1U-^#L4TYB$VuDA4`O<+&A|(A7 zOA|~$eyGv+AO>Qzxd>r&vccmN?%#3oMKZP>HNeHnz!ZMCY0M*2PCNg$ub=$X zk5Br3(Ea@l6O4#Hq2#wW|d?jv*^E|F0j+vcvsRDzw0 z8)8_FFuX@GMkkD>G5#Z!nG>-(_@IJp8hTIX)UK^%x_s`KvrZx%id`_)LD&}9Rv2mR zL|6}n=zp=Gz-j}t6vEXCQE0J{vBocJy-Tp_(4rF_*fVuH)S*2_Gk&g*gil))%q9+6 z@=X7qG=J_~aTCVtoBCP%JyXt|bIoHHpV>EP%%_KrrA4Nxpaja;KsO>6NZv;YkZ$ZC-fmvf)kP*r&QaKKk!| zW5WzuD&3`~H(fjCs~2DaRvg-1zPD6?r+W@9THOAvi!BJASx_KEO^i`bun&o`oEzA5 z(No_%cIepamyMr4CG~xS@qITedHt9GNB^>Ld?ein;|pBh4>1;3`#zYmeb>7F_6KvX z@Rp&ktqM2nZP~SIRej%^*S>#Y%d`!1*6e(~?VXW*Yj1CvuyLBxu=qH2#b2kbR+|?d z`=5=wpSti5FMXr*b1%E$(4sScaQ0`%L6?&Y`Y?%}DnJX0$tyzk$h_eG_6NH5zM5Wd zeiEP8`@j3%qkxZ!$A#E-MQDQ1J)rPprfuGbb#88WZ~MO7&3ncl{q?DBvu`^xWx9XI z*$0n$|2lxAgxzbx%jHA2OKJ?J3 zIF{C&rzG_O;Ih44 z@R_S_zLO3n-_IY6JtW#bPoUzV->Z7?F4mK4d7f_@^kiqLOOVF#X$R-fKuA*r9n6un z948Q%pRvBnQ$*J3NbhY4m$O!)w6JQy&WsMd#3fQVQc>M!>K(RCLEu- zK*}#`hg@?ysZGsL-W*u(VqcB{4ns|q`JBvwpKW>@nc}LImn@9s zi=C{xq_m_Hk3QpzTgV$|&zILS#sdTfEwn=@L^?C?^PQ}1@bJccQ|Ar- zh`P6eo4vd9qIc4&3Us8A??rY@?*=UUU`h~er>rYgmG11@SB^MH#w>n3W=Ow}D$&s%9@h=@^TvHB8-go~kJ1Jmk3z}Cl4lP@_ zO5)p@A6Gr`gUKvktnO%gt<`Jm<>58F%Y<~Pv zgXV1hM9RD9$}XSYT~FuIkJQF83r}aXf={Ko&3bfzgkB&m|8g5!7Ldv>~%0(V6B(!x|dpy=d795agDtlO@=m8;*fX&5?Lqd?HXk4}GM9`%?pX0p7tSanG|M$3!oi}9*Sx5SCp|1g*{viH`e zjWe3Z47qOX#BMdN>*XU#-GoAW$DN}NQVK3sdk1YZy&B==+0_`Fs+xzfhQNqzD)=|t z_ReW!oubL{s2pHPCGY|wd<dsuCy>yWM6mSi__8?V^P2Z z*LM)!imKc{I2FC+h~@niG$#M^a~EogZz}v`VVCyN5r!XPr&ak~sFDD{e2T zqSOsf`uds`6yHUsa3x9zmMcPFI~7G$odVtP;V#g~8{`wAyXicN?_%qNE=^-`K|3hV z#>-YUjal2cruk;?Lnb~atb3JuU@~*w40znd6DHm}@UF&Xwi)1P$1R^w8fJ9hO?Um@ zB>MVK6%Wi0*?20&dl#Mqs#MV8EYS0bF%6|viAI(`Pg^jN)|VZ{b7L%~H4#YASYz3d z_4SmG&3u8^4&BRDY^|o2LF~9r*>VB*`FPtiU9ZN5+t$mu--Igye7U<(3Cm zRj4Q*TluLS6d!YnCu6e@dWVlD9*+&#TP5DXdRA>3RzLmoqqoi~9gxV8Pjhz;vitcri|(G-wM>_PCL+eTOqe zKJNPJgXuHO4u*{U%;w1p(%l;hD%uwYP6>Je;bjK);a1?%K^uCzzif1GWC}5T<=@j; z2=BlawqTKkwKljuRf`qnicxLP8uv!I?UyU=yqMyynA`BqhIbDT6MPooi7PXnVU6~B zRewc&OGoK^M8W*G)7Sl$+)Ng3%@+bs1I2K|>ZV1MBq@&RzL3uF zG7fs0$0KlAs_IJBetKzh;$MGgQR{)HH&UGKDjtcKP`orFNd{n~WjhwNQ{wR>M*il$ zO~Xb{-9Bu!H){47bIjEj&3VnA{_vILWX0u-W{!iJj|USUhR!bd6v?TQSE@R7f(J$% z_!?z1T0N6BG#>l{n;ty`co*5d_u=G&eck9iu94Ho(fsWv+(-^lES(xJDID7=4=*}c zQ)MqCsA(OtY{;kurFoJr?&|IrPN$12CSN?-2=GOhW5F#()#SKXS=tSoQPtJMGV+1crsK@ zwM9((gJlajy7Vtk@D9<37qd$Qsu`X!w7@$k4zFHwdxkF`Wu^hDj$gRplTCCz6kl{j z2q-*wVAY3*KnI+}tSidbuv7!GNq1LpfHxoMT|seGY^3qd>|<-B9OADAVP-+6yDCm? zgLW?}ch*-{Z{J1ETD-5AGny)Fj*Wo@CP!7Z8841Nb%RFcvPsrBKUbG)p}*mOwNkD- z=9Wq`S=1!su(X>=T8*4)Q<}+(PNYr3S|F~GC2dMGDb0zr7a^@yg0m^jBrhk@x(7j8 zjZ|h+nn^uQq&3&Wk~I>MO=%{PIFYuKq}50(Hl>-|;6&Qo1{ALvnZl+tlmDAYdjZmF zWdobiOj2(m?UKQeRwKpNlx7ln6KSvKrPW63HKmzM-9*}F8W90C@^(#WCgV1dHueb2 zDmC(NO=%{(Hj#EGq}9r`HKmzk*+kmeLoj^QNS!sMnIzam+FnSjl@e=8GpVkLv?W6k z-8B+lO=%`kHIX)K7^d+WX{)9*la885+W={`l2c7-Ca*M+<{k-YHL^@iX(mrJk@f;f ztC2NoN;AoziL?uw5ZyIWLrrNWt22>Se-!%08abY(G?ST`NZUcuYUF8}(oDW(B5mf; z=(uZSUz*ZP4rLasm@+TOh4ghM+0UY_LQKfoKQ}JN~0^0 zn-X*{p$)228cniI5>tYHF^?{#G?{3VsFa}PiRj%7+LDPj=|~BBJ)sS9l8H8XM+y2B zLK`e36K#@<67<<80c}u?(rEHjlPQ#-cM#g(4W-ePkxfoeg02JI+GGfsXp{JppqCKZ zpaG@Pl_)+5`kRC{2tR2weYZ*CNzlVu&^s8Eo;11=Stmhv6WXBeq|xN5CLJe1e~-`x zIVX*-M7c@Oqs9Sk5OC6Hx-pYjlb|mlv_Z2;qbo6J67&;!bSa<4M4NP(1U>Q;$Tr9` zX>=u0OoCoPXoDh?MpvS`B=v_NrLwAHfC~B(rEf_lVp;hZvon*oTSkdVO7)67)7g8&r%mnlv{l z776;jJi3%{kw#OLm}H6s-3dmFNv%kuNw!IuNYLAXHVG7IbR~8~f_|6K23I1DuEcyu z&`UuCFnJJZbR~L2f_|9L2H7EvrW-R^3<>%eFjGuULmEw1Gx-V$S`*q}FQm~V+oUEW z=x+mU5*19e$w5faN8-8LWF(}~q`65pNYLGRbSd?~M4QBd1pNS^4Vpn3P2X*j2oiJy zcm*b= needLen { + // Encode directly into dst + dstStart := len(dst) + 4 // Start offset in dst + dstSizePos := dst[len(dst):dstStart] // Reserve space for compressed size + dstEnd := len(dst) + needLen // End offset in dst + // Compress into dst and get actual size. + actual := s2.EncodeSnappy(dst[dstStart:dstEnd], src[pos:newPos]) + // Update dst size + dst = dst[:dstStart+len(actual)] + // Store compressed size + binary.BigEndian.PutUint32(dstSizePos, uint32(len(actual))) + } else { + chunk = s2.EncodeSnappy(chunk[:cap(chunk)], src[pos:newPos]) + origLen := len(dst) + // First encode the compressed size (big-endian) + // Put* panics if the buffer is too small, so pad 4 bytes first + dst = append(dst, dst[0:4]...) + binary.BigEndian.PutUint32(dst[origLen:], uint32(len(chunk))) + // And now the compressed data + dst = append(dst, chunk...) + } + pos = newPos + } + return dst +} + +// EncodeBetter *appends* to the specified 'dst' the compressed +// 'src' in xerial framing format. If 'dst' does not have enough +// capacity, then a new slice will be allocated. If 'dst' has +// non-zero length, then if *must* have been built using this function. +func EncodeBetter(dst, src []byte) []byte { + if len(dst) == 0 { + dst = append(dst, xerialHeader...) + dst = append(dst, xerialVersionInfo...) + } + + // Snappy encode in blocks of maximum 32KB + var ( + max = len(src) + blockSize = 32 * 1024 + pos = 0 + chunk []byte + ) + + for pos < max { + newPos := min(pos+blockSize, max) + // Find maximum length we need + needLen := s2.MaxEncodedLen(newPos-pos) + 4 + if cap(dst)-len(dst) >= needLen { + // Encode directly into dst + dstStart := len(dst) + 4 // Start offset in dst + dstSizePos := dst[len(dst):dstStart] // Reserve space for compressed size + dstEnd := len(dst) + needLen // End offset in dst + // Compress into dst and get actual size. + actual := s2.EncodeSnappyBetter(dst[dstStart:dstEnd], src[pos:newPos]) + // Update dst size + dst = dst[:dstStart+len(actual)] + // Store compressed size + binary.BigEndian.PutUint32(dstSizePos, uint32(len(actual))) + } else { + chunk = s2.EncodeSnappyBetter(chunk[:cap(chunk)], src[pos:newPos]) + origLen := len(dst) + // First encode the compressed size (big-endian) + // Put* panics if the buffer is too small, so pad 4 bytes first + dst = append(dst, dst[0:4]...) + binary.BigEndian.PutUint32(dst[origLen:], uint32(len(chunk))) + // And now the compressed data + dst = append(dst, chunk...) + } + pos = newPos + } + return dst +} + +func min(x, y int) int { + if x < y { + return x + } + return y +} + +const ( + sizeOffset = 16 + sizeBytes = 4 +) + +// Decode decodes snappy data whether it is traditional unframed +// or includes the xerial framing format. +func Decode(src []byte) ([]byte, error) { + return DecodeInto(nil, src) +} + +// DecodeInto decodes snappy data whether it is traditional unframed +// or includes the xerial framing format into the specified `dst`. +// It is assumed that the entirety of `dst` including all capacity is available +// for use by this function. If `dst` is nil *or* insufficiently large to hold +// the decoded `src`, new space will be allocated. +// To never allocate bigger destination, use DecodeCapped. +func DecodeInto(dst, src []byte) ([]byte, error) { + var max = len(src) + + if max < len(xerialHeader) || !bytes.Equal(src[:8], xerialHeader) { + dst, err := s2.Decode(dst[:cap(dst)], src) + if err != nil { + return dst, ErrMalformed + } + return dst, nil + } + if max == sizeOffset { + return []byte{}, nil + } + if max < sizeOffset+sizeBytes { + return nil, ErrMalformed + } + if len(dst) > 0 { + dst = dst[:0] + } + var ( + pos = sizeOffset + chunk []byte + ) + + for pos+sizeBytes <= max { + size := int(binary.BigEndian.Uint32(src[pos : pos+sizeBytes])) + pos += sizeBytes + + nextPos := pos + size + // On architectures where int is 32-bytes wide size + pos could + // overflow so we need to check the low bound as well as the + // high + if nextPos < pos || nextPos > max { + return nil, ErrMalformed + } + nextLen, err := s2.DecodedLen(src[pos:nextPos]) + if err != nil { + return nil, err + } + if cap(dst)-len(dst) >= nextLen { + // Decode directly into dst + dstStart := len(dst) + dstEnd := dstStart + nextLen + _, err = s2.Decode(dst[dstStart:dstEnd], src[pos:nextPos]) + if err != nil { + return nil, err + } + dst = dst[:dstEnd] + } else { + chunk, err = s2.Decode(chunk[:cap(chunk)], src[pos:nextPos]) + if err != nil { + return nil, err + } + dst = append(dst, chunk...) + } + pos = nextPos + } + return dst, nil +} + +var ErrDstTooSmall = errors.New("destination buffer too small") + +// DecodeCapped decodes snappy data whether it is traditional unframed +// or includes the xerial framing format into the specified `dst`. +// It is assumed that the entirety of `dst` including all capacity is available +// for use by this function. If `dst` is nil *or* insufficiently large to hold +// the decoded `src`, ErrDstTooSmall is returned. +func DecodeCapped(dst, src []byte) ([]byte, error) { + var max = len(src) + if dst == nil { + return nil, ErrDstTooSmall + } + if max < len(xerialHeader) || !bytes.Equal(src[:8], xerialHeader) { + l, err := s2.DecodedLen(src) + if err != nil { + return nil, ErrMalformed + } + if l > cap(dst) { + return nil, ErrDstTooSmall + } + return s2.Decode(dst[:cap(dst)], src) + } + dst = dst[:0] + if max == sizeOffset { + return dst, nil + } + if max < sizeOffset+sizeBytes { + return nil, ErrMalformed + } + pos := sizeOffset + + for pos+sizeBytes <= max { + size := int(binary.BigEndian.Uint32(src[pos : pos+sizeBytes])) + pos += sizeBytes + + nextPos := pos + size + // On architectures where int is 32-bytes wide size + pos could + // overflow so we need to check the low bound as well as the + // high + if nextPos < pos || nextPos > max { + return nil, ErrMalformed + } + nextLen, err := s2.DecodedLen(src[pos:nextPos]) + if err != nil { + return nil, err + } + if cap(dst)-len(dst) < nextLen { + return nil, ErrDstTooSmall + } + // Decode directly into dst + dstStart := len(dst) + dstEnd := dstStart + nextLen + _, err = s2.Decode(dst[dstStart:dstEnd], src[pos:nextPos]) + if err != nil { + return nil, err + } + dst = dst[:dstEnd] + pos = nextPos + } + return dst, nil +} diff --git a/snappy/xerial/xerial_test.go b/snappy/xerial/xerial_test.go new file mode 100644 index 0000000000..acfca868a4 --- /dev/null +++ b/snappy/xerial/xerial_test.go @@ -0,0 +1,276 @@ +package xerial + +import ( + "bytes" + "math/rand" + "testing" + + "github.com/klauspost/compress/s2" +) + +const largeString = `Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur? At vero eos et accusamus et iusto odio dignissimos ducimus qui blanditiis praesentium voluptatum deleniti atque corrupti quos dolores et quas molestias except` + +var snappyStreamTestCases = map[string][]byte{ + "PLAINDATA": {130, 83, 78, 65, 80, 80, 89, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 11, 9, 32, 80, 76, 65, 73, 78, 68, 65, 84, 65}, + `{"a":"UtaitILHMDAAAAfU","b":"日本"}`: {130, 83, 78, 65, 80, 80, 89, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 39, 37, 144, 123, 34, 97, 34, 58, 34, 85, 116, 97, 105, 116, 73, 76, 72, 77, 68, 65, 65, 65, 65, 102, 85, 34, 44, 34, 98, 34, 58, 34, 230, 151, 165, 230, 156, 172, 34, 125}, + largeString: {130, 83, 78, 65, 80, 80, 89, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 3, 89, 128, 8, 240, 90, 83, 101, 100, 32, 117, 116, 32, 112, 101, 114, 115, 112, 105, 99, 105, 97, 116, 105, 115, 32, 117, 110, 100, 101, 32, 111, 109, 110, 105, 115, 32, 105, 115, 116, 101, 32, 110, 97, 116, 117, 115, 32, 101, 114, 114, 111, 114, 32, 115, 105, 116, 32, 118, 111, 108, 117, 112, 116, 97, 116, 101, 109, 32, 97, 99, 99, 117, 115, 97, 110, 116, 105, 117, 109, 32, 100, 111, 108, 111, 114, 101, 109, 113, 117, 101, 32, 108, 97, 117, 100, 97, 5, 22, 240, 60, 44, 32, 116, 111, 116, 97, 109, 32, 114, 101, 109, 32, 97, 112, 101, 114, 105, 97, 109, 44, 32, 101, 97, 113, 117, 101, 32, 105, 112, 115, 97, 32, 113, 117, 97, 101, 32, 97, 98, 32, 105, 108, 108, 111, 32, 105, 110, 118, 101, 110, 116, 111, 114, 101, 32, 118, 101, 114, 105, 116, 97, 1, 141, 4, 101, 116, 1, 36, 88, 115, 105, 32, 97, 114, 99, 104, 105, 116, 101, 99, 116, 111, 32, 98, 101, 97, 116, 97, 101, 32, 118, 105, 1, 6, 120, 100, 105, 99, 116, 97, 32, 115, 117, 110, 116, 32, 101, 120, 112, 108, 105, 99, 97, 98, 111, 46, 32, 78, 101, 109, 111, 32, 101, 110, 105, 109, 5, 103, 0, 109, 46, 180, 0, 12, 113, 117, 105, 97, 17, 16, 0, 115, 5, 209, 72, 97, 115, 112, 101, 114, 110, 97, 116, 117, 114, 32, 97, 117, 116, 32, 111, 100, 105, 116, 5, 9, 36, 102, 117, 103, 105, 116, 44, 32, 115, 101, 100, 9, 53, 32, 99, 111, 110, 115, 101, 113, 117, 117, 110, 1, 42, 20, 109, 97, 103, 110, 105, 32, 9, 245, 16, 115, 32, 101, 111, 115, 1, 36, 28, 32, 114, 97, 116, 105, 111, 110, 101, 17, 96, 33, 36, 1, 51, 36, 105, 32, 110, 101, 115, 99, 105, 117, 110, 116, 1, 155, 1, 254, 16, 112, 111, 114, 114, 111, 1, 51, 36, 115, 113, 117, 97, 109, 32, 101, 115, 116, 44, 1, 14, 13, 81, 5, 183, 4, 117, 109, 1, 18, 0, 97, 9, 19, 4, 32, 115, 1, 149, 12, 109, 101, 116, 44, 9, 135, 76, 99, 116, 101, 116, 117, 114, 44, 32, 97, 100, 105, 112, 105, 115, 99, 105, 32, 118, 101, 108, 50, 173, 0, 24, 110, 111, 110, 32, 110, 117, 109, 9, 94, 84, 105, 117, 115, 32, 109, 111, 100, 105, 32, 116, 101, 109, 112, 111, 114, 97, 32, 105, 110, 99, 105, 100, 33, 52, 20, 117, 116, 32, 108, 97, 98, 33, 116, 4, 101, 116, 9, 106, 0, 101, 5, 219, 20, 97, 109, 32, 97, 108, 105, 5, 62, 33, 164, 8, 114, 97, 116, 29, 212, 12, 46, 32, 85, 116, 41, 94, 52, 97, 100, 32, 109, 105, 110, 105, 109, 97, 32, 118, 101, 110, 105, 33, 221, 72, 113, 117, 105, 115, 32, 110, 111, 115, 116, 114, 117, 109, 32, 101, 120, 101, 114, 99, 105, 33, 202, 104, 111, 110, 101, 109, 32, 117, 108, 108, 97, 109, 32, 99, 111, 114, 112, 111, 114, 105, 115, 32, 115, 117, 115, 99, 105, 112, 105, 13, 130, 8, 105, 111, 115, 1, 64, 12, 110, 105, 115, 105, 1, 150, 5, 126, 44, 105, 100, 32, 101, 120, 32, 101, 97, 32, 99, 111, 109, 5, 192, 0, 99, 41, 131, 33, 172, 8, 63, 32, 81, 1, 107, 4, 97, 117, 33, 101, 96, 118, 101, 108, 32, 101, 117, 109, 32, 105, 117, 114, 101, 32, 114, 101, 112, 114, 101, 104, 101, 110, 100, 101, 114, 105, 65, 63, 12, 105, 32, 105, 110, 1, 69, 16, 118, 111, 108, 117, 112, 65, 185, 1, 47, 24, 105, 116, 32, 101, 115, 115, 101, 1, 222, 64, 109, 32, 110, 105, 104, 105, 108, 32, 109, 111, 108, 101, 115, 116, 105, 97, 101, 46, 103, 0, 0, 44, 1, 45, 16, 32, 105, 108, 108, 117, 37, 143, 45, 36, 0, 109, 5, 110, 65, 33, 20, 97, 116, 32, 113, 117, 111, 17, 92, 44, 115, 32, 110, 117, 108, 108, 97, 32, 112, 97, 114, 105, 9, 165, 24, 65, 116, 32, 118, 101, 114, 111, 69, 34, 44, 101, 116, 32, 97, 99, 99, 117, 115, 97, 109, 117, 115, 1, 13, 104, 105, 117, 115, 116, 111, 32, 111, 100, 105, 111, 32, 100, 105, 103, 110, 105, 115, 115, 105, 109, 111, 115, 32, 100, 117, 99, 105, 1, 34, 80, 113, 117, 105, 32, 98, 108, 97, 110, 100, 105, 116, 105, 105, 115, 32, 112, 114, 97, 101, 115, 101, 101, 87, 17, 111, 56, 116, 117, 109, 32, 100, 101, 108, 101, 110, 105, 116, 105, 32, 97, 116, 65, 89, 28, 99, 111, 114, 114, 117, 112, 116, 105, 1, 150, 0, 115, 13, 174, 5, 109, 8, 113, 117, 97, 65, 5, 52, 108, 101, 115, 116, 105, 97, 115, 32, 101, 120, 99, 101, 112, 116, 0, 0, 0, 1, 0}, +} + +func makeMassive(input string, numCopies int) string { + outBuff := make([]byte, len(input)*numCopies) + + for i := 0; i < numCopies; i++ { + copy(outBuff[len(outBuff):], input) + } + + return string(outBuff) +} + +func TestSnappyEncodeStream(t *testing.T) { + for src := range snappyStreamTestCases { + dst := Encode(nil, []byte(src)) + + // Block size can change the bytes generated, so let's just decode and make sure in matches out + dec, err := Decode(dst) + if err != nil { + t.Error(err) + } + if src != string(dec) { + t.Errorf("Expected decode to match encode orig = %s, decoded = %s", src, string(dec)) + } + } +} + +func TestSnappyLargeStringEncodeStream(t *testing.T) { + massiveString := makeMassive(largeString, 10000) + dst := Encode(nil, []byte(massiveString)) + dec, err := Decode(dst) + if err != nil { + t.Error(err) + } + if massiveString != string(dec) { + t.Errorf("Decoded string didn't match original input (not printing due to size)") + } +} + +func TestSnappyDecodeStreams(t *testing.T) { + for exp, src := range snappyStreamTestCases { + dst, err := Decode(src) + if err != nil { + t.Error("Encoding error: ", err) + } else if !bytes.Equal(dst, []byte(exp)) { + t.Errorf("Expected %s to be generated from [%d]byte, but was %s", exp, len(src), string(dst)) + } + } +} + +func TestSnappyDecodeMalformedTruncatedHeader(t *testing.T) { + // Truncated headers should not cause a panic. + for i := 0; i < len(xerialHeader); i++ { + buf := make([]byte, i) + copy(buf, xerialHeader[:i]) + if _, err := Decode(buf); err != ErrMalformed { + t.Errorf("expected ErrMalformed got %v", err) + } + } +} + +func TestSnappyDecodeMalformedTruncatedSize(t *testing.T) { + // Inputs with valid Xerial header but truncated "size" field + sizes := []int{sizeOffset + 1, sizeOffset + 2, sizeOffset + 3} + for _, size := range sizes { + buf := make([]byte, size) + copy(buf, xerialHeader) + if _, err := Decode(buf); err != ErrMalformed { + t.Errorf("expected ErrMalformed got %v", err) + } + } +} + +func TestSnappyDecodeMalformedBNoData(t *testing.T) { + // No data after the size field + buf := make([]byte, 20) + copy(buf, xerialHeader) + // indicate that there's one byte of data to be read + buf[len(buf)-1] = 1 + if _, err := Decode(buf); err != ErrMalformed { + t.Errorf("expected ErrMalformed got %v", err) + } +} + +func TestSnappyMasterDecodeFailed(t *testing.T) { + buf := make([]byte, 21) + copy(buf, xerialHeader) + // indicate that there's one byte of data to be read + buf[len(buf)-2] = 1 + // A payload which will not decode + buf[len(buf)-1] = 1 + if _, err := Decode(buf); err == ErrMalformed || err == nil { + t.Errorf("unexpected err: %v", err) + } +} + +func BenchmarkSnappyStreamDecode(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + + for n := 0; n < b.N; n++ { + bytes := 0 + for _, test := range snappyStreamTestCases { + dst, err := Decode(test) + if err != nil { + b.Error("Decoding error: ", err) + } + bytes += len(dst) + } + b.SetBytes(int64(bytes)) + } +} + +func BenchmarkSnappyStreamDecodeInto(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + + var ( + dst = make([]byte, 1024, 1024) + err error + ) + + for n := 0; n < b.N; n++ { + bytes := 0 + for _, test := range snappyStreamTestCases { + dst, err = DecodeInto(dst, test) + if err != nil { + b.Error("Decoding error: ", err) + } + bytes += len(dst) + } + b.SetBytes(int64(bytes)) + } +} +func BenchmarkSnappyStreamDecodeMassive(b *testing.B) { + massiveString := makeMassive(largeString, 10000) + enc := Encode(nil, []byte(massiveString)) + + b.ReportAllocs() + b.ResetTimer() + b.SetBytes(int64(len(massiveString))) + + for n := 0; n < b.N; n++ { + _, err := Decode(enc) + if err != nil { + b.Error("Decoding error: ", err) + } + } +} + +func BenchmarkSnappyStreamDecodeIntoMassive(b *testing.B) { + massiveString := makeMassive(largeString, 10000) + enc := Encode(nil, []byte(massiveString)) + + var ( + dst = make([]byte, 1024, len(massiveString)) + err error + ) + + b.ReportAllocs() + b.ResetTimer() + b.SetBytes(int64(len(massiveString))) + + for n := 0; n < b.N; n++ { + dst, err = DecodeInto(dst, enc) + if err != nil { + b.Error("Decoding error: ", err) + } + } +} + +func BenchmarkSnappyStreamEncode(b *testing.B) { + test := []byte(largeString) + + var ( + dst = make([]byte, 0, 20+s2.MaxEncodedLen(len(test))) + err error + ) + + b.ReportAllocs() + b.ResetTimer() + b.SetBytes(int64(len(test))) + + for n := 0; n < b.N; n++ { + dst = Encode(dst[:0], test) + if err != nil { + b.Error("Encoding error: ", err) + } + } +} + +func BenchmarkSnappyStreamEncodeBetter(b *testing.B) { + test := []byte(largeString) + + var ( + dst = make([]byte, 0, 20+s2.MaxEncodedLen(len(test))) + err error + ) + + b.ReportAllocs() + b.ResetTimer() + b.SetBytes(int64(len(test))) + + for n := 0; n < b.N; n++ { + dst = EncodeBetter(dst[:0], test) + if err != nil { + b.Error("Encoding error: ", err) + } + } +} + +func BenchmarkSnappyStreamEncodeMassive(b *testing.B) { + massiveString := []byte(makeMassive(largeString, 10000)) + + // Inject some randomness, so it isn't just all copies. + rng := rand.New(rand.NewSource(0)) + for i := 0; i < len(massiveString)/10; i++ { + massiveString[rng.Intn(len(massiveString))]++ + } + var ( + dst = make([]byte, 0, s2.MaxEncodedLen(len(massiveString))) + err error + ) + + b.ReportAllocs() + b.ResetTimer() + b.SetBytes(int64(len(massiveString))) + + for n := 0; n < b.N; n++ { + dst = Encode(dst[:0], massiveString) + if err != nil { + b.Error("Encoding error: ", err) + } + } +} + +func BenchmarkSnappyStreamEncodeBetterMassive(b *testing.B) { + massiveString := []byte(makeMassive(largeString, 10000)) + + // Inject some randomness, so it isn't just all copies. + rng := rand.New(rand.NewSource(0)) + for i := 0; i < len(massiveString)/10; i++ { + massiveString[rng.Intn(len(massiveString))]++ + } + var ( + dst = make([]byte, 0, s2.MaxEncodedLen(len(massiveString))) + err error + ) + + b.ReportAllocs() + b.ResetTimer() + b.SetBytes(int64(len(massiveString))) + + for n := 0; n < b.N; n++ { + dst = EncodeBetter(dst[:0], massiveString) + if err != nil { + b.Error("Encoding error: ", err) + } + } +}