From 2b473416f59540603e8df6073160ae5709d874f2 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Fri, 2 Dec 2022 12:36:17 +0100 Subject: [PATCH] zstd: Improve best compression's match selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The best encoder selects matches based on the criterion a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 If this were computed on the reals, it would be equivalent to a.est < b.est, so the added terms only capture round-off error (this is also why CSE doesn't eliminate them). Changing the formula to a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 captures the intention better, I think, and improves compression: enwik9 260989017 259699309 -0.4942% silesia/dickens 3233958 3222189 -0.3639% silesia/mozilla 16980973 16912341 -0.4042% silesia/mr 3505223 3505553 0.0094% silesia/nci 2313702 2289871 -1.0300% silesia/ooffice 2915199 2896410 -0.6445% silesia/osdb 3364752 3390871 0.7763% silesia/reymont 1658404 1656006 -0.1446% silesia/samba 4330660 4326783 -0.0895% silesia/sao 5399736 5416932 0.3185% silesia/webster 9987784 9966351 -0.2146% silesia/xml 542081 538378 -0.6831% silesia/x-ray 5756210 5733061 -0.4022% ... as well as throughput: name old speed new speed delta Encoder_EncodeAllSimple/best-8 12.1MB/s ± 1% 12.2MB/s ± 1% +1.17% (p=0.000 n=18+20) Encoder_EncodeAllSimple4K/best-8 10.4MB/s ± 1% 10.5MB/s ± 1% +0.82% (p=0.000 n=20+20) --- zstd/enc_best.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zstd/enc_best.go b/zstd/enc_best.go index 35675e4df0..817df64e54 100644 --- a/zstd/enc_best.go +++ b/zstd/enc_best.go @@ -190,7 +190,7 @@ encodeLoop: } bestOf := func(a, b match) match { - if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 { + if a.est-b.est+(a.s-b.s)*bitsPerByte>>10 < 0 { return a } return b