diff --git a/.deepsource.toml b/.deepsource.toml deleted file mode 100644 index 40609eff..00000000 --- a/.deepsource.toml +++ /dev/null @@ -1,17 +0,0 @@ -version = 1 - -test_patterns = [ - '**/*_test.go' -] - -exclude_patterns = [ - -] - -[[analyzers]] -name = 'go' -enabled = true - - - [analyzers.meta] - import_path = 'github.com/dgraph-io/ristretto' diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2cbef4ca..a8e87729 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,4 +1,4 @@ # CODEOWNERS info: https://help.github.com/en/articles/about-code-owners # Owners are automatically requested for review for PRs that changes code # that they own. -* @akon-dey @nosql22 @joshua-goldstein @skrdgraph +* @joshua-goldstein @mangalaman93 @harshil-goel diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 64d1bf16..00000000 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,29 +0,0 @@ - - -## Problem - - -## Solution - \ No newline at end of file diff --git a/.github/workflows/ci-aqua-security-trivy-tests.yml b/.github/workflows/ci-aqua-security-trivy-tests.yml index 6dff9300..1d3deff4 100644 --- a/.github/workflows/ci-aqua-security-trivy-tests.yml +++ b/.github/workflows/ci-aqua-security-trivy-tests.yml @@ -10,7 +10,7 @@ on: - synchronize - ready_for_review branches: - - main + - main schedule: - cron: "0 * * * *" jobs: @@ -24,7 +24,7 @@ jobs: uses: aquasecurity/trivy-action@master with: scan-type: 'fs' - scan-ref: '.' + scan-ref: '.' format: 'sarif' output: 'trivy-results.sarif' - name: Upload Trivy scan results to GitHub Security tab diff --git a/.github/workflows/ci-ristretto-lint.yml b/.github/workflows/ci-ristretto-lint.yml index 1cfcbba0..d795a24b 100644 --- a/.github/workflows/ci-ristretto-lint.yml +++ b/.github/workflows/ci-ristretto-lint.yml @@ -10,17 +10,26 @@ on: - cron: "0 * * * *" jobs: go-lint: + if: github.event.pull_request.draft == false name: lint - runs-on: ubuntu-latest + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 + - name: Get Go Version + run: | + #!/bin/bash + GOVERSION=$({ [ -f .go-version ] && cat .go-version; }) + echo "GOVERSION=$GOVERSION" >> $GITHUB_ENV + - name: Setup Go + uses: actions/setup-go@v3 + with: + go-version: ${{ env.GOVERSION }} - name: golang-lint env: # prevent OOM GOGC: 10 uses: golangci/golangci-lint-action@v3 with: - # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version. - version: v1.48 + version: latest only-new-issues: true args: --timeout=10m diff --git a/.github/workflows/ci-ristretto-tests.yml b/.github/workflows/ci-ristretto-tests.yml index 4b20c06c..dc8786c0 100644 --- a/.github/workflows/ci-ristretto-tests.yml +++ b/.github/workflows/ci-ristretto-tests.yml @@ -2,10 +2,10 @@ name: ci-ristretto-tests on: push: branches: - - main + - main pull_request_target: branches: - - main + - main schedule: - cron: "30 * * * *" jobs: diff --git a/.go-version b/.go-version index b8f1e3fd..bc449347 100644 --- a/.go-version +++ b/.go-version @@ -1 +1 @@ -1.17.11 +1.19 diff --git a/.golangci.yml b/.golangci.yml index 7318e9a3..c342c7aa 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,23 +1,29 @@ run: - tests: false skip-dirs: - - contrib - - sim + skip-files: linters-settings: lll: line-length: 120 + staticcheck: + checks: + - all + - '-SA1019' # it is okay to use math/rand at times. + gosec: + excludes: + - G404 # it is okay to use math/rand at times. linters: disable-all: true enable: - #- errcheck - #- ineffassign - - gas - #- gofmt - #- golint - #- gosimple - #- govet + - errcheck + - gofmt + - goimports + - gosec + - gosimple + - govet + - ineffassign - lll - #- varcheck - #- unused + - staticcheck + - unconvert + - unused diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d18e39e..5e1ced12 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -172,7 +172,7 @@ improve performance and reduce memory requirements. ### Fixed - Fix the way metrics are handled for deletions. ([#111][]) -- Support nil `*Cache` values in `Clear` and `Close`. ([#119][]) +- Support nil `*Cache` values in `Clear` and `Close`. ([#119][]) - Delete item immediately. ([#113][]) - Remove key from policy after TTL eviction. ([#130][]) diff --git a/README.md b/README.md index 79fe2bfc..8793ad0d 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,7 @@ Ristretto is a fast, concurrent cache library built with a focus on performance and correctness. -The motivation to build Ristretto comes from the need for a contention-free -cache in [Dgraph][]. +The motivation to build Ristretto comes from the need for a contention-free cache in [Dgraph][]. [Dgraph]: https://github.com/dgraph-io/dgraph @@ -19,7 +18,7 @@ cache in [Dgraph][]. * **Admission: TinyLFU** - extra performance with little memory overhead (12 bits per counter). * **Fast Throughput** - we use a variety of techniques for managing contention and the result is excellent throughput. * **Cost-Based Eviction** - any large new item deemed valuable can evict multiple smaller items (cost could be anything). -* **Fully Concurrent** - you can use as many goroutines as you want with little throughput degradation. +* **Fully Concurrent** - you can use as many goroutines as you want with little throughput degradation. * **Metrics** - optional performance metrics for throughput, hit ratios, and other stats. * **Simple API** - just figure out your ideal `Config` values and you're off and running. @@ -29,28 +28,27 @@ Ristretto is production-ready. See [Projects using Ristretto](#projects-using-ri ## Table of Contents -* [Usage](#Usage) - * [Example](#Example) - * [Config](#Config) - * [NumCounters](#Config) - * [MaxCost](#Config) - * [BufferItems](#Config) - * [Metrics](#Config) - * [OnEvict](#Config) - * [KeyToHash](#Config) - * [Cost](#Config) -* [Benchmarks](#Benchmarks) - * [Hit Ratios](#Hit-Ratios) - * [Search](#Search) - * [Database](#Database) - * [Looping](#Looping) - * [CODASYL](#CODASYL) - * [Throughput](#Throughput) - * [Mixed](#Mixed) - * [Read](#Read) - * [Write](#Write) -* [Projects using Ristretto](#projects-using-ristretto) -* [FAQ](#FAQ) +- [Ristretto](#ristretto) + - [Features](#features) + - [Status](#status) + - [Table of Contents](#table-of-contents) + - [Usage](#usage) + - [Example](#example) + - [Config](#config) + - [Benchmarks](#benchmarks) + - [Hit Ratios](#hit-ratios) + - [Search](#search) + - [Database](#database) + - [Looping](#looping) + - [CODASYL](#codasyl) + - [Throughput](#throughput) + - [Mixed](#mixed) + - [Read](#read) + - [Write](#write) + - [Projects Using Ristretto](#projects-using-ristretto) + - [FAQ](#faq) + - [How are you achieving this performance? What shortcuts are you taking?](#how-are-you-achieving-this-performance-what-shortcuts-are-you-taking) + - [Is Ristretto distributed?](#is-ristretto-distributed) ## Usage @@ -95,31 +93,31 @@ func main() { ### Config -The `Config` struct is passed to `NewCache` when creating Ristretto instances (see the example above). +The `Config` struct is passed to `NewCache` when creating Ristretto instances (see the example above). **NumCounters** `int64` -NumCounters is the number of 4-bit access counters to keep for admission and eviction. We've seen good performance in setting this to 10x the number of items you expect to keep in the cache when full. +NumCounters is the number of 4-bit access counters to keep for admission and eviction. We've seen good performance in setting this to 10x the number of items you expect to keep in the cache when full. -For example, if you expect each item to have a cost of 1 and MaxCost is 100, set NumCounters to 1,000. Or, if you use variable cost values but expect the cache to hold around 10,000 items when full, set NumCounters to 100,000. The important thing is the *number of unique items* in the full cache, not necessarily the MaxCost value. +For example, if you expect each item to have a cost of 1 and MaxCost is 100, set NumCounters to 1,000. Or, if you use variable cost values but expect the cache to hold around 10,000 items when full, set NumCounters to 100,000. The important thing is the *number of unique items* in the full cache, not necessarily the MaxCost value. **MaxCost** `int64` -MaxCost is how eviction decisions are made. For example, if MaxCost is 100 and a new item with a cost of 1 increases total cache cost to 101, 1 item will be evicted. +MaxCost is how eviction decisions are made. For example, if MaxCost is 100 and a new item with a cost of 1 increases total cache cost to 101, 1 item will be evicted. -MaxCost can also be used to denote the max size in bytes. For example, if MaxCost is 1,000,000 (1MB) and the cache is full with 1,000 1KB items, a new item (that's accepted) would cause 5 1KB items to be evicted. +MaxCost can also be used to denote the max size in bytes. For example, if MaxCost is 1,000,000 (1MB) and the cache is full with 1,000 1KB items, a new item (that's accepted) would cause 5 1KB items to be evicted. -MaxCost could be anything as long as it matches how you're using the cost values when calling Set. +MaxCost could be anything as long as it matches how you're using the cost values when calling Set. **BufferItems** `int64` -BufferItems is the size of the Get buffers. The best value we've found for this is 64. +BufferItems is the size of the Get buffers. The best value we've found for this is 64. If for some reason you see Get performance decreasing with lots of contention (you shouldn't), try increasing this value in increments of 64. This is a fine-tuning mechanism and you probably won't have to touch this. **Metrics** `bool` -Metrics is true when you want real-time logging of a variety of stats. The reason this is a Config flag is because there's a 10% throughput performance overhead. +Metrics is true when you want real-time logging of a variety of stats. The reason this is a Config flag is because there's a 10% throughput performance overhead. **OnEvict** `func(hashes [2]uint64, value interface{}, cost int64)` @@ -224,8 +222,8 @@ Below is a list of known projects that use Ristretto: We go into detail in the [Ristretto blog post](https://blog.dgraph.io/post/introducing-ristretto-high-perf-go-cache/), but in short: our throughput performance can be attributed to a mix of batching and eventual consistency. Our hit ratio performance is mostly due to an excellent [admission policy](https://arxiv.org/abs/1512.00727) and SampledLFU eviction policy. -As for "shortcuts," the only thing Ristretto does that could be construed as one is dropping some Set calls. That means a Set call for a new item (updates are guaranteed) isn't guaranteed to make it into the cache. The new item could be dropped at two points: when passing through the Set buffer or when passing through the admission policy. However, this doesn't affect hit ratios much at all as we expect the most popular items to be Set multiple times and eventually make it in the cache. +As for "shortcuts," the only thing Ristretto does that could be construed as one is dropping some Set calls. That means a Set call for a new item (updates are guaranteed) isn't guaranteed to make it into the cache. The new item could be dropped at two points: when passing through the Set buffer or when passing through the admission policy. However, this doesn't affect hit ratios much at all as we expect the most popular items to be Set multiple times and eventually make it in the cache. ### Is Ristretto distributed? -No, it's just like any other Go library that you can import into your project and use in a single process. +No, it's just like any other Go library that you can import into your project and use in a single process. diff --git a/cache_test.go b/cache_test.go index 657d54f0..f32c3043 100644 --- a/cache_test.go +++ b/cache_test.go @@ -824,7 +824,7 @@ func TestDropUpdates(t *testing.T) { var err error handler := func(_ interface{}, value interface{}) { v := value.(string) - lastEvictedSet, err = strconv.ParseInt(string(v), 10, 32) + lastEvictedSet, err = strconv.ParseInt(v, 10, 32) require.NoError(t, err) _, ok := droppedMap[int(lastEvictedSet)] diff --git a/contrib/demo/node.go b/contrib/demo/node.go index 05719baa..3138437b 100644 --- a/contrib/demo/node.go +++ b/contrib/demo/node.go @@ -3,7 +3,6 @@ package main import ( "fmt" "runtime" - "unsafe" "github.com/dgraph-io/ristretto/z" "github.com/dustin/go-humanize" @@ -14,7 +13,6 @@ type node struct { next *node } -var nodeSz = int(unsafe.Sizeof(node{})) var alloc *z.Allocator func printNode(n *node) { diff --git a/contrib/demo/node_allocator.go b/contrib/demo/node_allocator.go index b0cb25b4..2db63de1 100644 --- a/contrib/demo/node_allocator.go +++ b/contrib/demo/node_allocator.go @@ -1,3 +1,4 @@ +//go:build jemalloc && allocator // +build jemalloc,allocator package main @@ -10,7 +11,7 @@ import ( // Defined in node.go. func init() { - alloc = z.NewAllocator(10 << 20, "demo") + alloc = z.NewAllocator(10<<20, "demo") } func newNode(val int) *node { diff --git a/contrib/demo/node_golang.go b/contrib/demo/node_golang.go index 9c5858b0..9f4b3f1b 100644 --- a/contrib/demo/node_golang.go +++ b/contrib/demo/node_golang.go @@ -1,3 +1,4 @@ +//go:build !jemalloc // +build !jemalloc package main diff --git a/contrib/demo/node_jemalloc.go b/contrib/demo/node_jemalloc.go index 7f8e5459..34752dd0 100644 --- a/contrib/demo/node_jemalloc.go +++ b/contrib/demo/node_jemalloc.go @@ -1,3 +1,4 @@ +//go:build jemalloc && !allocator // +build jemalloc,!allocator package main diff --git a/contrib/memtest/main.go b/contrib/memtest/main.go index 3cf6fe1a..ce9f6f26 100644 --- a/contrib/memtest/main.go +++ b/contrib/memtest/main.go @@ -153,7 +153,7 @@ func main() { fill = make([]byte, maxMB<<20) rand.Read(fill) - c := make(chan os.Signal) + c := make(chan os.Signal, 10) signal.Notify(c, os.Interrupt, syscall.SIGTERM) go func() { <-c diff --git a/contrib/memtest/nojemalloc.go b/contrib/memtest/nojemalloc.go index 8aa04c1e..776b558b 100644 --- a/contrib/memtest/nojemalloc.go +++ b/contrib/memtest/nojemalloc.go @@ -22,6 +22,7 @@ func Calloc(size int) []byte { } hdr := reflect.SliceHeader{Data: uintptr(ptr), Len: size, Cap: size} atomic.AddInt64(&numbytes, int64(size)) + //nolint:govet return *(*[]byte)(unsafe.Pointer(&hdr)) } diff --git a/go.mod b/go.mod index 55d6caeb..1d5e517a 100644 --- a/go.mod +++ b/go.mod @@ -1,13 +1,18 @@ module github.com/dgraph-io/ristretto -go 1.12 +go 1.19 require ( github.com/cespare/xxhash/v2 v2.1.1 - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 - github.com/dustin/go-humanize v1.0.0 + github.com/dgryski/go-farm v0.0.0-20200201041132-a6ae2369ad13 + github.com/dustin/go-humanize v1.0.1 github.com/pkg/errors v0.9.1 - github.com/stretchr/testify v1.4.0 - golang.org/x/sys v0.0.0-20221010170243-090e33056c14 + github.com/stretchr/testify v1.8.4 + golang.org/x/sys v0.11.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index b87717dd..df96a4ef 100644 --- a/go.sum +++ b/go.sum @@ -1,22 +1,20 @@ github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA= -github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= -github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= -github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/dgryski/go-farm v0.0.0-20200201041132-a6ae2369ad13 h1:fAjc9m62+UWV/WAFKLNi6ZS0675eEUC9y3AlwSbQu1Y= +github.com/dgryski/go-farm v0.0.0-20200201041132-a6ae2369ad13/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= -github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -golang.org/x/sys v0.0.0-20221010170243-090e33056c14 h1:k5II8e6QD8mITdi+okbbmR/cIyEbeXLBhy5Ha4nevyc= -golang.org/x/sys v0.0.0-20221010170243-090e33056c14/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/policy.go b/policy.go index bf23f91f..d11c1bcb 100644 --- a/policy.go +++ b/policy.go @@ -33,7 +33,7 @@ const ( // policy is the interface encapsulating eviction/admission behavior. // // TODO: remove this interface and just rename defaultPolicy to policy, as we -// are probably only going to use/implement/maintain one policy. +// are probably only going to use/implement/maintain one policy. type policy interface { ringConsumer // Add attempts to Add the key-cost pair to the Policy. It returns a slice @@ -223,7 +223,7 @@ func (p *defaultPolicy) Del(key uint64) { func (p *defaultPolicy) Cap() int64 { p.Lock() - capacity := int64(p.evict.getMaxCost() - p.evict.used) + capacity := p.evict.getMaxCost() - p.evict.used p.Unlock() return capacity } @@ -346,7 +346,7 @@ func (p *sampledLFU) updateIfHas(key uint64, cost int64) bool { p.metrics.add(keyUpdate, key, 1) if prev > cost { diff := prev - cost - p.metrics.add(costAdd, key, ^uint64(uint64(diff)-1)) + p.metrics.add(costAdd, key, ^uint64(diff-1)) } else if cost > prev { diff := cost - prev p.metrics.add(costAdd, key, uint64(diff)) diff --git a/sketch.go b/sketch.go index 6368d2bd..196bdfdc 100644 --- a/sketch.go +++ b/sketch.go @@ -103,7 +103,7 @@ func newCmRow(numCounters int64) cmRow { } func (r cmRow) get(n uint64) byte { - return byte(r[n/2]>>((n&1)*4)) & 0x0f + return (r[n/2] >> ((n & 1) * 4)) & 0x0f } func (r cmRow) increment(n uint64) { diff --git a/z/README.md b/z/README.md index 6d77e146..ad48a3c1 100644 --- a/z/README.md +++ b/z/README.md @@ -1,22 +1,22 @@ ## bbloom: a bitset Bloom filter for go/golang === -package implements a fast bloom filter with real 'bitset' and JSONMarshal/JSONUnmarshal to store/reload the Bloom filter. +package implements a fast bloom filter with real 'bitset' and JSONMarshal/JSONUnmarshal to store/reload the Bloom filter. NOTE: the package uses unsafe.Pointer to set and read the bits from the bitset. If you're uncomfortable with using the unsafe package, please consider using my bloom filter package at github.com/AndreasBriese/bloom === -changelog 11/2015: new thread safe methods AddTS(), HasTS(), AddIfNotHasTS() following a suggestion from Srdjan Marinovic (github @a-little-srdjan), who used this to code a bloomfilter cache. +changelog 11/2015: new thread safe methods AddTS(), HasTS(), AddIfNotHasTS() following a suggestion from Srdjan Marinovic (github @a-little-srdjan), who used this to code a bloomfilter cache. -This bloom filter was developed to strengthen a website-log database and was tested and optimized for this log-entry mask: "2014/%02i/%02i %02i:%02i:%02i /info.html". -Nonetheless bbloom should work with any other form of entries. +This bloom filter was developed to strengthen a website-log database and was tested and optimized for this log-entry mask: "2014/%02i/%02i %02i:%02i:%02i /info.html". +Nonetheless bbloom should work with any other form of entries. ~~Hash function is a modified Berkeley DB sdbm hash (to optimize for smaller strings). sdbm http://www.cse.yorku.ca/~oz/hash.html~~ Found sipHash (SipHash-2-4, a fast short-input PRF created by Jean-Philippe Aumasson and Daniel J. Bernstein.) to be about as fast. sipHash had been ported by Dimtry Chestnyk to Go (github.com/dchest/siphash ) -Minimum hashset size is: 512 ([4]uint64; will be set automatically). +Minimum hashset size is: 512 ([4]uint64; will be set automatically). ###install @@ -25,7 +25,7 @@ go get github.com/AndreasBriese/bbloom ``` ###test -+ change to folder ../bbloom ++ change to folder ../bbloom + create wordlist in file "words.txt" (you might use `python permut.py`) + run 'go test -bench=.' within the folder @@ -52,10 +52,10 @@ import ( at your header. In the program use ```go -// create a bloom filter for 65536 items and 1 % wrong-positive ratio +// create a bloom filter for 65536 items and 1 % wrong-positive ratio bf := bbloom.New(float64(1<<16), float64(0.01)) -// or +// or // create a bloom filter with 650000 for 65536 items and 7 locs per hash explicitly // bf = bbloom.New(float64(650000), float64(7)) // or @@ -64,7 +64,7 @@ bf = bbloom.New(650000.0, 7.0) // add one item bf.Add([]byte("butter")) -// Number of elements added is exposed now +// Number of elements added is exposed now // Note: ElemNum will not be included in JSON export (for compatability to older version) nOfElementsInFilter := bf.ElemNum @@ -86,7 +86,7 @@ isNotIn = bf.HasTS([]byte("peanutButter")) // should be false added = bf.AddIfNotHasTS([]byte("butter")) // should be false because 'peanutbutter' is already in the set added = bf.AddIfNotHasTS([]byte("peanutbuTTer")) // should be true because 'penutbuTTer' is new -// convert to JSON ([]byte) +// convert to JSON ([]byte) Json := bf.JSONMarshal() // bloomfilters Mutex is exposed for external un-/locking @@ -95,7 +95,7 @@ bf.Mtx.Lock() Json = bf.JSONMarshal() bf.Mtx.Unlock() -// restore a bloom filter from storage +// restore a bloom filter from storage bfNew := bbloom.JSONUnmarshal(Json) isInNew := bfNew.Has([]byte("butter")) // should be true @@ -105,17 +105,17 @@ isNotInNew := bfNew.Has([]byte("Butter")) // should be false to work with the bloom filter. -### why 'fast'? +### why 'fast'? + +It's about 3 times faster than William Fitzgeralds bitset bloom filter https://github.com/willf/bloom . And it is about so fast as my []bool set variant for Boom filters (see https://github.com/AndreasBriese/bloom ) but having a 8times smaller memory footprint: -It's about 3 times faster than William Fitzgeralds bitset bloom filter https://github.com/willf/bloom . And it is about so fast as my []bool set variant for Boom filters (see https://github.com/AndreasBriese/bloom ) but having a 8times smaller memory footprint: - Bloom filter (filter size 524288, 7 hashlocs) github.com/AndreasBriese/bbloom 'Add' 65536 items (10 repetitions): 6595800 ns (100 ns/op) github.com/AndreasBriese/bbloom 'Has' 65536 items (10 repetitions): 5986600 ns (91 ns/op) github.com/AndreasBriese/bloom 'Add' 65536 items (10 repetitions): 6304684 ns (96 ns/op) github.com/AndreasBriese/bloom 'Has' 65536 items (10 repetitions): 6568663 ns (100 ns/op) - + github.com/willf/bloom 'Add' 65536 items (10 repetitions): 24367224 ns (371 ns/op) github.com/willf/bloom 'Test' 65536 items (10 repetitions): 21881142 ns (333 ns/op) github.com/dataence/bloom/standard 'Add' 65536 items (10 repetitions): 23041644 ns (351 ns/op) @@ -126,4 +126,4 @@ It's about 3 times faster than William Fitzgeralds bitset bloom filter https://g (on MBPro15 OSX10.8.5 i7 4Core 2.4Ghz) -With 32bit bloom filters (bloom32) using modified sdbm, bloom32 does hashing with only 2 bit shifts, one xor and one substraction per byte. smdb is about as fast as fnv64a but gives less collisions with the dataset (see mask above). bloom.New(float64(10 * 1<<16),float64(7)) populated with 1<<16 random items from the dataset (see above) and tested against the rest results in less than 0.05% collisions. +With 32bit bloom filters (bloom32) using modified sdbm, bloom32 does hashing with only 2 bit shifts, one xor and one substraction per byte. smdb is about as fast as fnv64a but gives less collisions with the dataset (see mask above). bloom.New(float64(10 * 1<<16),float64(7)) populated with 1<<16 random items from the dataset (see above) and tested against the rest results in less than 0.05% collisions. diff --git a/z/allocator_test.go b/z/allocator_test.go index c828f8ff..8959f875 100644 --- a/z/allocator_test.go +++ b/z/allocator_test.go @@ -137,7 +137,7 @@ func TestAllocateConcurrent(t *testing.T) { mu := new(sync.Mutex) for i := 0; i < M; i++ { wg.Add(1) - go func() { + go func() { //nolint:staticcheck,govet defer wg.Done() var bufs []uintptr for j := 0; j < N; j++ { @@ -149,6 +149,7 @@ func TestAllocateConcurrent(t *testing.T) { mu.Lock() for _, b := range bufs { if _, ok := m[b]; ok { + //nolint:staticcheck,govet t.Fatalf("Did not expect to see the same ptr") } m[b] = struct{}{} diff --git a/z/bbloom_test.go b/z/bbloom_test.go index fcd301fb..bf84c603 100644 --- a/z/bbloom_test.go +++ b/z/bbloom_test.go @@ -18,14 +18,13 @@ func TestMain(m *testing.M) { wordlist1 = make([][]byte, n) for i := range wordlist1 { b := make([]byte, 32) - rand.Read(b) + _, _ = rand.Read(b) wordlist1[i] = b } fmt.Println("\n###############\nbbloom_test.go") fmt.Print("Benchmarks relate to 2**16 OP. --> output/65536 op/ns\n###############\n\n") m.Run() - } func TestM_NumberOfWrongs(t *testing.T) { @@ -38,6 +37,7 @@ func TestM_NumberOfWrongs(t *testing.T) { cnt++ } } + //nolint:lll fmt.Printf("Bloomfilter New(7* 2**16, 7) (-> size=%v bit): \n Check for 'false positives': %v wrong positive 'Has' results on 2**16 entries => %v %%\n", len(bf.bitset)<<6, cnt, float64(cnt)/float64(n)) } diff --git a/z/btree.go b/z/btree.go index 12b735bb..33c3046b 100644 --- a/z/btree.go +++ b/z/btree.go @@ -30,6 +30,7 @@ import ( var ( pageSize = os.Getpagesize() maxKeys = (pageSize / 16) - 1 + //nolint:unused oneThird = int(float64(maxKeys) / 3) ) @@ -480,6 +481,8 @@ func (t *Tree) split(pid uint64) node { // shareWithSiblingXXX is unused for now. The idea is to move some keys to // sibling when a node is full. But, I don't see any special benefits in our // access pattern. It doesn't result in better occupancy ratios. +// +//nolint:unused func (t *Tree) shareWithSiblingXXX(n node, idx int) bool { if idx == 0 { return false diff --git a/z/btree_test.go b/z/btree_test.go index 61406aa4..ea7410a6 100644 --- a/z/btree_test.go +++ b/z/btree_test.go @@ -18,7 +18,6 @@ package z import ( "fmt" - "io/ioutil" "math" "math/rand" "os" @@ -61,7 +60,7 @@ func TestTree(t *testing.T) { } func TestTreePersistent(t *testing.T) { - dir, err := ioutil.TempDir("", "") + dir, err := os.MkdirTemp("", "") require.NoError(t, err) defer os.RemoveAll(dir) path := filepath.Join(dir, "tree.buf") @@ -439,7 +438,7 @@ func BenchmarkSearch(b *testing.B) { jumpBy := []int{8, 16, 32, 64, 128, 196, 255} for _, sz := range jumpBy { - f, err := ioutil.TempFile(".", "tree") + f, err := os.CreateTemp(".", "tree") require.NoError(b, err) mf, err := OpenMmapFileUsing(f, pageSize, true) diff --git a/z/buffer.go b/z/buffer.go index b3a48ed0..fc363b89 100644 --- a/z/buffer.go +++ b/z/buffer.go @@ -19,7 +19,6 @@ package z import ( "encoding/binary" "fmt" - "io/ioutil" "log" "os" "sort" @@ -92,7 +91,7 @@ func NewBufferTmp(dir string, capacity int) (*Buffer, error) { if dir == "" { dir = tmpDir } - file, err := ioutil.TempFile(dir, "buffer") + file, err := os.CreateTemp(dir, "buffer") if err != nil { return nil, err } @@ -200,7 +199,7 @@ func (b *Buffer) Grow(n int) { // If autoMmap gets triggered, copy the slice over to an mmaped file. if b.autoMmapAfter > 0 && b.curSz > b.autoMmapAfter { b.bufType = UseMmap - file, err := ioutil.TempFile(b.autoMmapDir, "") + file, err := os.CreateTemp(b.autoMmapDir, "") if err != nil { panic(err) } @@ -281,7 +280,9 @@ func (b *Buffer) SliceIterate(f func(slice []byte) error) error { if b.IsEmpty() { return nil } - slice, next := []byte{}, b.StartOffset() + + next := b.StartOffset() + var slice []byte for next >= 0 { slice, next = b.Slice(next) if len(slice) == 0 { @@ -291,6 +292,7 @@ func (b *Buffer) SliceIterate(f func(slice []byte) error) error { return err } } + return nil } @@ -339,7 +341,7 @@ func (s *sortHelper) sortSmall(start, end int) { }) // Now we iterate over the s.small offsets and copy over the slices. The result is now in order. for _, off := range s.small { - s.tmp.Write(rawSlice(s.b.buf[off:])) + _, _ = s.tmp.Write(rawSlice(s.b.buf[off:])) } assert(end-start == copy(s.b.buf[start:end], s.tmp.Bytes())) } @@ -454,7 +456,7 @@ func (b *Buffer) SortSliceBetween(start, end int, less LessFunc) { small: make([]int, 0, 1024), tmp: NewBuffer(szTmp, b.tag), } - defer s.tmp.Release() + defer func() { _ = s.tmp.Release() }() left := offsets[0] for _, off := range offsets[1:] { diff --git a/z/buffer_test.go b/z/buffer_test.go index 4df1208a..4e67cdb9 100644 --- a/z/buffer_test.go +++ b/z/buffer_test.go @@ -109,12 +109,12 @@ func TestBufferAutoMmap(t *testing.T) { var count int var last []byte - buf.SliceIterate(func(slice []byte) error { + require.NoError(t, buf.SliceIterate(func(slice []byte) error { require.True(t, bytes.Compare(slice, last) >= 0) last = append(last[:0], slice...) count++ return nil - }) + })) require.Equal(t, N, count) } @@ -134,7 +134,7 @@ func TestBufferSimpleSort(t *testing.T) { }) var last uint32 var i int - buf.SliceIterate(func(slice []byte) error { + require.NoError(t, buf.SliceIterate(func(slice []byte) error { num := binary.BigEndian.Uint32(slice) if num < last { fmt.Printf("num: %d idx: %d last: %d\n", num, i, last) @@ -144,7 +144,7 @@ func TestBufferSimpleSort(t *testing.T) { last = num // fmt.Printf("Got number: %d\n", num) return nil - }) + })) }) } } @@ -174,7 +174,7 @@ func TestBufferSlice(t *testing.T) { compare := func() { i := 0 - buf.SliceIterate(func(slice []byte) error { + require.NoError(t, buf.SliceIterate(func(slice []byte) error { // All the slices returned by the buffer should be equal to what we // inserted earlier. if !bytes.Equal(exp[i], slice) { @@ -184,7 +184,7 @@ func TestBufferSlice(t *testing.T) { require.Equal(t, exp[i], slice) i++ return nil - }) + })) require.Equal(t, len(exp), i) } compare() // same order as inserted. @@ -227,7 +227,8 @@ func TestBufferSort(t *testing.T) { return lhs < rhs }) - slice, next := []byte{}, start + next := start + var slice []byte var last uint64 var count int for next >= 0 && next < end { diff --git a/z/calloc_64bit.go b/z/calloc_64bit.go index b898248b..6c02cabd 100644 --- a/z/calloc_64bit.go +++ b/z/calloc_64bit.go @@ -2,6 +2,7 @@ // of this source code is governed by a BSD-style license that can be found in // the LICENSE file. +//go:build amd64 || arm64 || arm64be || ppc64 || ppc64le || mips64 || mips64le || riscv64 || s390x || sparc64 // +build amd64 arm64 arm64be ppc64 ppc64le mips64 mips64le riscv64 s390x sparc64 package z diff --git a/z/calloc_nojemalloc.go b/z/calloc_nojemalloc.go index 93ceedf9..20c9ae35 100644 --- a/z/calloc_nojemalloc.go +++ b/z/calloc_nojemalloc.go @@ -2,6 +2,7 @@ // of this source code is governed by a BSD-style license that can be found in // the LICENSE file. +//go:build !jemalloc || !cgo // +build !jemalloc !cgo package z @@ -34,4 +35,4 @@ func StatsPrint() { // ReadMemStats doesn't do anything since all the memory is being managed // by the Go runtime. -func ReadMemStats(_ *MemStats) { return } +func ReadMemStats(_ *MemStats) {} diff --git a/z/calloc_test.go b/z/calloc_test.go index bcb860ba..93cb6535 100644 --- a/z/calloc_test.go +++ b/z/calloc_test.go @@ -57,6 +57,7 @@ func BenchmarkAllocation(b *testing.B) { x = make([]byte, sz) } r.Read(x) + //nolint:staticcheck pool.Put(x) } }) diff --git a/z/file.go b/z/file.go index 880caf0a..c07949a7 100644 --- a/z/file.go +++ b/z/file.go @@ -61,7 +61,9 @@ func OpenMmapFileUsing(fd *os.File, sz int, writable bool) (*MmapFile, error) { if fileSize == 0 { dir, _ := filepath.Split(filename) - go SyncDir(dir) + if err := SyncDir(dir); err != nil { + return nil, err + } } return &MmapFile{ Data: buf, diff --git a/z/file_default.go b/z/file_default.go index d9c0db43..00e7d087 100644 --- a/z/file_default.go +++ b/z/file_default.go @@ -1,3 +1,4 @@ +//go:build !linux // +build !linux /* diff --git a/z/mremap_linux.go b/z/mremap_linux.go index 22567865..bc35e562 100644 --- a/z/mremap_linux.go +++ b/z/mremap_linux.go @@ -1,3 +1,4 @@ +//go:build !arm64 // +build !arm64 /* diff --git a/z/rtutil.go b/z/rtutil.go index 8f317c80..89e41764 100644 --- a/z/rtutil.go +++ b/z/rtutil.go @@ -27,10 +27,12 @@ import ( ) // NanoTime returns the current time in nanoseconds from a monotonic clock. +// //go:linkname NanoTime runtime.nanotime func NanoTime() int64 // CPUTicks is a faster alternative to NanoTime to measure time duration. +// //go:linkname CPUTicks runtime.cputicks func CPUTicks() int64 @@ -60,6 +62,7 @@ func MemHashString(str string) uint64 { } // FastRand is a fast thread local random function. +// //go:linkname FastRand runtime.fastrand func FastRand() uint32 diff --git a/z/rtutil_test.go b/z/rtutil_test.go index 4c15aff9..d5a11926 100644 --- a/z/rtutil_test.go +++ b/z/rtutil_test.go @@ -23,7 +23,8 @@ func BenchmarkMemHash(b *testing.B) { } func BenchmarkMemHashString(b *testing.B) { - s := "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." + s := "Lorem ipsum dolor sit amet, consectetur adipiscing elit, " + + "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." b.ReportAllocs() b.ResetTimer() @@ -322,6 +323,6 @@ func BenchmarkRandGlobal(b *testing.B) { func BenchmarkRandAtomic(b *testing.B) { var x uint32 benchmarkRand(b, func() func() uint32 { - return func() uint32 { return uint32(atomic.AddUint32(&x, 1)) } + return func() uint32 { return atomic.AddUint32(&x, 1) } }) } diff --git a/z/simd/baseline.go b/z/simd/baseline.go index 967e3a30..e98f6286 100644 --- a/z/simd/baseline.go +++ b/z/simd/baseline.go @@ -98,6 +98,7 @@ func Binary(keys []uint64, key uint64) int16 { })) } +//nolint:unused func cmp2_native(twos, pk [2]uint64) int16 { if twos[0] == pk[0] { return 0 @@ -108,6 +109,7 @@ func cmp2_native(twos, pk [2]uint64) int16 { return 2 } +//nolint:unused func cmp4_native(fours, pk [4]uint64) int16 { for i := range fours { if fours[i] >= pk[i] { @@ -117,6 +119,7 @@ func cmp4_native(fours, pk [4]uint64) int16 { return 4 } +//nolint:unused func cmp8_native(a [8]uint64, pk [4]uint64) int16 { for i := range a { if a[i] >= pk[0] { diff --git a/z/z.go b/z/z.go index 97455586..ae416946 100644 --- a/z/z.go +++ b/z/z.go @@ -23,14 +23,13 @@ import ( "github.com/cespare/xxhash/v2" ) -// TODO: Figure out a way to re-use memhash for the second uint64 hash, we -// already know that appending bytes isn't reliable for generating a -// second hash (see Ristretto PR #88). -// -// We also know that while the Go runtime has a runtime memhash128 -// function, it's not possible to use it to generate [2]uint64 or -// anything resembling a 128bit hash, even though that's exactly what -// we need in this situation. +// TODO: Figure out a way to re-use memhash for the second uint64 hash, +// we already know that appending bytes isn't reliable for generating a +// second hash (see Ristretto PR #88). +// We also know that while the Go runtime has a runtime memhash128 +// function, it's not possible to use it to generate [2]uint64 or +// anything resembling a 128bit hash, even though that's exactly what +// we need in this situation. func KeyToHash(key interface{}) (uint64, uint64) { if key == nil { return 0, 0