Skip to content

Commit

Permalink
Merge pull request #243 from dbaggerman/new-bloom-filter
Browse files Browse the repository at this point in the history
New bloom filter
  • Loading branch information
boyter authored Mar 22, 2021
2 parents c916c20 + d0205e5 commit 95d766e
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 16 deletions.
24 changes: 24 additions & 0 deletions processor/bloom.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package processor

// Prime number less than 256
const BloomPrime = 251

var BloomTable [256]uint64

func init() {
for i := range BloomTable {
BloomTable[i] = BloomHash(byte(i))
}
}

func BloomHash(b byte) uint64 {
i := uint64(b)

k := (i^BloomPrime) * i

k1 := k & 0x3f
k2 := k >> 1 & 0x3f
k3 := k >> 2 & 0x3f

return (1 << k1) | (1 << k2) | (1 << k3)
}
18 changes: 9 additions & 9 deletions processor/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -350,14 +350,14 @@ func processLanguageFeature(name string, value Language) {
stringTrie := &Trie{}
tokenTrie := &Trie{}

complexityMask := byte(0)
singleLineCommentMask := byte(0)
multiLineCommentMask := byte(0)
stringMask := byte(0)
processMask := byte(0)
var complexityMask uint64
var singleLineCommentMask uint64
var multiLineCommentMask uint64
var stringMask uint64
var processMask uint64

for _, v := range value.ComplexityChecks {
complexityMask |= v[0]
complexityMask |= BloomHash(v[0])
complexityTrie.Insert(TComplexity, []byte(v))
if !Complexity {
tokenTrie.Insert(TComplexity, []byte(v))
Expand All @@ -368,21 +368,21 @@ func processLanguageFeature(name string, value Language) {
}

for _, v := range value.LineComment {
singleLineCommentMask |= v[0]
singleLineCommentMask |= BloomHash(v[0])
slCommentTrie.Insert(TSlcomment, []byte(v))
tokenTrie.Insert(TSlcomment, []byte(v))
}
processMask |= singleLineCommentMask

for _, v := range value.MultiLine {
multiLineCommentMask |= v[0][0]
multiLineCommentMask |= BloomHash(v[0][0])
mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
}
processMask |= multiLineCommentMask

for _, v := range value.Quotes {
stringMask |= v.Start[0]
stringMask |= BloomHash(v.Start[0])
stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
}
Expand Down
10 changes: 5 additions & 5 deletions processor/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ type LanguageFeature struct {
Strings *Trie
Tokens *Trie
Nested bool
ComplexityCheckMask byte
SingleLineCommentMask byte
MultiLineCommentMask byte
StringCheckMask byte
ProcessMask byte
ComplexityCheckMask uint64
SingleLineCommentMask uint64
MultiLineCommentMask uint64
StringCheckMask uint64
ProcessMask uint64
Keywords []string
Quotes []Quote
}
Expand Down
5 changes: 3 additions & 2 deletions processor/workers.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,9 @@ func isBinary(index int, currentByte byte) bool {
return false
}

func shouldProcess(currentByte, processBytesMask byte) bool {
if currentByte&processBytesMask != currentByte {
func shouldProcess(currentByte byte, processBytesMask uint64) bool {
k := BloomTable[currentByte]
if k&processBytesMask != k {
return false
}
return true
Expand Down

0 comments on commit 95d766e

Please sign in to comment.