diff --git a/cmd/cindex/cindex.go b/cmd/cindex/cindex.go index 5960b48a6e..85aaaeb1ff 100644 --- a/cmd/cindex/cindex.go +++ b/cmd/cindex/cindex.go @@ -24,7 +24,8 @@ const ( DEFAULT_MAX_FILE_LENGTH = 1 << 30 DEFAULT_MAX_LINE_LENGTH = 2000 DEFAULT_MAX_TEXT_TRIGRAMS = 30000 - DEFAULT_MAX_INVALID_UTF8_PERCENTAGE = 0.1 + DEFAULT_MAX_INVALID_UTF8_RATIO = 0.1 + DEFAULT_MAX_NULL_RATIO = 0.1 ) var usageMessage = `usage: cindex [options] [path...] @@ -49,6 +50,8 @@ Options: skip indexing a file if it has more than this number of trigrams (Default: %v) -maxinvalidutf8ratio RATIO skip indexing a file if it has more than this ratio of invalid UTF-8 sequences (Default: %v) + -max-null-ratio RATIO + skip indexing a file if it has more than this ratio of null bytes (Default: %v) -exclude FILE path to file containing a list of file patterns to exclude from indexing -filelist FILE @@ -83,7 +86,7 @@ With no path arguments, cindex -reset removes the index. ` func usage() { - fmt.Fprintf(os.Stderr, usageMessage, DEFAULT_MAX_FILE_LENGTH, DEFAULT_MAX_LINE_LENGTH, DEFAULT_MAX_TEXT_TRIGRAMS, DEFAULT_MAX_INVALID_UTF8_PERCENTAGE) + fmt.Fprintf(os.Stderr, usageMessage, DEFAULT_MAX_FILE_LENGTH, DEFAULT_MAX_LINE_LENGTH, DEFAULT_MAX_TEXT_TRIGRAMS, DEFAULT_MAX_INVALID_UTF8_RATIO, DEFAULT_MAX_NULL_RATIO) os.Exit(2) } @@ -107,7 +110,8 @@ var ( maxFileLen = flag.Int64("maxfilelen", DEFAULT_MAX_FILE_LENGTH, "skip indexing a file if longer than this size in bytes") maxLineLen = flag.Int("maxlinelen", DEFAULT_MAX_LINE_LENGTH, "skip indexing a file if it has a line longer than this size in bytes") maxTextTrigrams = flag.Int("maxtrigrams", DEFAULT_MAX_TEXT_TRIGRAMS, "skip indexing a file if it has more than this number of trigrams") - maxInvalidUTF8Ratio = flag.Float64("maxinvalidutf8ratio", DEFAULT_MAX_INVALID_UTF8_PERCENTAGE, "skip indexing a file if it has more than this ratio of invalid UTF-8 sequences") + maxInvalidUTF8Ratio = flag.Float64("maxinvalidutf8ratio", DEFAULT_MAX_INVALID_UTF8_RATIO, "skip indexing a file if it has more than this ratio of invalid UTF-8 sequences") + maxNullRatio = flag.Float64("max-null-ratio", DEFAULT_MAX_NULL_RATIO, "skip indexing a file if it has more than this ratio of null bytes") excludePatterns = []string{ ".csearchindex", @@ -350,6 +354,7 @@ func main() { ix.MaxLineLen = *maxLineLen ix.MaxTextTrigrams = *maxTextTrigrams ix.MaxInvalidUTF8Ratio = *maxInvalidUTF8Ratio + ix.MaxNullRatio = *maxNullRatio ix.AddPaths(args) walkChan := make(chan string) diff --git a/index/write.go b/index/write.go index de5e514091..ce2df7abb4 100644 --- a/index/write.go +++ b/index/write.go @@ -61,6 +61,7 @@ type IndexWriter struct { MaxTextTrigrams int MaxInvalidUTF8Ratio float64 + MaxNullRatio float64 } const npost = 64 << 20 / 8 // 64 MB worth of post entries @@ -141,9 +142,11 @@ func (ix *IndexWriter) Add(name string, f io.Reader, size int64) { n = int64(0) linelen = 0 inv_cnt = int64(0) + null_cnt = int64(0) b1 = byte(0) b2 = byte(0) max_invalid = int64(float64(size) * ix.MaxInvalidUTF8Ratio) + max_null = int64(float64(size) * ix.MaxNullRatio) ) for { tv = (tv << 8) & (1<<24 - 1) @@ -172,7 +175,7 @@ func (ix *IndexWriter) Add(name string, f io.Reader, size int64) { if !validUTF8(b1, b2) { if inv_cnt++; inv_cnt > max_invalid { if ix.LogSkip { - log.Printf("%s: skipped. High invalid UTF-8 ratio. total: %d invalid: %d ratio: %f\n", name, size, inv_cnt, float64(inv_cnt)/float64(size)) + log.Printf("%s: skipped. High invalid UTF-8 ratio. total: %d, invalid: %d, ratio: %f\n", name, size, inv_cnt, float64(inv_cnt)/float64(size)) } return } @@ -180,11 +183,13 @@ func (ix *IndexWriter) Add(name string, f io.Reader, size int64) { ix.trigram.Add(tv) } } - if (b1 == 0x00 || b2 == 0x00) && n >= 3 { - if ix.LogSkip { - log.Printf("%s: skipped. Binary file. Bytes %02X%02X at offset %d\n", name, (tv>>8)&0xFF, tv&0xFF, n) + if c == 0x00 { + if null_cnt++; null_cnt > max_null { + if ix.LogSkip { + log.Printf("%s: skipped. High null byte ratio. total: %d, null: %d, ratio: %f\n", name, size, null_cnt, float64(null_cnt)/float64(size)) + } + return } - return } if linelen++; linelen > ix.MaxLineLen { if ix.LogSkip {