forked from diskoverdata/diskover-community
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiskover.cfg
168 lines (152 loc) · 6.88 KB
/
diskover.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
; diskover config file
; if you make any changes, restart worker bots so they get the new config
[excludes]
; directory names and absolute paths you want to exclude from crawl, case-sensitive, can include wildcards (.* or backup* or /dir/dirname* or *tmp or *tmp* etc)
dirs = .*,.snapshot,.Snapshot,.zfs
; files you want to exclude from crawl, case-sensitive, can include wildcards (.*, *.doc or NULLEXT for files with no extension)
files = .*,Thumbs.db,.DS_Store,._.DS_Store,.localized,desktop.ini
[includes]
; directory names and absolute paths you want to include (whitelist), case-sensitive, you don't need to whitelist rootdir (-d rootdir)
;dirs = .recycle
; files you want to include (whitelist), case-sensitive
;files =
[ownersgroups]
; control how owner (username) and group fields are stored for file and directory docs
; store uid and gid's instead of trying to get owner and group names (default is False)
;uidgidonly = False
; owner/group names contain domain name set to True (default is False)
;domain = False
; character separator used on cifs/nfs mounts to separte user/group and domain name, usually \\ or @
;domainsep = \\
; if domain name comes first before character separator, set this to True, otherwise False (default is True)
;domainfirst = True
; when indexing owner and group fields, keep the domain name (default is False)
;keepdomain = False
[autotag]
; pattern dictionaries for diskover bots to use when auto-tagging, values are case-sensitive, can include wildcard for ext, name or path (tmp* or TMP* or *tmp or *TMP* etc)
;files = [{"name": [], "name_exclude": [], "ext": ["tmp*", "TMP*", "temp*", "TEMP*", "cache*", "CACHE*"], "path": ["*/Application Support/*", "*/Containers/*"], "path_exclude": [], "mtime": 90, "atime": 0, "ctime": 90, "tag": "delete", "tag_custom": "autotag"}]
;dirs = [{"name": ["*tmp*", "*TMP*", "*temp*", "*TEMP*", "*Temp*", "*cache*", "*CACHE*", "*Cache*"], "name_exclude": ["*templates*", "*Templates*"], "path": ["*/Application Support/*", "*/Containers/*"], "path_exclude": [], "mtime": 90, "atime": 0, "ctime": 90, "tag": "delete", "tag_custom": "autotag"}]
[storagecost]
; storage cost per GB (default is 0.03 cents per GB)
costpergb = 0.03
; use decimal base 10 (1000) or binary base 2 (1024) for GB size (default is 2, set to 2 or 10)
base = 2
; pattern dictionaries for diskover bots to use when determing cost per GB (overrides above)
; can also specify file name with json, example paths = storagecost.paths.json
;paths = [{"path": ["*/fastdiskpath1/*", "*/Fastdiskpath2/*"], "path_exclude": [], "costpergb": 0.05}, {"path": ["*/slowdiskpath1/*", "*/Slowdiskpath2/*"], "path_exclude": [], "costpergb": 0.02}]
;times = [{"mtime": 180, "atime": 0, "ctime": 180, "costpergb": 0.02}]
; deciding factor if a match is in both paths and times, can be path or time
;priority = path
[elasticsearch]
; uncomment the below three lines if you are using AWS ES
;aws = False
;host = search-diskover-es-cluster-eg3yztrvzb6qucroyyjk2vokza.ap-northeast-1.es.amazonaws.com
;port = 443
; below two lines are for local ES, comment out if you are using AWS ES
host = localhost
port = 9200
; uncomment the below two lines if you installed X-Pack, for http-auth
;user = elastic
;password = changeme
; index name for ES, cli arg overwrites this
indexname = diskover-index
; timeout for connection to ES (default is 10)
timeout = 30
; number of connections kept open to ES when crawling (default is 10)
maxsize = 20
; max retries for ES operations (default is 0)
maxretries = 10
; wait for at least yellow status before bulk uploading (default is False), set to True if you want to wait
wait = False
; chunk size for ES bulk operations (default is 500)
chunksize = 1000
; number of shards for index (default is 5)
shards = 1
; number of replicas for index (default is 1)
replicas = 0
; the below settings are to optimize ES for crawling
; index refresh interval (default is 1s), set to -1 to disable refresh during crawl (fastest performance but no index searches), after crawl is set back to 1s
indexrefresh = 30s
; disable replicas during crawl - set to True to turn off replicas or False to keep on (default False), after crawl is set back to replicas value above
disablereplicas = True
; transaction log flush threshold size (default 512mb)
translogsize = 1gb
[redis]
host = 127.0.0.1
port = 6379
;password =
; cache directory times in Redis
; used for -I index2 when comparing directory times to get metadata from index2 instead of off disk
; set to True to cache dir times or False to turn off (default False)
cachedirtimes = False
; how long in seconds directory keys lives in Redis (default 1 day)
dirtimesttl = 604800
; database to use (default is 0)
db = 0
; rq default time out in sec (default 86400)
timeout = 86400
; rq queue names to use (default is diskover, diskover_crawl, diskover_calcdir)
queue = diskover
queuecrawl = diskover_crawl
queuecalcdir = diskover_calcdir
[adaptivebatch]
; adaptive batch settings when using -a (intelligent crawling)
; batchsize (numbers of dirs) to start at
startsize = 50
; maximum size of batch
maxsize = 500
; when adjusting batch size use this for +/- (increases when queue is 0, decreases when > 0)
stepsize = 10
[workerbot]
; enable bot logs (True or False), bot logs will slow down crawl, use for debugging only
botlogs = False
; log file directory to store worker logs
; log files are named diskover_bot_worker_<workername>_<time>_log
logfiledir = /tmp
[paths]
; used by diskover socket server
; path to diskover.py (default is ./diskover.py)
diskoverpath = ./diskover.py
; path to python executable (default is python)
pythonpath = python
[socketlistener]
; hostname and port (TCP) for diskover socket server for remote commands
host = 0.0.0.0
port = 9999
; max connections for diskover socket server
maxconnections = 5
; port (TCP) for diskover socket server for messages from diskover treewalk client
twcport = 9998
[dupescheck]
; read size (bytes) for md5 sum check (how many bytes to read in at a time when md5 checking, default 64 KB)
readsize = 65536
; max size (bytes) of files to check (files larger than this will be skipped, default 1 GB)
maxsize = 1073741824
; bytes to check at start and end of file before doing md5 sum check (set large enough to account for file header info, default is 64)
checkbytes = 64
[crawlbot]
; continuous scanner
; time to sleep (seconds) between checking for directory changes
sleeptime = 0.1
; number of threads for checking directories, setting this to num of cores x2 is a good starting point
threads = 8
; max time in seconds to wait till the bots finish crawling, default 120
maxwaittime = 120
[gource]
; should be set to same in diskover-gource.sh
maxfilelag = 0.1
[qumulo]
; Qumulo host
;cluster = 172.16.129.10
; Qumulo api user
;api_user = admin
; Qumulo api password
;api_password = admin
[crawlapi]
; crawl api url endpoint
; url = http://localhost:8080/api
; optional api login
;user = admin
;pass = admin
; number of items per page for each directory list request
;pagesize = 1000