This repository has been archived by the owner on Nov 5, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
harvestd.yaml
255 lines (223 loc) · 11.1 KB
/
harvestd.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
### Default (baseline) configuration parameters.
### DO NOT ever change this config, use -c commandline option instead!
collectors:
# Modules that collect the actual datapoints to be sent
_default: # used as a base for all other sections here
enabled: true
# debug: # auto-filled from global "debug" section, if not specified
ping:
# Reports average (ewma) rtt of icmp ping to each specified host and packet loss (if any).
interval: 5 # seconds between sending-out pings
ewma_factor: 0.3 # ewma factor for rtt values
resolve:
no_reply: 30 # re-resolve hostnames after 30 seconds w/o reply
time: 600 # re-resolve hostnames after fixed 600s intervals
# "max_retries" restarts ping subprocess (e.g. to apply changes to
# /etc/hosts or other libc resolver configuration) after N name resolution failures.
# Also, if resolver fails even after restart (i.e. on start), disable warnings
# (but issuing a message on next success) after that number of retries.
max_retries: 5
hosts: # explicitly split into ipv4/ipv6 to control how hostnames are resolved
ipv4:
# google_com: google.com
# google_dns: 8.8.8.8
ipv6:
# ipv6_google_com: ipv6.google.com
# ipv6_tunnelbroker_net: ipv6.tunnelbroker.net
cron_log:
# Reports start/stop, run time and errors for cron jobs from a logfile.
# I use simple wrappers for cron-jobs to produce these logs (among other things):
# https://github.com/mk-fg/fgtk#task https://github.com/mk-fg/fgtk/tree/master/task
source: # must be filled with path to a log file
aliases: # either [alias, regexp] or ["_" + regexp_group, regexp], see "_script" example below
# - ['logrotate', '(^|\b)logrotate\b']
# - ['locate', '(^|\b)updatedb\b']
# - ['_script', '/etc/cron\.\w+/*(?P<script>\S+)(\s+|$)']
lines: # only named regexp groups here are mandatory, all lines are optional
init: 'task\[(\d+|-)\]:\s+Queued\b[^:]*: (?P<job>.*)$'
start: 'task\[(\d+|-)\]:\s+Started\b[^:]*: (?P<job>.*)$'
finish: 'task\[(\d+|-)\]:\s+Finished\b[^:]*: (?P<job>.*)$'
duration: 'task\[(\d+|-)\]:\s+Finished \([^):]*\bduration=(?P<val>\d+)[,)][^:]*: (?P<job>.*)$'
error: 'task\[(\d+|-)\]:\s+Finished \([^):]*\bstatus=0*[^0]+0*[,)][^:]*: (?P<job>.*)$'
xattr_name: user.collectd.logtail.pos # used to mark "last position" in sa logs
slabinfo:
# Reports RAM usage by kernel, allocated via slab subsystem.
include_prefixes: # takes priority over exclude_prefixes
exclude_prefixes: ['kmalloc-', 'kmem_cache', 'dma-kmalloc-']
pass_zeroes: False # to skip creating a lot metrics for always-0 (for particular hosts) slab counts
cgacct:
# Accounting of cpu/mem/io for systemd-created per-service cgroups.
cg_root: /sys/fs/cgroup
systemd_prefix: system.slice # was just "system" for older versions
resource_controllers: ['cpuacct', 'memory', 'blkio'] # mapped to methods in cgacct.py
sysstat:
# Processing of sysstat logs - cpu, io, network, temperatures, etc.
# See collectors/sysstat.py for full list of parameters.
force_interval: true # skip intervals of different length than core.interval
force_interval_fuzz: 10 # +/- % to consider acceptable interval fuzz
sa_path: /var/log/sa
rate: # see "graphite_metrics.collectors.rate_limit"
limiting_enabled: true
max_interval: 30 # cycles
sampling: 3
skip:
redundant: true # skip metrics, redundant with other default collectors
sections: # optional list of sections in "sadf -j -- -A" output to skip, example: ['disk', 'cpu-load-all']
older_than_days: 4 # do not check sysstat logs older than this number of days on each run
xattr_name: user.sa_carbon.pos # used to mark "last position" in sa logs
# Max timestan to dump with "sadf -j" in seconds.
# Use if resulting json is too huge for processing in one go (e.g. ram-wise).
max_dump_span: # example: 7200
iptables_counts:
# Packet/byte counters from iptables/ip6tables.
# In my case, these bindings are generated from higher-level configuration
# by trilobite script (https://github.com/mk-fg/trilobite).
rule_metrics_path:
# Paths to files with "table_name chain_name rule_no metric_name"
# lines for iptables/ip6tables.
# Example line in such files: "filter FORWARD 30 network.services.tor.out"
ipv4: # example: /var/lib/iptables/metrics.list
ipv6: # example: /var/lib/ip6tables/metrics.list
# One of: pkt, bytes, both (metric.pkt + metric.bytes), both_flat (metric_pkt + metric_bytes)
units: both_flat
# Consider counter invalid (and skip it) if rule has changed without rule_metrics file update
discard_changed_rules: true
irq:
# Interrupt counters (/proc/interrupts, /proc/softirqs) processing.
# No configuration.
memstats:
# System memory usage statistics (/proc/vmstat, /proc/meminfo).
# No configuration.
memfrag:
# Memory fragmentation statistics (/proc/buddyinfo, /proc/pagetypeinfo).
# No configuration.
stats:
# General system statistics (/proc/stats) - irq.total.{hard,soft}, processes.forks, etc.
# No configuration.
cjdns_peer_stats:
# Traffic/state stats for cjdns daemon - https://github.com/cjdelisle/cjdns/
# Collects these via InterfaceController_peerStats() admin interface call.
# Doesn't need/use threaded cjdnsadmin module that comes with it.
enabled: false # more rare than other stats
# How to get peer metric name.
# Can be either "pubkey", "ipv6" or whatever key (e.g. "user")
# that cjdns returns or a list of these, to use first one available (e.g. ["user", "ipv6"]).
# Note that "pubkey" and "ipv6" keys are synthetic and always available.
peer_id: ipv6
# Path to standard cjdcmd/cjdmaid/cjdnsadmin configuration file,
# which should contain address, port and password keys.
# See https://github.com/cjdelisle/cjdns/blob/master/contrib/python/cjdnsadminmaker.py
cjdnsadmin_conf: ~/.cjdnsadmin
# Prefix under which create "<peer_id>.{bytes_in,bytes_out}" counters
prefix: network.services.cjdns.peers
filter:
# Log stats only for peers with following connection properties.
direction: any # one of "any", "incoming", "outgoing"
established_only: true # don't send byte counters of configured but disconnected peers
# Some extra metrics to pass along with byte counters.
# Each one can be set to null or false to skip sending it.
special_metrics:
# Add specified key for each peer, set to 0 or 1, depending on connection state.
peer_link: link
# Total number of configured peers.
count: network.services.cjdns.peer_state.total
# Prefix for counts of peers by state (e.g. "established", "unresponsive", etc).
count_state: network.services.cjdns.peer_state
timeout: 8 # how long to wait for cjdns responses
recv_retries: 10 # how many responses with wrong txid (likely prev timeouts) to tolerate
# self_profiling: # TODO
# main_loop: true
# collectors: true
processors:
# Modules that process the datapoints before they are passed to sinks
# Datapoints are passed to processors in the same order as they're specified here,
# with all the entry points without config section afterwards in no particular order
# Passed a list of sinks along with the datapoints,
# so can facilitate filtering, by dropping particular sinks from the list
_default: # used as a base for all other sections here
enabled: true
# debug: # auto-filled from global "debug" section, if not specified
hostname_prefix:
hostname: # uname(), if unset
sinks:
_default: # used as a base for all other sections here
# Default host/port for sinks can be overidden by CLI flags
host: localhost # can be specified as "host[:port]"
default_port: 2003
enabled: false # should be explicitly enabled
# debug: # auto-filled from global "debug" section, if not specified
carbon_socket:
enabled: true # the only sink enabled by default
max_reconnects: # before bailing out with the error
reconnect_delay: 5 # seconds
librato_metrics: # see http://dev.librato.com/v1/post/metrics
http_parameters:
# See http://docs.python-requests.org/en/latest/api/#main-interface for a complete list
url: https://metrics-api.librato.com/v1/metrics
auth: ['example@librato.com:', '75AFDB82'] # override with the actual values, no url-encoding needed
timeout: # defaults to half of the loop.interval or 30, if former is inaccessible
# Might be useful in some setups:
# proxies:
# cert:
# verify: false
# Derive "source" field from first component of metric name
# See also "hostname_prefix" processor
source_from_prefix: true
# Explicit source specification, overrides "source_from_prefix", if set
# If neither "source" or "source_from_prefix" are set, it won't be sent at all
source:
# Discard "measure_time" field of individual metrics,
# sending just one value (when data reached the sink)
# Saves quite a bit of traffic (roughly 1/3),
# but MUST NOT be used with historical data collectors, like sysstat
unified_measure_time: false
# Split measurement submissions into concurrent requests, as suggested by docs
# Goal is to minimize overall submission time given the current api limitations
# Uses async api in requests module, which requires gevent (gevent.org),
# will be disabled with a warning (or fail, if enabled explicitly), if unavailable
chunk_data:
# Can be explicitly disabled (enabled: false) to remove gevent-related
# warnings on init, or enabled (=true) to fail if async api is unavailable
enabled:
max_chunk_size: 500
max_concurrent_requests: 10 # 0 or false to remove this limit
# dump: # just logs all the datapoints with level=INFO for testing purposes
# enabled: true
loop:
name: basic # entry point name to use, only one loop can be used
interval: 60 # seconds
core:
# Emulate filesystem extended attributes (used in some collectors
# like sysstat or cron_log), storing per-path data in a simple shelve db.
# Done by faking "xattr" module. Attached data will be lost on path changes.
# Specify a path to db file (will be created) to use it.
xattr_emulation:
debug: # values here can be overidden by special CLI flags
dry_run: false
logging: # see http://docs.python.org/library/logging.config.html
# "custom" level means WARNING or DEBUG, depending on CLI options
warnings: true # capture python warnings
tracebacks: true # much easier to debug with these, but noisy and multiline
version: 1
formatters:
basic:
format: '%(asctime)s :: %(levelname)s :: %(name)s: %(message)s'
datefmt: '%Y-%m-%d %H:%M:%S'
handlers:
console:
class: logging.StreamHandler
stream: ext://sys.stdout
formatter: basic
level: custom
# file:
# class: logging.handlers.WatchedFileHandler
# filename: /var/log/harvestd.log
# formatter: basic
# encoding: utf-8
# level: DEBUG
# loggers:
# graphite_metrics.collectors.irq:
# level: ERROR
root:
handlers: [console]
level: custom