From 5eab6b97ab8f85cbb10bccc6fe9b3578c4b03f8e Mon Sep 17 00:00:00 2001 From: SuperQ Date: Thu, 14 Mar 2024 10:53:26 +0100 Subject: [PATCH] Improve DNS lookup debugging Rework DNS lookups in order to improve debugging * Create a unified reverse DNS lookup function. * Add lookup timing logging * Add a scrape ID to the logger to identify concurrent scrapes. Signed-off-by: SuperQ --- collector/collector.go | 43 ++++++++++++++++++++++++++++++++------- collector/serverstats.go | 27 ++++++++++++------------ collector/sources.go | 18 +++++----------- collector/tracking.go | 44 ++++++++++++++++------------------------ 4 files changed, 73 insertions(+), 59 deletions(-) diff --git a/collector/collector.go b/collector/collector.go index 88677aa..22d6d00 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -18,7 +18,10 @@ import ( "net" "os" "path" + "slices" + "sort" "strings" + "sync/atomic" "time" "github.com/facebook/time/ntp/chrony" @@ -41,6 +44,9 @@ var ( ), prometheus.GaugeValue, } + + // Globally track scrapes to provide better logging context. + scrapeID atomic.Uint64 ) // Exporter collects chrony stats from the given server and exports @@ -136,14 +142,18 @@ func (e Exporter) dial() (net.Conn, error, func()) { // Collect implements prometheus.Collector. func (e Exporter) Collect(ch chan<- prometheus.Metric) { + logger := log.With(e.logger, "scrape_id", scrapeID.Add(1)) + start := time.Now() + level.Debug(logger).Log("msg", "Scrape starting") var up float64 defer func() { + level.Debug(logger).Log("msg", "Scrape completed", "seconds", time.Since(start).Seconds()) ch <- upMetric.mustNewConstMetric(up) }() conn, err, cleanup := e.dial() defer cleanup() if err != nil { - level.Debug(e.logger).Log("msg", "Couldn't connect to chrony", "address", e.address, "err", err) + level.Debug(logger).Log("msg", "Couldn't connect to chrony", "address", e.address, "err", err) return } @@ -152,26 +162,45 @@ func (e Exporter) Collect(ch chan<- prometheus.Metric) { client := chrony.Client{Sequence: 1, Connection: conn} if e.collectSources { - err = e.getSourcesMetrics(ch, client) + err = e.getSourcesMetrics(logger, ch, client) if err != nil { - level.Debug(e.logger).Log("msg", "Couldn't get sources", "err", err) + level.Debug(logger).Log("msg", "Couldn't get sources", "err", err) up = 0 } } if e.collectTracking { - err = e.getTrackingMetrics(ch, client) + err = e.getTrackingMetrics(logger, ch, client) if err != nil { - level.Debug(e.logger).Log("msg", "Couldn't get tracking", "err", err) + level.Debug(logger).Log("msg", "Couldn't get tracking", "err", err) up = 0 } } if e.collectServerstats { - err = e.getServerstatsMetrics(ch, client) + err = e.getServerstatsMetrics(logger, ch, client) if err != nil { - level.Debug(e.logger).Log("msg", "Couldn't get serverstats", "err", err) + level.Debug(logger).Log("msg", "Couldn't get serverstats", "err", err) up = 0 } } } + +func (e Exporter) dnsLookup(logger log.Logger, address net.IP) string { + start := time.Now() + defer func() { + level.Debug(logger).Log("msg", "DNS lookup took", "seconds", time.Since(start).Seconds()) + }() + if !e.dnsLookups { + return address.String() + } + names, err := net.LookupAddr(address.String()) + if err != nil || len(names) < 1 { + return address.String() + } + for i, name := range names { + names[i] = strings.TrimRight(name, ".") + } + sort.Strings(names) + return strings.Join(slices.Compact(names), ",") +} diff --git a/collector/serverstats.go b/collector/serverstats.go index a7a8175..72fd096 100644 --- a/collector/serverstats.go +++ b/collector/serverstats.go @@ -17,6 +17,7 @@ import ( "fmt" "github.com/facebook/time/ntp/chrony" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" ) @@ -137,12 +138,12 @@ var ( } ) -func (e Exporter) getServerstatsMetrics(ch chan<- prometheus.Metric, client chrony.Client) error { +func (e Exporter) getServerstatsMetrics(logger log.Logger, ch chan<- prometheus.Metric, client chrony.Client) error { packet, err := client.Communicate(chrony.NewServerStatsPacket()) if err != nil { return err } - level.Debug(e.logger).Log("msg", "Got 'serverstats' response", "serverstats_packet", packet.GetStatus()) + level.Debug(logger).Log("msg", "Got 'serverstats' response", "serverstats_packet", packet.GetStatus()) serverstats, ok := packet.(*chrony.ReplyServerStats3) if !ok { @@ -150,37 +151,37 @@ func (e Exporter) getServerstatsMetrics(ch chan<- prometheus.Metric, client chro } ch <- serverstatsNTPHits.mustNewConstMetric(float64(serverstats.NTPHits)) - level.Debug(e.logger).Log("msg", "Serverstats NTP Hits", "ntp_hits", serverstats.NTPHits) + level.Debug(logger).Log("msg", "Serverstats NTP Hits", "ntp_hits", serverstats.NTPHits) ch <- serverstatsNKEHits.mustNewConstMetric(float64(serverstats.NKEHits)) - level.Debug(e.logger).Log("msg", "Serverstats NKE Hits", "nke_hits", serverstats.NKEHits) + level.Debug(logger).Log("msg", "Serverstats NKE Hits", "nke_hits", serverstats.NKEHits) ch <- serverstatsCMDHits.mustNewConstMetric(float64(serverstats.CMDHits)) - level.Debug(e.logger).Log("msg", "Serverstats CMD Hits", "cmd_hits", serverstats.CMDHits) + level.Debug(logger).Log("msg", "Serverstats CMD Hits", "cmd_hits", serverstats.CMDHits) ch <- serverstatsNTPDrops.mustNewConstMetric(float64(serverstats.NTPDrops)) - level.Debug(e.logger).Log("msg", "Serverstats NTP Drops", "ntp_drops", serverstats.NTPDrops) + level.Debug(logger).Log("msg", "Serverstats NTP Drops", "ntp_drops", serverstats.NTPDrops) ch <- serverstatsNKEDrops.mustNewConstMetric(float64(serverstats.NKEDrops)) - level.Debug(e.logger).Log("msg", "Serverstats NKE Drops", "nke_drops", serverstats.NKEDrops) + level.Debug(logger).Log("msg", "Serverstats NKE Drops", "nke_drops", serverstats.NKEDrops) ch <- serverstatsCMDDrops.mustNewConstMetric(float64(serverstats.CMDDrops)) - level.Debug(e.logger).Log("msg", "Serverstats CMD Drops", "cmd_drops", serverstats.CMDDrops) + level.Debug(logger).Log("msg", "Serverstats CMD Drops", "cmd_drops", serverstats.CMDDrops) ch <- serverstatsLogDrops.mustNewConstMetric(float64(serverstats.LogDrops)) - level.Debug(e.logger).Log("msg", "Serverstats Log Drops", "log_drops", serverstats.LogDrops) + level.Debug(logger).Log("msg", "Serverstats Log Drops", "log_drops", serverstats.LogDrops) ch <- serverstatsNTPAuthHits.mustNewConstMetric(float64(serverstats.NTPAuthHits)) - level.Debug(e.logger).Log("msg", "Serverstats Authenticated Packets", "auth_hits", serverstats.NTPAuthHits) + level.Debug(logger).Log("msg", "Serverstats Authenticated Packets", "auth_hits", serverstats.NTPAuthHits) ch <- serverstatsNTPInterleavedHits.mustNewConstMetric(float64(serverstats.NTPInterleavedHits)) - level.Debug(e.logger).Log("msg", "Serverstats Interleaved Packets", "interleaved_hits", serverstats.NTPInterleavedHits) + level.Debug(logger).Log("msg", "Serverstats Interleaved Packets", "interleaved_hits", serverstats.NTPInterleavedHits) ch <- serverstatsNTPTimestamps.mustNewConstMetric(float64(serverstats.NTPTimestamps)) - level.Debug(e.logger).Log("msg", "Serverstats Timestamps Held", "ntp_timestamps_held", serverstats.NTPTimestamps) + level.Debug(logger).Log("msg", "Serverstats Timestamps Held", "ntp_timestamps_held", serverstats.NTPTimestamps) ch <- serverstatsNTPSpanSeconds.mustNewConstMetric(float64(serverstats.NTPSpanSeconds)) - level.Debug(e.logger).Log("msg", "Serverstats Timestamps Span", "ntp_timestamps_span", serverstats.NTPSpanSeconds) + level.Debug(logger).Log("msg", "Serverstats Timestamps Span", "ntp_timestamps_span", serverstats.NTPSpanSeconds) return nil } diff --git a/collector/sources.go b/collector/sources.go index 159f68a..550bc36 100644 --- a/collector/sources.go +++ b/collector/sources.go @@ -17,11 +17,9 @@ import ( "encoding/binary" "fmt" "math" - "net" - "sort" - "strings" "github.com/facebook/time/ntp/chrony" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" ) @@ -92,12 +90,12 @@ var ( } ) -func (e Exporter) getSourcesMetrics(ch chan<- prometheus.Metric, client chrony.Client) error { +func (e Exporter) getSourcesMetrics(logger log.Logger, ch chan<- prometheus.Metric, client chrony.Client) error { packet, err := client.Communicate(chrony.NewSourcesPacket()) if err != nil { return err } - level.Debug(e.logger).Log("msg", "Got 'sources' response", "sources_packet", packet.GetStatus()) + level.Debug(logger).Log("msg", "Got 'sources' response", "sources_packet", packet.GetStatus()) sources, ok := packet.(*chrony.ReplySources) if !ok { @@ -107,7 +105,7 @@ func (e Exporter) getSourcesMetrics(ch chan<- prometheus.Metric, client chrony.C results := make([]chrony.ReplySourceData, sources.NSources) for i := 0; i < int(sources.NSources); i++ { - level.Debug(e.logger).Log("msg", "Fetching source", "source", i) + level.Debug(logger).Log("msg", "Fetching source", "source", i) packet, err = client.Communicate(chrony.NewSourceDataPacket(int32(i))) if err != nil { return fmt.Errorf("Failed to get sourcedata response: %d", i) @@ -121,13 +119,7 @@ func (e Exporter) getSourcesMetrics(ch chan<- prometheus.Metric, client chrony.C for _, r := range results { sourceAddress := r.IPAddr.String() - sourceName := sourceAddress - if e.dnsLookups { - // Ignore reverse lookup errors. - sourceNames, _ := net.LookupAddr(sourceAddress) - sort.Strings(sourceNames) - sourceName = strings.Join(sourceNames, ",") - } + sourceName := e.dnsLookup(logger, r.IPAddr) if r.Mode == chrony.SourceModeRef && r.IPAddr.To4() != nil { sourceName = chrony.RefidToString(binary.BigEndian.Uint32(r.IPAddr)) diff --git a/collector/tracking.go b/collector/tracking.go index a1d33ec..ece7a4e 100644 --- a/collector/tracking.go +++ b/collector/tracking.go @@ -16,9 +16,9 @@ package collector import ( "fmt" "net" - "strings" "github.com/facebook/time/ntp/chrony" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" ) @@ -162,74 +162,66 @@ var ( } ) -func (e Exporter) chronyFormatName(tracking chrony.Tracking) string { +func (e Exporter) trackingFormatName(logger log.Logger, tracking chrony.Tracking) string { if tracking.IPAddr.IsUnspecified() { return chrony.RefidToString(tracking.RefID) } - if !e.dnsLookups { - return tracking.IPAddr.String() - } - - names, err := net.LookupAddr(tracking.IPAddr.String()) - if err != nil || len(names) < 1 { - return tracking.IPAddr.String() - } - return strings.TrimRight(names[0], ".") + return e.dnsLookup(logger, tracking.IPAddr) } -func (e Exporter) getTrackingMetrics(ch chan<- prometheus.Metric, client chrony.Client) error { +func (e Exporter) getTrackingMetrics(logger log.Logger, ch chan<- prometheus.Metric, client chrony.Client) error { packet, err := client.Communicate(chrony.NewTrackingPacket()) if err != nil { return err } - level.Debug(e.logger).Log("msg", "Got 'tracking' response", "tracking_packet", packet.GetStatus()) + level.Debug(logger).Log("msg", "Got 'tracking' response", "tracking_packet", packet.GetStatus()) tracking, ok := packet.(*chrony.ReplyTracking) if !ok { return fmt.Errorf("Got wrong 'tracking' response: %q", packet) } - ch <- trackingInfo.mustNewConstMetric(1.0, tracking.IPAddr.String(), e.chronyFormatName(tracking.Tracking), chrony.RefidAsHEX(tracking.RefID)) + ch <- trackingInfo.mustNewConstMetric(1.0, tracking.IPAddr.String(), e.trackingFormatName(logger, tracking.Tracking), chrony.RefidAsHEX(tracking.RefID)) ch <- trackingLastOffset.mustNewConstMetric(tracking.LastOffset) - level.Debug(e.logger).Log("msg", "Tracking Last Offset", "offset", tracking.LastOffset) + level.Debug(logger).Log("msg", "Tracking Last Offset", "offset", tracking.LastOffset) ch <- trackingRefTime.mustNewConstMetric(float64(tracking.RefTime.UnixNano()) / 1e9) - level.Debug(e.logger).Log("msg", "Tracking Ref Time", "ref_time", tracking.RefTime) + level.Debug(logger).Log("msg", "Tracking Ref Time", "ref_time", tracking.RefTime) ch <- trackingSystemTime.mustNewConstMetric(float64(tracking.CurrentCorrection)) - level.Debug(e.logger).Log("msg", "Tracking System Time", "system_time", tracking.CurrentCorrection) + level.Debug(logger).Log("msg", "Tracking System Time", "system_time", tracking.CurrentCorrection) remoteTracking := 1.0 if tracking.IPAddr.Equal(trackingLocalIP) { remoteTracking = 0.0 } ch <- trackingRemoteTracking.mustNewConstMetric(remoteTracking) - level.Debug(e.logger).Log("msg", "Tracking is remote", "bool_value", remoteTracking) + level.Debug(logger).Log("msg", "Tracking is remote", "bool_value", remoteTracking) ch <- trackingRMSOffset.mustNewConstMetric(tracking.RMSOffset) - level.Debug(e.logger).Log("msg", "Tracking RMS Offset", "rms_offset", tracking.RMSOffset) + level.Debug(logger).Log("msg", "Tracking RMS Offset", "rms_offset", tracking.RMSOffset) ch <- trackingRootDelay.mustNewConstMetric(tracking.RootDelay) - level.Debug(e.logger).Log("msg", "Tracking Root delay", "root_delay", tracking.RootDelay) + level.Debug(logger).Log("msg", "Tracking Root delay", "root_delay", tracking.RootDelay) ch <- trackingRootDispersion.mustNewConstMetric(tracking.RootDispersion) - level.Debug(e.logger).Log("msg", "Tracking Root dispersion", "root_dispersion", tracking.RootDispersion) + level.Debug(logger).Log("msg", "Tracking Root dispersion", "root_dispersion", tracking.RootDispersion) ch <- trackingFrequency.mustNewConstMetric(tracking.FreqPPM) - level.Debug(e.logger).Log("msg", "Tracking Frequency", "frequency", tracking.FreqPPM) + level.Debug(logger).Log("msg", "Tracking Frequency", "frequency", tracking.FreqPPM) ch <- trackingResidualFrequency.mustNewConstMetric(tracking.ResidFreqPPM) - level.Debug(e.logger).Log("msg", "Tracking Residual Frequency", "residual_frequency", tracking.ResidFreqPPM) + level.Debug(logger).Log("msg", "Tracking Residual Frequency", "residual_frequency", tracking.ResidFreqPPM) ch <- trackingSkew.mustNewConstMetric(tracking.SkewPPM) - level.Debug(e.logger).Log("msg", "Tracking Skew", "skew", tracking.SkewPPM) + level.Debug(logger).Log("msg", "Tracking Skew", "skew", tracking.SkewPPM) ch <- trackingUpdateInterval.mustNewConstMetric(tracking.LastUpdateInterval) - level.Debug(e.logger).Log("msg", "Tracking Last Update Interval", "update_interval", tracking.LastUpdateInterval) + level.Debug(logger).Log("msg", "Tracking Last Update Interval", "update_interval", tracking.LastUpdateInterval) ch <- trackingStratum.mustNewConstMetric(float64(tracking.Stratum)) - level.Debug(e.logger).Log("msg", "Tracking Stratum", "stratum", tracking.Stratum) + level.Debug(logger).Log("msg", "Tracking Stratum", "stratum", tracking.Stratum) return nil }