Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make replication stats return whole number #28824

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .chloggen/replication-stats-conversion-failure.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: deprecation

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: postgresqlreceiver

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: "Deprecation of postgresql replication lag metrics `postgresql.wal.lag` in favor of more precise 'postgresql.wal.delay'"

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [26714]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [user]
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ func stopSampling(_ context.Context) error {
// no load scraper is running nothing to do
return nil
}

// only stop sampling if all load scrapers have been closed
scraperCount--
if scraperCount > 0 {
Expand Down
70 changes: 63 additions & 7 deletions receiver/postgresqlreceiver/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,19 @@ import (
"github.com/lib/pq"
"go.opentelemetry.io/collector/config/confignet"
"go.opentelemetry.io/collector/config/configtls"
"go.opentelemetry.io/collector/featuregate"
"go.uber.org/multierr"
)

const lagMetricsInSecondsFeatureGateID = "postgresqlreceiver.preciselagmetrics"

var preciseLagMetricsFg = featuregate.GlobalRegistry().MustRegister(
lagMetricsInSecondsFeatureGateID,
featuregate.StageAlpha,
featuregate.WithRegisterDescription("Metric `postgresql.wal.lag` is replaced by more precise `postgresql.wal.delay`."),
featuregate.WithRegisterFromVersion("0.89.0"),
)

// databaseName is a name that refers to a database so that it can be uniquely referred to later
// i.e. database1
type databaseName string
Expand Down Expand Up @@ -484,18 +494,63 @@ func (c *postgreSQLClient) getMaxConnections(ctx context.Context) (int64, error)
type replicationStats struct {
clientAddr string
pendingBytes int64
flushLag int64
replayLag int64
writeLag int64
flushLagInt int64 // Deprecated
replayLagInt int64 // Deprecated
writeLagInt int64 // Deprecated
flushLag float64
replayLag float64
writeLag float64
}

func (c *postgreSQLClient) getDeprecatedReplicationStats(ctx context.Context) ([]replicationStats, error) {
query := `SELECT
client_addr,
coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn), -1) AS replication_bytes_pending,
extract('epoch' from coalesce(write_lag, '-1 seconds'))::integer,
extract('epoch' from coalesce(flush_lag, '-1 seconds'))::integer,
extract('epoch' from coalesce(replay_lag, '-1 seconds'))::integer
FROM pg_stat_replication;
`
rows, err := c.client.QueryContext(ctx, query)
if err != nil {
return nil, fmt.Errorf("unable to query pg_stat_replication: %w", err)
}
defer rows.Close()
var rs []replicationStats
var errors error
for rows.Next() {
var client string
var replicationBytes int64
var writeLagInt, flushLagInt, replayLagInt int64
err = rows.Scan(&client, &replicationBytes,
&writeLagInt, &flushLagInt, &replayLagInt)
if err != nil {
errors = multierr.Append(errors, err)
continue
}
rs = append(rs, replicationStats{
clientAddr: client,
pendingBytes: replicationBytes,
replayLagInt: replayLagInt,
writeLagInt: writeLagInt,
flushLagInt: flushLagInt,
})
}

return rs, errors
}

func (c *postgreSQLClient) getReplicationStats(ctx context.Context) ([]replicationStats, error) {
if !preciseLagMetricsFg.IsEnabled() {
return c.getDeprecatedReplicationStats(ctx)
}

query := `SELECT
client_addr,
coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn), -1) AS replication_bytes_pending,
extract('epoch' from coalesce(write_lag, '-1 seconds')),
extract('epoch' from coalesce(flush_lag, '-1 seconds')),
extract('epoch' from coalesce(replay_lag, '-1 seconds'))
extract('epoch' from coalesce(write_lag, '-1 seconds'))::decimal AS write_lag_fractional,
extract('epoch' from coalesce(flush_lag, '-1 seconds'))::decimal AS flush_lag_fractional,
extract('epoch' from coalesce(replay_lag, '-1 seconds'))::decimal AS replay_lag_fractional
FROM pg_stat_replication;
`
rows, err := c.client.QueryContext(ctx, query)
Expand All @@ -507,7 +562,8 @@ func (c *postgreSQLClient) getReplicationStats(ctx context.Context) ([]replicati
var errors error
for rows.Next() {
var client string
var replicationBytes, writeLag, flushLag, replayLag int64
var replicationBytes int64
var writeLag, flushLag, replayLag float64
err = rows.Scan(&client, &replicationBytes, &writeLag, &flushLag, &replayLag)
if err != nil {
errors = multierr.Append(errors, err)
Expand Down
18 changes: 18 additions & 0 deletions receiver/postgresqlreceiver/documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,24 @@ The number of temp files.
| ---- | ----------- | ---------- | ----------------------- | --------- |
| {temp_file} | Sum | Int | Cumulative | true |

### postgresql.wal.delay

Time between flushing recent WAL locally and receiving notification that the standby server has completed an operation with it.

This metric requires WAL to be enabled with at least one replica.


| Unit | Metric Type | Value Type |
| ---- | ----------- | ---------- |
| s | Gauge | Double |

#### Attributes

| Name | Description | Values |
| ---- | ----------- | ------ |
| operation | The operation which is responsible for the lag. | Str: ``flush``, ``replay``, ``write`` |
| replication_client | The IP address of the client connected to this backend. If this field is "unix", it indicates either that the client is connected via a Unix socket. | Any Str |

## Resource Attributes

| Name | Description | Values | Enabled |
Expand Down
5 changes: 4 additions & 1 deletion receiver/postgresqlreceiver/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.20
require (
github.com/google/go-cmp v0.6.0
github.com/lib/pq v1.10.9
github.com/open-telemetry/opentelemetry-collector-contrib/internal/common v0.88.0
github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal v0.88.0
github.com/open-telemetry/opentelemetry-collector-contrib/pkg/golden v0.0.0-00010101000000-000000000000
github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest v0.88.0
Expand All @@ -16,6 +17,7 @@ require (
go.opentelemetry.io/collector/config/configtls v0.88.1-0.20231026220224-6405e152a2d9
go.opentelemetry.io/collector/confmap v0.88.1-0.20231026220224-6405e152a2d9
go.opentelemetry.io/collector/consumer v0.88.1-0.20231026220224-6405e152a2d9
go.opentelemetry.io/collector/featuregate v1.0.0-rcv0017.0.20231026220224-6405e152a2d9
go.opentelemetry.io/collector/pdata v1.0.0-rcv0017.0.20231026220224-6405e152a2d9
go.opentelemetry.io/collector/receiver v0.88.1-0.20231026220224-6405e152a2d9
go.uber.org/multierr v1.11.0
Expand Down Expand Up @@ -75,7 +77,6 @@ require (
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/collector v0.88.1-0.20231026220224-6405e152a2d9 // indirect
go.opentelemetry.io/collector/config/configtelemetry v0.88.1-0.20231026220224-6405e152a2d9 // indirect
go.opentelemetry.io/collector/featuregate v1.0.0-rcv0017.0.20231026220224-6405e152a2d9 // indirect
go.opentelemetry.io/otel v1.19.0 // indirect
go.opentelemetry.io/otel/metric v1.19.0 // indirect
go.opentelemetry.io/otel/trace v1.19.0 // indirect
Expand All @@ -95,6 +96,8 @@ replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatatest

replace github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil => ../../pkg/pdatautil

replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/common => ../../internal/common

replace github.com/open-telemetry/opentelemetry-collector-contrib/internal/coreinternal => ../../internal/coreinternal

retract (
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ all_set:
enabled: true
postgresql.wal.age:
enabled: true
postgresql.wal.delay:
enabled: true
postgresql.wal.lag:
enabled: true
resource_attributes:
Expand Down Expand Up @@ -112,6 +114,8 @@ none_set:
enabled: false
postgresql.wal.age:
enabled: false
postgresql.wal.delay:
enabled: false
postgresql.wal.lag:
enabled: false
resource_attributes:
Expand Down
10 changes: 10 additions & 0 deletions receiver/postgresqlreceiver/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,13 @@ metrics:
value_type: int
extended_documentation: |
This metric requires WAL to be enabled with at least one replica.
postgresql.wal.delay:
attributes: [wal_operation_lag, replication_client]
description: Time between flushing recent WAL locally and receiving notification that the standby server has completed an operation with it.
enabled: false
unit: s
gauge:
value_type: double
extended_documentation: |
This metric requires WAL to be enabled with at least one replica.

Loading