Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make replication stats return whole number #28824

Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .chloggen/replication-stats-conversion-failure.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: deprecation

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: postgresqlreceiver

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: "Deprecation of postgresql replication lag metrics `postgresql.wal.lag` in favor of more precise 'postgresql.wal.delay'"

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [26714]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: [user]
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ func stopSampling(_ context.Context) error {
// no load scraper is running nothing to do
return nil
}

djaglowski marked this conversation as resolved.
Show resolved Hide resolved
// only stop sampling if all load scrapers have been closed
scraperCount--
if scraperCount > 0 {
Expand Down
70 changes: 63 additions & 7 deletions receiver/postgresqlreceiver/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,19 @@ import (
"github.com/lib/pq"
"go.opentelemetry.io/collector/config/confignet"
"go.opentelemetry.io/collector/config/configtls"
"go.opentelemetry.io/collector/featuregate"
"go.uber.org/multierr"
)

const lagMetricsInSecondsFeatureGateID = "postgresqlreceiver.preciselagmetrics"

var preciseLagMetricsFg = featuregate.GlobalRegistry().MustRegister(
lagMetricsInSecondsFeatureGateID,
featuregate.StageAlpha,
featuregate.WithRegisterDescription("Metric `postgresql.wal.lag` is replaced by more precise `postgresql.wal.delay`."),
featuregate.WithRegisterFromVersion("0.89.0"),
)

// databaseName is a name that refers to a database so that it can be uniquely referred to later
// i.e. database1
type databaseName string
Expand Down Expand Up @@ -484,18 +494,63 @@ func (c *postgreSQLClient) getMaxConnections(ctx context.Context) (int64, error)
type replicationStats struct {
clientAddr string
pendingBytes int64
flushLag int64
replayLag int64
writeLag int64
flushLagInt int64 // Deprecated
replayLagInt int64 // Deprecated
writeLagInt int64 // Deprecated
flushLag float64
replayLag float64
writeLag float64
}

func (c *postgreSQLClient) getDeprecatedReplicationStats(ctx context.Context) ([]replicationStats, error) {
query := `SELECT
client_addr,
coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn), -1) AS replication_bytes_pending,
extract('epoch' from coalesce(write_lag, '-1 seconds'))::integer,
extract('epoch' from coalesce(flush_lag, '-1 seconds'))::integer,
extract('epoch' from coalesce(replay_lag, '-1 seconds'))::integer
FROM pg_stat_replication;
`
rows, err := c.client.QueryContext(ctx, query)
if err != nil {
return nil, fmt.Errorf("unable to query pg_stat_replication: %w", err)
}
defer rows.Close()
var rs []replicationStats
var errors error
for rows.Next() {
var client string
var replicationBytes int64
var writeLagInt, flushLagInt, replayLagInt int64
err = rows.Scan(&client, &replicationBytes,
&writeLagInt, &flushLagInt, &replayLagInt)
if err != nil {
errors = multierr.Append(errors, err)
continue
}
rs = append(rs, replicationStats{
clientAddr: client,
pendingBytes: replicationBytes,
replayLagInt: replayLagInt,
writeLagInt: writeLagInt,
flushLagInt: flushLagInt,
})
}

return rs, errors
}

func (c *postgreSQLClient) getReplicationStats(ctx context.Context) ([]replicationStats, error) {
if !preciseLagMetricsFg.IsEnabled() {
return c.getDeprecatedReplicationStats(ctx)
}

query := `SELECT
client_addr,
coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn), -1) AS replication_bytes_pending,
extract('epoch' from coalesce(write_lag, '-1 seconds')),
extract('epoch' from coalesce(flush_lag, '-1 seconds')),
extract('epoch' from coalesce(replay_lag, '-1 seconds'))
extract('epoch' from coalesce(write_lag, '-1 seconds'))::decimal AS write_lag_fractional,
extract('epoch' from coalesce(flush_lag, '-1 seconds'))::decimal AS flush_lag_fractional,
extract('epoch' from coalesce(replay_lag, '-1 seconds'))::decimal AS replay_lag_fractional
FROM pg_stat_replication;
`
rows, err := c.client.QueryContext(ctx, query)
Expand All @@ -507,7 +562,8 @@ func (c *postgreSQLClient) getReplicationStats(ctx context.Context) ([]replicati
var errors error
for rows.Next() {
var client string
var replicationBytes, writeLag, flushLag, replayLag int64
var replicationBytes int64
var writeLag, flushLag, replayLag float64
err = rows.Scan(&client, &replicationBytes, &writeLag, &flushLag, &replayLag)
if err != nil {
errors = multierr.Append(errors, err)
Expand Down
18 changes: 18 additions & 0 deletions receiver/postgresqlreceiver/documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,24 @@ The number of temp files.
| ---- | ----------- | ---------- | ----------------------- | --------- |
| {temp_file} | Sum | Int | Cumulative | true |

### postgresql.wal.delay

Time between flushing recent WAL locally and receiving notification that the standby server has completed an operation with it captured in seconds and milliseconds.

This metric requires WAL to be enabled with at least one replica.


| Unit | Metric Type | Value Type |
| ---- | ----------- | ---------- |
| s | Gauge | Double |

#### Attributes

| Name | Description | Values |
| ---- | ----------- | ------ |
| operation | The operation which is responsible for the lag. | Str: ``flush``, ``replay``, ``write`` |
| replication_client | The IP address of the client connected to this backend. If this field is "unix", it indicates either that the client is connected via a Unix socket. | Any Str |

## Resource Attributes

| Name | Description | Values | Enabled |
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ all_set:
enabled: true
postgresql.wal.age:
enabled: true
postgresql.wal.delay:
enabled: true
postgresql.wal.lag:
enabled: true
resource_attributes:
Expand Down Expand Up @@ -112,6 +114,8 @@ none_set:
enabled: false
postgresql.wal.age:
enabled: false
postgresql.wal.delay:
enabled: false
postgresql.wal.lag:
enabled: false
resource_attributes:
Expand Down
10 changes: 10 additions & 0 deletions receiver/postgresqlreceiver/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,13 @@ metrics:
value_type: int
extended_documentation: |
This metric requires WAL to be enabled with at least one replica.
postgresql.wal.delay:
attributes: [wal_operation_lag, replication_client]
description: Time between flushing recent WAL locally and receiving notification that the standby server has completed an operation with it captured in seconds and milliseconds.
michalpristas marked this conversation as resolved.
Show resolved Hide resolved
enabled: false
unit: s
gauge:
value_type: double
extended_documentation: |
This metric requires WAL to be enabled with at least one replica.

28 changes: 20 additions & 8 deletions receiver/postgresqlreceiver/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,14 +313,26 @@ func (p *postgreSQLScraper) collectReplicationStats(
if rs.pendingBytes >= 0 {
p.mb.RecordPostgresqlReplicationDataDelayDataPoint(now, rs.pendingBytes, rs.clientAddr)
}
if rs.writeLag >= 0 {
p.mb.RecordPostgresqlWalLagDataPoint(now, rs.writeLag, metadata.AttributeWalOperationLagWrite, rs.clientAddr)
}
if rs.replayLag >= 0 {
p.mb.RecordPostgresqlWalLagDataPoint(now, rs.replayLag, metadata.AttributeWalOperationLagReplay, rs.clientAddr)
}
if rs.flushLag >= 0 {
p.mb.RecordPostgresqlWalLagDataPoint(now, rs.flushLag, metadata.AttributeWalOperationLagFlush, rs.clientAddr)
if preciseLagMetricsFg.IsEnabled() {
if rs.writeLag >= 0 {
p.mb.RecordPostgresqlWalDelayDataPoint(now, rs.writeLag, metadata.AttributeWalOperationLagWrite, rs.clientAddr)
}
if rs.replayLag >= 0 {
p.mb.RecordPostgresqlWalDelayDataPoint(now, rs.replayLag, metadata.AttributeWalOperationLagReplay, rs.clientAddr)
}
if rs.flushLag >= 0 {
p.mb.RecordPostgresqlWalDelayDataPoint(now, rs.flushLag, metadata.AttributeWalOperationLagFlush, rs.clientAddr)
}
} else {
if rs.writeLagInt >= 0 {
p.mb.RecordPostgresqlWalLagDataPoint(now, rs.writeLagInt, metadata.AttributeWalOperationLagWrite, rs.clientAddr)
}
if rs.replayLagInt >= 0 {
p.mb.RecordPostgresqlWalLagDataPoint(now, rs.replayLagInt, metadata.AttributeWalOperationLagReplay, rs.clientAddr)
}
if rs.flushLagInt >= 0 {
p.mb.RecordPostgresqlWalLagDataPoint(now, rs.flushLagInt, metadata.AttributeWalOperationLagFlush, rs.clientAddr)
}
}
}
}
Expand Down
Loading
Loading