Skip to content

Commit

Permalink
proxy: improve error classification (#6841)
Browse files Browse the repository at this point in the history
## Problem

## Summary of changes

1. Classify further cplane API errors
2. add 'serviceratelimit' and make a few of the timeout errors return
that.
3. a few additional minor changes
  • Loading branch information
conradludgate authored Feb 21, 2024
1 parent e7452d3 commit e0af945
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 51 deletions.
14 changes: 3 additions & 11 deletions proxy/src/bin/pg_sni_router.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,16 +171,8 @@ async fn task_main(
.context("failed to set socket option")?;

info!(%peer_addr, "serving");
let mut ctx =
RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
handle_client(
&mut ctx,
dest_suffix,
tls_config,
tls_server_end_point,
socket,
)
.await
let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
Expand Down Expand Up @@ -248,7 +240,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
}

async fn handle_client(
ctx: &mut RequestMonitoring,
mut ctx: RequestMonitoring,
dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint,
Expand Down
18 changes: 17 additions & 1 deletion proxy/src/console/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,22 @@ pub mod errors {
impl ReportableError for ApiError {
fn get_error_kind(&self) -> crate::error::ErrorKind {
match self {
ApiError::Console {
status: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
..
} => crate::error::ErrorKind::User,
ApiError::Console {
status: http::StatusCode::LOCKED,
text,
} if text.contains("quota exceeded")
|| text.contains("the limit for current plan reached") =>
{
crate::error::ErrorKind::User
}
ApiError::Console {
status: http::StatusCode::TOO_MANY_REQUESTS,
..
} => crate::error::ErrorKind::ServiceRateLimit,
ApiError::Console { .. } => crate::error::ErrorKind::ControlPlane,
ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
}
Expand Down Expand Up @@ -222,7 +238,7 @@ pub mod errors {
match self {
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
WakeComputeError::ApiError(e) => e.get_error_kind(),
WakeComputeError::TimeoutError => crate::error::ErrorKind::RateLimit,
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
}
}
}
Expand Down
10 changes: 4 additions & 6 deletions proxy/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,13 @@ impl RequestMonitoring {
self.success = true;
}

pub fn log(&mut self) {
if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(self.clone());
}
}
pub fn log(self) {}
}

impl Drop for RequestMonitoring {
fn drop(&mut self) {
self.log()
if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(self.clone());
}
}
}
26 changes: 5 additions & 21 deletions proxy/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,12 @@ pub enum ErrorKind {
/// Network error between user and proxy. Not necessarily user error
ClientDisconnect,

/// Proxy self-imposed rate limits
/// Proxy self-imposed user rate limits
RateLimit,

/// Proxy self-imposed service-wise rate limits
ServiceRateLimit,

/// internal errors
Service,

Expand All @@ -54,25 +57,12 @@ pub enum ErrorKind {
}

impl ErrorKind {
pub fn to_str(&self) -> &'static str {
match self {
ErrorKind::User => "request failed due to user error",
ErrorKind::ClientDisconnect => "client disconnected",
ErrorKind::RateLimit => "request cancelled due to rate limit",
ErrorKind::Service => "internal service error",
ErrorKind::ControlPlane => "non-retryable control plane error",
ErrorKind::Postgres => "postgres error",
ErrorKind::Compute => {
"non-retryable compute connection error (or exhausted retry capacity)"
}
}
}

pub fn to_metric_label(&self) -> &'static str {
match self {
ErrorKind::User => "user",
ErrorKind::ClientDisconnect => "clientdisconnect",
ErrorKind::RateLimit => "ratelimit",
ErrorKind::ServiceRateLimit => "serviceratelimit",
ErrorKind::Service => "service",
ErrorKind::ControlPlane => "controlplane",
ErrorKind::Postgres => "postgres",
Expand All @@ -85,12 +75,6 @@ pub trait ReportableError: fmt::Display + Send + 'static {
fn get_error_kind(&self) -> ErrorKind;
}

impl ReportableError for tokio::time::error::Elapsed {
fn get_error_kind(&self) -> ErrorKind {
ErrorKind::RateLimit
}
}

impl ReportableError for tokio_postgres::error::Error {
fn get_error_kind(&self) -> ErrorKind {
if self.as_db_error().is_some() {
Expand Down
19 changes: 7 additions & 12 deletions proxy/src/serverless/sql_over_http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use hyper::StatusCode;
use hyper::{Body, HeaderMap, Request};
use serde_json::json;
use serde_json::Value;
use tokio::join;
use tokio::try_join;
use tokio_postgres::error::DbError;
use tokio_postgres::error::ErrorPosition;
use tokio_postgres::GenericClient;
Expand All @@ -32,11 +32,9 @@ use crate::auth::ComputeUserInfoParseError;
use crate::config::ProxyConfig;
use crate::config::TlsConfig;
use crate::context::RequestMonitoring;
use crate::error::ReportableError;
use crate::metrics::HTTP_CONTENT_LENGTH;
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
use crate::proxy::NeonOptions;
use crate::serverless::backend::HttpConnError;
use crate::DbName;
use crate::RoleName;

Expand Down Expand Up @@ -287,8 +285,10 @@ pub async fn handle(
)?
}
},
Err(e) => {
ctx.set_error_kind(e.get_error_kind());
Err(_) => {
// TODO: when http error classification is done, distinguish between
// timeout on sql vs timeout in proxy/cplane
// ctx.set_error_kind(crate::error::ErrorKind::RateLimit);

let message = format!(
"HTTP-Connection timed out, execution time exeeded {} seconds",
Expand Down Expand Up @@ -402,16 +402,11 @@ async fn handle_inner(
// not strictly necessary to mark success here,
// but it's just insurance for if we forget it somewhere else
ctx.latency_timer.success();
Ok::<_, HttpConnError>(client)
Ok::<_, anyhow::Error>(client)
};

// Run both operations in parallel
let (payload_result, auth_and_connect_result) =
join!(fetch_and_process_request, authenticate_and_connect,);

// Handle the results
let payload = payload_result?; // Handle errors appropriately
let mut client = auth_and_connect_result?; // Handle errors appropriately
let (payload, mut client) = try_join!(fetch_and_process_request, authenticate_and_connect)?;

let mut response = Response::builder()
.status(StatusCode::OK)
Expand Down

1 comment on commit e0af945

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2529 tests run: 2399 passed, 0 failed, 130 skipped (full report)


Flaky tests (1)

Postgres 16

  • test_pg_regress[None]: debug

Code coverage* (full report)

  • functions: 28.8% (6753 of 23411 functions)
  • lines: 47.7% (41045 of 86134 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
e0af945 at 2024-02-21T11:24:58.581Z :recycle:

Please sign in to comment.