Skip to content

Commit

Permalink
feat(gateway): get stats from cgroup file directly (#1464)
Browse files Browse the repository at this point in the history
We do this because getting it through docker can take up to 1 second
per project (container) which makes the ambulance task unacceptably slow.

---------

Co-authored-by: chesedo <pieter@chesedo.me>
  • Loading branch information
oddgrd and chesedo authored Dec 7, 2023
1 parent cc1bff0 commit 564ea0b
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 26 deletions.
6 changes: 3 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -823,9 +823,9 @@ workflows:
jobs:
- approve-push-unstable:
type: approval
filters:
branches:
only: main
# filters:
# branches:
# only: main
- build-and-push:
name: build-and-push-unstable
aws-access-key-id: DEV_AWS_ACCESS_KEY_ID
Expand Down
5 changes: 5 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ services:
# This image needs to run highly privileged in order to
# orchestrate user runtimes safely
- ${DOCKER_SOCK}:/var/run/docker.sock
# We need to mount this directory to be able to read docker
# stats from it. We mount it as read-only.
# TODO: this directory will vary depending on the host OS,
# we should make it more dynamic.
- /sys/fs/cgroup:/sys/fs/cgroup:ro
environment:
- RUST_LOG=${RUST_LOG}
- SHUTTLE_ENV=${SHUTTLE_ENV}
Expand Down
2 changes: 2 additions & 0 deletions gateway/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ pub mod task;
pub mod tls;
pub mod worker;

pub const DOCKER_STATS_PATH: &str = "/sys/fs/cgroup/cpuacct/docker";

static AUTH_CLIENT: Lazy<Client<HttpConnector>> = Lazy::new(Client::new);

/// Server-side errors that do not have to do with the user runtime
Expand Down
15 changes: 15 additions & 0 deletions gateway/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use shuttle_gateway::proxy::UserServiceBuilder;
use shuttle_gateway::service::{GatewayService, MIGRATIONS};
use shuttle_gateway::tls::make_tls_acceptor;
use shuttle_gateway::worker::{Worker, WORKER_QUEUE_SIZE};
use shuttle_gateway::DOCKER_STATS_PATH;
use sqlx::migrate::MigrateDatabase;
use sqlx::sqlite::{SqliteConnectOptions, SqliteJournalMode, SqliteSynchronous};
use sqlx::{Sqlite, SqlitePool};
Expand All @@ -37,6 +38,20 @@ async fn main() -> io::Result<()> {
Sqlite::create_database(db_uri).await.unwrap();
}

let docker_stats_path =
PathBuf::from_str(DOCKER_STATS_PATH).expect("to parse docker stats path");

// Return an error early if the docker stats path is not in the expected location.
if !docker_stats_path.exists() {
return Err(std::io::Error::new(
io::ErrorKind::NotFound,
format!(
"could not find docker stats at path: {:?}",
DOCKER_STATS_PATH
),
));
}

info!(
"state db: {}",
std::fs::canonicalize(&args.state)
Expand Down
72 changes: 49 additions & 23 deletions gateway/src/project.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use std::net::{IpAddr, SocketAddr};
use std::time::Duration;

use bollard::container::{
Config, CreateContainerOptions, KillContainerOptions, RemoveContainerOptions, Stats,
StatsOptions, StopContainerOptions,
Config, CreateContainerOptions, KillContainerOptions, RemoveContainerOptions,
StopContainerOptions,
};
use bollard::errors::Error as DockerError;
use bollard::models::{ContainerInspectResponse, ContainerStateStatusEnum};
Expand Down Expand Up @@ -34,6 +34,7 @@ use ulid::Ulid;
use uuid::Uuid;

use crate::service::ContainerSettings;
use crate::DOCKER_STATS_PATH;
use crate::{DockerContext, Error, ErrorKind, IntoTryState, Refresh, State, TryState};

macro_rules! safe_unwrap {
Expand Down Expand Up @@ -1228,14 +1229,14 @@ pub struct ProjectStarted {
start_count: usize,
// Use default for backward compatibility. Can be removed when all projects in the DB have this property set
#[serde(default, deserialize_with = "ok_or_default")]
stats: VecDeque<Stats>,
stats: VecDeque<u64>,
}

impl ProjectStarted {
pub fn new(
container: ContainerInspectResponse,
start_count: usize,
stats: VecDeque<Stats>,
stats: VecDeque<u64>,
) -> Self {
Self {
container,
Expand Down Expand Up @@ -1309,14 +1310,14 @@ pub struct ProjectReady {
service: Service,
// Use default for backward compatibility. Can be removed when all projects in the DB have this property set
#[serde(default, deserialize_with = "ok_or_default")]
stats: VecDeque<Stats>,
stats: VecDeque<u64>,
}

impl ProjectReady {
pub fn new(
container: ContainerInspectResponse,
service: Service,
stats: VecDeque<Stats>,
stats: VecDeque<u64>,
) -> Self {
Self {
container,
Expand All @@ -1326,6 +1327,43 @@ impl ProjectReady {
}
}

#[instrument(name = "getting container stats from the cgroup file", skip_all)]
async fn get_container_stats(container: &ContainerInspectResponse) -> Result<u64, ProjectError> {
let id = safe_unwrap!(container.id);

let cpu_usage: u64 = tokio::fs::read_to_string(format!("{DOCKER_STATS_PATH}/{id}/cpuacct.usage"))
.map_err(|e| {
error!(error = %e, shuttle.container.id = id, "failed to read docker stats file for container");
ProjectError::internal("failed to read docker stats file for container")
}).await?
.trim()
.parse()
.map_err(|e| {
error!(error = %e, shuttle.container.id = id, "failed to parse cpu usage stat");

ProjectError::internal("failed to parse cpu usage to u64")
})?;

// TODO: the above solution only works for cgroup v1
// This is the version used by our server. However on my local nix setup I have cgroup v2 and had to use the
// following to get the 'usage_usec' which is on the first line
// let usage: u64 = std::fs::read_to_string(format!(
// "/sys/fs/cgroup/system.slice/docker-{id}.scope/cpu.stat"
// ))
// .unwrap_or_default()
// .lines()
// .next()
// .unwrap()
// .split(' ')
// .nth(1)
// .unwrap_or_default()
// .parse::<u64>()
// .unwrap_or_default()
// * 1_000;

Ok(cpu_usage)
}

#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum ProjectRunning {
Ready(ProjectReady),
Expand All @@ -1341,7 +1379,7 @@ where
type Error = ProjectError;

#[instrument(name = "check if container is still healthy", skip_all)]
async fn next(mut self, ctx: &Ctx) -> Result<Self::Next, Self::Error> {
async fn next(mut self, _ctx: &Ctx) -> Result<Self::Next, Self::Error> {
let Self {
container,
mut service,
Expand All @@ -1364,20 +1402,9 @@ where
}));
}

let new_stat = ctx
.docker()
.stats(
safe_unwrap!(container.id),
Some(StatsOptions {
one_shot: true,
stream: false,
}),
)
.next()
.await
.unwrap()?;
let new_stat = get_container_stats(&container).await?;

stats.push_back(new_stat.clone());
stats.push_back(new_stat);

let mut last = None;

Expand All @@ -1393,9 +1420,7 @@ where
}));
};

let cpu_per_minute = (new_stat.cpu_stats.cpu_usage.total_usage
- last.cpu_stats.cpu_usage.total_usage)
/ idle_minutes;
let cpu_per_minute = (new_stat - last) / idle_minutes;

debug!(
shuttle.container.id = container.id,
Expand Down Expand Up @@ -1496,6 +1521,7 @@ impl Service {
.map_err(|err| err.into())
}

#[instrument(name = "calling status endpoint on container", skip_all, fields(project_name = %self.name))]
pub async fn is_healthy(&mut self) -> bool {
let uri = self.uri(format!("/projects/{}/status", self.name)).unwrap();
let resp = timeout(IS_HEALTHY_TIMEOUT, CLIENT.get(uri)).await;
Expand Down

0 comments on commit 564ea0b

Please sign in to comment.