diff --git a/.circleci/config.yml b/.circleci/config.yml index 40542b77a..24f9241ef 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -823,9 +823,9 @@ workflows: jobs: - approve-push-unstable: type: approval - filters: - branches: - only: main + # filters: + # branches: + # only: main - build-and-push: name: build-and-push-unstable aws-access-key-id: DEV_AWS_ACCESS_KEY_ID diff --git a/docker-compose.yml b/docker-compose.yml index e4f0708d9..bc2a54d30 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -126,6 +126,11 @@ services: # This image needs to run highly privileged in order to # orchestrate user runtimes safely - ${DOCKER_SOCK}:/var/run/docker.sock + # We need to mount this directory to be able to read docker + # stats from it. We mount it as read-only. + # TODO: this directory will vary depending on the host OS, + # we should make it more dynamic. + - /sys/fs/cgroup:/sys/fs/cgroup:ro environment: - RUST_LOG=${RUST_LOG} - SHUTTLE_ENV=${SHUTTLE_ENV} diff --git a/gateway/src/lib.rs b/gateway/src/lib.rs index 493b3b1a3..8dea1d92b 100644 --- a/gateway/src/lib.rs +++ b/gateway/src/lib.rs @@ -35,6 +35,8 @@ pub mod task; pub mod tls; pub mod worker; +pub const DOCKER_STATS_PATH: &str = "/sys/fs/cgroup/cpuacct/docker"; + static AUTH_CLIENT: Lazy> = Lazy::new(Client::new); /// Server-side errors that do not have to do with the user runtime diff --git a/gateway/src/main.rs b/gateway/src/main.rs index 89a7e7bb0..0ebe666b0 100644 --- a/gateway/src/main.rs +++ b/gateway/src/main.rs @@ -11,6 +11,7 @@ use shuttle_gateway::proxy::UserServiceBuilder; use shuttle_gateway::service::{GatewayService, MIGRATIONS}; use shuttle_gateway::tls::make_tls_acceptor; use shuttle_gateway::worker::{Worker, WORKER_QUEUE_SIZE}; +use shuttle_gateway::DOCKER_STATS_PATH; use sqlx::migrate::MigrateDatabase; use sqlx::sqlite::{SqliteConnectOptions, SqliteJournalMode, SqliteSynchronous}; use sqlx::{Sqlite, SqlitePool}; @@ -37,6 +38,20 @@ async fn main() -> io::Result<()> { Sqlite::create_database(db_uri).await.unwrap(); } + let docker_stats_path = + PathBuf::from_str(DOCKER_STATS_PATH).expect("to parse docker stats path"); + + // Return an error early if the docker stats path is not in the expected location. + if !docker_stats_path.exists() { + return Err(std::io::Error::new( + io::ErrorKind::NotFound, + format!( + "could not find docker stats at path: {:?}", + DOCKER_STATS_PATH + ), + )); + } + info!( "state db: {}", std::fs::canonicalize(&args.state) diff --git a/gateway/src/project.rs b/gateway/src/project.rs index 9e0d2349d..8f1f8acc2 100644 --- a/gateway/src/project.rs +++ b/gateway/src/project.rs @@ -4,8 +4,8 @@ use std::net::{IpAddr, SocketAddr}; use std::time::Duration; use bollard::container::{ - Config, CreateContainerOptions, KillContainerOptions, RemoveContainerOptions, Stats, - StatsOptions, StopContainerOptions, + Config, CreateContainerOptions, KillContainerOptions, RemoveContainerOptions, + StopContainerOptions, }; use bollard::errors::Error as DockerError; use bollard::models::{ContainerInspectResponse, ContainerStateStatusEnum}; @@ -34,6 +34,7 @@ use ulid::Ulid; use uuid::Uuid; use crate::service::ContainerSettings; +use crate::DOCKER_STATS_PATH; use crate::{DockerContext, Error, ErrorKind, IntoTryState, Refresh, State, TryState}; macro_rules! safe_unwrap { @@ -1228,14 +1229,14 @@ pub struct ProjectStarted { start_count: usize, // Use default for backward compatibility. Can be removed when all projects in the DB have this property set #[serde(default, deserialize_with = "ok_or_default")] - stats: VecDeque, + stats: VecDeque, } impl ProjectStarted { pub fn new( container: ContainerInspectResponse, start_count: usize, - stats: VecDeque, + stats: VecDeque, ) -> Self { Self { container, @@ -1309,14 +1310,14 @@ pub struct ProjectReady { service: Service, // Use default for backward compatibility. Can be removed when all projects in the DB have this property set #[serde(default, deserialize_with = "ok_or_default")] - stats: VecDeque, + stats: VecDeque, } impl ProjectReady { pub fn new( container: ContainerInspectResponse, service: Service, - stats: VecDeque, + stats: VecDeque, ) -> Self { Self { container, @@ -1326,6 +1327,43 @@ impl ProjectReady { } } +#[instrument(name = "getting container stats from the cgroup file", skip_all)] +async fn get_container_stats(container: &ContainerInspectResponse) -> Result { + let id = safe_unwrap!(container.id); + + let cpu_usage: u64 = tokio::fs::read_to_string(format!("{DOCKER_STATS_PATH}/{id}/cpuacct.usage")) + .map_err(|e| { + error!(error = %e, shuttle.container.id = id, "failed to read docker stats file for container"); + ProjectError::internal("failed to read docker stats file for container") + }).await? + .trim() + .parse() + .map_err(|e| { + error!(error = %e, shuttle.container.id = id, "failed to parse cpu usage stat"); + + ProjectError::internal("failed to parse cpu usage to u64") + })?; + + // TODO: the above solution only works for cgroup v1 + // This is the version used by our server. However on my local nix setup I have cgroup v2 and had to use the + // following to get the 'usage_usec' which is on the first line + // let usage: u64 = std::fs::read_to_string(format!( + // "/sys/fs/cgroup/system.slice/docker-{id}.scope/cpu.stat" + // )) + // .unwrap_or_default() + // .lines() + // .next() + // .unwrap() + // .split(' ') + // .nth(1) + // .unwrap_or_default() + // .parse::() + // .unwrap_or_default() + // * 1_000; + + Ok(cpu_usage) +} + #[derive(Clone, Debug, Serialize, Deserialize)] pub enum ProjectRunning { Ready(ProjectReady), @@ -1341,7 +1379,7 @@ where type Error = ProjectError; #[instrument(name = "check if container is still healthy", skip_all)] - async fn next(mut self, ctx: &Ctx) -> Result { + async fn next(mut self, _ctx: &Ctx) -> Result { let Self { container, mut service, @@ -1364,20 +1402,9 @@ where })); } - let new_stat = ctx - .docker() - .stats( - safe_unwrap!(container.id), - Some(StatsOptions { - one_shot: true, - stream: false, - }), - ) - .next() - .await - .unwrap()?; + let new_stat = get_container_stats(&container).await?; - stats.push_back(new_stat.clone()); + stats.push_back(new_stat); let mut last = None; @@ -1393,9 +1420,7 @@ where })); }; - let cpu_per_minute = (new_stat.cpu_stats.cpu_usage.total_usage - - last.cpu_stats.cpu_usage.total_usage) - / idle_minutes; + let cpu_per_minute = (new_stat - last) / idle_minutes; debug!( shuttle.container.id = container.id, @@ -1496,6 +1521,7 @@ impl Service { .map_err(|err| err.into()) } + #[instrument(name = "calling status endpoint on container", skip_all, fields(project_name = %self.name))] pub async fn is_healthy(&mut self) -> bool { let uri = self.uri(format!("/projects/{}/status", self.name)).unwrap(); let resp = timeout(IS_HEALTHY_TIMEOUT, CLIENT.get(uri)).await;