-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: port collision on large amount of tests #257
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,57 +1,101 @@ | ||
use crate::error::SandboxErrorCode; | ||
use crate::network::Sandbox; | ||
use std::fs::File; | ||
|
||
use crate::error::{ErrorKind, SandboxErrorCode}; | ||
use crate::result::Result; | ||
|
||
use async_process::Child; | ||
use fs2::FileExt; | ||
use portpicker::pick_unused_port; | ||
use tempfile::TempDir; | ||
use tracing::info; | ||
|
||
use near_sandbox_utils as sandbox; | ||
|
||
/// Acquire an unused port and lock it for the duration until the sandbox server has | ||
/// been started. | ||
fn acquire_unused_port() -> Result<(u16, File)> { | ||
loop { | ||
let port = pick_unused_port() | ||
.ok_or_else(|| SandboxErrorCode::InitFailure.message("no ports free"))?; | ||
let lockpath = std::env::temp_dir().join(format!("near-sandbox-port{}.lock", port)); | ||
let lockfile = File::create(lockpath).map_err(|err| { | ||
ErrorKind::Io.full(format!("failed to create lockfile for port {}", port), err) | ||
})?; | ||
if lockfile.try_lock_exclusive().is_ok() { | ||
Comment on lines
+21
to
+24
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Curious what I assume you've probably tested this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So lock files are purely advisory about a resource being taken up. They don't actually lock the file from being written to, so truncation wouldn't error out There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gotcha! |
||
break Ok((port, lockfile)); | ||
} | ||
} | ||
} | ||
|
||
async fn init_home_dir() -> Result<TempDir> { | ||
let home_dir = tempfile::tempdir().map_err(|e| ErrorKind::Io.custom(e))?; | ||
let output = sandbox::init(&home_dir) | ||
.map_err(|e| SandboxErrorCode::InitFailure.custom(e))? | ||
.output() | ||
.await | ||
.map_err(|e| SandboxErrorCode::InitFailure.custom(e))?; | ||
info!(target: "workspaces", "sandbox init: {:?}", output); | ||
|
||
Ok(home_dir) | ||
} | ||
|
||
pub struct SandboxServer { | ||
pub(crate) rpc_port: u16, | ||
pub(crate) net_port: u16, | ||
pub(crate) home_dir: TempDir, | ||
|
||
rpc_port_lock: Option<File>, | ||
net_port_lock: Option<File>, | ||
process: Option<Child>, | ||
} | ||
|
||
impl SandboxServer { | ||
pub fn new(rpc_port: u16, net_port: u16) -> Self { | ||
Self { | ||
rpc_port, | ||
net_port, | ||
process: None, | ||
} | ||
} | ||
|
||
pub async fn start(&mut self) -> Result<()> { | ||
if self.process.is_some() { | ||
return Err(SandboxErrorCode::AlreadyStarted.into()); | ||
} | ||
|
||
info!(target: "workspaces", "Starting up sandbox at localhost:{}", self.rpc_port); | ||
let home_dir = Sandbox::home_dir(self.rpc_port); | ||
|
||
pub(crate) async fn run_new() -> Result<Self> { | ||
// Supress logs for the sandbox binary by default: | ||
supress_sandbox_logs_if_required(); | ||
|
||
// Remove dir if it already exists: | ||
let _ = std::fs::remove_dir_all(&home_dir); | ||
let output = sandbox::init(&home_dir) | ||
.map_err(|e| SandboxErrorCode::InitFailure.custom(e))? | ||
.output() | ||
.await | ||
.map_err(|e| SandboxErrorCode::InitFailure.custom(e))?; | ||
info!(target: "workspaces", "sandbox init: {:?}", output); | ||
// Try running the server with the follow provided rpc_ports and net_ports | ||
let (rpc_port, rpc_port_lock) = acquire_unused_port()?; | ||
let (net_port, net_port_lock) = acquire_unused_port()?; | ||
let home_dir = init_home_dir().await?; | ||
|
||
// Configure `$home_dir/config.json` to our liking. Sandbox requires extra settings | ||
// for the best user experience, and being able to offer patching large state payloads. | ||
crate::network::config::set_sandbox_configs(&home_dir)?; | ||
|
||
let child = sandbox::run(&home_dir, self.rpc_port, self.net_port) | ||
let child = sandbox::run(&home_dir, rpc_port, net_port) | ||
.map_err(|e| SandboxErrorCode::RunFailure.custom(e))?; | ||
|
||
info!(target: "workspaces", "Started sandbox: pid={:?}", child.id()); | ||
self.process = Some(child); | ||
info!(target: "workspaces", "Started up sandbox at localhost:{} with pid={:?}", rpc_port, child.id()); | ||
|
||
Ok(Self { | ||
rpc_port, | ||
net_port, | ||
home_dir, | ||
rpc_port_lock: Some(rpc_port_lock), | ||
net_port_lock: Some(net_port_lock), | ||
process: Some(child), | ||
}) | ||
} | ||
|
||
/// Unlock port lockfiles that were used to avoid port contention when starting up | ||
/// the sandbox node. | ||
pub(crate) fn unlock_lockfiles(&mut self) -> Result<()> { | ||
if let Some(rpc_port_lock) = self.rpc_port_lock.take() { | ||
rpc_port_lock.unlock().map_err(|e| { | ||
ErrorKind::Io.full( | ||
format!("failed to unlock lockfile for rpc_port={}", self.rpc_port), | ||
e, | ||
) | ||
})?; | ||
} | ||
if let Some(net_port_lock) = self.net_port_lock.take() { | ||
net_port_lock.unlock().map_err(|e| { | ||
ErrorKind::Io.full( | ||
format!("failed to unlock lockfile for net_port={}", self.net_port), | ||
e, | ||
) | ||
})?; | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
@@ -61,14 +105,6 @@ impl SandboxServer { | |
} | ||
} | ||
|
||
impl Default for SandboxServer { | ||
fn default() -> Self { | ||
let rpc_port = pick_unused_port().expect("no ports free"); | ||
let net_port = pick_unused_port().expect("no ports free"); | ||
Self::new(rpc_port, net_port) | ||
} | ||
} | ||
|
||
impl Drop for SandboxServer { | ||
fn drop(&mut self) { | ||
if self.process.is_none() { | ||
|
@@ -88,6 +124,9 @@ impl Drop for SandboxServer { | |
.kill() | ||
.map_err(|e| format!("Could not cleanup sandbox due to: {:?}", e)) | ||
.unwrap(); | ||
|
||
// Unlock the ports just in case they have not been preemptively done. | ||
self.unlock_lockfiles().unwrap(); | ||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was going to suggest the same thing as @DavidM-D, but it seems like this is the key difference. A potential race condition between two nodes seeing that a port is “free” at the same time, then proceeding to simultaneously attempt binding to it. And since there's no global node executor that controls the issuing of ports to nodes, there's no way to gate access to ports if workspaces instances can't communicate with themselves, which is probably why the common denominator used here is the filesystem.