diff --git a/src/executor/wall_time/executor.rs b/src/executor/wall_time/executor.rs index fc956e6d..d2a77a0f 100644 --- a/src/executor/wall_time/executor.rs +++ b/src/executor/wall_time/executor.rs @@ -1,5 +1,5 @@ use super::helpers::validate_walltime_results; -use super::isolation::{requires_isolation, wrap_isolation_scope}; +use super::isolation::resolve_isolation_mode; use super::profiler::Profiler; use super::profiler::perf::PerfProfiler; use super::profiler::samply::SamplyProfiler; @@ -181,14 +181,11 @@ impl Executor for WallTimeExecutor { let (_env_file, _script_file, cmd_builder) = WallTimeExecutor::walltime_bench_cmd(&execution_context.config, execution_context)?; - // Resolve the isolation decision once and reuse it for both the scope + // Resolve the isolation decision once and reuse it for both the leaf // wrapping here and the privilege wrapping below (or in the profiler). - let isolate = requires_isolation(); - let cmd_builder = if isolate { - wrap_isolation_scope(cmd_builder)? - } else { - cmd_builder - }; + let isolation_mode = resolve_isolation_mode(); + let cmd_builder = isolation_mode.wrap_bench(cmd_builder)?; + let requires_sudo = isolation_mode.requires_sudo(); // Split-borrow `self` so the closure inside `run_with_profiler` can // capture `benchmark_state` while we hold `&mut profiler`. @@ -204,16 +201,13 @@ impl Executor for WallTimeExecutor { cmd_builder, &execution_context.config, &execution_context.profile_folder, - isolate, + requires_sudo, benchmark_state, ) .await } _ => { - // No profiler: still honor the isolation decision, since the - // scope (and the sudo it requires) is a property of the run, not - // of the profiler. - let cmd_builder = if isolate { + let cmd_builder = if requires_sudo { wrap_with_sudo(cmd_builder)? } else { cmd_builder @@ -262,11 +256,11 @@ async fn run_with_profiler( cmd_builder: CommandBuilder, config: &ExecutorConfig, profile_folder: &Path, - isolate: bool, + requires_sudo: bool, benchmark_state: &OnceCell<(FifoBenchmarkData, ExecutionTimestamps)>, ) -> Result { let wrapped = profiler - .wrap_command(cmd_builder, config, profile_folder, isolate) + .wrap_command(cmd_builder, config, profile_folder, requires_sudo) .await?; let cmd = wrapped.build(); debug!("cmd: {cmd:?}"); diff --git a/src/executor/wall_time/isolation.rs b/src/executor/wall_time/isolation.rs index cc691190..3025b932 100644 --- a/src/executor/wall_time/isolation.rs +++ b/src/executor/wall_time/isolation.rs @@ -1,60 +1,107 @@ +use std::path::Path; + use crate::executor::helpers::command::CommandBuilder; use crate::prelude::*; -/// Whether the benchmark must run inside the privileged systemd scope, which in -/// turn requires the profiler itself to record under elevated privileges. -/// -/// When isolated, the scope reparents the benchmark out of the profiler's -/// process subtree, so the profiler can only observe it by recording -/// system-wide — which needs `sudo`. When not isolated, the profiler records its -/// own descendant tree and runs unprivileged (relying on a permissive -/// `perf_event_paranoid`). -/// -/// `CODSPEED_ISOLATION` overrides the decision; otherwise we isolate only when we -/// can elevate without prompting (root or passwordless sudo, as on CI), so a -/// local run never blocks on a password. -/// -/// Isolation relies on `systemd-run`, so it is Linux-only; other platforms always -/// record their own descendant tree unprivileged. -pub fn requires_isolation() -> bool { - if !cfg!(target_os = "linux") { - return false; +/// How the benchmark is pinned to dedicated cores, away from the rest of the +/// system. Whether the profiler must run under sudo depends on the mode; see +/// [`requires_sudo`](Self::requires_sudo). +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum IsolationMode { + /// The benchmark runs in the runner's own process subtree, unpinned. + None, + /// `systemd-run --scope --slice=codspeed.slice`. The scope reparents the + /// benchmark out of the profiler's process subtree, so the profiler needs + /// elevated privileges (sudo) to observe it. + Systemd, + /// The benchmark runs in a delegated cgroup scope. + /// The scope is expected to be created with the expected leaves before CLI invocation. + Cgroup { + /// The delegated scope cgroup directory, as seen from the runner. + scope_dir: String, + }, +} + +impl IsolationMode { + /// Whether the profiler must run under sudo. True only when the benchmark is + /// reparented out of the profiler's subtree (i.e. for [`Systemd`](Self::Systemd)), + /// where the profiler needs elevated privileges to observe it. + pub fn requires_sudo(&self) -> bool { + matches!(self, IsolationMode::Systemd) } - match std::env::var("CODSPEED_ISOLATION").as_deref() { - Ok("true") => true, - Ok("false") => false, - _ => { - let can_isolate = crate::executor::helpers::run_with_sudo::can_elevate_without_prompt(); - if !can_isolate { - info!( - "Running without process isolation: elevating privileges would require a \ - password prompt. Set CODSPEED_ISOLATION=true to force it." - ); + + /// Wrap the benchmark leaf so it runs pinned according to this mode. The + /// profiler wraps its recorder around the result, so this must only touch the + /// leaf and never move the profiler onto the bench cores. + pub fn wrap_bench(&self, bench_cmd: CommandBuilder) -> Result { + match self { + IsolationMode::None => Ok(bench_cmd), + IsolationMode::Systemd => wrap_isolation_scope(bench_cmd), + IsolationMode::Cgroup { scope_dir } => { + // Move the runner (and the profiler it later spawns) onto the + // system cores before wrapping the benchmark for the bench cores. + place_runner_in_support(scope_dir)?; + wrap_cgroup_isolation(bench_cmd, scope_dir) } - can_isolate } } } -/// Run the benchmark command in an isolated process scope. +/// Environment variable selecting how walltime runs isolate the benchmark. +const ISOLATION_ENV: &str = "CODSPEED_ISOLATION"; + +/// Resolve how this run should isolate the benchmark from [`ISOLATION_ENV`]: /// -/// On Linux, the command is wrapped with `systemd-run --scope` so it runs inside the -/// `codspeed.slice` cgroup (predefined on CodSpeed CI runners to pin and isolate the -/// benchmark). Only applied when [`requires_isolation`] is true. +/// - non-Linux: [`None`](IsolationMode::None), no mechanism is available; +/// - unset: [`Systemd`](IsolationMode::Systemd) if we can elevate without a prompt +/// (root or passwordless sudo, as on CI), else [`None`](IsolationMode::None) so a +/// local run never blocks on a password prompt; +/// - `false`: [`None`](IsolationMode::None); +/// - `CGROUP:`: [`Cgroup`](IsolationMode::Cgroup) under ``, the only mode +/// that works without host systemd (used inside the sandbox); +/// - anything else: [`Systemd`](IsolationMode::Systemd). +pub fn resolve_isolation_mode() -> IsolationMode { + if !cfg!(target_os = "linux") { + return IsolationMode::None; + } + + let Some(value) = std::env::var(ISOLATION_ENV).ok().filter(|v| !v.is_empty()) else { + // Unset: isolate only if we can elevate silently, otherwise stay unpinned + // so a local run never blocks on a password prompt. + if crate::executor::helpers::run_with_sudo::can_elevate_without_prompt() { + return IsolationMode::Systemd; + } + info!( + "Running without process isolation: elevating privileges would require a \ + password prompt. Set {ISOLATION_ENV}=true to force it." + ); + return IsolationMode::None; + }; + + match value.as_str() { + "false" => IsolationMode::None, + _ => match value.strip_prefix("CGROUP:") { + Some(scope_dir) => IsolationMode::Cgroup { + scope_dir: scope_dir.to_string(), + }, + None => IsolationMode::Systemd, + }, + } +} + +/// Wrap the benchmark leaf so it runs inside the systemd `codspeed.slice` scope. /// -/// Remarks: -/// - We're using `--scope` so that the profiler is able to capture the events of the benchmark -/// process. -/// - We can't use `--user` here because we need to run in `codspeed.slice`, otherwise we'd run in -/// `user.slice` (which is isolated). We use `--uid` and `--gid` to keep running as the current -/// user. -/// - `--scope` only inherits the system environment, so the caller is expected to have already -/// forwarded the relevant variables (via `wrap_with_env`). -/// - The caller is expected to have already set the working directory on `bench_cmd`; it will be -/// propagated to `systemd-run` via [`CommandBuilder::wrap_with`], and `--same-dir` makes the -/// spawned scope inherit it. +/// Notes on the `systemd-run` flags: +/// - `--scope` (rather than a transient service) keeps the benchmark a child of +/// the runner, so the profiler can capture its events; see +/// [`IsolationMode::Systemd`]. +/// - `--user` would land us in `user.slice`, not `codspeed.slice`; `--uid`/`--gid` +/// instead keep the scope running as the current user. +/// - `--scope` only inherits the system environment, so the caller must have +/// already forwarded the benchmark's variables (via `wrap_with_env`) and set +/// its working directory, which `--same-dir` propagates into the scope. #[cfg(target_os = "linux")] -pub fn wrap_isolation_scope(mut bench_cmd: CommandBuilder) -> Result { +fn wrap_isolation_scope(mut bench_cmd: CommandBuilder) -> Result { use crate::executor::helpers::env::is_codspeed_debug_enabled; let mut cmd_builder = CommandBuilder::new("systemd-run"); @@ -75,6 +122,105 @@ pub fn wrap_isolation_scope(mut bench_cmd: CommandBuilder) -> Result Result { +fn wrap_isolation_scope(bench_cmd: CommandBuilder) -> Result { Ok(bench_cmd) } + +/// Names of the leaves that are expected to be created prior to the CLI invocation. +const BENCH_LEAF: &str = "bench"; +const SUPPORT_LEAF: &str = "support"; + +/// Move the runner itself into the scope's `support` leaf, so it and every child +/// it later spawns (notably the profiler) run on the system cores rather than the +/// bench cores. Call before spawning the profiler. +/// +/// The workload starts in the scope's `all` leaf (every core); this opts the +/// runner down onto the system cores, leaving the bench cores idle for the +/// benchmark the shim later places into `bench`. +fn place_runner_in_support(scope_dir: &str) -> Result<()> { + let procs = Path::new(scope_dir).join(SUPPORT_LEAF).join("cgroup.procs"); + let pid = std::process::id().to_string(); + std::fs::write(&procs, &pid).with_context(|| format!("move runner into {}", procs.display())) +} + +/// Wrap the benchmark so it self-places into the scope's `bench` leaf. +/// +/// The privileged side splits the delegated scope and pins the leaves; here a +/// tiny `bash` shim writes its own PID into `/bench/cgroup.procs` (moving +/// itself, and the benchmark it `exec`s, into the leaf) and then `exec`s the +/// original command, so the benchmark inherits the leaf's `cpuset.cpus` pinning +/// while staying the same PID and the same descendant of the profiler. +/// +/// Must wrap only the leaf, before the profiler wraps the recorder around it, so +/// the profiler itself is never moved onto the bench cores. +fn wrap_cgroup_isolation(mut bench_cmd: CommandBuilder, scope_dir: &str) -> Result { + let procs_path = Path::new(scope_dir).join(BENCH_LEAF).join("cgroup.procs"); + let quoted = shell_words::quote(&procs_path.to_string_lossy()).into_owned(); + // `$0`/`$@`: the shim's own argv[0] is a throwaway "bash"; the real program + // and its arguments follow and are re-exec'd verbatim via `exec "$@"`. + let script = format!("echo $$ > {quoted} && exec \"$@\""); + + bench_cmd.wrap("bash", ["-c".to_string(), script, "bash".to_string()]); + Ok(bench_cmd) +} + +#[cfg(test)] +mod tests { + use super::*; + use temp_env::with_vars; + + #[test] + fn cgroup_prefix_selects_cgroup_mode() { + if !cfg!(target_os = "linux") { + return; + } + with_vars( + [(ISOLATION_ENV, Some("CGROUP:/sys/fs/cgroup/scope"))], + || { + assert_eq!( + resolve_isolation_mode(), + IsolationMode::Cgroup { + scope_dir: "/sys/fs/cgroup/scope".to_string(), + } + ); + }, + ); + } + + #[test] + fn isolation_false_disables() { + with_vars([(ISOLATION_ENV, Some("false"))], || { + assert_eq!(resolve_isolation_mode(), IsolationMode::None); + }); + } + + #[test] + fn other_value_selects_systemd() { + if !cfg!(target_os = "linux") { + return; + } + with_vars([(ISOLATION_ENV, Some("true"))], || { + assert_eq!(resolve_isolation_mode(), IsolationMode::Systemd); + }); + } + + #[test] + fn cgroup_mode_keeps_profiler_unprivileged() { + let cgroup = IsolationMode::Cgroup { + scope_dir: "/x".to_string(), + }; + assert!(!cgroup.requires_sudo()); + assert!(IsolationMode::Systemd.requires_sudo()); + } + + #[test] + fn cgroup_wrap_places_then_execs() { + let mut cmd = CommandBuilder::new("bash"); + cmd.arg("/tmp/bench.sh"); + let wrapped = wrap_cgroup_isolation(cmd, "/sys/fs/cgroup").unwrap(); + assert_eq!( + wrapped.as_command_line(), + "bash -c 'echo $$ > /sys/fs/cgroup/bench/cgroup.procs && exec \"$@\"' bash bash /tmp/bench.sh" + ); + } +} diff --git a/src/executor/wall_time/profiler/mod.rs b/src/executor/wall_time/profiler/mod.rs index 0ee6becc..ab7f62cb 100644 --- a/src/executor/wall_time/profiler/mod.rs +++ b/src/executor/wall_time/profiler/mod.rs @@ -48,15 +48,16 @@ pub trait Profiler { /// Profilers stash any live state they need for the duration of the run /// (control fifos, output paths) on `self`. /// - /// `isolate` mirrors the decision the executor used to wrap the benchmark in - /// the systemd scope: when set, the profiler records system-wide under sudo; - /// otherwise it records its own descendant tree unprivileged. + /// When `requires_sudo` is set, the profiler must run elevated: the + /// isolation mechanism reparented the benchmark out of its subtree, so it can + /// only observe it with elevated privileges. Otherwise it records its own + /// descendant tree unprivileged. async fn wrap_command( &mut self, cmd: CommandBuilder, config: &ExecutorConfig, profile_folder: &Path, - isolate: bool, + requires_sudo: bool, ) -> anyhow::Result; /// The benchmarked process signaled the start of a measured region. diff --git a/src/executor/wall_time/profiler/perf/mod.rs b/src/executor/wall_time/profiler/perf/mod.rs index 5536af12..2c5514d2 100644 --- a/src/executor/wall_time/profiler/perf/mod.rs +++ b/src/executor/wall_time/profiler/perf/mod.rs @@ -91,7 +91,7 @@ impl Profiler for PerfProfiler { mut cmd_builder: CommandBuilder, config: &ExecutorConfig, profile_folder: &Path, - isolate: bool, + requires_sudo: bool, ) -> anyhow::Result { let perf_fifo = PerfFifo::new()?; let perf_file_path = profile_folder.join(PERF_PIPEDATA_FILE_NAME); @@ -183,10 +183,9 @@ impl Profiler for PerfProfiler { self.perf_fifo = Some(perf_fifo); self.perf_file_path = Some(perf_file_path); - // Isolated runs reparent the benchmark out of perf's subtree, so perf - // must record system-wide under sudo. Unisolated runs record perf's own - // descendant tree unprivileged. - let wrapped_builder = if isolate { + // When the benchmark was reparented out of perf's subtree, perf needs + // sudo to observe it; otherwise it records its own descendants unprivileged. + let wrapped_builder = if requires_sudo { wrap_with_sudo(wrapped_builder)? } else { wrapped_builder diff --git a/src/executor/wall_time/profiler/samply/mod.rs b/src/executor/wall_time/profiler/samply/mod.rs index 7a1a6f04..97b5bffd 100644 --- a/src/executor/wall_time/profiler/samply/mod.rs +++ b/src/executor/wall_time/profiler/samply/mod.rs @@ -86,7 +86,7 @@ impl Profiler for SamplyProfiler { mut cmd_builder: CommandBuilder, _config: &ExecutorConfig, profile_folder: &Path, - isolate: bool, + requires_sudo: bool, ) -> anyhow::Result { let output_path = profile_folder.join(SAMPLY_OUTPUT_FILE_NAME); @@ -157,10 +157,9 @@ impl Profiler for SamplyProfiler { self.output_path = Some(output_path); self.v8_log_dir = Some(v8_log_dir); - // Isolated runs reparent the benchmark out of samply's subtree, so samply - // must record system-wide under sudo. Unisolated runs record samply's own - // descendant tree unprivileged. - let cmd_builder = if isolate { + // When the benchmark was reparented out of samply's subtree, samply needs + // sudo to observe it; otherwise it records its own descendants unprivileged. + let cmd_builder = if requires_sudo { wrap_with_sudo(cmd_builder)? } else { cmd_builder