From cdbdb9b351d6215eb48c84c550749bf9bd1bf503 Mon Sep 17 00:00:00 2001
From: Aleksey Shakhmatov <dev@shakhmatov.com>
Date: Thu, 21 May 2026 08:33:24 +0300
Subject: [PATCH] refactor: simplify after code review

Aggregated fixes from a three-axis review (reuse, quality, efficiency):

- benchmark.zig: store raw bytes_per_op on Benchmark instead of pre-
  multiplying by b.n inside set_bytes. Reset no longer clobbers it, so
  the call order no longer matters and the math moves to result
  finalization where n is final.
- runner.zig: drop the runner-initiated b.reset_timer (was duplicative
  with the user's own reset_timer at the top of every bench). Pack
  counter/sub_run/sub_run_ctx/io into a runner.Env so run_one's
  signature stays at four params instead of seven.
- reporter.zig: remove the [256]f64 stack buffer that silently
  truncated --count=500. Reporters now take a pre-computed Summary and
  raw ns_samples slice from the suite. Replace the hand-rolled JSON
  string escape with std.json.Stringify.encodeJsonString.
- suite.zig: replace four nullable active_* fields with an explicit
  RunCtx threaded through run_named and the sub_run trampoline.
  Allocate ns_samples once per benchmark group. Extract match_flag for
  the eight identical --foo= prefix checks and open_stderr for the
  duplicated stderr-writer dance.

`--count=500` now produces 500 JSON samples (was 256 silently). No
behavior change for users at the API level beyond the set_bytes
ordering footgun being removed.
---
 src/benchmark.zig |  13 ++-
 src/reporter.zig  |  64 ++++---------
 src/runner.zig    |  86 ++++++++---------
 src/suite.zig     | 237 ++++++++++++++++++++++++++--------------------
 4 files changed, 198 insertions(+), 202 deletions(-)

diff --git a/src/benchmark.zig b/src/benchmark.zig
index d9747b6..5eb3e80 100644
--- a/src/benchmark.zig
+++ b/src/benchmark.zig
@@ -34,7 +34,9 @@ pub const Benchmark = struct {
     accumulated_ns: i128 = 0,
     timer_running: bool = true,
     start_ts: Io.Timestamp = Io.Timestamp.zero,
-    bytes_processed: u64 = 0,
+    /// Bytes processed per iteration, as declared by `set_bytes`. The
+    /// reporter multiplies by `n` to derive total throughput in MB/s.
+    bytes_per_op: u64 = 0,
     /// If the user calls `b.run`, this Benchmark is just a container — the
     /// outer runner will skip reporting its own result.
     is_container: bool = false,
@@ -43,10 +45,10 @@ pub const Benchmark = struct {
 
     /// Start (or restart) the timer fresh: zero accumulated time, zero
     /// allocation counters. Call after setup and before the measured loop.
+    /// `set_bytes` survives `reset_timer` — declare it at any point.
     pub fn reset_timer(b: *Benchmark) void {
         b.accumulated_ns = 0;
         b.counter.reset();
-        b.bytes_processed = 0;
         b.timer_running = true;
         b.start_ts = Io.Timestamp.now(b.io, .awake);
     }
@@ -67,10 +69,11 @@ pub const Benchmark = struct {
         b.timer_running = true;
     }
 
-    /// Record bytes processed by this iteration; the reporter divides by
-    /// `ns/op` to print MB/s.
+    /// Record bytes processed per iteration; the reporter multiplies by
+    /// `n` and divides by elapsed time to print MB/s. Safe to call before
+    /// or after `reset_timer`.
     pub fn set_bytes(b: *Benchmark, bytes_per_op: u64) void {
-        b.bytes_processed = bytes_per_op *| b.n;
+        b.bytes_per_op = bytes_per_op;
     }
 
     /// Mark this benchmark so allocations columns are always printed,
diff --git a/src/reporter.zig b/src/reporter.zig
index dddd9b1..ea05b54 100644
--- a/src/reporter.zig
+++ b/src/reporter.zig
@@ -18,31 +18,25 @@ pub fn write_text_header(w: *Writer) !void {
     });
 }
 
-/// Print one benchmark result group (>= 1 attempt) as a text row.
+/// Render one benchmark group as a text row. `summary` summarizes the
+/// `ns_per_op` distribution across all attempts in the group; `last`
+/// supplies metadata (n, B/op, MB/s, …) that is stable across attempts.
 pub fn write_text(
     w: *Writer,
     name: []const u8,
-    runs: []const Result,
+    last: Result,
+    summary: stats.Summary,
     opts: ReportOpts,
 ) !void {
-    if (runs.len == 0) return;
-    if (runs[0].is_container) return;
-
-    var buf: [256]f64 = undefined;
-    const ns_samples: []f64 = buf[0..@min(runs.len, buf.len)];
-    for (ns_samples, runs[0..ns_samples.len]) |*s, r| s.* = r.ns_per_op;
-    const ns_summary = stats.summarize(ns_samples);
-
-    const last = runs[runs.len - 1];
     const show_allocs = opts.always_allocs or last.force_report_allocs or
         last.bytes_per_op > 0 or last.allocs_per_op > 0;
 
     try w.print("{s:<30} {d:>10} ", .{ name, last.n });
 
-    if (runs.len > 1) {
-        try w.print("{d:>7.2} \xc2\xb1 {d:<2.2} ", .{ ns_summary.mean, ns_summary.stddev });
+    if (summary.n > 1) {
+        try w.print("{d:>7.2} \xc2\xb1 {d:<2.2} ", .{ summary.mean, summary.stddev });
     } else {
-        try w.print("{d:>12.2} ", .{ ns_summary.mean });
+        try w.print("{d:>12.2} ", .{summary.mean});
     }
 
     if (show_allocs) {
@@ -63,22 +57,14 @@ pub fn write_text(
 pub fn write_json(
     w: *Writer,
     name: []const u8,
-    runs: []const Result,
+    last: Result,
+    ns_samples: []const f64,
+    summary: stats.Summary,
 ) !void {
-    if (runs.len == 0) return;
-    if (runs[0].is_container) return;
-
-    var buf: [256]f64 = undefined;
-    const ns_samples: []f64 = buf[0..@min(runs.len, buf.len)];
-    for (ns_samples, runs[0..ns_samples.len]) |*s, r| s.* = r.ns_per_op;
-    const s = stats.summarize(ns_samples);
-
-    const last = runs[runs.len - 1];
-
-    try w.writeAll("{");
-    try write_json_string_field(w, "name", name);
+    try w.writeAll("{\"name\":");
+    try std.json.Stringify.encodeJsonString(name, .{}, w);
     try w.print(",\"n\":{d}", .{last.n});
-    try w.print(",\"ns_per_op\":{d}", .{s.mean});
+    try w.print(",\"ns_per_op\":{d}", .{summary.mean});
     try w.print(",\"bytes_per_op\":{d}", .{last.bytes_per_op});
     try w.print(",\"allocs_per_op\":{d}", .{last.allocs_per_op});
 
@@ -88,12 +74,12 @@ pub fn write_json(
         try w.writeAll(",\"mb_per_sec\":null");
     }
 
-    try w.print(",\"count\":{d}", .{runs.len});
+    try w.print(",\"count\":{d}", .{summary.n});
 
-    if (runs.len > 1) {
+    if (summary.n > 1) {
         try w.print(
             ",\"ns_per_op_mean\":{d},\"ns_per_op_stddev\":{d},\"ns_per_op_min\":{d}",
-            .{ s.mean, s.stddev, s.min },
+            .{ summary.mean, summary.stddev, summary.min },
         );
         try w.writeAll(",\"samples\":[");
         for (ns_samples, 0..) |x, i| {
@@ -105,19 +91,3 @@ pub fn write_json(
 
     try w.writeAll("}\n");
 }
-
-fn write_json_string_field(w: *Writer, key: []const u8, value: []const u8) !void {
-    try w.print("\"{s}\":\"", .{key});
-    for (value) |c| {
-        switch (c) {
-            '"' => try w.writeAll("\\\""),
-            '\\' => try w.writeAll("\\\\"),
-            '\n' => try w.writeAll("\\n"),
-            '\r' => try w.writeAll("\\r"),
-            '\t' => try w.writeAll("\\t"),
-            0x00...0x08, 0x0b, 0x0c, 0x0e...0x1f => try w.print("\\u{x:0>4}", .{c}),
-            else => try w.print("{c}", .{c}),
-        }
-    }
-    try w.writeAll("\"");
-}
diff --git a/src/runner.zig b/src/runner.zig
index 0123d54..17d5dfe 100644
--- a/src/runner.zig
+++ b/src/runner.zig
@@ -28,80 +28,78 @@ pub const Options = struct {
     max_iters: u64 = 1_000_000_000,
 };
 
-/// Adaptive single-run: grow `n` until `elapsed >= min_time_ns` or
-/// `n >= max_iters`. Returns the final `Result`.
-pub fn run_one(
-    name: []const u8,
-    f: BenchFn,
+/// Per-suite collaborators threaded into every `run_one` call.
+pub const Env = struct {
     counter: *CountingAllocator,
     sub_run: SubRunFn,
     sub_run_ctx: *anyopaque,
     io: std.Io,
-    opts: Options,
-) !Result {
+};
+
+/// Adaptive single-run: grow `n` until `elapsed >= min_time_ns` or
+/// `n >= max_iters`. Returns the final `Result`.
+pub fn run_one(name: []const u8, f: BenchFn, env: Env, opts: Options) !Result {
     var n: u64 = 1;
-    var last_elapsed_ns: u64 = 0;
-    var last_alloc_bytes: u64 = 0;
-    var last_alloc_count: u64 = 0;
-    var last_bytes_processed: u64 = 0;
-    var last_is_container: bool = false;
-    var last_force_report: bool = false;
+    var elapsed_ns: u64 = 0;
+    var alloc_bytes: u64 = 0;
+    var alloc_count: u64 = 0;
+    var bytes_per_op: u64 = 0;
+    var is_container = false;
+    var force_report = false;
 
     while (true) {
+        env.counter.reset();
         var b: Benchmark = .{
             .n = n,
-            .allocator = counter.allocator(),
-            .io = io,
+            .allocator = env.counter.allocator(),
+            .io = env.io,
             .name = name,
-            .counter = counter,
-            .sub_run = sub_run,
-            .sub_run_ctx = sub_run_ctx,
+            .counter = env.counter,
+            .sub_run = env.sub_run,
+            .sub_run_ctx = env.sub_run_ctx,
+            .start_ts = std.Io.Timestamp.now(env.io, .awake),
         };
-        counter.reset();
-        b.reset_timer();
 
         try f(&b);
-
         b.finish();
 
-        const elapsed_i: i128 = if (b.accumulated_ns < 0) 0 else b.accumulated_ns;
-        last_elapsed_ns = @intCast(@min(elapsed_i, std.math.maxInt(u64)));
-        last_alloc_bytes = counter.bytes_allocated;
-        last_alloc_count = counter.allocs;
-        last_bytes_processed = b.bytes_processed;
-        last_is_container = b.is_container;
-        last_force_report = b.force_report_allocs;
+        elapsed_ns = if (b.accumulated_ns <= 0) 0 else @intCast(@min(b.accumulated_ns, std.math.maxInt(u64)));
+        alloc_bytes = env.counter.bytes_allocated;
+        alloc_count = env.counter.allocs;
+        bytes_per_op = b.bytes_per_op;
+        is_container = b.is_container;
+        force_report = b.force_report_allocs;
 
-        if (last_is_container) break;
-        if (last_elapsed_ns >= opts.min_time_ns) break;
+        if (is_container) break;
+        if (elapsed_ns >= opts.min_time_ns) break;
         if (n >= opts.max_iters) break;
 
-        n = next_n(n, last_elapsed_ns, opts.min_time_ns, opts.max_iters);
+        n = next_n(n, elapsed_ns, opts.min_time_ns, opts.max_iters);
     }
 
     const fn_n: f64 = @floatFromInt(n);
-    const ns_per_op: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(last_elapsed_ns)) / fn_n;
-    const bytes_per_op: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(last_alloc_bytes)) / fn_n;
-    const allocs_per_op: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(last_alloc_count)) / fn_n;
+    const ns_per_op_v: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(elapsed_ns)) / fn_n;
+    const bytes_alloc_per_op: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(alloc_bytes)) / fn_n;
+    const allocs_per_op: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(alloc_count)) / fn_n;
 
-    const mb_per_sec: ?f64 = if (last_bytes_processed == 0 or last_elapsed_ns == 0)
+    const mb_per_sec: ?f64 = if (bytes_per_op == 0 or elapsed_ns == 0)
         null
     else blk: {
-        const bytes_f: f64 = @floatFromInt(last_bytes_processed);
-        const elapsed_s: f64 = @as(f64, @floatFromInt(last_elapsed_ns)) / @as(f64, std.time.ns_per_s);
-        break :blk (bytes_f / (1024.0 * 1024.0)) / elapsed_s;
+        const total_bytes: f64 = @floatFromInt(bytes_per_op *| n);
+        const elapsed_s: f64 = @as(f64, @floatFromInt(elapsed_ns)) / @as(f64, std.time.ns_per_s);
+        break :blk (total_bytes / (1024.0 * 1024.0)) / elapsed_s;
     };
 
     return .{
         .name = name,
         .n = n,
-        .elapsed_ns = last_elapsed_ns,
-        .ns_per_op = ns_per_op,
-        .bytes_per_op = bytes_per_op,
+        .elapsed_ns = elapsed_ns,
+        .ns_per_op = ns_per_op_v,
+        .bytes_per_op = bytes_alloc_per_op,
         .allocs_per_op = allocs_per_op,
         .mb_per_sec = mb_per_sec,
-        .force_report_allocs = last_force_report,
-        .is_container = last_is_container,
+        .force_report_allocs = force_report,
+        .is_container = is_container,
     };
 }
 
@@ -154,10 +152,8 @@ test "round_up snaps to nice numbers" {
 }
 
 test "next_n grows toward target" {
-    // first run: 0 ns -> jump by 100x
     try std.testing.expectEqual(@as(u64, 100), next_n(1, 0, std.time.ns_per_s, 1 << 30));
 
-    // 100 iters in 1 ms; target 1s -> predicted = 1.2e6 * 100 / 1e6 = 120000, rounded 200000
     const n2 = next_n(100, std.time.ns_per_ms, std.time.ns_per_s, 1 << 30);
     try std.testing.expect(n2 > 100);
     try std.testing.expect(n2 <= 100 * 100);
diff --git a/src/suite.zig b/src/suite.zig
index f14406a..dd582f0 100644
--- a/src/suite.zig
+++ b/src/suite.zig
@@ -6,6 +6,7 @@ const Writer = std.Io.Writer;
 const bench = @import("benchmark.zig");
 const runner = @import("runner.zig");
 const reporter = @import("reporter.zig");
+const stats = @import("stats.zig");
 const CountingAllocator = @import("alloc.zig").CountingAllocator;
 
 const Benchmark = bench.Benchmark;
@@ -33,14 +34,6 @@ pub const Suite = struct {
     io: Io,
     entries: std.ArrayListUnmanaged(Entry) = .empty,
 
-    /// Mutable state during a run — installed by `run`.
-    active_writer: ?*Writer = null,
-    active_opts: Options = .{},
-    active_counter: ?*CountingAllocator = null,
-    /// The composed name of the benchmark currently being executed.
-    /// Used by `sub_run_trampoline` to prefix sub-benchmark names.
-    current_name: ?[]const u8 = null,
-
     pub fn init(gpa: Allocator, io: Io) Suite {
         return .{ .gpa = gpa, .io = io };
     }
@@ -60,121 +53,60 @@ pub const Suite = struct {
     /// Run all registered benchmarks with `opts`, writing to `writer`.
     pub fn run(self: *Suite, opts: Options, writer: *Writer) !void {
         var counter = CountingAllocator.init(self.gpa);
-
-        self.active_writer = writer;
-        self.active_opts = opts;
-        self.active_counter = &counter;
-        defer {
-            self.active_writer = null;
-            self.active_counter = null;
-            self.current_name = null;
-        }
+        var ctx: RunCtx = .{
+            .gpa = self.gpa,
+            .io = self.io,
+            .writer = writer,
+            .opts = opts,
+            .counter = &counter,
+        };
 
         if (opts.format == .text) try reporter.write_text_header(writer);
 
         for (self.entries.items) |entry| {
             if (opts.filter) |needle| {
-                // Substring match on the top-level name, or on the parent
-                // segment when the user passes a `parent/leaf` style filter.
-                const target = if (std.mem.indexOf(u8, needle, "/")) |i|
+                // For `parent/leaf` style filters, gate the top level on the
+                // parent segment so we still descend into matching subs.
+                const target = if (std.mem.indexOfScalar(u8, needle, '/')) |i|
                     needle[0..i]
                 else
                     needle;
                 if (std.mem.indexOf(u8, entry.name, target) == null) continue;
             }
-            try self.run_named(entry.name, entry.f);
+            try run_named(&ctx, entry.name, entry.f);
         }
 
         try writer.flush();
     }
 
-    /// Run a benchmark by composed name. Used both for top-level entries
-    /// and sub-benchmarks (`parent/sub`).
-    fn run_named(self: *Suite, full_name: []const u8, f: BenchFn) !void {
-        const opts = self.active_opts;
-        const writer = self.active_writer.?;
-        const counter = self.active_counter.?;
-
-        const saved_current = self.current_name;
-        self.current_name = full_name;
-        defer self.current_name = saved_current;
-
-        const count = @max(@as(u32, 1), opts.count);
-        const samples = try self.gpa.alloc(Result, count);
-        defer self.gpa.free(samples);
-
-        var i: u32 = 0;
-        while (i < count) : (i += 1) {
-            samples[i] = try runner.run_one(
-                full_name,
-                f,
-                counter,
-                Suite.sub_run_trampoline,
-                self,
-                self.io,
-                .{ .min_time_ns = opts.min_time_ns, .max_iters = opts.max_iters },
-            );
-            // If the user delegated to sub-benchmarks, no point repeating.
-            if (samples[i].is_container) {
-                i += 1;
-                break;
-            }
-        }
-        const used = samples[0..i];
-
-        switch (opts.format) {
-            .text => try reporter.write_text(
-                writer,
-                full_name,
-                used,
-                .{ .always_allocs = opts.always_allocs },
-            ),
-            .json => try reporter.write_json(writer, full_name, used),
-        }
-    }
-
-    fn sub_run_trampoline(ctx: *anyopaque, sub_name: []const u8, f: BenchFn) anyerror!void {
-        const self: *Suite = @ptrCast(@alignCast(ctx));
-        const parent = self.current_name orelse "<orphan>";
-        const composed = try std.fmt.allocPrint(self.gpa, "{s}/{s}", .{ parent, sub_name });
-        defer self.gpa.free(composed);
-
-        // Apply filter against the composed name so users can target a
-        // specific sub-benchmark with --filter=parent/leaf or by substring.
-        if (self.active_opts.filter) |needle| {
-            if (std.mem.indexOf(u8, composed, needle) == null) return;
-        }
-
-        try self.run_named(composed, f);
-    }
-
     /// CLI entrypoint: parse `proc_init.minimal.args`, then dispatch to `run`.
     pub fn run_cli(self: *Suite, proc_init: std.process.Init) !void {
-        var args_it = std.process.Args.Iterator.init(proc_init.minimal.args);
-        _ = args_it.skip(); // argv[0]
-
         var opts: Options = .{};
         var list_only = false;
 
+        var args_it = std.process.Args.Iterator.init(proc_init.minimal.args);
+        _ = args_it.skip(); // argv[0]
+
         while (args_it.next()) |raw| {
             const arg: []const u8 = raw;
             if (std.mem.eql(u8, arg, "--help") or std.mem.eql(u8, arg, "-h")) {
-                try print_help(proc_init);
+                var buf: [2048]u8 = undefined;
+                var w = open_stderr(proc_init.io, &buf);
+                try print_help(&w.interface);
                 return;
             } else if (std.mem.eql(u8, arg, "--list")) {
                 list_only = true;
             } else if (std.mem.eql(u8, arg, "--allocs")) {
                 opts.always_allocs = true;
-            } else if (std.mem.startsWith(u8, arg, "--filter=")) {
-                opts.filter = arg["--filter=".len..];
-            } else if (std.mem.startsWith(u8, arg, "--min-time=")) {
-                opts.min_time_ns = try parse_duration_ns(arg["--min-time=".len..]);
-            } else if (std.mem.startsWith(u8, arg, "--count=")) {
-                opts.count = try std.fmt.parseInt(u32, arg["--count=".len..], 10);
-            } else if (std.mem.startsWith(u8, arg, "--max-iters=")) {
-                opts.max_iters = try std.fmt.parseInt(u64, arg["--max-iters=".len..], 10);
-            } else if (std.mem.startsWith(u8, arg, "--format=")) {
-                const v = arg["--format=".len..];
+            } else if (match_flag(arg, "--filter")) |v| {
+                opts.filter = v;
+            } else if (match_flag(arg, "--min-time")) |v| {
+                opts.min_time_ns = try parse_duration_ns(v);
+            } else if (match_flag(arg, "--count")) |v| {
+                opts.count = try std.fmt.parseInt(u32, v, 10);
+            } else if (match_flag(arg, "--max-iters")) |v| {
+                opts.max_iters = try std.fmt.parseInt(u64, v, 10);
+            } else if (match_flag(arg, "--format")) |v| {
                 if (std.mem.eql(u8, v, "text")) {
                     opts.format = .text;
                 } else if (std.mem.eql(u8, v, "json")) {
@@ -186,24 +118,112 @@ pub const Suite = struct {
         }
 
         var buf: [4096]u8 = undefined;
-        var stderr_file = std.Io.File.stderr().writerStreaming(proc_init.io, &buf);
+        var w = open_stderr(proc_init.io, &buf);
 
         if (list_only) {
-            for (self.entries.items) |e| {
-                try stderr_file.interface.print("{s}\n", .{e.name});
-            }
-            try stderr_file.interface.flush();
+            for (self.entries.items) |e| try w.interface.print("{s}\n", .{e.name});
+            try w.interface.flush();
             return;
         }
 
-        try self.run(opts, &stderr_file.interface);
+        try self.run(opts, &w.interface);
     }
 };
 
-fn print_help(proc_init: std.process.Init) !void {
-    var buf: [2048]u8 = undefined;
-    var stderr_file = std.Io.File.stderr().writerStreaming(proc_init.io, &buf);
-    const w = &stderr_file.interface;
+/// Per-run state threaded through `run_named` and the sub-benchmark
+/// trampoline. Avoids stashing mutable globals on `Suite`.
+const RunCtx = struct {
+    gpa: Allocator,
+    io: Io,
+    writer: *Writer,
+    opts: Options,
+    counter: *CountingAllocator,
+    /// Composed name of the benchmark currently executing — used as the
+    /// prefix when its `f` invokes `b.run(...)`.
+    parent_name: ?[]const u8 = null,
+};
+
+/// Run a benchmark by composed name. Used both for top-level entries and
+/// sub-benchmarks (parent/sub).
+fn run_named(ctx: *RunCtx, full_name: []const u8, f: BenchFn) !void {
+    const saved = ctx.parent_name;
+    ctx.parent_name = full_name;
+    defer ctx.parent_name = saved;
+
+    const count = @max(@as(u32, 1), ctx.opts.count);
+    const samples = try ctx.gpa.alloc(Result, count);
+    defer ctx.gpa.free(samples);
+
+    const env: runner.Env = .{
+        .counter = ctx.counter,
+        .sub_run = sub_run_trampoline,
+        .sub_run_ctx = ctx,
+        .io = ctx.io,
+    };
+    const r_opts: runner.Options = .{
+        .min_time_ns = ctx.opts.min_time_ns,
+        .max_iters = ctx.opts.max_iters,
+    };
+
+    var i: u32 = 0;
+    while (i < count) : (i += 1) {
+        samples[i] = try runner.run_one(full_name, f, env, r_opts);
+        // Containers delegate to sub-benchmarks; their own row is meaningless
+        // and there's nothing to repeat across `--count`.
+        if (samples[i].is_container) {
+            i += 1;
+            break;
+        }
+    }
+    const used = samples[0..i];
+    if (used.len == 0 or used[used.len - 1].is_container) return;
+
+    const ns_samples = try ctx.gpa.alloc(f64, used.len);
+    defer ctx.gpa.free(ns_samples);
+    for (ns_samples, used) |*s, r| s.* = r.ns_per_op;
+
+    const summary = stats.summarize(ns_samples);
+    const last = used[used.len - 1];
+
+    switch (ctx.opts.format) {
+        .text => try reporter.write_text(
+            ctx.writer,
+            full_name,
+            last,
+            summary,
+            .{ .always_allocs = ctx.opts.always_allocs },
+        ),
+        .json => try reporter.write_json(ctx.writer, full_name, last, ns_samples, summary),
+    }
+}
+
+fn sub_run_trampoline(opaque_ctx: *anyopaque, sub_name: []const u8, f: BenchFn) anyerror!void {
+    const ctx: *RunCtx = @ptrCast(@alignCast(opaque_ctx));
+    const parent = ctx.parent_name orelse "<orphan>";
+    const composed = try std.fmt.allocPrint(ctx.gpa, "{s}/{s}", .{ parent, sub_name });
+    defer ctx.gpa.free(composed);
+
+    // Sub-bench filter: match against the composed name so users can target
+    // `parent/leaf` or filter by substring.
+    if (ctx.opts.filter) |needle| {
+        if (std.mem.indexOf(u8, composed, needle) == null) return;
+    }
+    try run_named(ctx, composed, f);
+}
+
+fn open_stderr(io: Io, buf: []u8) std.Io.File.Writer {
+    return std.Io.File.stderr().writerStreaming(io, buf);
+}
+
+/// Match a `--name=value` style argument. Returns the value slice if the
+/// argument's prefix is `name=`, otherwise null.
+fn match_flag(arg: []const u8, comptime name: []const u8) ?[]const u8 {
+    const prefix = name ++ "=";
+    if (!std.mem.startsWith(u8, arg, prefix)) return null;
+    return arg[prefix.len..];
+}
+
+fn print_help(w: *Writer) !void {
     try w.writeAll(
         \\zbench — benchmark runner
         \\
@@ -260,3 +280,10 @@ test "parse_duration_ns" {
     try std.testing.expectError(error.InvalidDuration, parse_duration_ns(""));
     try std.testing.expectError(error.InvalidDuration, parse_duration_ns("10xyz"));
 }
+
+test "match_flag" {
+    try std.testing.expectEqualStrings("foo", match_flag("--filter=foo", "--filter").?);
+    try std.testing.expectEqualStrings("", match_flag("--filter=", "--filter").?);
+    try std.testing.expectEqual(@as(?[]const u8, null), match_flag("--other=x", "--filter"));
+    try std.testing.expectEqual(@as(?[]const u8, null), match_flag("--filter", "--filter"));
+}