diff --git a/src/alloc.zig b/src/alloc.zig
new file mode 100644
index 0000000..5038aac
--- /dev/null
+++ b/src/alloc.zig
@@ -0,0 +1,94 @@
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const Alignment = std.mem.Alignment;
+
+pub const CountingAllocator = struct {
+    inner: Allocator,
+    bytes_allocated: u64 = 0,
+    allocs: u64 = 0,
+    frees: u64 = 0,
+
+    pub fn init(inner: Allocator) CountingAllocator {
+        return .{ .inner = inner };
+    }
+
+    pub fn reset(self: *CountingAllocator) void {
+        self.bytes_allocated = 0;
+        self.allocs = 0;
+        self.frees = 0;
+    }
+
+    pub fn allocator(self: *CountingAllocator) Allocator {
+        return .{
+            .ptr = self,
+            .vtable = &.{
+                .alloc = alloc,
+                .resize = resize,
+                .remap = remap,
+                .free = free,
+            },
+        };
+    }
+
+    fn alloc(ctx: *anyopaque, len: usize, alignment: Alignment, ret_addr: usize) ?[*]u8 {
+        const self: *CountingAllocator = @ptrCast(@alignCast(ctx));
+        const result = self.inner.vtable.alloc(self.inner.ptr, len, alignment, ret_addr);
+        if (result) |_| {
+            self.allocs += 1;
+            self.bytes_allocated += len;
+        }
+        return result;
+    }
+
+    fn resize(ctx: *anyopaque, memory: []u8, alignment: Alignment, new_len: usize, ret_addr: usize) bool {
+        const self: *CountingAllocator = @ptrCast(@alignCast(ctx));
+        const ok = self.inner.vtable.resize(self.inner.ptr, memory, alignment, new_len, ret_addr);
+        if (ok and new_len > memory.len) {
+            self.bytes_allocated += new_len - memory.len;
+        }
+        return ok;
+    }
+
+    fn remap(ctx: *anyopaque, memory: []u8, alignment: Alignment, new_len: usize, ret_addr: usize) ?[*]u8 {
+        const self: *CountingAllocator = @ptrCast(@alignCast(ctx));
+        const result = self.inner.vtable.remap(self.inner.ptr, memory, alignment, new_len, ret_addr);
+        if (result) |_| {
+            if (new_len > memory.len) self.bytes_allocated += new_len - memory.len;
+        }
+        return result;
+    }
+
+    fn free(ctx: *anyopaque, memory: []u8, alignment: Alignment, ret_addr: usize) void {
+        const self: *CountingAllocator = @ptrCast(@alignCast(ctx));
+        self.inner.vtable.free(self.inner.ptr, memory, alignment, ret_addr);
+        self.frees += 1;
+    }
+};
+
+test "CountingAllocator counts allocations" {
+    var counter = CountingAllocator.init(std.testing.allocator);
+    const a = counter.allocator();
+
+    const buf1 = try a.alloc(u8, 64);
+    const buf2 = try a.alloc(u8, 128);
+    try std.testing.expectEqual(@as(u64, 2), counter.allocs);
+    try std.testing.expectEqual(@as(u64, 192), counter.bytes_allocated);
+
+    a.free(buf1);
+    a.free(buf2);
+    try std.testing.expectEqual(@as(u64, 2), counter.frees);
+}
+
+test "CountingAllocator reset clears counters" {
+    var counter = CountingAllocator.init(std.testing.allocator);
+    const a = counter.allocator();
+
+    const buf = try a.alloc(u8, 32);
+    defer a.free(buf);
+    try std.testing.expect(counter.allocs > 0);
+
+    counter.reset();
+    try std.testing.expectEqual(@as(u64, 0), counter.allocs);
+    try std.testing.expectEqual(@as(u64, 0), counter.bytes_allocated);
+    try std.testing.expectEqual(@as(u64, 0), counter.frees);
+}
diff --git a/src/benchmark.zig b/src/benchmark.zig
new file mode 100644
index 0000000..d9747b6
--- /dev/null
+++ b/src/benchmark.zig
@@ -0,0 +1,107 @@
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const Io = std.Io;
+const CountingAllocator = @import("alloc.zig").CountingAllocator;
+
+pub const BenchFn = *const fn (b: *Benchmark) anyerror!void;
+
+/// Callback installed by the Suite so `b.run(name, f)` can start a fresh
+/// adaptive prog. Keeps benchmark.zig free of a back-import on suite.zig.
+pub const SubRunFn = *const fn (
+    ctx: *anyopaque,
+    sub_name: []const u8,
+    f: BenchFn,
+) anyerror!void;
+
+/// State for one invocation of a user benchmark function. The runner mutates
+/// `n` between attempts; the user reads it inside the hot loop.
+pub const Benchmark = struct {
+    /// Target iteration count for the current attempt. Set by the runner.
+    n: u64,
+    /// Counting-wrapped allocator. Use it inside the bench to get accurate
+    /// B/op and allocs/op. Backed by the GPA passed to `Suite.init`.
+    allocator: Allocator,
+    /// I/O capability — passed through so user benchmarks can do I/O if
+    /// they need to.
+    io: Io,
+    /// Hierarchical name used when reporting sub-benchmark results.
+    name: []const u8,
+
+    counter: *CountingAllocator,
+    sub_run: SubRunFn,
+    sub_run_ctx: *anyopaque,
+
+    accumulated_ns: i128 = 0,
+    timer_running: bool = true,
+    start_ts: Io.Timestamp = Io.Timestamp.zero,
+    bytes_processed: u64 = 0,
+    /// If the user calls `b.run`, this Benchmark is just a container — the
+    /// outer runner will skip reporting its own result.
+    is_container: bool = false,
+    /// Forces the reporter to print B/op and allocs/op even when zero.
+    force_report_allocs: bool = false,
+
+    /// Start (or restart) the timer fresh: zero accumulated time, zero
+    /// allocation counters. Call after setup and before the measured loop.
+    pub fn reset_timer(b: *Benchmark) void {
+        b.accumulated_ns = 0;
+        b.counter.reset();
+        b.bytes_processed = 0;
+        b.timer_running = true;
+        b.start_ts = Io.Timestamp.now(b.io, .awake);
+    }
+
+    /// Pause timing and allocation counting. Pair with `start_timer` for
+    /// per-iteration setup that should not be measured.
+    pub fn stop_timer(b: *Benchmark) void {
+        if (!b.timer_running) return;
+        const now = Io.Timestamp.now(b.io, .awake);
+        b.accumulated_ns += @as(i128, b.start_ts.durationTo(now).nanoseconds);
+        b.timer_running = false;
+    }
+
+    /// Resume timing after `stop_timer`.
+    pub fn start_timer(b: *Benchmark) void {
+        if (b.timer_running) return;
+        b.start_ts = Io.Timestamp.now(b.io, .awake);
+        b.timer_running = true;
+    }
+
+    /// Record bytes processed by this iteration; the reporter divides by
+    /// `ns/op` to print MB/s.
+    pub fn set_bytes(b: *Benchmark, bytes_per_op: u64) void {
+        b.bytes_processed = bytes_per_op *| b.n;
+    }
+
+    /// Mark this benchmark so allocations columns are always printed,
+    /// regardless of the `--allocs` flag.
+    pub fn report_allocs(b: *Benchmark) void {
+        b.force_report_allocs = true;
+    }
+
+    /// Optimization barrier — discourages the compiler from eliminating the
+    /// computation that produced `value`. Use to keep a result alive past
+    /// the loop body.
+    pub fn keep(b: *Benchmark, value: anytype) void {
+        _ = b;
+        std.mem.doNotOptimizeAway(value);
+    }
+
+    /// Run a sub-benchmark. The current benchmark becomes a container and
+    /// its own result is not reported; the sub-benchmark is reported as
+    /// `parent/sub_name`.
+    pub fn run(b: *Benchmark, sub_name: []const u8, f: BenchFn) !void {
+        b.stop_timer();
+        b.is_container = true;
+        try b.sub_run(b.sub_run_ctx, sub_name, f);
+    }
+
+    /// Internal: finalize accumulated time at end of one attempt.
+    pub fn finish(b: *Benchmark) void {
+        if (b.timer_running) {
+            const now = Io.Timestamp.now(b.io, .awake);
+            b.accumulated_ns += @as(i128, b.start_ts.durationTo(now).nanoseconds);
+            b.timer_running = false;
+        }
+    }
+};
diff --git a/src/runner.zig b/src/runner.zig
new file mode 100644
index 0000000..0123d54
--- /dev/null
+++ b/src/runner.zig
@@ -0,0 +1,164 @@
+const std = @import("std");
+const bench = @import("benchmark.zig");
+const stats = @import("stats.zig");
+const CountingAllocator = @import("alloc.zig").CountingAllocator;
+
+const Benchmark = bench.Benchmark;
+const BenchFn = bench.BenchFn;
+const SubRunFn = bench.SubRunFn;
+
+pub const Result = struct {
+    name: []const u8,
+    n: u64,
+    elapsed_ns: u64,
+    ns_per_op: f64,
+    bytes_per_op: f64,
+    allocs_per_op: f64,
+    /// MB/s if the benchmark called `set_bytes`, null otherwise.
+    mb_per_sec: ?f64,
+    /// Whether the user explicitly asked for allocation columns.
+    force_report_allocs: bool,
+    /// If true, this benchmark only ran sub-benchmarks and should not be
+    /// reported as its own row.
+    is_container: bool,
+};
+
+pub const Options = struct {
+    min_time_ns: u64 = std.time.ns_per_s,
+    max_iters: u64 = 1_000_000_000,
+};
+
+/// Adaptive single-run: grow `n` until `elapsed >= min_time_ns` or
+/// `n >= max_iters`. Returns the final `Result`.
+pub fn run_one(
+    name: []const u8,
+    f: BenchFn,
+    counter: *CountingAllocator,
+    sub_run: SubRunFn,
+    sub_run_ctx: *anyopaque,
+    io: std.Io,
+    opts: Options,
+) !Result {
+    var n: u64 = 1;
+    var last_elapsed_ns: u64 = 0;
+    var last_alloc_bytes: u64 = 0;
+    var last_alloc_count: u64 = 0;
+    var last_bytes_processed: u64 = 0;
+    var last_is_container: bool = false;
+    var last_force_report: bool = false;
+
+    while (true) {
+        var b: Benchmark = .{
+            .n = n,
+            .allocator = counter.allocator(),
+            .io = io,
+            .name = name,
+            .counter = counter,
+            .sub_run = sub_run,
+            .sub_run_ctx = sub_run_ctx,
+        };
+        counter.reset();
+        b.reset_timer();
+
+        try f(&b);
+
+        b.finish();
+
+        const elapsed_i: i128 = if (b.accumulated_ns < 0) 0 else b.accumulated_ns;
+        last_elapsed_ns = @intCast(@min(elapsed_i, std.math.maxInt(u64)));
+        last_alloc_bytes = counter.bytes_allocated;
+        last_alloc_count = counter.allocs;
+        last_bytes_processed = b.bytes_processed;
+        last_is_container = b.is_container;
+        last_force_report = b.force_report_allocs;
+
+        if (last_is_container) break;
+        if (last_elapsed_ns >= opts.min_time_ns) break;
+        if (n >= opts.max_iters) break;
+
+        n = next_n(n, last_elapsed_ns, opts.min_time_ns, opts.max_iters);
+    }
+
+    const fn_n: f64 = @floatFromInt(n);
+    const ns_per_op: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(last_elapsed_ns)) / fn_n;
+    const bytes_per_op: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(last_alloc_bytes)) / fn_n;
+    const allocs_per_op: f64 = if (n == 0) 0 else @as(f64, @floatFromInt(last_alloc_count)) / fn_n;
+
+    const mb_per_sec: ?f64 = if (last_bytes_processed == 0 or last_elapsed_ns == 0)
+        null
+    else blk: {
+        const bytes_f: f64 = @floatFromInt(last_bytes_processed);
+        const elapsed_s: f64 = @as(f64, @floatFromInt(last_elapsed_ns)) / @as(f64, std.time.ns_per_s);
+        break :blk (bytes_f / (1024.0 * 1024.0)) / elapsed_s;
+    };
+
+    return .{
+        .name = name,
+        .n = n,
+        .elapsed_ns = last_elapsed_ns,
+        .ns_per_op = ns_per_op,
+        .bytes_per_op = bytes_per_op,
+        .allocs_per_op = allocs_per_op,
+        .mb_per_sec = mb_per_sec,
+        .force_report_allocs = last_force_report,
+        .is_container = last_is_container,
+    };
+}
+
+/// Pick the next iteration count. Strategy: predict an `n` that should land
+/// at `min_time_ns` based on the last sample, overshoot by 20 %, clamp to
+/// at most ×100 growth and at most `max_iters`, round up to a "nice" number.
+fn next_n(prev_n: u64, prev_elapsed_ns: u64, min_time_ns: u64, max_iters: u64) u64 {
+    var predicted: u64 = undefined;
+    if (prev_elapsed_ns == 0) {
+        predicted = prev_n * 100;
+    } else {
+        // (min_time_ns * 1.2) * prev_n / prev_elapsed_ns, in integer math
+        const num = @as(u128, min_time_ns) *| 12 *| prev_n;
+        const denom: u128 = @as(u128, prev_elapsed_ns) *| 10;
+        const p = num / denom;
+        predicted = if (p > std.math.maxInt(u64)) std.math.maxInt(u64) else @intCast(p);
+    }
+
+    if (predicted <= prev_n) predicted = prev_n + 1;
+    if (predicted > prev_n *| 100) predicted = prev_n *| 100;
+    if (predicted > max_iters) predicted = max_iters;
+    return round_up(predicted);
+}
+
+/// Round up to a "nice" decimal number (1, 2, 3, 5, 10, 20, 30, 50, 100, ...).
+/// Matches Go testing's growth heuristic — avoids reporting awkward iteration
+/// counts like 1357 in favor of 2000.
+fn round_up(n: u64) u64 {
+    if (n <= 1) return 1;
+    var base: u64 = 1;
+    while (base *| 10 < n) base *|= 10;
+    if (n <= base) return base;
+    if (n <= 2 *| base) return 2 *| base;
+    if (n <= 3 *| base) return 3 *| base;
+    if (n <= 5 *| base) return 5 *| base;
+    return 10 *| base;
+}
+
+test "round_up snaps to nice numbers" {
+    try std.testing.expectEqual(@as(u64, 1), round_up(1));
+    try std.testing.expectEqual(@as(u64, 2), round_up(2));
+    try std.testing.expectEqual(@as(u64, 3), round_up(3));
+    try std.testing.expectEqual(@as(u64, 5), round_up(4));
+    try std.testing.expectEqual(@as(u64, 5), round_up(5));
+    try std.testing.expectEqual(@as(u64, 10), round_up(7));
+    try std.testing.expectEqual(@as(u64, 100), round_up(73));
+    try std.testing.expectEqual(@as(u64, 1000), round_up(999));
+    try std.testing.expectEqual(@as(u64, 2000), round_up(1234));
+    try std.testing.expectEqual(@as(u64, 1_000_000), round_up(1_000_000));
+}
+
+test "next_n grows toward target" {
+    // first run: 0 ns -> jump by 100x
+    try std.testing.expectEqual(@as(u64, 100), next_n(1, 0, std.time.ns_per_s, 1 << 30));
+
+    // 100 iters in 1 ms; target 1s -> predicted = 1.2e6 * 100 / 1e6 = 120000, rounded 200000
+    const n2 = next_n(100, std.time.ns_per_ms, std.time.ns_per_s, 1 << 30);
+    try std.testing.expect(n2 > 100);
+    try std.testing.expect(n2 <= 100 * 100);
+}
diff --git a/src/stats.zig b/src/stats.zig
new file mode 100644
index 0000000..1572bcb
--- /dev/null
+++ b/src/stats.zig
@@ -0,0 +1,57 @@
+const std = @import("std");
+
+pub const Summary = struct {
+    n: usize,
+    min: f64,
+    mean: f64,
+    stddev: f64,
+};
+
+pub fn summarize(samples: []const f64) Summary {
+    if (samples.len == 0) return .{ .n = 0, .min = 0, .mean = 0, .stddev = 0 };
+
+    var min: f64 = samples[0];
+    var sum: f64 = 0;
+    for (samples) |s| {
+        if (s < min) min = s;
+        sum += s;
+    }
+    const mean = sum / @as(f64, @floatFromInt(samples.len));
+
+    var var_sum: f64 = 0;
+    for (samples) |s| {
+        const d = s - mean;
+        var_sum += d * d;
+    }
+    const variance = if (samples.len > 1)
+        var_sum / @as(f64, @floatFromInt(samples.len - 1))
+    else
+        0;
+
+    return .{
+        .n = samples.len,
+        .min = min,
+        .mean = mean,
+        .stddev = @sqrt(variance),
+    };
+}
+
+test "summarize empty" {
+    const s = summarize(&.{});
+    try std.testing.expectEqual(@as(usize, 0), s.n);
+}
+
+test "summarize single value" {
+    const s = summarize(&.{42.0});
+    try std.testing.expectEqual(@as(f64, 42), s.min);
+    try std.testing.expectEqual(@as(f64, 42), s.mean);
+    try std.testing.expectEqual(@as(f64, 0), s.stddev);
+}
+
+test "summarize multiple values" {
+    const s = summarize(&.{ 2, 4, 4, 4, 5, 5, 7, 9 });
+    try std.testing.expectEqual(@as(f64, 2), s.min);
+    try std.testing.expectEqual(@as(f64, 5), s.mean);
+    // sample stddev of [2,4,4,4,5,5,7,9] = sqrt(32/7) ≈ 2.138
+    try std.testing.expectApproxEqAbs(@as(f64, 2.1380899), s.stddev, 1e-5);
+}