diff --git a/README.md b/README.md
new file mode 100644
index 0000000..a2f5c6a
--- /dev/null
+++ b/README.md
@@ -0,0 +1,299 @@
+# zbench
+
+Go-style benchmarking for Zig 0.16+. No external dependencies. Adaptive
+iteration count, per-op allocation tracking, throughput (MB/s),
+sub-benchmarks, statistical repetition, and text or JSON output.
+
+```
+benchmark                           iters        ns/op       B/op  allocs/op         MB/s
+append_u8                       300000000         0.52          1          0
+sha256_64                         3000000        53.62                            1138.37
+hash_sizes/sha256_16              5000000        32.55                             468.81
+hash_sizes/sha256_256             1000000       134.26                            1818.43
+hash_sizes/sha256_4096             100000      1778.86                            2195.92
+```
+
+## Requirements
+
+- Zig 0.16.0 or newer.
+
+## Adding the dependency
+
+In your project's `build.zig.zon`:
+
+```zig
+.dependencies = .{
+    .zbench = .{
+        .url = "https://example.com/zbench-0.1.0.tar.gz",
+        .hash = "...",
+    },
+},
+```
+
+Or fetch from a local path during development:
+
+```sh
+zig fetch --save=zbench ../path/to/zbench
+```
+
+## Wiring it up in `build.zig`
+
+The library exposes two modules: `zbench` (the runtime API) and
+`zbench_build` (a small build helper). A typical consumer `build.zig`:
+
+```zig
+const std = @import("std");
+const zbench_build = @import("zbench_build");
+
+pub fn build(b: *std.Build) void {
+    const target = b.standardTargetOptions(.{});
+
+    const zbench = b.dependency("zbench", .{
+        .target = target,
+        .optimize = .ReleaseFast,
+    });
+
+    _ = zbench_build.add_bench_step(b, .{
+        .step_name = "bench",
+        .root = b.path("bench/main.zig"),
+        .target = target,
+        .zbench = zbench.module("zbench"),
+    });
+}
+```
+
+Now `zig build bench` compiles and runs the benchmark executable, and
+`zig build bench -- --min-time=200ms --count=3` forwards CLI arguments
+through to it.
+
+If you prefer not to use the helper, you can build the executable
+yourself and wire it up like any other `b.addExecutable` step — see
+`examples/bench/build.zig` for the explicit version.
+
+## Minimal benchmark
+
+```zig
+const std = @import("std");
+const zbench = @import("zbench");
+
+pub fn main(init: std.process.Init) !void {
+    var suite = zbench.Suite.init(init.gpa, init.io);
+    defer suite.deinit();
+
+    try suite.add("append", bench_append);
+    try suite.add("hash", bench_hash);
+
+    try suite.run_cli(init);
+}
+
+fn bench_append(b: *zbench.Benchmark) !void {
+    var list: std.ArrayListUnmanaged(u8) = .empty;
+    defer list.deinit(b.allocator);
+
+    b.reset_timer();
+    var i: u64 = 0;
+    while (i < b.n) : (i += 1) {
+        try list.append(b.allocator, @intCast(i & 0xff));
+    }
+    b.keep(list.items);
+}
+
+fn bench_hash(b: *zbench.Benchmark) !void {
+    var buf: [64]u8 = @splat(0xab);
+    var out: [32]u8 = undefined;
+
+    b.reset_timer();
+    var i: u64 = 0;
+    while (i < b.n) : (i += 1) {
+        std.crypto.hash.sha2.Sha256.hash(&buf, &out, .{});
+        b.keep(out);
+    }
+    b.set_bytes(buf.len);
+}
+```
+
+The mental model matches Go's `testing.B`: the runner calls your
+function with an increasing `b.n` until the wall time crosses
+`--min-time`. You write the loop, the framework picks how many times
+to run it.
+
+## The `Benchmark` API
+
+- `b.n` — target iteration count for the current attempt.
+- `b.allocator` — a wrapping allocator that counts allocations; use it
+  if you want `B/op` and `allocs/op` reported.
+- `b.io` — `std.Io` for benchmarks that need to perform I/O.
+- `b.reset_timer()` — call after setup, before the measured loop.
+- `b.stop_timer()` / `b.start_timer()` — exclude per-iteration setup
+  from the measurement.
+- `b.set_bytes(bytes_per_op)` — declare throughput; the reporter shows
+  `MB/s`.
+- `b.report_allocs()` — force the `B/op` / `allocs/op` columns for
+  this benchmark, regardless of `--allocs`.
+- `b.keep(value)` — optimization barrier; keeps a computed value alive
+  past the loop so ReleaseFast does not delete the work.
+- `b.run(name, fn)` — run a sub-benchmark. The parent function becomes
+  a container and is not itself reported; the sub-benchmark is
+  reported as `parent/name`.
+
+## Excluding setup from the measurement
+
+```zig
+fn bench_lookup(b: *zbench.Benchmark) !void {
+    var map: std.AutoHashMapUnmanaged(u64, u64) = .empty;
+    defer map.deinit(b.allocator);
+    for (0..1000) |k| try map.put(b.allocator, k, k *% 31);
+
+    b.reset_timer();
+    var hits: u64 = 0;
+    var i: u64 = 0;
+    while (i < b.n) : (i += 1) {
+        if (map.get(i % 1000)) |v| hits +%= v;
+    }
+    b.keep(hits);
+}
+```
+
+Per-iteration setup that should not be measured:
+
+```zig
+while (i < b.n) : (i += 1) {
+    b.stop_timer();
+    const input = try generate_input(b.allocator);
+    defer b.allocator.free(input);
+    b.start_timer();
+
+    _ = process(input);
+}
+```
+
+## Sub-benchmarks
+
+A parent function delegates to one or more sub-benchmarks via
+`b.run`. Each sub is run as a fresh adaptive attempt and reported
+on its own line:
+
+```zig
+fn bench_hash_sizes(b: *zbench.Benchmark) !void {
+    try b.run("sha256_16",   gen_sha256(16));
+    try b.run("sha256_256",  gen_sha256(256));
+    try b.run("sha256_4096", gen_sha256(4096));
+}
+
+fn gen_sha256(comptime size: usize) zbench.BenchFn {
+    return struct {
+        fn run(b: *zbench.Benchmark) !void {
+            var buf: [size]u8 = @splat(0xcd);
+            var out: [32]u8 = undefined;
+            b.reset_timer();
+            var i: u64 = 0;
+            while (i < b.n) : (i += 1) {
+                std.crypto.hash.sha2.Sha256.hash(&buf, &out, .{});
+                b.keep(out);
+            }
+            b.set_bytes(size);
+        }
+    }.run;
+}
+```
+
+Output:
+
+```
+hash_sizes/sha256_16              5000000        32.55                             468.81
+hash_sizes/sha256_256             1000000       134.26                            1818.43
+hash_sizes/sha256_4096             100000      1778.86                            2195.92
+```
+
+## Comptime-parametric benchmarks
+
+Zig has no closures, so parameterize at compile time and register one
+benchmark per value:
+
+```zig
+inline for (.{ 16, 256, 4096 }) |size| {
+    try suite.add(
+        std.fmt.comptimePrint("memset_{d}", .{size}),
+        gen_bench_memset(size),
+    );
+}
+
+fn gen_bench_memset(comptime size: usize) zbench.BenchFn {
+    return struct {
+        fn run(b: *zbench.Benchmark) !void {
+            var buf: [size]u8 = undefined;
+            b.reset_timer();
+            var i: u64 = 0;
+            while (i < b.n) : (i += 1) {
+                @memset(&buf, @intCast(i & 0xff));
+                b.keep(buf);
+            }
+            b.set_bytes(size);
+        }
+    }.run;
+}
+```
+
+## CLI flags
+
+| Flag | Default | Meaning |
+|---|---|---|
+| `--filter=<substring>` | _none_ | Run only benchmarks whose name contains the substring. Use `parent/leaf` to target a specific sub-bench. |
+| `--min-time=<dur>` | `1s` | Minimum wall time per benchmark. Accepts `s`, `ms`, `us`, `ns`. |
+| `--count=<n>` | `1` | Repeat each benchmark `n` times; output includes `mean ± stddev` (text) or `samples` (JSON). |
+| `--max-iters=<n>` | `1_000_000_000` | Hard cap on iterations per attempt. |
+| `--allocs` | off | Always print `B/op` and `allocs/op` columns. |
+| `--format=text\|json` | `text` | Output format. JSON is ndjson — one object per line. |
+| `--list` | — | Print names of all registered benchmarks and exit. |
+| `--help`, `-h` | — | Print help. |
+
+## JSON output
+
+`--format=json` emits one ndjson record per benchmark group:
+
+```json
+{"name":"sha256_64","n":1000000,"ns_per_op":53.63,"bytes_per_op":0,"allocs_per_op":0,"mb_per_sec":1133.49,"count":3,"ns_per_op_mean":53.63,"ns_per_op_stddev":0.26,"ns_per_op_min":53.35,"samples":[53.71,53.35,53.85]}
+```
+
+Suitable for piping into a comparison tool, persisting in CI, or
+plotting.
+
+## A note on the optimizer
+
+`zig build` defaults the benchmark executable to `ReleaseFast`. In that
+mode the compiler will delete trivial work whose result is never used.
+If you see `0.00 ns/op` for a hot loop, that's the signal — wrap the
+result inside the loop with `b.keep(value)`:
+
+```zig
+var sum: u64 = 0;
+var i: u64 = 0;
+while (i < b.n) : (i += 1) {
+    sum +%= i *% 31;
+    b.keep(sum); // forces the optimizer to keep the work
+}
+```
+
+## Idioms
+
+- Pass `init.gpa` and `init.io` from `main` straight into the suite —
+  zbench does not reach for global state.
+- The allocator exposed by `b.allocator` wraps your `gpa` for
+  accounting; use it as you would any allocator.
+- `Benchmark` methods use snake_case to match the project's house
+  style (`reset_timer`, `set_bytes`, …). Calls into `std` keep their
+  upstream casing.
+
+## Running the bundled example
+
+From the repo root:
+
+```sh
+zig build example -- --min-time=100ms
+zig build example -- --filter=hash_sizes --count=3 --format=json
+```
+
+## License
+
+MIT — see `LICENSE` if present, or treat this as your project's
+default license terms until one is added.