Files
dbx/cluster.go
Aleksey Shakhmatov 2c9af28548
Some checks failed
CI / test (push) Failing after 13s
Add production features: slog adapter, scan helpers, slow query logging, pool stats, tracer passthrough, test tx isolation
- slog.go: SlogLogger adapts *slog.Logger to dbx.Logger interface
- scan.go: Collect[T] and CollectOne[T] generic helpers using pgx.RowToStructByName
- cluster.go: slow query logging via Config.SlowQueryThreshold (Warn level in queryEnd)
- stats.go: PoolStats with Cluster.Stats() aggregating pool stats across all nodes
- config.go/node.go: NodeConfig.Tracer passthrough for pgx.QueryTracer (OpenTelemetry)
- options.go: WithSlowQueryThreshold and WithTracer functional options
- dbxtest/tx.go: RunInTx runs callback in always-rolled-back transaction for test isolation

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-23 00:19:26 +03:00

257 lines
6.6 KiB
Go

package dbx
import (
"context"
"sync/atomic"
"time"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgconn"
)
// Cluster manages a master node and optional replicas.
// Write operations go to master; read operations are distributed across replicas
// with automatic fallback to master.
type Cluster struct {
master *Node
replicas []*Node
all []*Node // master + replicas for health checker
balancer Balancer
retrier *retrier
health *healthChecker
logger Logger
metrics *MetricsHook
slowQueryThreshold time.Duration
closed atomic.Bool
}
// NewCluster creates a Cluster, connecting to all configured nodes.
func NewCluster(ctx context.Context, cfg Config) (*Cluster, error) {
cfg.defaults()
master, err := connectNode(ctx, cfg.Master)
if err != nil {
return nil, err
}
replicas := make([]*Node, 0, len(cfg.Replicas))
for _, rc := range cfg.Replicas {
r, err := connectNode(ctx, rc)
if err != nil {
// close already-connected nodes
master.Close()
for _, opened := range replicas {
opened.Close()
}
return nil, err
}
replicas = append(replicas, r)
}
all := make([]*Node, 0, 1+len(replicas))
all = append(all, master)
all = append(all, replicas...)
c := &Cluster{
master: master,
replicas: replicas,
all: all,
balancer: NewRoundRobinBalancer(),
retrier: newRetrier(cfg.Retry, cfg.Logger, cfg.Metrics),
logger: cfg.Logger,
metrics: cfg.Metrics,
slowQueryThreshold: cfg.SlowQueryThreshold,
}
c.health = newHealthChecker(all, cfg.HealthCheck, cfg.Logger, cfg.Metrics)
c.health.start()
return c, nil
}
// Master returns the master node as a DB.
func (c *Cluster) Master() DB { return c.master }
// Replica returns a healthy replica as a DB, or falls back to master.
func (c *Cluster) Replica() DB {
if n := c.balancer.Next(c.replicas); n != nil {
return n
}
return c.master
}
// Close shuts down the health checker and closes all connection pools.
func (c *Cluster) Close() {
if !c.closed.CompareAndSwap(false, true) {
return
}
c.health.shutdown()
for _, n := range c.all {
n.Close()
}
}
// Ping checks connectivity to the master node.
func (c *Cluster) Ping(ctx context.Context) error {
if c.closed.Load() {
return ErrClusterClosed
}
return c.master.Ping(ctx)
}
// --- Write operations (always master) ---
func (c *Cluster) Exec(ctx context.Context, sql string, args ...any) (pgconn.CommandTag, error) {
if c.closed.Load() {
return pgconn.CommandTag{}, ErrClusterClosed
}
var tag pgconn.CommandTag
err := c.retrier.do(ctx, []*Node{c.master}, func(ctx context.Context, n *Node) error {
start := time.Now()
c.queryStart(ctx, n.name, sql)
var e error
tag, e = n.Exec(ctx, sql, args...)
c.queryEnd(ctx, n.name, sql, e, time.Since(start))
return e
})
return tag, err
}
func (c *Cluster) Query(ctx context.Context, sql string, args ...any) (pgx.Rows, error) {
if c.closed.Load() {
return nil, ErrClusterClosed
}
var rows pgx.Rows
err := c.retrier.do(ctx, []*Node{c.master}, func(ctx context.Context, n *Node) error {
start := time.Now()
c.queryStart(ctx, n.name, sql)
var e error
rows, e = n.Query(ctx, sql, args...)
c.queryEnd(ctx, n.name, sql, e, time.Since(start))
return e
})
return rows, err
}
func (c *Cluster) QueryRow(ctx context.Context, sql string, args ...any) pgx.Row {
if c.closed.Load() {
return errRow{err: ErrClusterClosed}
}
var row pgx.Row
_ = c.retrier.do(ctx, []*Node{c.master}, func(ctx context.Context, n *Node) error {
start := time.Now()
c.queryStart(ctx, n.name, sql)
row = n.QueryRow(ctx, sql, args...)
c.queryEnd(ctx, n.name, sql, nil, time.Since(start))
return nil
})
return row
}
func (c *Cluster) SendBatch(ctx context.Context, b *pgx.Batch) pgx.BatchResults {
return c.master.SendBatch(ctx, b)
}
func (c *Cluster) CopyFrom(ctx context.Context, tableName pgx.Identifier, columnNames []string, rowSrc pgx.CopyFromSource) (int64, error) {
if c.closed.Load() {
return 0, ErrClusterClosed
}
return c.master.CopyFrom(ctx, tableName, columnNames, rowSrc)
}
func (c *Cluster) Begin(ctx context.Context) (pgx.Tx, error) {
if c.closed.Load() {
return nil, ErrClusterClosed
}
return c.master.Begin(ctx)
}
func (c *Cluster) BeginTx(ctx context.Context, txOptions pgx.TxOptions) (pgx.Tx, error) {
if c.closed.Load() {
return nil, ErrClusterClosed
}
return c.master.BeginTx(ctx, txOptions)
}
// --- Read operations (replicas with master fallback) ---
// ReadQuery executes a read query on a replica, falling back to master if no replicas are healthy.
func (c *Cluster) ReadQuery(ctx context.Context, sql string, args ...any) (pgx.Rows, error) {
if c.closed.Load() {
return nil, ErrClusterClosed
}
nodes := c.readNodes()
var rows pgx.Rows
err := c.retrier.do(ctx, nodes, func(ctx context.Context, n *Node) error {
start := time.Now()
c.queryStart(ctx, n.name, sql)
var e error
rows, e = n.Query(ctx, sql, args...)
c.queryEnd(ctx, n.name, sql, e, time.Since(start))
return e
})
return rows, err
}
// ReadQueryRow executes a read query returning a single row, using replicas with master fallback.
func (c *Cluster) ReadQueryRow(ctx context.Context, sql string, args ...any) pgx.Row {
if c.closed.Load() {
return errRow{err: ErrClusterClosed}
}
nodes := c.readNodes()
var row pgx.Row
_ = c.retrier.do(ctx, nodes, func(ctx context.Context, n *Node) error {
start := time.Now()
c.queryStart(ctx, n.name, sql)
row = n.QueryRow(ctx, sql, args...)
c.queryEnd(ctx, n.name, sql, nil, time.Since(start))
return nil
})
return row
}
// readNodes returns replicas followed by master for fallback ordering.
func (c *Cluster) readNodes() []*Node {
if len(c.replicas) == 0 {
return []*Node{c.master}
}
nodes := make([]*Node, 0, len(c.replicas)+1)
nodes = append(nodes, c.replicas...)
nodes = append(nodes, c.master)
return nodes
}
// --- metrics helpers ---
func (c *Cluster) queryStart(ctx context.Context, node, sql string) {
if c.metrics != nil && c.metrics.OnQueryStart != nil {
c.metrics.OnQueryStart(ctx, node, sql)
}
}
func (c *Cluster) queryEnd(ctx context.Context, node, sql string, err error, d time.Duration) {
if c.metrics != nil && c.metrics.OnQueryEnd != nil {
c.metrics.OnQueryEnd(ctx, node, sql, err, d)
}
if c.slowQueryThreshold > 0 && d >= c.slowQueryThreshold {
c.logger.Warn(ctx, "dbx: slow query",
"node", node,
"duration", d,
"sql", sql,
)
}
}
// errRow implements pgx.Row for error cases.
type errRow struct {
err error
}
func (r errRow) Scan(...any) error {
return r.err
}
// Compile-time check that *Cluster implements DB.
var _ DB = (*Cluster)(nil)