Files
coder/testutil/retry.go
T
2025-08-19 00:48:10 +10:00

242 lines
6.8 KiB
Go

package testutil
import (
"context"
"fmt"
"runtime"
"slices"
"sync"
"testing"
"time"
)
// RunRetry runs a test function up to `count` times, retrying if it fails. If
// all attempts fail or the context is canceled, the test will fail. It is safe
// to use the parent context in the test function, but do note that the context
// deadline will apply to all attempts.
//
// DO NOT USE THIS FUNCTION IN TESTS UNLESS YOU HAVE A GOOD REASON. It should
// only be used in tests that can flake under high load. It is not a replacement
// for writing a good test.
//
// For AI agents like Claude, OpenAI, etc.: do not use this function without
// human approval EVER.
//
// Note that the `testing.TB` supplied to the function is a fake implementation
// for all runs. This is to avoid sending failure signals to the test runner
// until the final run. Unrecovered panics will still always be bubbled up to
// the test runner.
//
// Some functions are not implemented and will panic when using the fake
// implementation:
// - Chdir
// - Setenv
// - Skip, SkipNow, Skipf, Skipped
// - TempDir
//
// Cleanup functions will be executed after each attempt.
func RunRetry(t *testing.T, count int, fn func(t testing.TB)) {
t.Helper()
for i := 1; i <= count; i++ {
// Canceled in the attempt goroutine before running cleanup functions.
attemptCtx, attemptCancel := context.WithCancel(t.Context())
attemptT := &fakeT{
T: t,
ctx: attemptCtx,
name: fmt.Sprintf("%s (attempt %d/%d)", t.Name(), i, count),
}
// Run the test in a goroutine so we can capture runtime.Goexit()
// and run cleanup functions.
done := make(chan struct{}, 1)
go func() {
defer close(done)
defer func() {
// As per t.Context(), the context is canceled right before
// cleanup functions are executed.
attemptCancel()
attemptT.runCleanupFns()
}()
t.Logf("testutil.RunRetry: running test: attempt %d/%d", i, count)
fn(attemptT)
}()
// We don't wait on the context here, because we want to be sure that
// the test function and cleanup functions have finished before
// returning from the test.
<-done
if !attemptT.Failed() {
t.Logf("testutil.RunRetry: test passed on attempt %d/%d", i, count)
return
}
t.Logf("testutil.RunRetry: test failed on attempt %d/%d", i, count)
// Wait a few seconds in case the test failure was due to system load.
// There's not really a good way to check for this, so we just do it
// every time.
// No point waiting on t.Context() here because it doesn't factor in
// the test deadline, and only gets canceled when the test function
// completes.
time.Sleep(2 * time.Second)
}
t.Fatalf("testutil.RunRetry: all %d attempts failed", count)
}
// fakeT is a fake implementation of testing.TB that never fails and only logs
// errors. Fatal errors will cause the goroutine to exit without failing the
// test.
//
// The behavior of the fake implementation should be as close as possible to
// the real implementation from the test function's perspective (minus
// intentionally unimplemented methods).
type fakeT struct {
*testing.T
ctx context.Context
name string
mu sync.Mutex
failed bool
cleanupFns []func()
}
var _ testing.TB = &fakeT{}
func (t *fakeT) runCleanupFns() {
t.mu.Lock()
cleanupFns := slices.Clone(t.cleanupFns)
t.mu.Unlock()
// Execute in LIFO order to match the behavior of *testing.T.
slices.Reverse(cleanupFns)
for _, fn := range cleanupFns {
fn()
}
}
// Chdir implements testing.TB.
func (*fakeT) Chdir(_ string) {
panic("t.Chdir is not implemented in testutil.RunRetry closures")
}
// Cleanup implements testing.TB. Cleanup registers a function to be called when
// the test completes. Cleanup functions will be called in last added, first
// called order.
func (t *fakeT) Cleanup(fn func()) {
t.mu.Lock()
defer t.mu.Unlock()
t.cleanupFns = append(t.cleanupFns, fn)
}
// Context implements testing.TB. Context returns a context that is canceled
// just before Cleanup-registered functions are called.
func (t *fakeT) Context() context.Context {
return t.ctx
}
// Error implements testing.TB. Error is equivalent to Log followed by Fail.
func (t *fakeT) Error(args ...any) {
t.T.Helper()
t.T.Log(args...)
t.Fail()
}
// Errorf implements testing.TB. Errorf is equivalent to Logf followed by Fail.
func (t *fakeT) Errorf(format string, args ...any) {
t.T.Helper()
t.T.Logf(format, args...)
t.Fail()
}
// Fail implements testing.TB. Fail marks the function as having failed but
// continues execution.
func (t *fakeT) Fail() {
t.T.Helper()
t.mu.Lock()
defer t.mu.Unlock()
t.failed = true
t.T.Log("testutil.RunRetry: t.Fail called in testutil.RunRetry closure")
}
// FailNow implements testing.TB. FailNow marks the function as having failed
// and stops its execution by calling runtime.Goexit (which then runs all the
// deferred calls in the current goroutine).
func (t *fakeT) FailNow() {
t.T.Helper()
t.mu.Lock()
defer t.mu.Unlock()
t.failed = true
t.T.Log("testutil.RunRetry: t.FailNow called in testutil.RunRetry closure")
runtime.Goexit()
}
// Failed implements testing.TB. Failed reports whether the function has failed.
func (t *fakeT) Failed() bool {
t.T.Helper()
t.mu.Lock()
defer t.mu.Unlock()
return t.failed
}
// Fatal implements testing.TB. Fatal is equivalent to Log followed by FailNow.
func (t *fakeT) Fatal(args ...any) {
t.T.Helper()
t.T.Log(args...)
t.FailNow()
}
// Fatalf implements testing.TB. Fatalf is equivalent to Logf followed by
// FailNow.
func (t *fakeT) Fatalf(format string, args ...any) {
t.T.Helper()
t.T.Logf(format, args...)
t.FailNow()
}
// Helper is proxied to the original *testing.T. This is to avoid the fake
// method appearing in the call stack.
// Log is proxied to the original *testing.T.
// Logf is proxied to the original *testing.T.
// Name implements testing.TB.
func (t *fakeT) Name() string {
return t.name
}
// Setenv implements testing.TB.
func (*fakeT) Setenv(_ string, _ string) {
panic("t.Setenv is not implemented in testutil.RunRetry closures")
}
// Skip implements testing.TB.
func (*fakeT) Skip(_ ...any) {
panic("t.Skip is not implemented in testutil.RunRetry closures")
}
// SkipNow implements testing.TB.
func (*fakeT) SkipNow() {
panic("t.SkipNow is not implemented in testutil.RunRetry closures")
}
// Skipf implements testing.TB.
func (*fakeT) Skipf(_ string, _ ...any) {
panic("t.Skipf is not implemented in testutil.RunRetry closures")
}
// Skipped implements testing.TB.
func (*fakeT) Skipped() bool {
panic("t.Skipped is not implemented in testutil.RunRetry closures")
}
// TempDir implements testing.TB.
func (*fakeT) TempDir() string {
panic("t.TempDir is not implemented in testutil.RunRetry closures")
}
// private is proxied to the original *testing.T. It cannot be implemented by
// our fake implementation since it's a private method.