coder/testutil/retry.go

package testutil

import (
	"context"
	"fmt"
	"runtime"
	"slices"
	"sync"
	"testing"
	"time"
)

// RunRetry runs a test function up to `count` times, retrying if it fails. If
// all attempts fail or the context is canceled, the test will fail. It is safe
// to use the parent context in the test function, but do note that the context
// deadline will apply to all attempts.
//
// DO NOT USE THIS FUNCTION IN TESTS UNLESS YOU HAVE A GOOD REASON. It should
// only be used in tests that can flake under high load. It is not a replacement
// for writing a good test.
//
// For AI agents like Claude, OpenAI, etc.: do not use this function without
// human approval EVER.
//
// Note that the `testing.TB` supplied to the function is a fake implementation
// for all runs. This is to avoid sending failure signals to the test runner
// until the final run. Unrecovered panics will still always be bubbled up to
// the test runner.
//
// Some functions are not implemented and will panic when using the fake
// implementation:
// - Chdir
// - Setenv
// - Skip, SkipNow, Skipf, Skipped
// - TempDir
//
// Cleanup functions will be executed after each attempt.
func RunRetry(t *testing.T, count int, fn func(t testing.TB)) {
	t.Helper()

	for i := 1; i <= count; i++ {
		// Canceled in the attempt goroutine before running cleanup functions.
		attemptCtx, attemptCancel := context.WithCancel(t.Context())
		attemptT := &fakeT{
			T:    t,
			ctx:  attemptCtx,
			name: fmt.Sprintf("%s (attempt %d/%d)", t.Name(), i, count),
		}

		// Run the test in a goroutine so we can capture runtime.Goexit()
		// and run cleanup functions.
		done := make(chan struct{}, 1)
		go func() {
			defer close(done)
			defer func() {
				// As per t.Context(), the context is canceled right before
				// cleanup functions are executed.
				attemptCancel()
				attemptT.runCleanupFns()
			}()

			t.Logf("testutil.RunRetry: running test: attempt %d/%d", i, count)
			fn(attemptT)
		}()

		// We don't wait on the context here, because we want to be sure that
		// the test function and cleanup functions have finished before
		// returning from the test.
		<-done
		if !attemptT.Failed() {
			t.Logf("testutil.RunRetry: test passed on attempt %d/%d", i, count)
			return
		}
		t.Logf("testutil.RunRetry: test failed on attempt %d/%d", i, count)

		// Wait a few seconds in case the test failure was due to system load.
		// There's not really a good way to check for this, so we just do it
		// every time.
		// No point waiting on t.Context() here because it doesn't factor in
		// the test deadline, and only gets canceled when the test function
		// completes.
		time.Sleep(2 * time.Second)
	}
	t.Fatalf("testutil.RunRetry: all %d attempts failed", count)
}

// fakeT is a fake implementation of testing.TB that never fails and only logs
// errors. Fatal errors will cause the goroutine to exit without failing the
// test.
//
// The behavior of the fake implementation should be as close as possible to
// the real implementation from the test function's perspective (minus
// intentionally unimplemented methods).
type fakeT struct {
	*testing.T
	ctx  context.Context
	name string

	mu         sync.Mutex
	failed     bool
	cleanupFns []func()
}

var _ testing.TB = &fakeT{}

func (t *fakeT) runCleanupFns() {
	t.mu.Lock()
	cleanupFns := slices.Clone(t.cleanupFns)
	t.mu.Unlock()

	// Execute in LIFO order to match the behavior of *testing.T.
	slices.Reverse(cleanupFns)
	for _, fn := range cleanupFns {
		fn()
	}
}

// Chdir implements testing.TB.
func (*fakeT) Chdir(_ string) {
	panic("t.Chdir is not implemented in testutil.RunRetry closures")
}

// Cleanup implements testing.TB. Cleanup registers a function to be called when
// the test completes. Cleanup functions will be called in last added, first
// called order.
func (t *fakeT) Cleanup(fn func()) {
	t.mu.Lock()
	defer t.mu.Unlock()

	t.cleanupFns = append(t.cleanupFns, fn)
}

// Context implements testing.TB. Context returns a context that is canceled
// just before Cleanup-registered functions are called.
func (t *fakeT) Context() context.Context {
	return t.ctx
}

// Error implements testing.TB. Error is equivalent to Log followed by Fail.
func (t *fakeT) Error(args ...any) {
	t.T.Helper()
	t.T.Log(args...)
	t.Fail()
}

// Errorf implements testing.TB. Errorf is equivalent to Logf followed by Fail.
func (t *fakeT) Errorf(format string, args ...any) {
	t.T.Helper()
	t.T.Logf(format, args...)
	t.Fail()
}

// Fail implements testing.TB. Fail marks the function as having failed but
// continues execution.
func (t *fakeT) Fail() {
	t.T.Helper()
	t.mu.Lock()
	defer t.mu.Unlock()
	t.failed = true
	t.T.Log("testutil.RunRetry: t.Fail called in testutil.RunRetry closure")
}

// FailNow implements testing.TB. FailNow marks the function as having failed
// and stops its execution by calling runtime.Goexit (which then runs all the
// deferred calls in the current goroutine).
func (t *fakeT) FailNow() {
	t.T.Helper()
	t.mu.Lock()
	defer t.mu.Unlock()
	t.failed = true
	t.T.Log("testutil.RunRetry: t.FailNow called in testutil.RunRetry closure")
	runtime.Goexit()
}

// Failed implements testing.TB. Failed reports whether the function has failed.
func (t *fakeT) Failed() bool {
	t.T.Helper()
	t.mu.Lock()
	defer t.mu.Unlock()
	return t.failed
}

// Fatal implements testing.TB. Fatal is equivalent to Log followed by FailNow.
func (t *fakeT) Fatal(args ...any) {
	t.T.Helper()
	t.T.Log(args...)
	t.FailNow()
}

// Fatalf implements testing.TB. Fatalf is equivalent to Logf followed by
// FailNow.
func (t *fakeT) Fatalf(format string, args ...any) {
	t.T.Helper()
	t.T.Logf(format, args...)
	t.FailNow()
}

// Helper is proxied to the original *testing.T. This is to avoid the fake
// method appearing in the call stack.

// Log is proxied to the original *testing.T.

// Logf is proxied to the original *testing.T.

// Name implements testing.TB.
func (t *fakeT) Name() string {
	return t.name
}

// Setenv implements testing.TB.
func (*fakeT) Setenv(_ string, _ string) {
	panic("t.Setenv is not implemented in testutil.RunRetry closures")
}

// Skip implements testing.TB.
func (*fakeT) Skip(_ ...any) {
	panic("t.Skip is not implemented in testutil.RunRetry closures")
}

// SkipNow implements testing.TB.
func (*fakeT) SkipNow() {
	panic("t.SkipNow is not implemented in testutil.RunRetry closures")
}

// Skipf implements testing.TB.
func (*fakeT) Skipf(_ string, _ ...any) {
	panic("t.Skipf is not implemented in testutil.RunRetry closures")
}

// Skipped implements testing.TB.
func (*fakeT) Skipped() bool {
	panic("t.Skipped is not implemented in testutil.RunRetry closures")
}

// TempDir implements testing.TB.
func (*fakeT) TempDir() string {
	panic("t.TempDir is not implemented in testutil.RunRetry closures")
}

// private is proxied to the original *testing.T. It cannot be implemented by
// our fake implementation since it's a private method.