fix: recreate ai_provider_type instead of ADD VALUE (#25895)

Coder runs all migrations in a single transaction (`pgTxnDriver`).
Postgres forbids using an enum value added by `ALTER TYPE ... ADD VALUE`
within the same transaction that added it. Migration `000499` widened
`ai_provider_type` with `ADD VALUE`, and `000504` casts existing
`chat_providers` rows to that enum in the same transaction. On
deployments with a legacy provider using one of the new values (for
example `openai-compat`), the batch failed with `unsafe use of new
value` and the server could not start.

Recreate the type (create a new enum, alter the column, drop and rename)
instead of using `ADD VALUE`, matching the existing precedent in
`000144_user_status_dormant`. A freshly created enum's values are usable
immediately in the same transaction, so the cast in `000504` succeeds.
The resulting schema is identical, so `make gen` produces no `dump.sql`
diff and databases that already applied these migrations see no drift.

Added a regression test that seeds an `openai-compat` provider and
applies `000499` through `000504` in a single transaction, reproducing
the production path. The per-step `Stepper` used by the other migration
tests commits each migration separately and cannot surface this class of
bug.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Signed-off-by: Danny Kopping <danny@coder.com>
Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Danny Kopping
2026-06-01 15:30:45 +02:00
committed by GitHub
parent a85462bd49
commit 85f56e4944
4 changed files with 143 additions and 9 deletions
+35
View File
@@ -41,6 +41,41 @@
dimension.
- Avoid `ByX` names for grouped queries.
### Enum Changes Run in a Single Transaction
All migrations run inside one transaction (`pgTxnDriver`). Postgres forbids
*using* an enum value added by `ALTER TYPE ... ADD VALUE` within the same
transaction that added it, so it fails with `unsafe use of new value`.
Adding the value is fine; using it in the same batch is not. "Using it"
includes a later migration that casts to it (`col::my_enum`), inserts or
updates a row with it, or sets it as a column default. This only fails when a
row actually materializes the new value, so fresh databases and CI pass while
deployments with existing data break.
**MUST DO**: If any migration uses a newly added enum value, recreate the type
instead of using `ADD VALUE`. A freshly created enum's values are usable
immediately in the same transaction. Precedent: `000144_user_status_dormant`.
```sql
CREATE TYPE new_my_enum AS ENUM ('existing', 'value', 'new_value');
ALTER TABLE my_table
ALTER COLUMN col TYPE new_my_enum USING (col::text::new_my_enum);
DROP TYPE my_enum;
ALTER TYPE new_my_enum RENAME TO my_enum;
```
Recreating produces an identical schema, so `make gen` yields no `dump.sql`
diff and databases that already applied the migration see no drift.
**Testing**: `migrations.Stepper` commits each migration separately, so tests
built on it cannot surface this. To catch it, seed a row using the new value,
then apply the affected migrations in a single transaction (see
`TestMigration000504AIProvidersBackfillEnumInSingleTxn`).
## Handling Nullable Fields
Use `sql.NullString`, `sql.NullBool`, etc. for optional database fields:
@@ -1,3 +1,4 @@
-- No-op: Postgres does not allow removing enum values safely.
-- Matches the precedent in 000495_ai_providers.down.sql for ALTER
-- TYPE resource_type / api_key_scope ADD VALUE.
-- No-op: the up recreates ai_provider_type with a wider value set, but the
-- down does not narrow it back. Narrowing would drop rows that already use the
-- new values, and 000495_ai_providers.down.sql drops the type wholesale when
-- migrating all the way down.
@@ -7,9 +7,27 @@
-- OpenAI-compatible endpoints. Native gateway-side support for these
-- providers comes later, at which point this enum already carries the
-- right discriminator and no further migration is needed.
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'azure';
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'bedrock';
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'google';
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'openai-compat';
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'openrouter';
ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'vercel';
--
-- Recreate the type rather than using ALTER TYPE ... ADD VALUE. Postgres
-- forbids using a value added by ADD VALUE within the same transaction, and
-- all migrations run in one transaction. 000504 casts existing chat_providers
-- rows to these new values in that same transaction, so ADD VALUE fails with
-- "unsafe use of new value". A freshly created enum's values are usable
-- immediately, so the cast in 000504 succeeds.
CREATE TYPE new_ai_provider_type AS ENUM (
'openai',
'anthropic',
'azure',
'bedrock',
'google',
'openai-compat',
'openrouter',
'vercel'
);
ALTER TABLE ai_providers
ALTER COLUMN type TYPE new_ai_provider_type USING (type::text::new_ai_provider_type);
DROP TYPE ai_provider_type;
ALTER TYPE new_ai_provider_type RENAME TO ai_provider_type;
@@ -7,6 +7,7 @@ import (
"os"
"path/filepath"
"slices"
"strings"
"sync"
"testing"
"time"
@@ -1502,6 +1503,85 @@ func TestMigration000504AIProvidersBackfillOverridesNameConflict(t *testing.T) {
require.True(t, fresh.Enabled)
}
// TestMigration000504AIProvidersBackfillEnumInSingleTxn reproduces the
// production migration path, where every pending migration runs inside a
// single transaction (see pgTxnDriver). Migration 000499 widens
// ai_provider_type with ALTER TYPE ... ADD VALUE, and 000504 casts existing
// chat_providers rows to that enum. Postgres forbids using an enum value
// added by ADD VALUE within the same transaction, so when a legacy provider
// uses one of the new values (for example openai-compat) the batch fails with
// "unsafe use of new value". The per-step Stepper used by the other tests
// commits each migration separately and cannot surface this.
func TestMigration000504AIProvidersBackfillEnumInSingleTxn(t *testing.T) {
t.Parallel()
sqlDB := testSQLDB(t)
ctx := testutil.Context(t, testutil.WaitSuperLong)
// Apply everything through 498 and commit, so chat_providers exists and is
// populated before the batch under test runs, matching a deployment that
// ran an earlier migration batch before this one.
applyMigrationsInTxn(ctx, t, sqlDB, 1, 498)
now := time.Now().UTC().Truncate(time.Microsecond)
providerID := uuid.New()
// A legacy provider whose type is one of the values added in 000499.
_, err := sqlDB.ExecContext(ctx, `
INSERT INTO chat_providers (id, provider, display_name, api_key, enabled, base_url, created_at, updated_at)
VALUES ($1, 'openai-compat', 'OpenAI Compatible', '', TRUE, 'https://api.example.com/v1', $2, $2)
`, providerID, now)
require.NoError(t, err)
// Apply 000499 through 000504 in a single transaction, as production does.
applyMigrationsInTxn(ctx, t, sqlDB, 499, 504)
var typ string
err = sqlDB.QueryRowContext(ctx,
`SELECT type FROM ai_providers WHERE id = $1`, providerID,
).Scan(&typ)
require.NoError(t, err)
require.Equal(t, "openai-compat", typ)
}
// applyMigrationsInTxn executes the up SQL for every migration whose version is
// in [from, to] inside a single transaction, mirroring pgTxnDriver. The whole
// batch commits or rolls back together.
func applyMigrationsInTxn(ctx context.Context, t *testing.T, sqlDB *sql.DB, from, to int) {
t.Helper()
entries, err := os.ReadDir(".")
require.NoError(t, err)
var files []string
for _, entry := range entries {
name := entry.Name()
if !strings.HasSuffix(name, ".up.sql") {
continue
}
var version int
if _, err := fmt.Sscanf(name, "%06d_", &version); err != nil {
continue
}
if version >= from && version <= to {
files = append(files, name)
}
}
slices.Sort(files)
tx, err := sqlDB.BeginTx(ctx, nil)
require.NoError(t, err)
defer tx.Rollback()
for _, name := range files {
query, err := os.ReadFile(name)
require.NoError(t, err)
_, err = tx.ExecContext(ctx, string(query))
require.NoErrorf(t, err, "apply migration %s", name)
}
require.NoError(t, tx.Commit())
}
func TestMigration000498SoftDeleteStaleWorkspaceAgents(t *testing.T) {
t.Parallel()