diff --git a/.claude/docs/DATABASE.md b/.claude/docs/DATABASE.md index 84f6125fa4..331d662d20 100644 --- a/.claude/docs/DATABASE.md +++ b/.claude/docs/DATABASE.md @@ -41,6 +41,41 @@ dimension. - Avoid `ByX` names for grouped queries. +### Enum Changes Run in a Single Transaction + +All migrations run inside one transaction (`pgTxnDriver`). Postgres forbids +*using* an enum value added by `ALTER TYPE ... ADD VALUE` within the same +transaction that added it, so it fails with `unsafe use of new value`. + +Adding the value is fine; using it in the same batch is not. "Using it" +includes a later migration that casts to it (`col::my_enum`), inserts or +updates a row with it, or sets it as a column default. This only fails when a +row actually materializes the new value, so fresh databases and CI pass while +deployments with existing data break. + +**MUST DO**: If any migration uses a newly added enum value, recreate the type +instead of using `ADD VALUE`. A freshly created enum's values are usable +immediately in the same transaction. Precedent: `000144_user_status_dormant`. + +```sql +CREATE TYPE new_my_enum AS ENUM ('existing', 'value', 'new_value'); + +ALTER TABLE my_table + ALTER COLUMN col TYPE new_my_enum USING (col::text::new_my_enum); + +DROP TYPE my_enum; + +ALTER TYPE new_my_enum RENAME TO my_enum; +``` + +Recreating produces an identical schema, so `make gen` yields no `dump.sql` +diff and databases that already applied the migration see no drift. + +**Testing**: `migrations.Stepper` commits each migration separately, so tests +built on it cannot surface this. To catch it, seed a row using the new value, +then apply the affected migrations in a single transaction (see +`TestMigration000504AIProvidersBackfillEnumInSingleTxn`). + ## Handling Nullable Fields Use `sql.NullString`, `sql.NullBool`, etc. for optional database fields: diff --git a/coderd/database/migrations/000499_ai_provider_type_chatd_values.down.sql b/coderd/database/migrations/000499_ai_provider_type_chatd_values.down.sql index a8b89359aa..ab84bd795f 100644 --- a/coderd/database/migrations/000499_ai_provider_type_chatd_values.down.sql +++ b/coderd/database/migrations/000499_ai_provider_type_chatd_values.down.sql @@ -1,3 +1,4 @@ --- No-op: Postgres does not allow removing enum values safely. --- Matches the precedent in 000495_ai_providers.down.sql for ALTER --- TYPE resource_type / api_key_scope ADD VALUE. +-- No-op: the up recreates ai_provider_type with a wider value set, but the +-- down does not narrow it back. Narrowing would drop rows that already use the +-- new values, and 000495_ai_providers.down.sql drops the type wholesale when +-- migrating all the way down. diff --git a/coderd/database/migrations/000499_ai_provider_type_chatd_values.up.sql b/coderd/database/migrations/000499_ai_provider_type_chatd_values.up.sql index 104514d32c..30df7758dd 100644 --- a/coderd/database/migrations/000499_ai_provider_type_chatd_values.up.sql +++ b/coderd/database/migrations/000499_ai_provider_type_chatd_values.up.sql @@ -7,9 +7,27 @@ -- OpenAI-compatible endpoints. Native gateway-side support for these -- providers comes later, at which point this enum already carries the -- right discriminator and no further migration is needed. -ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'azure'; -ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'bedrock'; -ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'google'; -ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'openai-compat'; -ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'openrouter'; -ALTER TYPE ai_provider_type ADD VALUE IF NOT EXISTS 'vercel'; +-- +-- Recreate the type rather than using ALTER TYPE ... ADD VALUE. Postgres +-- forbids using a value added by ADD VALUE within the same transaction, and +-- all migrations run in one transaction. 000504 casts existing chat_providers +-- rows to these new values in that same transaction, so ADD VALUE fails with +-- "unsafe use of new value". A freshly created enum's values are usable +-- immediately, so the cast in 000504 succeeds. +CREATE TYPE new_ai_provider_type AS ENUM ( + 'openai', + 'anthropic', + 'azure', + 'bedrock', + 'google', + 'openai-compat', + 'openrouter', + 'vercel' +); + +ALTER TABLE ai_providers + ALTER COLUMN type TYPE new_ai_provider_type USING (type::text::new_ai_provider_type); + +DROP TYPE ai_provider_type; + +ALTER TYPE new_ai_provider_type RENAME TO ai_provider_type; diff --git a/coderd/database/migrations/migrate_test.go b/coderd/database/migrations/migrate_test.go index 0b3e0c240c..f148860bc5 100644 --- a/coderd/database/migrations/migrate_test.go +++ b/coderd/database/migrations/migrate_test.go @@ -7,6 +7,7 @@ import ( "os" "path/filepath" "slices" + "strings" "sync" "testing" "time" @@ -1502,6 +1503,85 @@ func TestMigration000504AIProvidersBackfillOverridesNameConflict(t *testing.T) { require.True(t, fresh.Enabled) } +// TestMigration000504AIProvidersBackfillEnumInSingleTxn reproduces the +// production migration path, where every pending migration runs inside a +// single transaction (see pgTxnDriver). Migration 000499 widens +// ai_provider_type with ALTER TYPE ... ADD VALUE, and 000504 casts existing +// chat_providers rows to that enum. Postgres forbids using an enum value +// added by ADD VALUE within the same transaction, so when a legacy provider +// uses one of the new values (for example openai-compat) the batch fails with +// "unsafe use of new value". The per-step Stepper used by the other tests +// commits each migration separately and cannot surface this. +func TestMigration000504AIProvidersBackfillEnumInSingleTxn(t *testing.T) { + t.Parallel() + + sqlDB := testSQLDB(t) + ctx := testutil.Context(t, testutil.WaitSuperLong) + + // Apply everything through 498 and commit, so chat_providers exists and is + // populated before the batch under test runs, matching a deployment that + // ran an earlier migration batch before this one. + applyMigrationsInTxn(ctx, t, sqlDB, 1, 498) + + now := time.Now().UTC().Truncate(time.Microsecond) + providerID := uuid.New() + + // A legacy provider whose type is one of the values added in 000499. + _, err := sqlDB.ExecContext(ctx, ` + INSERT INTO chat_providers (id, provider, display_name, api_key, enabled, base_url, created_at, updated_at) + VALUES ($1, 'openai-compat', 'OpenAI Compatible', '', TRUE, 'https://api.example.com/v1', $2, $2) + `, providerID, now) + require.NoError(t, err) + + // Apply 000499 through 000504 in a single transaction, as production does. + applyMigrationsInTxn(ctx, t, sqlDB, 499, 504) + + var typ string + err = sqlDB.QueryRowContext(ctx, + `SELECT type FROM ai_providers WHERE id = $1`, providerID, + ).Scan(&typ) + require.NoError(t, err) + require.Equal(t, "openai-compat", typ) +} + +// applyMigrationsInTxn executes the up SQL for every migration whose version is +// in [from, to] inside a single transaction, mirroring pgTxnDriver. The whole +// batch commits or rolls back together. +func applyMigrationsInTxn(ctx context.Context, t *testing.T, sqlDB *sql.DB, from, to int) { + t.Helper() + + entries, err := os.ReadDir(".") + require.NoError(t, err) + + var files []string + for _, entry := range entries { + name := entry.Name() + if !strings.HasSuffix(name, ".up.sql") { + continue + } + var version int + if _, err := fmt.Sscanf(name, "%06d_", &version); err != nil { + continue + } + if version >= from && version <= to { + files = append(files, name) + } + } + slices.Sort(files) + + tx, err := sqlDB.BeginTx(ctx, nil) + require.NoError(t, err) + defer tx.Rollback() + + for _, name := range files { + query, err := os.ReadFile(name) + require.NoError(t, err) + _, err = tx.ExecContext(ctx, string(query)) + require.NoErrorf(t, err, "apply migration %s", name) + } + require.NoError(t, tx.Commit()) +} + func TestMigration000498SoftDeleteStaleWorkspaceAgents(t *testing.T) { t.Parallel()