fix(agent/agentcontainer): allow lifecycle script error on devcontainer up (#21020)

When devcontainer up fails due to a lifecycle script error like
postCreateCommand, the CLI still returns a container ID. Previously
this was discarded and the devcontainer marked as failed. Now we
continue with agent injection if a container ID is available,
allowing users to debug the issue in the running container.

Fixes coder/internal#1137
This commit is contained in:
Mathias Fredriksson
2025-12-02 15:33:01 +02:00
committed by GitHub
parent bf40d678ec
commit 74d0c39cb3
5 changed files with 381 additions and 75 deletions
+42 -19
View File
@@ -1039,6 +1039,10 @@ func (api *API) processUpdatedContainersLocked(ctx context.Context, updated code
logger.Error(ctx, "inject subagent into container failed", slog.Error(err))
dc.Error = err.Error()
} else {
// TODO(mafredri): Preserve the error from devcontainer
// up if it was a lifecycle script error. Currently
// this results in a brief flicker for the user if
// injection is fast, as the error is shown then erased.
dc.Error = ""
}
}
@@ -1347,27 +1351,41 @@ func (api *API) CreateDevcontainer(workspaceFolder, configPath string, opts ...D
upOptions := []DevcontainerCLIUpOptions{WithUpOutput(infoW, errW)}
upOptions = append(upOptions, opts...)
_, err := api.dccli.Up(ctx, dc.WorkspaceFolder, configPath, upOptions...)
if err != nil {
containerID, upErr := api.dccli.Up(ctx, dc.WorkspaceFolder, configPath, upOptions...)
if upErr != nil {
// No need to log if the API is closing (context canceled), as this
// is expected behavior when the API is shutting down.
if !errors.Is(err, context.Canceled) {
logger.Error(ctx, "devcontainer creation failed", slog.Error(err))
if !errors.Is(upErr, context.Canceled) {
logger.Error(ctx, "devcontainer creation failed", slog.Error(upErr))
}
api.mu.Lock()
dc = api.knownDevcontainers[dc.WorkspaceFolder]
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusError
dc.Error = err.Error()
api.knownDevcontainers[dc.WorkspaceFolder] = dc
api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "errorTimes")
api.mu.Unlock()
// If we don't have a container ID, the error is fatal, so we
// should mark the devcontainer as errored and return.
if containerID == "" {
api.mu.Lock()
dc = api.knownDevcontainers[dc.WorkspaceFolder]
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusError
dc.Error = upErr.Error()
api.knownDevcontainers[dc.WorkspaceFolder] = dc
api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "errorTimes")
api.broadcastUpdatesLocked()
api.mu.Unlock()
return xerrors.Errorf("start devcontainer: %w", err)
return xerrors.Errorf("start devcontainer: %w", upErr)
}
// If we have a container ID, it means the container was created
// but a lifecycle script (e.g. postCreateCommand) failed. In this
// case, we still want to refresh containers to pick up the new
// container, inject the agent, and allow the user to debug the
// issue. We store the error to surface it to the user.
logger.Warn(ctx, "devcontainer created with errors (e.g. lifecycle script failure), container is available",
slog.F("container_id", containerID),
)
} else {
logger.Info(ctx, "devcontainer created successfully")
}
logger.Info(ctx, "devcontainer created successfully")
api.mu.Lock()
dc = api.knownDevcontainers[dc.WorkspaceFolder]
// Update the devcontainer status to Running or Stopped based on the
@@ -1376,13 +1394,18 @@ func (api *API) CreateDevcontainer(workspaceFolder, configPath string, opts ...D
// to minimize the time between API consistency, we guess the status
// based on the container state.
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped
if dc.Container != nil {
if dc.Container.Running {
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning
}
if dc.Container != nil && dc.Container.Running {
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning
}
dc.Dirty = false
dc.Error = ""
if upErr != nil {
// If there was a lifecycle script error but we have a container ID,
// the container is running so we should set the status to Running.
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning
dc.Error = upErr.Error()
} else {
dc.Error = ""
}
api.recreateSuccessTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "successTimes")
api.knownDevcontainers[dc.WorkspaceFolder] = dc
api.broadcastUpdatesLocked()
+112
View File
@@ -2070,6 +2070,118 @@ func TestAPI(t *testing.T) {
require.Equal(t, "", response.Devcontainers[0].Error)
})
// This test verifies that when devcontainer up fails due to a
// lifecycle script error (such as postCreateCommand failing) but the
// container was successfully created, we still proceed with the
// devcontainer. The container should be available for use and the
// agent should be injected.
t.Run("DuringUpWithContainerID", func(t *testing.T) {
t.Parallel()
var (
ctx = testutil.Context(t, testutil.WaitMedium)
logger = slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug)
mClock = quartz.NewMock(t)
testContainer = codersdk.WorkspaceAgentContainer{
ID: "test-container-id",
FriendlyName: "test-container",
Image: "test-image",
Running: true,
CreatedAt: time.Now(),
Labels: map[string]string{
agentcontainers.DevcontainerLocalFolderLabel: "/workspaces/project",
agentcontainers.DevcontainerConfigFileLabel: "/workspaces/project/.devcontainer/devcontainer.json",
},
}
fCCLI = &fakeContainerCLI{
containers: codersdk.WorkspaceAgentListContainersResponse{
Containers: []codersdk.WorkspaceAgentContainer{testContainer},
},
arch: "amd64",
}
fDCCLI = &fakeDevcontainerCLI{
upID: testContainer.ID,
upErrC: make(chan func() error, 1),
}
fSAC = &fakeSubAgentClient{
logger: logger.Named("fakeSubAgentClient"),
}
testDevcontainer = codersdk.WorkspaceAgentDevcontainer{
ID: uuid.New(),
Name: "test-devcontainer",
WorkspaceFolder: "/workspaces/project",
ConfigPath: "/workspaces/project/.devcontainer/devcontainer.json",
Status: codersdk.WorkspaceAgentDevcontainerStatusStopped,
}
)
mClock.Set(time.Now()).MustWait(ctx)
tickerTrap := mClock.Trap().TickerFunc("updaterLoop")
nowRecreateSuccessTrap := mClock.Trap().Now("recreate", "successTimes")
api := agentcontainers.NewAPI(logger,
agentcontainers.WithClock(mClock),
agentcontainers.WithContainerCLI(fCCLI),
agentcontainers.WithDevcontainerCLI(fDCCLI),
agentcontainers.WithDevcontainers(
[]codersdk.WorkspaceAgentDevcontainer{testDevcontainer},
[]codersdk.WorkspaceAgentScript{{ID: testDevcontainer.ID, LogSourceID: uuid.New()}},
),
agentcontainers.WithSubAgentClient(fSAC),
agentcontainers.WithSubAgentURL("test-subagent-url"),
agentcontainers.WithWatcher(watcher.NewNoop()),
)
api.Start()
defer func() {
close(fDCCLI.upErrC)
api.Close()
}()
r := chi.NewRouter()
r.Mount("/", api.Routes())
tickerTrap.MustWait(ctx).MustRelease(ctx)
tickerTrap.Close()
// Send a recreate request to trigger devcontainer up.
req := httptest.NewRequest(http.MethodPost, "/devcontainers/"+testDevcontainer.ID.String()+"/recreate", nil)
rec := httptest.NewRecorder()
r.ServeHTTP(rec, req)
require.Equal(t, http.StatusAccepted, rec.Code)
// Simulate a lifecycle script failure. The devcontainer CLI
// will return an error but also provide a container ID since
// the container was created before the script failed.
simulatedError := xerrors.New("postCreateCommand failed with exit code 1")
testutil.RequireSend(ctx, t, fDCCLI.upErrC, func() error { return simulatedError })
// Wait for the recreate operation to complete. We expect it to
// record a success time because the container was created.
nowRecreateSuccessTrap.MustWait(ctx).MustRelease(ctx)
nowRecreateSuccessTrap.Close()
req = httptest.NewRequest(http.MethodGet, "/", nil)
rec = httptest.NewRecorder()
r.ServeHTTP(rec, req)
require.Equal(t, http.StatusOK, rec.Code)
var response codersdk.WorkspaceAgentListContainersResponse
err := json.NewDecoder(rec.Body).Decode(&response)
require.NoError(t, err)
// Verify that the devcontainer is running and has the container
// associated with it despite the lifecycle script error. The
// error may be cleared during refresh if agent injection
// succeeds, but the important thing is that the container is
// available for use.
require.Len(t, response.Devcontainers, 1)
assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, response.Devcontainers[0].Status)
assert.NotNil(t, response.Devcontainers[0].Container)
assert.Equal(t, testContainer.ID, response.Devcontainers[0].Container.ID)
})
t.Run("DuringInjection", func(t *testing.T) {
t.Parallel()
+16 -15
View File
@@ -263,11 +263,14 @@ func (d *devcontainerCLI) Up(ctx context.Context, workspaceFolder, configPath st
}
if err := cmd.Run(); err != nil {
_, err2 := parseDevcontainerCLILastLine[devcontainerCLIResult](ctx, logger, stdoutBuf.Bytes())
result, err2 := parseDevcontainerCLILastLine[devcontainerCLIResult](ctx, logger, stdoutBuf.Bytes())
if err2 != nil {
err = errors.Join(err, err2)
}
return "", err
// Return the container ID if available, even if there was an error.
// This can happen if the container was created successfully but a
// lifecycle script (e.g. postCreateCommand) failed.
return result.ContainerID, err
}
result, err := parseDevcontainerCLILastLine[devcontainerCLIResult](ctx, logger, stdoutBuf.Bytes())
@@ -275,6 +278,13 @@ func (d *devcontainerCLI) Up(ctx context.Context, workspaceFolder, configPath st
return "", err
}
// Check if the result indicates an error (e.g. lifecycle script failure)
// but still has a container ID, allowing the caller to potentially
// continue with the container that was created.
if err := result.Err(); err != nil {
return result.ContainerID, err
}
return result.ContainerID, nil
}
@@ -394,7 +404,10 @@ func parseDevcontainerCLILastLine[T any](ctx context.Context, logger slog.Logger
type devcontainerCLIResult struct {
Outcome string `json:"outcome"` // "error", "success".
// The following fields are set if outcome is success.
// The following fields are typically set if outcome is success, but
// ContainerID may also be present when outcome is error if the
// container was created but a lifecycle script (e.g. postCreateCommand)
// failed.
ContainerID string `json:"containerId"`
RemoteUser string `json:"remoteUser"`
RemoteWorkspaceFolder string `json:"remoteWorkspaceFolder"`
@@ -404,18 +417,6 @@ type devcontainerCLIResult struct {
Description string `json:"description"`
}
func (r *devcontainerCLIResult) UnmarshalJSON(data []byte) error {
type wrapperResult devcontainerCLIResult
var wrappedResult wrapperResult
if err := json.Unmarshal(data, &wrappedResult); err != nil {
return err
}
*r = devcontainerCLIResult(wrappedResult)
return r.Err()
}
func (r devcontainerCLIResult) Err() error {
if r.Outcome == "success" {
return nil
+64 -41
View File
@@ -42,56 +42,63 @@ func TestDevcontainerCLI_ArgsAndParsing(t *testing.T) {
t.Parallel()
tests := []struct {
name string
logFile string
workspace string
config string
opts []agentcontainers.DevcontainerCLIUpOptions
wantArgs string
wantError bool
name string
logFile string
workspace string
config string
opts []agentcontainers.DevcontainerCLIUpOptions
wantArgs string
wantError bool
wantContainerID bool // If true, expect a container ID even when wantError is true.
}{
{
name: "success",
logFile: "up.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: false,
name: "success",
logFile: "up.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: false,
wantContainerID: true,
},
{
name: "success with config",
logFile: "up.log",
workspace: "/test/workspace",
config: "/test/config.json",
wantArgs: "up --log-format json --workspace-folder /test/workspace --config /test/config.json",
wantError: false,
name: "success with config",
logFile: "up.log",
workspace: "/test/workspace",
config: "/test/config.json",
wantArgs: "up --log-format json --workspace-folder /test/workspace --config /test/config.json",
wantError: false,
wantContainerID: true,
},
{
name: "already exists",
logFile: "up-already-exists.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: false,
name: "already exists",
logFile: "up-already-exists.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: false,
wantContainerID: true,
},
{
name: "docker error",
logFile: "up-error-docker.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: true,
name: "docker error",
logFile: "up-error-docker.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: true,
wantContainerID: false,
},
{
name: "bad outcome",
logFile: "up-error-bad-outcome.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: true,
name: "bad outcome",
logFile: "up-error-bad-outcome.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: true,
wantContainerID: false,
},
{
name: "does not exist",
logFile: "up-error-does-not-exist.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: true,
name: "does not exist",
logFile: "up-error-does-not-exist.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: true,
wantContainerID: false,
},
{
name: "with remove existing container",
@@ -100,8 +107,21 @@ func TestDevcontainerCLI_ArgsAndParsing(t *testing.T) {
opts: []agentcontainers.DevcontainerCLIUpOptions{
agentcontainers.WithRemoveExistingContainer(),
},
wantArgs: "up --log-format json --workspace-folder /test/workspace --remove-existing-container",
wantError: false,
wantArgs: "up --log-format json --workspace-folder /test/workspace --remove-existing-container",
wantError: false,
wantContainerID: true,
},
{
// This test verifies that when a lifecycle script like
// postCreateCommand fails, the CLI returns both an error
// and a container ID. The caller can then proceed with
// agent injection into the created container.
name: "lifecycle script failure with container",
logFile: "up-error-lifecycle-script.log",
workspace: "/test/workspace",
wantArgs: "up --log-format json --workspace-folder /test/workspace",
wantError: true,
wantContainerID: true,
},
}
@@ -122,10 +142,13 @@ func TestDevcontainerCLI_ArgsAndParsing(t *testing.T) {
containerID, err := dccli.Up(ctx, tt.workspace, tt.config, tt.opts...)
if tt.wantError {
assert.Error(t, err, "want error")
assert.Empty(t, containerID, "expected empty container ID")
} else {
assert.NoError(t, err, "want no error")
}
if tt.wantContainerID {
assert.NotEmpty(t, containerID, "expected non-empty container ID")
} else {
assert.Empty(t, containerID, "expected empty container ID")
}
})
}
File diff suppressed because one or more lines are too long