mirror of
https://github.com/coder/coder.git
synced 2026-06-02 20:48:20 +00:00
fix(agent/agentcontainer): allow lifecycle script error on devcontainer up (#21020)
When devcontainer up fails due to a lifecycle script error like postCreateCommand, the CLI still returns a container ID. Previously this was discarded and the devcontainer marked as failed. Now we continue with agent injection if a container ID is available, allowing users to debug the issue in the running container. Fixes coder/internal#1137
This commit is contained in:
committed by
GitHub
parent
bf40d678ec
commit
74d0c39cb3
@@ -1039,6 +1039,10 @@ func (api *API) processUpdatedContainersLocked(ctx context.Context, updated code
|
||||
logger.Error(ctx, "inject subagent into container failed", slog.Error(err))
|
||||
dc.Error = err.Error()
|
||||
} else {
|
||||
// TODO(mafredri): Preserve the error from devcontainer
|
||||
// up if it was a lifecycle script error. Currently
|
||||
// this results in a brief flicker for the user if
|
||||
// injection is fast, as the error is shown then erased.
|
||||
dc.Error = ""
|
||||
}
|
||||
}
|
||||
@@ -1347,27 +1351,41 @@ func (api *API) CreateDevcontainer(workspaceFolder, configPath string, opts ...D
|
||||
upOptions := []DevcontainerCLIUpOptions{WithUpOutput(infoW, errW)}
|
||||
upOptions = append(upOptions, opts...)
|
||||
|
||||
_, err := api.dccli.Up(ctx, dc.WorkspaceFolder, configPath, upOptions...)
|
||||
if err != nil {
|
||||
containerID, upErr := api.dccli.Up(ctx, dc.WorkspaceFolder, configPath, upOptions...)
|
||||
if upErr != nil {
|
||||
// No need to log if the API is closing (context canceled), as this
|
||||
// is expected behavior when the API is shutting down.
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
logger.Error(ctx, "devcontainer creation failed", slog.Error(err))
|
||||
if !errors.Is(upErr, context.Canceled) {
|
||||
logger.Error(ctx, "devcontainer creation failed", slog.Error(upErr))
|
||||
}
|
||||
|
||||
api.mu.Lock()
|
||||
dc = api.knownDevcontainers[dc.WorkspaceFolder]
|
||||
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusError
|
||||
dc.Error = err.Error()
|
||||
api.knownDevcontainers[dc.WorkspaceFolder] = dc
|
||||
api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "errorTimes")
|
||||
api.mu.Unlock()
|
||||
// If we don't have a container ID, the error is fatal, so we
|
||||
// should mark the devcontainer as errored and return.
|
||||
if containerID == "" {
|
||||
api.mu.Lock()
|
||||
dc = api.knownDevcontainers[dc.WorkspaceFolder]
|
||||
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusError
|
||||
dc.Error = upErr.Error()
|
||||
api.knownDevcontainers[dc.WorkspaceFolder] = dc
|
||||
api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "errorTimes")
|
||||
api.broadcastUpdatesLocked()
|
||||
api.mu.Unlock()
|
||||
|
||||
return xerrors.Errorf("start devcontainer: %w", err)
|
||||
return xerrors.Errorf("start devcontainer: %w", upErr)
|
||||
}
|
||||
|
||||
// If we have a container ID, it means the container was created
|
||||
// but a lifecycle script (e.g. postCreateCommand) failed. In this
|
||||
// case, we still want to refresh containers to pick up the new
|
||||
// container, inject the agent, and allow the user to debug the
|
||||
// issue. We store the error to surface it to the user.
|
||||
logger.Warn(ctx, "devcontainer created with errors (e.g. lifecycle script failure), container is available",
|
||||
slog.F("container_id", containerID),
|
||||
)
|
||||
} else {
|
||||
logger.Info(ctx, "devcontainer created successfully")
|
||||
}
|
||||
|
||||
logger.Info(ctx, "devcontainer created successfully")
|
||||
|
||||
api.mu.Lock()
|
||||
dc = api.knownDevcontainers[dc.WorkspaceFolder]
|
||||
// Update the devcontainer status to Running or Stopped based on the
|
||||
@@ -1376,13 +1394,18 @@ func (api *API) CreateDevcontainer(workspaceFolder, configPath string, opts ...D
|
||||
// to minimize the time between API consistency, we guess the status
|
||||
// based on the container state.
|
||||
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped
|
||||
if dc.Container != nil {
|
||||
if dc.Container.Running {
|
||||
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning
|
||||
}
|
||||
if dc.Container != nil && dc.Container.Running {
|
||||
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning
|
||||
}
|
||||
dc.Dirty = false
|
||||
dc.Error = ""
|
||||
if upErr != nil {
|
||||
// If there was a lifecycle script error but we have a container ID,
|
||||
// the container is running so we should set the status to Running.
|
||||
dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning
|
||||
dc.Error = upErr.Error()
|
||||
} else {
|
||||
dc.Error = ""
|
||||
}
|
||||
api.recreateSuccessTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "successTimes")
|
||||
api.knownDevcontainers[dc.WorkspaceFolder] = dc
|
||||
api.broadcastUpdatesLocked()
|
||||
|
||||
@@ -2070,6 +2070,118 @@ func TestAPI(t *testing.T) {
|
||||
require.Equal(t, "", response.Devcontainers[0].Error)
|
||||
})
|
||||
|
||||
// This test verifies that when devcontainer up fails due to a
|
||||
// lifecycle script error (such as postCreateCommand failing) but the
|
||||
// container was successfully created, we still proceed with the
|
||||
// devcontainer. The container should be available for use and the
|
||||
// agent should be injected.
|
||||
t.Run("DuringUpWithContainerID", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
var (
|
||||
ctx = testutil.Context(t, testutil.WaitMedium)
|
||||
logger = slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug)
|
||||
mClock = quartz.NewMock(t)
|
||||
|
||||
testContainer = codersdk.WorkspaceAgentContainer{
|
||||
ID: "test-container-id",
|
||||
FriendlyName: "test-container",
|
||||
Image: "test-image",
|
||||
Running: true,
|
||||
CreatedAt: time.Now(),
|
||||
Labels: map[string]string{
|
||||
agentcontainers.DevcontainerLocalFolderLabel: "/workspaces/project",
|
||||
agentcontainers.DevcontainerConfigFileLabel: "/workspaces/project/.devcontainer/devcontainer.json",
|
||||
},
|
||||
}
|
||||
fCCLI = &fakeContainerCLI{
|
||||
containers: codersdk.WorkspaceAgentListContainersResponse{
|
||||
Containers: []codersdk.WorkspaceAgentContainer{testContainer},
|
||||
},
|
||||
arch: "amd64",
|
||||
}
|
||||
fDCCLI = &fakeDevcontainerCLI{
|
||||
upID: testContainer.ID,
|
||||
upErrC: make(chan func() error, 1),
|
||||
}
|
||||
fSAC = &fakeSubAgentClient{
|
||||
logger: logger.Named("fakeSubAgentClient"),
|
||||
}
|
||||
|
||||
testDevcontainer = codersdk.WorkspaceAgentDevcontainer{
|
||||
ID: uuid.New(),
|
||||
Name: "test-devcontainer",
|
||||
WorkspaceFolder: "/workspaces/project",
|
||||
ConfigPath: "/workspaces/project/.devcontainer/devcontainer.json",
|
||||
Status: codersdk.WorkspaceAgentDevcontainerStatusStopped,
|
||||
}
|
||||
)
|
||||
|
||||
mClock.Set(time.Now()).MustWait(ctx)
|
||||
tickerTrap := mClock.Trap().TickerFunc("updaterLoop")
|
||||
nowRecreateSuccessTrap := mClock.Trap().Now("recreate", "successTimes")
|
||||
|
||||
api := agentcontainers.NewAPI(logger,
|
||||
agentcontainers.WithClock(mClock),
|
||||
agentcontainers.WithContainerCLI(fCCLI),
|
||||
agentcontainers.WithDevcontainerCLI(fDCCLI),
|
||||
agentcontainers.WithDevcontainers(
|
||||
[]codersdk.WorkspaceAgentDevcontainer{testDevcontainer},
|
||||
[]codersdk.WorkspaceAgentScript{{ID: testDevcontainer.ID, LogSourceID: uuid.New()}},
|
||||
),
|
||||
agentcontainers.WithSubAgentClient(fSAC),
|
||||
agentcontainers.WithSubAgentURL("test-subagent-url"),
|
||||
agentcontainers.WithWatcher(watcher.NewNoop()),
|
||||
)
|
||||
api.Start()
|
||||
defer func() {
|
||||
close(fDCCLI.upErrC)
|
||||
api.Close()
|
||||
}()
|
||||
|
||||
r := chi.NewRouter()
|
||||
r.Mount("/", api.Routes())
|
||||
|
||||
tickerTrap.MustWait(ctx).MustRelease(ctx)
|
||||
tickerTrap.Close()
|
||||
|
||||
// Send a recreate request to trigger devcontainer up.
|
||||
req := httptest.NewRequest(http.MethodPost, "/devcontainers/"+testDevcontainer.ID.String()+"/recreate", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
r.ServeHTTP(rec, req)
|
||||
require.Equal(t, http.StatusAccepted, rec.Code)
|
||||
|
||||
// Simulate a lifecycle script failure. The devcontainer CLI
|
||||
// will return an error but also provide a container ID since
|
||||
// the container was created before the script failed.
|
||||
simulatedError := xerrors.New("postCreateCommand failed with exit code 1")
|
||||
testutil.RequireSend(ctx, t, fDCCLI.upErrC, func() error { return simulatedError })
|
||||
|
||||
// Wait for the recreate operation to complete. We expect it to
|
||||
// record a success time because the container was created.
|
||||
nowRecreateSuccessTrap.MustWait(ctx).MustRelease(ctx)
|
||||
nowRecreateSuccessTrap.Close()
|
||||
|
||||
req = httptest.NewRequest(http.MethodGet, "/", nil)
|
||||
rec = httptest.NewRecorder()
|
||||
r.ServeHTTP(rec, req)
|
||||
require.Equal(t, http.StatusOK, rec.Code)
|
||||
|
||||
var response codersdk.WorkspaceAgentListContainersResponse
|
||||
err := json.NewDecoder(rec.Body).Decode(&response)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Verify that the devcontainer is running and has the container
|
||||
// associated with it despite the lifecycle script error. The
|
||||
// error may be cleared during refresh if agent injection
|
||||
// succeeds, but the important thing is that the container is
|
||||
// available for use.
|
||||
require.Len(t, response.Devcontainers, 1)
|
||||
assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, response.Devcontainers[0].Status)
|
||||
assert.NotNil(t, response.Devcontainers[0].Container)
|
||||
assert.Equal(t, testContainer.ID, response.Devcontainers[0].Container.ID)
|
||||
})
|
||||
|
||||
t.Run("DuringInjection", func(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -263,11 +263,14 @@ func (d *devcontainerCLI) Up(ctx context.Context, workspaceFolder, configPath st
|
||||
}
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
_, err2 := parseDevcontainerCLILastLine[devcontainerCLIResult](ctx, logger, stdoutBuf.Bytes())
|
||||
result, err2 := parseDevcontainerCLILastLine[devcontainerCLIResult](ctx, logger, stdoutBuf.Bytes())
|
||||
if err2 != nil {
|
||||
err = errors.Join(err, err2)
|
||||
}
|
||||
return "", err
|
||||
// Return the container ID if available, even if there was an error.
|
||||
// This can happen if the container was created successfully but a
|
||||
// lifecycle script (e.g. postCreateCommand) failed.
|
||||
return result.ContainerID, err
|
||||
}
|
||||
|
||||
result, err := parseDevcontainerCLILastLine[devcontainerCLIResult](ctx, logger, stdoutBuf.Bytes())
|
||||
@@ -275,6 +278,13 @@ func (d *devcontainerCLI) Up(ctx context.Context, workspaceFolder, configPath st
|
||||
return "", err
|
||||
}
|
||||
|
||||
// Check if the result indicates an error (e.g. lifecycle script failure)
|
||||
// but still has a container ID, allowing the caller to potentially
|
||||
// continue with the container that was created.
|
||||
if err := result.Err(); err != nil {
|
||||
return result.ContainerID, err
|
||||
}
|
||||
|
||||
return result.ContainerID, nil
|
||||
}
|
||||
|
||||
@@ -394,7 +404,10 @@ func parseDevcontainerCLILastLine[T any](ctx context.Context, logger slog.Logger
|
||||
type devcontainerCLIResult struct {
|
||||
Outcome string `json:"outcome"` // "error", "success".
|
||||
|
||||
// The following fields are set if outcome is success.
|
||||
// The following fields are typically set if outcome is success, but
|
||||
// ContainerID may also be present when outcome is error if the
|
||||
// container was created but a lifecycle script (e.g. postCreateCommand)
|
||||
// failed.
|
||||
ContainerID string `json:"containerId"`
|
||||
RemoteUser string `json:"remoteUser"`
|
||||
RemoteWorkspaceFolder string `json:"remoteWorkspaceFolder"`
|
||||
@@ -404,18 +417,6 @@ type devcontainerCLIResult struct {
|
||||
Description string `json:"description"`
|
||||
}
|
||||
|
||||
func (r *devcontainerCLIResult) UnmarshalJSON(data []byte) error {
|
||||
type wrapperResult devcontainerCLIResult
|
||||
|
||||
var wrappedResult wrapperResult
|
||||
if err := json.Unmarshal(data, &wrappedResult); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
*r = devcontainerCLIResult(wrappedResult)
|
||||
return r.Err()
|
||||
}
|
||||
|
||||
func (r devcontainerCLIResult) Err() error {
|
||||
if r.Outcome == "success" {
|
||||
return nil
|
||||
|
||||
@@ -42,56 +42,63 @@ func TestDevcontainerCLI_ArgsAndParsing(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
logFile string
|
||||
workspace string
|
||||
config string
|
||||
opts []agentcontainers.DevcontainerCLIUpOptions
|
||||
wantArgs string
|
||||
wantError bool
|
||||
name string
|
||||
logFile string
|
||||
workspace string
|
||||
config string
|
||||
opts []agentcontainers.DevcontainerCLIUpOptions
|
||||
wantArgs string
|
||||
wantError bool
|
||||
wantContainerID bool // If true, expect a container ID even when wantError is true.
|
||||
}{
|
||||
{
|
||||
name: "success",
|
||||
logFile: "up.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: false,
|
||||
name: "success",
|
||||
logFile: "up.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: false,
|
||||
wantContainerID: true,
|
||||
},
|
||||
{
|
||||
name: "success with config",
|
||||
logFile: "up.log",
|
||||
workspace: "/test/workspace",
|
||||
config: "/test/config.json",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace --config /test/config.json",
|
||||
wantError: false,
|
||||
name: "success with config",
|
||||
logFile: "up.log",
|
||||
workspace: "/test/workspace",
|
||||
config: "/test/config.json",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace --config /test/config.json",
|
||||
wantError: false,
|
||||
wantContainerID: true,
|
||||
},
|
||||
{
|
||||
name: "already exists",
|
||||
logFile: "up-already-exists.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: false,
|
||||
name: "already exists",
|
||||
logFile: "up-already-exists.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: false,
|
||||
wantContainerID: true,
|
||||
},
|
||||
{
|
||||
name: "docker error",
|
||||
logFile: "up-error-docker.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: true,
|
||||
name: "docker error",
|
||||
logFile: "up-error-docker.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: true,
|
||||
wantContainerID: false,
|
||||
},
|
||||
{
|
||||
name: "bad outcome",
|
||||
logFile: "up-error-bad-outcome.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: true,
|
||||
name: "bad outcome",
|
||||
logFile: "up-error-bad-outcome.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: true,
|
||||
wantContainerID: false,
|
||||
},
|
||||
{
|
||||
name: "does not exist",
|
||||
logFile: "up-error-does-not-exist.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: true,
|
||||
name: "does not exist",
|
||||
logFile: "up-error-does-not-exist.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: true,
|
||||
wantContainerID: false,
|
||||
},
|
||||
{
|
||||
name: "with remove existing container",
|
||||
@@ -100,8 +107,21 @@ func TestDevcontainerCLI_ArgsAndParsing(t *testing.T) {
|
||||
opts: []agentcontainers.DevcontainerCLIUpOptions{
|
||||
agentcontainers.WithRemoveExistingContainer(),
|
||||
},
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace --remove-existing-container",
|
||||
wantError: false,
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace --remove-existing-container",
|
||||
wantError: false,
|
||||
wantContainerID: true,
|
||||
},
|
||||
{
|
||||
// This test verifies that when a lifecycle script like
|
||||
// postCreateCommand fails, the CLI returns both an error
|
||||
// and a container ID. The caller can then proceed with
|
||||
// agent injection into the created container.
|
||||
name: "lifecycle script failure with container",
|
||||
logFile: "up-error-lifecycle-script.log",
|
||||
workspace: "/test/workspace",
|
||||
wantArgs: "up --log-format json --workspace-folder /test/workspace",
|
||||
wantError: true,
|
||||
wantContainerID: true,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -122,10 +142,13 @@ func TestDevcontainerCLI_ArgsAndParsing(t *testing.T) {
|
||||
containerID, err := dccli.Up(ctx, tt.workspace, tt.config, tt.opts...)
|
||||
if tt.wantError {
|
||||
assert.Error(t, err, "want error")
|
||||
assert.Empty(t, containerID, "expected empty container ID")
|
||||
} else {
|
||||
assert.NoError(t, err, "want no error")
|
||||
}
|
||||
if tt.wantContainerID {
|
||||
assert.NotEmpty(t, containerID, "expected non-empty container ID")
|
||||
} else {
|
||||
assert.Empty(t, containerID, "expected empty container ID")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Generated
Vendored
+147
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user