chore: implement oom/ood processing component (#16436)

Implements the processing logic as set out in the OOM/OOD RFC.
This commit is contained in:
Danielle Maywood
2025-02-17 16:56:52 +00:00
committed by GitHub
parent b5329ae1cd
commit d6b9806098
26 changed files with 1823 additions and 113 deletions
+25 -3
View File
@@ -17,10 +17,12 @@ import (
"cdr.dev/slog"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/appearance"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/externalauth"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/prometheusmetrics"
"github.com/coder/coder/v2/coderd/tracing"
"github.com/coder/coder/v2/coderd/workspacestats"
@@ -29,6 +31,7 @@ import (
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/tailnet"
tailnetproto "github.com/coder/coder/v2/tailnet/proto"
"github.com/coder/quartz"
)
// API implements the DRPC agent API interface from agent/proto. This struct is
@@ -59,7 +62,9 @@ type Options struct {
Ctx context.Context
Log slog.Logger
Clock quartz.Clock
Database database.Store
NotificationsEnqueuer notifications.Enqueuer
Pubsub pubsub.Pubsub
DerpMapFn func() *tailcfg.DERPMap
TailnetCoordinator *atomic.Pointer[tailnet.Coordinator]
@@ -82,6 +87,10 @@ type Options struct {
}
func New(opts Options) *API {
if opts.Clock == nil {
opts.Clock = quartz.NewReal()
}
api := &API{
opts: opts,
mu: sync.Mutex{},
@@ -104,9 +113,22 @@ func New(opts Options) *API {
}
api.ResourcesMonitoringAPI = &ResourcesMonitoringAPI{
Log: opts.Log,
AgentID: opts.AgentID,
Database: opts.Database,
AgentID: opts.AgentID,
WorkspaceID: opts.WorkspaceID,
Clock: opts.Clock,
Database: opts.Database,
NotificationsEnqueuer: opts.NotificationsEnqueuer,
Debounce: 5 * time.Minute,
Config: resourcesmonitor.Config{
NumDatapoints: 20,
CollectionInterval: 10 * time.Second,
Alert: resourcesmonitor.AlertConfig{
MinimumNOKsPercent: 20,
ConsecutiveNOKsPercent: 50,
},
},
}
api.StatsAPI = &StatsAPI{
+198 -9
View File
@@ -4,20 +4,35 @@ import (
"context"
"database/sql"
"errors"
"fmt"
"time"
"golang.org/x/xerrors"
"cdr.dev/slog"
"github.com/google/uuid"
"cdr.dev/slog"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/quartz"
)
type ResourcesMonitoringAPI struct {
AgentID uuid.UUID
Database database.Store
Log slog.Logger
AgentID uuid.UUID
WorkspaceID uuid.UUID
Log slog.Logger
Clock quartz.Clock
Database database.Store
NotificationsEnqueuer notifications.Enqueuer
Debounce time.Duration
Config resourcesmonitor.Config
}
func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) {
@@ -33,8 +48,8 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
return &proto.GetResourcesMonitoringConfigurationResponse{
Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{
CollectionIntervalSeconds: 10,
NumDatapoints: 20,
CollectionIntervalSeconds: int32(a.Config.CollectionInterval.Seconds()),
NumDatapoints: a.Config.NumDatapoints,
},
Memory: func() *proto.GetResourcesMonitoringConfigurationResponse_Memory {
if memoryErr != nil {
@@ -60,8 +75,182 @@ func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context
}
func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) {
a.Log.Info(ctx, "resources monitoring usage received",
slog.F("request", req))
var err error
return &proto.PushResourcesMonitoringUsageResponse{}, nil
if memoryErr := a.monitorMemory(ctx, req.Datapoints); memoryErr != nil {
err = errors.Join(err, xerrors.Errorf("monitor memory: %w", memoryErr))
}
if volumeErr := a.monitorVolumes(ctx, req.Datapoints); volumeErr != nil {
err = errors.Join(err, xerrors.Errorf("monitor volume: %w", volumeErr))
}
return &proto.PushResourcesMonitoringUsageResponse{}, err
}
func (a *ResourcesMonitoringAPI) monitorMemory(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error {
monitor, err := a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, a.AgentID)
if err != nil {
// It is valid for an agent to not have a memory monitor, so we
// do not want to treat it as an error.
if errors.Is(err, sql.ErrNoRows) {
return nil
}
return xerrors.Errorf("fetch memory resource monitor: %w", err)
}
if !monitor.Enabled {
return nil
}
usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage, 0, len(datapoints))
for _, datapoint := range datapoints {
usageDatapoints = append(usageDatapoints, datapoint.Memory)
}
usageStates := resourcesmonitor.CalculateMemoryUsageStates(monitor, usageDatapoints)
oldState := monitor.State
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates)
debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState)
//nolint:gocritic // We need to be able to update the resource monitor here.
err = a.Database.UpdateMemoryResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateMemoryResourceMonitorParams{
AgentID: a.AgentID,
State: newState,
UpdatedAt: dbtime.Time(a.Clock.Now()),
DebouncedUntil: dbtime.Time(debouncedUntil),
})
if err != nil {
return xerrors.Errorf("update workspace monitor: %w", err)
}
if !shouldNotify {
return nil
}
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}
_, err = a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfMemory,
map[string]string{
"workspace": workspace.Name,
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
},
map[string]any{
// NOTE(DanielleMaywood):
// When notifications are enqueued, they are checked to be
// unique within a single day. This means that if we attempt
// to send two OOM notifications for the same workspace on
// the same day, the enqueuer will prevent us from sending
// a second one. We are inject a timestamp to make the
// notifications appear different enough to circumvent this
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-memory",
)
if err != nil {
return xerrors.Errorf("notify workspace OOM: %w", err)
}
return nil
}
func (a *ResourcesMonitoringAPI) monitorVolumes(ctx context.Context, datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint) error {
volumeMonitors, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, a.AgentID)
if err != nil {
return xerrors.Errorf("get or insert volume monitor: %w", err)
}
outOfDiskVolumes := make([]map[string]any, 0)
for _, monitor := range volumeMonitors {
if !monitor.Enabled {
continue
}
usageDatapoints := make([]*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage, 0, len(datapoints))
for _, datapoint := range datapoints {
var usage *proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage
for _, volume := range datapoint.Volumes {
if volume.Volume == monitor.Path {
usage = volume
break
}
}
usageDatapoints = append(usageDatapoints, usage)
}
usageStates := resourcesmonitor.CalculateVolumeUsageStates(monitor, usageDatapoints)
oldState := monitor.State
newState := resourcesmonitor.NextState(a.Config, oldState, usageStates)
debouncedUntil, shouldNotify := monitor.Debounce(a.Debounce, a.Clock.Now(), oldState, newState)
if shouldNotify {
outOfDiskVolumes = append(outOfDiskVolumes, map[string]any{
"path": monitor.Path,
"threshold": fmt.Sprintf("%d%%", monitor.Threshold),
})
}
//nolint:gocritic // We need to be able to update the resource monitor here.
if err := a.Database.UpdateVolumeResourceMonitor(dbauthz.AsResourceMonitor(ctx), database.UpdateVolumeResourceMonitorParams{
AgentID: a.AgentID,
Path: monitor.Path,
State: newState,
UpdatedAt: dbtime.Time(a.Clock.Now()),
DebouncedUntil: dbtime.Time(debouncedUntil),
}); err != nil {
return xerrors.Errorf("update workspace monitor: %w", err)
}
}
if len(outOfDiskVolumes) == 0 {
return nil
}
workspace, err := a.Database.GetWorkspaceByID(ctx, a.WorkspaceID)
if err != nil {
return xerrors.Errorf("get workspace by id: %w", err)
}
if _, err := a.NotificationsEnqueuer.EnqueueWithData(
// nolint:gocritic // We need to be able to send the notification.
dbauthz.AsNotifier(ctx),
workspace.OwnerID,
notifications.TemplateWorkspaceOutOfDisk,
map[string]string{
"workspace": workspace.Name,
},
map[string]any{
"volumes": outOfDiskVolumes,
// NOTE(DanielleMaywood):
// When notifications are enqueued, they are checked to be
// unique within a single day. This means that if we attempt
// to send two OOM notifications for the same workspace on
// the same day, the enqueuer will prevent us from sending
// a second one. We are inject a timestamp to make the
// notifications appear different enough to circumvent this
// deduplication logic.
"timestamp": a.Clock.Now(),
},
"workspace-monitor-volumes",
); err != nil {
return xerrors.Errorf("notify workspace OOD: %w", err)
}
return nil
}
@@ -0,0 +1,944 @@
package agentapi_test
import (
"context"
"testing"
"time"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/timestamppb"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
"github.com/coder/quartz"
)
func resourceMonitorAPI(t *testing.T) (*agentapi.ResourcesMonitoringAPI, database.User, *quartz.Mock, *notificationstest.FakeEnqueuer) {
t.Helper()
db, _ := dbtestutil.NewDB(t)
user := dbgen.User(t, db, database.User{})
org := dbgen.Organization(t, db, database.Organization{})
template := dbgen.Template(t, db, database.Template{
OrganizationID: org.ID,
CreatedBy: user.ID,
})
templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{Valid: true, UUID: template.ID},
OrganizationID: org.ID,
CreatedBy: user.ID,
})
workspace := dbgen.Workspace(t, db, database.WorkspaceTable{
OrganizationID: org.ID,
TemplateID: template.ID,
OwnerID: user.ID,
})
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
Type: database.ProvisionerJobTypeWorkspaceBuild,
})
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
JobID: job.ID,
WorkspaceID: workspace.ID,
TemplateVersionID: templateVersion.ID,
})
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{
JobID: build.JobID,
})
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resource.ID,
})
notifyEnq := &notificationstest.FakeEnqueuer{}
clock := quartz.NewMock(t)
return &agentapi.ResourcesMonitoringAPI{
AgentID: agent.ID,
WorkspaceID: workspace.ID,
Clock: clock,
Database: db,
NotificationsEnqueuer: notifyEnq,
Config: resourcesmonitor.Config{
NumDatapoints: 20,
CollectionInterval: 10 * time.Second,
Alert: resourcesmonitor.AlertConfig{
MinimumNOKsPercent: 20,
ConsecutiveNOKsPercent: 50,
},
},
Debounce: 1 * time.Minute,
}, user, clock, notifyEnq
}
func TestMemoryResourceMonitorDebounce(t *testing.T) {
t.Parallel()
// This test is a bit of a long one. We're testing that
// when a monitor goes into an alert state, it doesn't
// allow another notification to occur until after the
// debounce period.
//
// 1. OK -> NOK |> sends a notification
// 2. NOK -> OK |> does nothing
// 3. OK -> NOK |> does nothing due to debounce period
// 4. NOK -> OK |> does nothing
// 5. OK -> NOK |> sends a notification as debounce period exceeded
api, user, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 100
// Given: A monitor in an OK state
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// When: The monitor is given a state that will trigger NOK
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect there to be a notification sent
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
notifyEnq.Clear()
// When: The monitor moves to an OK state from NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to a NOK state before the debounced time.
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no new notifications (showing the debouncer working)
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to an OK state from NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We still expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to a NOK state after the debounce period.
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect a notification
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
}
func TestMemoryResourceMonitor(t *testing.T) {
t.Parallel()
tests := []struct {
name string
memoryUsage []int64
memoryTotal int64
previousState database.WorkspaceAgentMonitorState
expectState database.WorkspaceAgentMonitorState
shouldNotify bool
}{
{
name: "WhenOK/NeverExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ShouldStayInOK",
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ConsecutiveExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenOK/MinimumExceedsThreshold",
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenNOK/NeverExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenNOK/ShouldStayInNOK",
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/ConsecutiveExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/MinimumExceedsThreshold",
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
api, user, clock, notifyEnq := resourceMonitorAPI(t)
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.memoryUsage))
collectedAt := clock.Now()
for _, usage := range tt.memoryUsage {
collectedAt = collectedAt.Add(15 * time.Second)
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
CollectedAt: timestamppb.New(collectedAt),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: usage,
Total: tt.memoryTotal,
},
})
}
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: tt.previousState,
Threshold: 80,
})
clock.Set(collectedAt)
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: datapoints,
})
require.NoError(t, err)
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
if tt.shouldNotify {
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
} else {
require.Len(t, sent, 0)
}
})
}
}
func TestMemoryResourceMonitorMissingData(t *testing.T) {
t.Parallel()
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
t.Parallel()
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in an OK state.
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// When: A datapoint is missing, surrounded by two NOK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Memory: nil,
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
// Then: We expect the monitor to still be in an OK state.
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitor.State)
})
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
t.Parallel()
api, _, clock, _ := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in a NOK state.
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// When: A datapoint is missing, surrounded by two OK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Memory: nil,
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect the monitor to still be in a NOK state.
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitor.State)
})
}
func TestVolumeResourceMonitorDebounce(t *testing.T) {
t.Parallel()
// This test is an even longer one. We're testing
// that the debounce logic is independent per
// volume monitor. We interleave the triggering
// of each monitor to ensure the debounce logic
// is monitor independent.
//
// First Monitor:
// 1. OK -> NOK |> sends a notification
// 2. NOK -> OK |> does nothing
// 3. OK -> NOK |> does nothing due to debounce period
// 4. NOK -> OK |> does nothing
// 5. OK -> NOK |> sends a notification as debounce period exceeded
// 6. NOK -> OK |> does nothing
//
// Second Monitor:
// 1. OK -> OK |> does nothing
// 2. OK -> NOK |> sends a notification
// 3. NOK -> OK |> does nothing
// 4. OK -> NOK |> does nothing due to debounce period
// 5. NOK -> OK |> does nothing
// 6. OK -> NOK |> sends a notification as debounce period exceeded
//
firstVolumePath := "/home/coder"
secondVolumePath := "/dev/coder"
api, _, clock, notifyEnq := resourceMonitorAPI(t)
// Given:
// - First monitor in an OK state
// - Second monitor in an OK state
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: firstVolumePath,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: secondVolumePath,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// When:
// - First monitor is in a NOK state
// - Second monitor is in an OK state
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the first monitor
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes := requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, firstVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First monitor moves back to OK
// - Second monitor moves to NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the second monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, secondVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First monitor moves back to NOK before debounce period has ended
// - Second monitor moves back to OK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When:
// - First monitor moves back to OK
// - Second monitor moves back to NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect no new notifications.
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When:
// - First monitor moves back to a NOK state after the debounce period
// - Second monitor moves back to OK
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the first monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, firstVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First montior moves back to OK
// - Second monitor moves back to NOK after the debounce period
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the second monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, secondVolumePath, volumes[0]["path"])
}
func TestVolumeResourceMonitor(t *testing.T) {
t.Parallel()
tests := []struct {
name string
volumePath string
volumeUsage []int64
volumeTotal int64
thresholdPercent int32
previousState database.WorkspaceAgentMonitorState
expectState database.WorkspaceAgentMonitorState
shouldNotify bool
}{
{
name: "WhenOK/NeverExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ShouldStayInOK",
volumePath: "/home/coder",
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ConsecutiveExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenOK/MinimumExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenNOK/NeverExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenNOK/ShouldStayInNOK",
volumePath: "/home/coder",
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/ConsecutiveExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/MinimumExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
}
for _, tt := range tests {
tt := tt
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
api, user, clock, notifyEnq := resourceMonitorAPI(t)
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.volumeUsage))
collectedAt := clock.Now()
for _, volumeUsage := range tt.volumeUsage {
collectedAt = collectedAt.Add(15 * time.Second)
volumeDatapoints := []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: tt.volumePath,
Used: volumeUsage,
Total: tt.volumeTotal,
},
}
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
CollectedAt: timestamppb.New(collectedAt),
Volumes: volumeDatapoints,
})
}
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: tt.volumePath,
State: tt.previousState,
Threshold: tt.thresholdPercent,
})
clock.Set(collectedAt)
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: datapoints,
})
require.NoError(t, err)
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
if tt.shouldNotify {
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
} else {
require.Len(t, sent, 0)
}
})
}
}
func TestVolumeResourceMonitorMultiple(t *testing.T) {
t.Parallel()
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 100
// Given: two different volume resource monitors
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: "/home/coder",
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: "/dev/coder",
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// When: both of them move to a NOK state
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: "/home/coder",
Used: 10,
Total: 10,
},
{
Volume: "/dev/coder",
Used: 10,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect a notification to alert with information about both
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes := requireVolumeData(t, sent[0])
require.Len(t, volumes, 2)
require.Equal(t, "/home/coder", volumes[0]["path"])
require.Equal(t, "/dev/coder", volumes[1]["path"])
}
func TestVolumeResourceMonitorMissingData(t *testing.T) {
t.Parallel()
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
t.Parallel()
volumePath := "/home/coder"
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in an OK state.
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: volumePath,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// When: A datapoint is missing, surrounded by two NOK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 10,
Total: 10,
},
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 10,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
// Then: We expect the monitor to still be in an OK state.
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Len(t, monitors, 1)
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitors[0].State)
})
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
t.Parallel()
volumePath := "/home/coder"
api, _, clock, _ := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in a NOK state.
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: volumePath,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// When: A datapoint is missing, surrounded by two OK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 1,
Total: 10,
},
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 1,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect the monitor to still be in a NOK state.
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Len(t, monitors, 1)
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitors[0].State)
})
}
func requireVolumeData(t *testing.T, notif *notificationstest.FakeNotification) []map[string]any {
t.Helper()
volumesData := notif.Data["volumes"]
require.IsType(t, []map[string]any{}, volumesData)
return volumesData.([]map[string]any)
}
@@ -0,0 +1,129 @@
package resourcesmonitor
import (
"math"
"time"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/util/slice"
)
type State int
const (
StateOK State = iota
StateNOK
StateUnknown
)
type AlertConfig struct {
// What percentage of datapoints in a row are
// required to put the monitor in an alert state.
ConsecutiveNOKsPercent int
// What percentage of datapoints in a window are
// required to put the monitor in an alert state.
MinimumNOKsPercent int
}
type Config struct {
// How many datapoints should the agent send
NumDatapoints int32
// How long between each datapoint should
// collection occur.
CollectionInterval time.Duration
Alert AlertConfig
}
func CalculateMemoryUsageStates(
monitor database.WorkspaceAgentMemoryResourceMonitor,
datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage,
) []State {
states := make([]State, 0, len(datapoints))
for _, datapoint := range datapoints {
state := StateUnknown
if datapoint != nil {
percent := int32(float64(datapoint.Used) / float64(datapoint.Total) * 100)
if percent < monitor.Threshold {
state = StateOK
} else {
state = StateNOK
}
}
states = append(states, state)
}
return states
}
func CalculateVolumeUsageStates(
monitor database.WorkspaceAgentVolumeResourceMonitor,
datapoints []*proto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage,
) []State {
states := make([]State, 0, len(datapoints))
for _, datapoint := range datapoints {
state := StateUnknown
if datapoint != nil {
percent := int32(float64(datapoint.Used) / float64(datapoint.Total) * 100)
if percent < monitor.Threshold {
state = StateOK
} else {
state = StateNOK
}
}
states = append(states, state)
}
return states
}
func NextState(c Config, oldState database.WorkspaceAgentMonitorState, states []State) database.WorkspaceAgentMonitorState {
// If there are enough consecutive NOK states, we should be in an
// alert state.
consecutiveNOKs := slice.CountConsecutive(StateNOK, states...)
if percent(consecutiveNOKs, len(states)) >= c.Alert.ConsecutiveNOKsPercent {
return database.WorkspaceAgentMonitorStateNOK
}
// We do not explicitly handle StateUnknown because it could have
// been either StateOK or StateNOK if collection didn't fail. As
// it could be either, our best bet is to ignore it.
nokCount, okCount := 0, 0
for _, state := range states {
switch state {
case StateOK:
okCount++
case StateNOK:
nokCount++
}
}
// If there are enough NOK datapoints, we should be in an alert state.
if percent(nokCount, len(states)) >= c.Alert.MinimumNOKsPercent {
return database.WorkspaceAgentMonitorStateNOK
}
// If all datapoints are OK, we should be in an OK state
if okCount == len(states) {
return database.WorkspaceAgentMonitorStateOK
}
// Otherwise we stay in the same state as last.
return oldState
}
func percent[T int](numerator, denominator T) int {
percent := float64(numerator*100) / float64(denominator)
return int(math.Round(percent))
}
+40
View File
@@ -289,6 +289,24 @@ var (
Scope: rbac.ScopeAll,
}.WithCachedASTValue()
subjectResourceMonitor = rbac.Subject{
FriendlyName: "Resource Monitor",
ID: uuid.Nil.String(),
Roles: rbac.Roles([]rbac.Role{
{
Identifier: rbac.RoleIdentifier{Name: "resourcemonitor"},
DisplayName: "Resource Monitor",
Site: rbac.Permissions(map[string][]policy.Action{
// The workspace monitor needs to be able to update monitors
rbac.ResourceWorkspaceAgentResourceMonitor.Type: {policy.ActionUpdate},
}),
Org: map[string][]rbac.Permission{},
User: []rbac.Permission{},
},
}),
Scope: rbac.ScopeAll,
}.WithCachedASTValue()
subjectSystemRestricted = rbac.Subject{
FriendlyName: "System",
ID: uuid.Nil.String(),
@@ -376,6 +394,12 @@ func AsNotifier(ctx context.Context) context.Context {
return context.WithValue(ctx, authContextKey{}, subjectNotifier)
}
// AsResourceMonitor returns a context with an actor that has permissions required for
// updating resource monitors.
func AsResourceMonitor(ctx context.Context) context.Context {
return context.WithValue(ctx, authContextKey{}, subjectResourceMonitor)
}
// AsSystemRestricted returns a context with an actor that has permissions
// required for various system operations (login, logout, metrics cache).
func AsSystemRestricted(ctx context.Context) context.Context {
@@ -3677,6 +3701,14 @@ func (q *querier) UpdateMemberRoles(ctx context.Context, arg database.UpdateMemb
return q.db.UpdateMemberRoles(ctx, arg)
}
func (q *querier) UpdateMemoryResourceMonitor(ctx context.Context, arg database.UpdateMemoryResourceMonitorParams) error {
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
return err
}
return q.db.UpdateMemoryResourceMonitor(ctx, arg)
}
func (q *querier) UpdateNotificationTemplateMethodByID(ctx context.Context, arg database.UpdateNotificationTemplateMethodByIDParams) (database.NotificationTemplate, error) {
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceNotificationTemplate); err != nil {
return database.NotificationTemplate{}, err
@@ -4073,6 +4105,14 @@ func (q *querier) UpdateUserStatus(ctx context.Context, arg database.UpdateUserS
return updateWithReturn(q.log, q.auth, fetch, q.db.UpdateUserStatus)(ctx, arg)
}
func (q *querier) UpdateVolumeResourceMonitor(ctx context.Context, arg database.UpdateVolumeResourceMonitorParams) error {
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
return err
}
return q.db.UpdateVolumeResourceMonitor(ctx, arg)
}
func (q *querier) UpdateWorkspace(ctx context.Context, arg database.UpdateWorkspaceParams) (database.WorkspaceTable, error) {
fetch := func(ctx context.Context, arg database.UpdateWorkspaceParams) (database.WorkspaceTable, error) {
w, err := q.db.GetWorkspaceByID(ctx, arg.ID)
+56 -45
View File
@@ -4725,43 +4725,78 @@ func (s *MethodTestSuite) TestOAuth2ProviderAppTokens() {
}
func (s *MethodTestSuite) TestResourcesMonitor() {
s.Run("InsertMemoryResourceMonitor", s.Subtest(func(db database.Store, check *expects) {
dbtestutil.DisableForeignKeysAndTriggers(s.T(), db)
check.Args(database.InsertMemoryResourceMonitorParams{}).Asserts(rbac.ResourceWorkspaceAgentResourceMonitor, policy.ActionCreate)
}))
createAgent := func(t *testing.T, db database.Store) (database.WorkspaceAgent, database.WorkspaceTable) {
t.Helper()
s.Run("InsertVolumeResourceMonitor", s.Subtest(func(db database.Store, check *expects) {
dbtestutil.DisableForeignKeysAndTriggers(s.T(), db)
check.Args(database.InsertVolumeResourceMonitorParams{}).Asserts(rbac.ResourceWorkspaceAgentResourceMonitor, policy.ActionCreate)
}))
s.Run("FetchMemoryResourceMonitorsByAgentID", s.Subtest(func(db database.Store, check *expects) {
u := dbgen.User(s.T(), db, database.User{})
o := dbgen.Organization(s.T(), db, database.Organization{})
tpl := dbgen.Template(s.T(), db, database.Template{
u := dbgen.User(t, db, database.User{})
o := dbgen.Organization(t, db, database.Organization{})
tpl := dbgen.Template(t, db, database.Template{
OrganizationID: o.ID,
CreatedBy: u.ID,
})
tv := dbgen.TemplateVersion(s.T(), db, database.TemplateVersion{
tv := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
OrganizationID: o.ID,
CreatedBy: u.ID,
})
w := dbgen.Workspace(s.T(), db, database.WorkspaceTable{
w := dbgen.Workspace(t, db, database.WorkspaceTable{
TemplateID: tpl.ID,
OrganizationID: o.ID,
OwnerID: u.ID,
})
j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{
j := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
Type: database.ProvisionerJobTypeWorkspaceBuild,
})
b := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{
b := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
JobID: j.ID,
WorkspaceID: w.ID,
TemplateVersionID: tv.ID,
})
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: b.JobID})
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
res := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{JobID: b.JobID})
agt := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{ResourceID: res.ID})
return agt, w
}
s.Run("InsertMemoryResourceMonitor", s.Subtest(func(db database.Store, check *expects) {
agt, _ := createAgent(s.T(), db)
check.Args(database.InsertMemoryResourceMonitorParams{
AgentID: agt.ID,
State: database.WorkspaceAgentMonitorStateOK,
}).Asserts(rbac.ResourceWorkspaceAgentResourceMonitor, policy.ActionCreate)
}))
s.Run("InsertVolumeResourceMonitor", s.Subtest(func(db database.Store, check *expects) {
agt, _ := createAgent(s.T(), db)
check.Args(database.InsertVolumeResourceMonitorParams{
AgentID: agt.ID,
State: database.WorkspaceAgentMonitorStateOK,
}).Asserts(rbac.ResourceWorkspaceAgentResourceMonitor, policy.ActionCreate)
}))
s.Run("UpdateMemoryResourceMonitor", s.Subtest(func(db database.Store, check *expects) {
agt, _ := createAgent(s.T(), db)
check.Args(database.UpdateMemoryResourceMonitorParams{
AgentID: agt.ID,
State: database.WorkspaceAgentMonitorStateOK,
}).Asserts(rbac.ResourceWorkspaceAgentResourceMonitor, policy.ActionUpdate)
}))
s.Run("UpdateVolumeResourceMonitor", s.Subtest(func(db database.Store, check *expects) {
agt, _ := createAgent(s.T(), db)
check.Args(database.UpdateVolumeResourceMonitorParams{
AgentID: agt.ID,
State: database.WorkspaceAgentMonitorStateOK,
}).Asserts(rbac.ResourceWorkspaceAgentResourceMonitor, policy.ActionUpdate)
}))
s.Run("FetchMemoryResourceMonitorsByAgentID", s.Subtest(func(db database.Store, check *expects) {
agt, w := createAgent(s.T(), db)
dbgen.WorkspaceAgentMemoryResourceMonitor(s.T(), db, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: agt.ID,
Enabled: true,
@@ -4776,32 +4811,8 @@ func (s *MethodTestSuite) TestResourcesMonitor() {
}))
s.Run("FetchVolumesResourceMonitorsByAgentID", s.Subtest(func(db database.Store, check *expects) {
u := dbgen.User(s.T(), db, database.User{})
o := dbgen.Organization(s.T(), db, database.Organization{})
tpl := dbgen.Template(s.T(), db, database.Template{
OrganizationID: o.ID,
CreatedBy: u.ID,
})
tv := dbgen.TemplateVersion(s.T(), db, database.TemplateVersion{
TemplateID: uuid.NullUUID{UUID: tpl.ID, Valid: true},
OrganizationID: o.ID,
CreatedBy: u.ID,
})
w := dbgen.Workspace(s.T(), db, database.WorkspaceTable{
TemplateID: tpl.ID,
OrganizationID: o.ID,
OwnerID: u.ID,
})
j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{
Type: database.ProvisionerJobTypeWorkspaceBuild,
})
b := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{
JobID: j.ID,
WorkspaceID: w.ID,
TemplateVersionID: tv.ID,
})
res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: b.JobID})
agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID})
agt, w := createAgent(s.T(), db)
dbgen.WorkspaceAgentVolumeResourceMonitor(s.T(), db, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: agt.ID,
Path: "/var/lib",
+15 -9
View File
@@ -1038,10 +1038,13 @@ func OAuth2ProviderAppToken(t testing.TB, db database.Store, seed database.OAuth
func WorkspaceAgentMemoryResourceMonitor(t testing.TB, db database.Store, seed database.WorkspaceAgentMemoryResourceMonitor) database.WorkspaceAgentMemoryResourceMonitor {
monitor, err := db.InsertMemoryResourceMonitor(genCtx, database.InsertMemoryResourceMonitorParams{
AgentID: takeFirst(seed.AgentID, uuid.New()),
Enabled: takeFirst(seed.Enabled, true),
Threshold: takeFirst(seed.Threshold, 100),
CreatedAt: takeFirst(seed.CreatedAt, dbtime.Now()),
AgentID: takeFirst(seed.AgentID, uuid.New()),
Enabled: takeFirst(seed.Enabled, true),
State: takeFirst(seed.State, database.WorkspaceAgentMonitorStateOK),
Threshold: takeFirst(seed.Threshold, 100),
CreatedAt: takeFirst(seed.CreatedAt, dbtime.Now()),
UpdatedAt: takeFirst(seed.UpdatedAt, dbtime.Now()),
DebouncedUntil: takeFirst(seed.DebouncedUntil, time.Time{}),
})
require.NoError(t, err, "insert workspace agent memory resource monitor")
return monitor
@@ -1049,11 +1052,14 @@ func WorkspaceAgentMemoryResourceMonitor(t testing.TB, db database.Store, seed d
func WorkspaceAgentVolumeResourceMonitor(t testing.TB, db database.Store, seed database.WorkspaceAgentVolumeResourceMonitor) database.WorkspaceAgentVolumeResourceMonitor {
monitor, err := db.InsertVolumeResourceMonitor(genCtx, database.InsertVolumeResourceMonitorParams{
AgentID: takeFirst(seed.AgentID, uuid.New()),
Path: takeFirst(seed.Path, "/"),
Enabled: takeFirst(seed.Enabled, true),
Threshold: takeFirst(seed.Threshold, 100),
CreatedAt: takeFirst(seed.CreatedAt, dbtime.Now()),
AgentID: takeFirst(seed.AgentID, uuid.New()),
Path: takeFirst(seed.Path, "/"),
Enabled: takeFirst(seed.Enabled, true),
State: takeFirst(seed.State, database.WorkspaceAgentMonitorStateOK),
Threshold: takeFirst(seed.Threshold, 100),
CreatedAt: takeFirst(seed.CreatedAt, dbtime.Now()),
UpdatedAt: takeFirst(seed.UpdatedAt, dbtime.Now()),
DebouncedUntil: takeFirst(seed.DebouncedUntil, time.Time{}),
})
require.NoError(t, err, "insert workspace agent volume resource monitor")
return monitor
+66 -6
View File
@@ -7989,7 +7989,16 @@ func (q *FakeQuerier) InsertMemoryResourceMonitor(_ context.Context, arg databas
q.mutex.Lock()
defer q.mutex.Unlock()
monitor := database.WorkspaceAgentMemoryResourceMonitor(arg)
//nolint:unconvert // The structs field-order differs so this is needed.
monitor := database.WorkspaceAgentMemoryResourceMonitor(database.WorkspaceAgentMemoryResourceMonitor{
AgentID: arg.AgentID,
Enabled: arg.Enabled,
State: arg.State,
Threshold: arg.Threshold,
CreatedAt: arg.CreatedAt,
UpdatedAt: arg.UpdatedAt,
DebouncedUntil: arg.DebouncedUntil,
})
q.workspaceAgentMemoryResourceMonitors = append(q.workspaceAgentMemoryResourceMonitors, monitor)
return monitor, nil
@@ -8676,11 +8685,14 @@ func (q *FakeQuerier) InsertVolumeResourceMonitor(_ context.Context, arg databas
defer q.mutex.Unlock()
monitor := database.WorkspaceAgentVolumeResourceMonitor{
AgentID: arg.AgentID,
Path: arg.Path,
Enabled: arg.Enabled,
Threshold: arg.Threshold,
CreatedAt: arg.CreatedAt,
AgentID: arg.AgentID,
Path: arg.Path,
Enabled: arg.Enabled,
State: arg.State,
Threshold: arg.Threshold,
CreatedAt: arg.CreatedAt,
UpdatedAt: arg.UpdatedAt,
DebouncedUntil: arg.DebouncedUntil,
}
q.workspaceAgentVolumeResourceMonitors = append(q.workspaceAgentVolumeResourceMonitors, monitor)
@@ -9691,6 +9703,30 @@ func (q *FakeQuerier) UpdateMemberRoles(_ context.Context, arg database.UpdateMe
return database.OrganizationMember{}, sql.ErrNoRows
}
func (q *FakeQuerier) UpdateMemoryResourceMonitor(_ context.Context, arg database.UpdateMemoryResourceMonitorParams) error {
err := validateDatabaseType(arg)
if err != nil {
return err
}
q.mutex.Lock()
defer q.mutex.Unlock()
for i, monitor := range q.workspaceAgentMemoryResourceMonitors {
if monitor.AgentID != arg.AgentID {
continue
}
monitor.State = arg.State
monitor.UpdatedAt = arg.UpdatedAt
monitor.DebouncedUntil = arg.DebouncedUntil
q.workspaceAgentMemoryResourceMonitors[i] = monitor
return nil
}
return nil
}
func (*FakeQuerier) UpdateNotificationTemplateMethodByID(_ context.Context, _ database.UpdateNotificationTemplateMethodByIDParams) (database.NotificationTemplate, error) {
// Not implementing this function because it relies on state in the database which is created with migrations.
// We could consider using code-generation to align the database state and dbmem, but it's not worth it right now.
@@ -10469,6 +10505,30 @@ func (q *FakeQuerier) UpdateUserStatus(_ context.Context, arg database.UpdateUse
return database.User{}, sql.ErrNoRows
}
func (q *FakeQuerier) UpdateVolumeResourceMonitor(_ context.Context, arg database.UpdateVolumeResourceMonitorParams) error {
err := validateDatabaseType(arg)
if err != nil {
return err
}
q.mutex.Lock()
defer q.mutex.Unlock()
for i, monitor := range q.workspaceAgentVolumeResourceMonitors {
if monitor.AgentID != arg.AgentID || monitor.Path != arg.Path {
continue
}
monitor.State = arg.State
monitor.UpdatedAt = arg.UpdatedAt
monitor.DebouncedUntil = arg.DebouncedUntil
q.workspaceAgentVolumeResourceMonitors[i] = monitor
return nil
}
return nil
}
func (q *FakeQuerier) UpdateWorkspace(_ context.Context, arg database.UpdateWorkspaceParams) (database.WorkspaceTable, error) {
if err := validateDatabaseType(arg); err != nil {
return database.WorkspaceTable{}, err
+14
View File
@@ -2331,6 +2331,13 @@ func (m queryMetricsStore) UpdateMemberRoles(ctx context.Context, arg database.U
return member, err
}
func (m queryMetricsStore) UpdateMemoryResourceMonitor(ctx context.Context, arg database.UpdateMemoryResourceMonitorParams) error {
start := time.Now()
r0 := m.s.UpdateMemoryResourceMonitor(ctx, arg)
m.queryLatencies.WithLabelValues("UpdateMemoryResourceMonitor").Observe(time.Since(start).Seconds())
return r0
}
func (m queryMetricsStore) UpdateNotificationTemplateMethodByID(ctx context.Context, arg database.UpdateNotificationTemplateMethodByIDParams) (database.NotificationTemplate, error) {
start := time.Now()
r0, r1 := m.s.UpdateNotificationTemplateMethodByID(ctx, arg)
@@ -2569,6 +2576,13 @@ func (m queryMetricsStore) UpdateUserStatus(ctx context.Context, arg database.Up
return user, err
}
func (m queryMetricsStore) UpdateVolumeResourceMonitor(ctx context.Context, arg database.UpdateVolumeResourceMonitorParams) error {
start := time.Now()
r0 := m.s.UpdateVolumeResourceMonitor(ctx, arg)
m.queryLatencies.WithLabelValues("UpdateVolumeResourceMonitor").Observe(time.Since(start).Seconds())
return r0
}
func (m queryMetricsStore) UpdateWorkspace(ctx context.Context, arg database.UpdateWorkspaceParams) (database.WorkspaceTable, error) {
start := time.Now()
workspace, err := m.s.UpdateWorkspace(ctx, arg)
+28
View File
@@ -4965,6 +4965,20 @@ func (mr *MockStoreMockRecorder) UpdateMemberRoles(ctx, arg any) *gomock.Call {
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateMemberRoles", reflect.TypeOf((*MockStore)(nil).UpdateMemberRoles), ctx, arg)
}
// UpdateMemoryResourceMonitor mocks base method.
func (m *MockStore) UpdateMemoryResourceMonitor(ctx context.Context, arg database.UpdateMemoryResourceMonitorParams) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "UpdateMemoryResourceMonitor", ctx, arg)
ret0, _ := ret[0].(error)
return ret0
}
// UpdateMemoryResourceMonitor indicates an expected call of UpdateMemoryResourceMonitor.
func (mr *MockStoreMockRecorder) UpdateMemoryResourceMonitor(ctx, arg any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateMemoryResourceMonitor", reflect.TypeOf((*MockStore)(nil).UpdateMemoryResourceMonitor), ctx, arg)
}
// UpdateNotificationTemplateMethodByID mocks base method.
func (m *MockStore) UpdateNotificationTemplateMethodByID(ctx context.Context, arg database.UpdateNotificationTemplateMethodByIDParams) (database.NotificationTemplate, error) {
m.ctrl.T.Helper()
@@ -5456,6 +5470,20 @@ func (mr *MockStoreMockRecorder) UpdateUserStatus(ctx, arg any) *gomock.Call {
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateUserStatus", reflect.TypeOf((*MockStore)(nil).UpdateUserStatus), ctx, arg)
}
// UpdateVolumeResourceMonitor mocks base method.
func (m *MockStore) UpdateVolumeResourceMonitor(ctx context.Context, arg database.UpdateVolumeResourceMonitorParams) error {
m.ctrl.T.Helper()
ret := m.ctrl.Call(m, "UpdateVolumeResourceMonitor", ctx, arg)
ret0, _ := ret[0].(error)
return ret0
}
// UpdateVolumeResourceMonitor indicates an expected call of UpdateVolumeResourceMonitor.
func (mr *MockStoreMockRecorder) UpdateVolumeResourceMonitor(ctx, arg any) *gomock.Call {
mr.mock.ctrl.T.Helper()
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateVolumeResourceMonitor", reflect.TypeOf((*MockStore)(nil).UpdateVolumeResourceMonitor), ctx, arg)
}
// UpdateWorkspace mocks base method.
func (m *MockStore) UpdateWorkspace(ctx context.Context, arg database.UpdateWorkspaceParams) (database.WorkspaceTable, error) {
m.ctrl.T.Helper()
+13 -2
View File
@@ -244,6 +244,11 @@ CREATE TYPE workspace_agent_lifecycle_state AS ENUM (
'off'
);
CREATE TYPE workspace_agent_monitor_state AS ENUM (
'OK',
'NOK'
);
CREATE TYPE workspace_agent_script_timing_stage AS ENUM (
'start',
'stop',
@@ -1510,7 +1515,10 @@ CREATE TABLE workspace_agent_memory_resource_monitors (
agent_id uuid NOT NULL,
enabled boolean NOT NULL,
threshold integer NOT NULL,
created_at timestamp with time zone NOT NULL
created_at timestamp with time zone NOT NULL,
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
state workspace_agent_monitor_state DEFAULT 'OK'::workspace_agent_monitor_state NOT NULL,
debounced_until timestamp with time zone DEFAULT '0001-01-01 00:00:00+00'::timestamp with time zone NOT NULL
);
CREATE UNLOGGED TABLE workspace_agent_metadata (
@@ -1595,7 +1603,10 @@ CREATE TABLE workspace_agent_volume_resource_monitors (
enabled boolean NOT NULL,
threshold integer NOT NULL,
path text NOT NULL,
created_at timestamp with time zone NOT NULL
created_at timestamp with time zone NOT NULL,
updated_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL,
state workspace_agent_monitor_state DEFAULT 'OK'::workspace_agent_monitor_state NOT NULL,
debounced_until timestamp with time zone DEFAULT '0001-01-01 00:00:00+00'::timestamp with time zone NOT NULL
);
CREATE TABLE workspace_agents (
@@ -0,0 +1,11 @@
ALTER TABLE workspace_agent_volume_resource_monitors
DROP COLUMN updated_at,
DROP COLUMN state,
DROP COLUMN debounced_until;
ALTER TABLE workspace_agent_memory_resource_monitors
DROP COLUMN updated_at,
DROP COLUMN state,
DROP COLUMN debounced_until;
DROP TYPE workspace_agent_monitor_state;
@@ -0,0 +1,14 @@
CREATE TYPE workspace_agent_monitor_state AS ENUM (
'OK',
'NOK'
);
ALTER TABLE workspace_agent_memory_resource_monitors
ADD COLUMN updated_at timestamp with time zone NOT NULL DEFAULT CURRENT_TIMESTAMP,
ADD COLUMN state workspace_agent_monitor_state NOT NULL DEFAULT 'OK',
ADD COLUMN debounced_until timestamp with time zone NOT NULL DEFAULT '0001-01-01 00:00:00'::timestamptz;
ALTER TABLE workspace_agent_volume_resource_monitors
ADD COLUMN updated_at timestamp with time zone NOT NULL DEFAULT CURRENT_TIMESTAMP,
ADD COLUMN state workspace_agent_monitor_state NOT NULL DEFAULT 'OK',
ADD COLUMN debounced_until timestamp with time zone NOT NULL DEFAULT '0001-01-01 00:00:00'::timestamptz;
+28
View File
@@ -527,3 +527,31 @@ func (k CryptoKey) CanVerify(now time.Time) bool {
func (r GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow) RBACObject() rbac.Object {
return r.ProvisionerJob.RBACObject()
}
func (m WorkspaceAgentMemoryResourceMonitor) Debounce(
by time.Duration,
now time.Time,
oldState, newState WorkspaceAgentMonitorState,
) (time.Time, bool) {
if now.After(m.DebouncedUntil) &&
oldState == WorkspaceAgentMonitorStateOK &&
newState == WorkspaceAgentMonitorStateNOK {
return now.Add(by), true
}
return m.DebouncedUntil, false
}
func (m WorkspaceAgentVolumeResourceMonitor) Debounce(
by time.Duration,
now time.Time,
oldState, newState WorkspaceAgentMonitorState,
) (debouncedUntil time.Time, shouldNotify bool) {
if now.After(m.DebouncedUntil) &&
oldState == WorkspaceAgentMonitorStateOK &&
newState == WorkspaceAgentMonitorStateNOK {
return now.Add(by), true
}
return m.DebouncedUntil, false
}
+73 -9
View File
@@ -1976,6 +1976,64 @@ func AllWorkspaceAgentLifecycleStateValues() []WorkspaceAgentLifecycleState {
}
}
type WorkspaceAgentMonitorState string
const (
WorkspaceAgentMonitorStateOK WorkspaceAgentMonitorState = "OK"
WorkspaceAgentMonitorStateNOK WorkspaceAgentMonitorState = "NOK"
)
func (e *WorkspaceAgentMonitorState) Scan(src interface{}) error {
switch s := src.(type) {
case []byte:
*e = WorkspaceAgentMonitorState(s)
case string:
*e = WorkspaceAgentMonitorState(s)
default:
return fmt.Errorf("unsupported scan type for WorkspaceAgentMonitorState: %T", src)
}
return nil
}
type NullWorkspaceAgentMonitorState struct {
WorkspaceAgentMonitorState WorkspaceAgentMonitorState `json:"workspace_agent_monitor_state"`
Valid bool `json:"valid"` // Valid is true if WorkspaceAgentMonitorState is not NULL
}
// Scan implements the Scanner interface.
func (ns *NullWorkspaceAgentMonitorState) Scan(value interface{}) error {
if value == nil {
ns.WorkspaceAgentMonitorState, ns.Valid = "", false
return nil
}
ns.Valid = true
return ns.WorkspaceAgentMonitorState.Scan(value)
}
// Value implements the driver Valuer interface.
func (ns NullWorkspaceAgentMonitorState) Value() (driver.Value, error) {
if !ns.Valid {
return nil, nil
}
return string(ns.WorkspaceAgentMonitorState), nil
}
func (e WorkspaceAgentMonitorState) Valid() bool {
switch e {
case WorkspaceAgentMonitorStateOK,
WorkspaceAgentMonitorStateNOK:
return true
}
return false
}
func AllWorkspaceAgentMonitorStateValues() []WorkspaceAgentMonitorState {
return []WorkspaceAgentMonitorState{
WorkspaceAgentMonitorStateOK,
WorkspaceAgentMonitorStateNOK,
}
}
// What stage the script was ran in.
type WorkspaceAgentScriptTimingStage string
@@ -3185,10 +3243,13 @@ type WorkspaceAgentLogSource struct {
}
type WorkspaceAgentMemoryResourceMonitor struct {
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Enabled bool `db:"enabled" json:"enabled"`
Threshold int32 `db:"threshold" json:"threshold"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Enabled bool `db:"enabled" json:"enabled"`
Threshold int32 `db:"threshold" json:"threshold"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
State WorkspaceAgentMonitorState `db:"state" json:"state"`
DebouncedUntil time.Time `db:"debounced_until" json:"debounced_until"`
}
type WorkspaceAgentMetadatum struct {
@@ -3259,11 +3320,14 @@ type WorkspaceAgentStat struct {
}
type WorkspaceAgentVolumeResourceMonitor struct {
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Enabled bool `db:"enabled" json:"enabled"`
Threshold int32 `db:"threshold" json:"threshold"`
Path string `db:"path" json:"path"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Enabled bool `db:"enabled" json:"enabled"`
Threshold int32 `db:"threshold" json:"threshold"`
Path string `db:"path" json:"path"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
State WorkspaceAgentMonitorState `db:"state" json:"state"`
DebouncedUntil time.Time `db:"debounced_until" json:"debounced_until"`
}
type WorkspaceApp struct {
+2
View File
@@ -480,6 +480,7 @@ type sqlcQuerier interface {
UpdateGroupByID(ctx context.Context, arg UpdateGroupByIDParams) (Group, error)
UpdateInactiveUsersToDormant(ctx context.Context, arg UpdateInactiveUsersToDormantParams) ([]UpdateInactiveUsersToDormantRow, error)
UpdateMemberRoles(ctx context.Context, arg UpdateMemberRolesParams) (OrganizationMember, error)
UpdateMemoryResourceMonitor(ctx context.Context, arg UpdateMemoryResourceMonitorParams) error
UpdateNotificationTemplateMethodByID(ctx context.Context, arg UpdateNotificationTemplateMethodByIDParams) (NotificationTemplate, error)
UpdateOAuth2ProviderAppByID(ctx context.Context, arg UpdateOAuth2ProviderAppByIDParams) (OAuth2ProviderApp, error)
UpdateOAuth2ProviderAppSecretByID(ctx context.Context, arg UpdateOAuth2ProviderAppSecretByIDParams) (OAuth2ProviderAppSecret, error)
@@ -514,6 +515,7 @@ type sqlcQuerier interface {
UpdateUserQuietHoursSchedule(ctx context.Context, arg UpdateUserQuietHoursScheduleParams) (User, error)
UpdateUserRoles(ctx context.Context, arg UpdateUserRolesParams) (User, error)
UpdateUserStatus(ctx context.Context, arg UpdateUserStatusParams) (User, error)
UpdateVolumeResourceMonitor(ctx context.Context, arg UpdateVolumeResourceMonitorParams) error
UpdateWorkspace(ctx context.Context, arg UpdateWorkspaceParams) (WorkspaceTable, error)
UpdateWorkspaceAgentConnectionByID(ctx context.Context, arg UpdateWorkspaceAgentConnectionByIDParams) error
UpdateWorkspaceAgentLifecycleStateByID(ctx context.Context, arg UpdateWorkspaceAgentLifecycleStateByIDParams) error
+101 -15
View File
@@ -12044,7 +12044,7 @@ func (q *sqlQuerier) UpsertWorkspaceAgentPortShare(ctx context.Context, arg Upse
const fetchMemoryResourceMonitorsByAgentID = `-- name: FetchMemoryResourceMonitorsByAgentID :one
SELECT
agent_id, enabled, threshold, created_at
agent_id, enabled, threshold, created_at, updated_at, state, debounced_until
FROM
workspace_agent_memory_resource_monitors
WHERE
@@ -12059,13 +12059,16 @@ func (q *sqlQuerier) FetchMemoryResourceMonitorsByAgentID(ctx context.Context, a
&i.Enabled,
&i.Threshold,
&i.CreatedAt,
&i.UpdatedAt,
&i.State,
&i.DebouncedUntil,
)
return i, err
}
const fetchVolumesResourceMonitorsByAgentID = `-- name: FetchVolumesResourceMonitorsByAgentID :many
SELECT
agent_id, enabled, threshold, path, created_at
agent_id, enabled, threshold, path, created_at, updated_at, state, debounced_until
FROM
workspace_agent_volume_resource_monitors
WHERE
@@ -12087,6 +12090,9 @@ func (q *sqlQuerier) FetchVolumesResourceMonitorsByAgentID(ctx context.Context,
&i.Threshold,
&i.Path,
&i.CreatedAt,
&i.UpdatedAt,
&i.State,
&i.DebouncedUntil,
); err != nil {
return nil, err
}
@@ -12106,26 +12112,35 @@ INSERT INTO
workspace_agent_memory_resource_monitors (
agent_id,
enabled,
state,
threshold,
created_at
created_at,
updated_at,
debounced_until
)
VALUES
($1, $2, $3, $4) RETURNING agent_id, enabled, threshold, created_at
($1, $2, $3, $4, $5, $6, $7) RETURNING agent_id, enabled, threshold, created_at, updated_at, state, debounced_until
`
type InsertMemoryResourceMonitorParams struct {
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Enabled bool `db:"enabled" json:"enabled"`
Threshold int32 `db:"threshold" json:"threshold"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Enabled bool `db:"enabled" json:"enabled"`
State WorkspaceAgentMonitorState `db:"state" json:"state"`
Threshold int32 `db:"threshold" json:"threshold"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
DebouncedUntil time.Time `db:"debounced_until" json:"debounced_until"`
}
func (q *sqlQuerier) InsertMemoryResourceMonitor(ctx context.Context, arg InsertMemoryResourceMonitorParams) (WorkspaceAgentMemoryResourceMonitor, error) {
row := q.db.QueryRowContext(ctx, insertMemoryResourceMonitor,
arg.AgentID,
arg.Enabled,
arg.State,
arg.Threshold,
arg.CreatedAt,
arg.UpdatedAt,
arg.DebouncedUntil,
)
var i WorkspaceAgentMemoryResourceMonitor
err := row.Scan(
@@ -12133,6 +12148,9 @@ func (q *sqlQuerier) InsertMemoryResourceMonitor(ctx context.Context, arg Insert
&i.Enabled,
&i.Threshold,
&i.CreatedAt,
&i.UpdatedAt,
&i.State,
&i.DebouncedUntil,
)
return i, err
}
@@ -12143,19 +12161,25 @@ INSERT INTO
agent_id,
path,
enabled,
state,
threshold,
created_at
created_at,
updated_at,
debounced_until
)
VALUES
($1, $2, $3, $4, $5) RETURNING agent_id, enabled, threshold, path, created_at
($1, $2, $3, $4, $5, $6, $7, $8) RETURNING agent_id, enabled, threshold, path, created_at, updated_at, state, debounced_until
`
type InsertVolumeResourceMonitorParams struct {
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Path string `db:"path" json:"path"`
Enabled bool `db:"enabled" json:"enabled"`
Threshold int32 `db:"threshold" json:"threshold"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Path string `db:"path" json:"path"`
Enabled bool `db:"enabled" json:"enabled"`
State WorkspaceAgentMonitorState `db:"state" json:"state"`
Threshold int32 `db:"threshold" json:"threshold"`
CreatedAt time.Time `db:"created_at" json:"created_at"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
DebouncedUntil time.Time `db:"debounced_until" json:"debounced_until"`
}
func (q *sqlQuerier) InsertVolumeResourceMonitor(ctx context.Context, arg InsertVolumeResourceMonitorParams) (WorkspaceAgentVolumeResourceMonitor, error) {
@@ -12163,8 +12187,11 @@ func (q *sqlQuerier) InsertVolumeResourceMonitor(ctx context.Context, arg Insert
arg.AgentID,
arg.Path,
arg.Enabled,
arg.State,
arg.Threshold,
arg.CreatedAt,
arg.UpdatedAt,
arg.DebouncedUntil,
)
var i WorkspaceAgentVolumeResourceMonitor
err := row.Scan(
@@ -12173,10 +12200,69 @@ func (q *sqlQuerier) InsertVolumeResourceMonitor(ctx context.Context, arg Insert
&i.Threshold,
&i.Path,
&i.CreatedAt,
&i.UpdatedAt,
&i.State,
&i.DebouncedUntil,
)
return i, err
}
const updateMemoryResourceMonitor = `-- name: UpdateMemoryResourceMonitor :exec
UPDATE workspace_agent_memory_resource_monitors
SET
updated_at = $2,
state = $3,
debounced_until = $4
WHERE
agent_id = $1
`
type UpdateMemoryResourceMonitorParams struct {
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
State WorkspaceAgentMonitorState `db:"state" json:"state"`
DebouncedUntil time.Time `db:"debounced_until" json:"debounced_until"`
}
func (q *sqlQuerier) UpdateMemoryResourceMonitor(ctx context.Context, arg UpdateMemoryResourceMonitorParams) error {
_, err := q.db.ExecContext(ctx, updateMemoryResourceMonitor,
arg.AgentID,
arg.UpdatedAt,
arg.State,
arg.DebouncedUntil,
)
return err
}
const updateVolumeResourceMonitor = `-- name: UpdateVolumeResourceMonitor :exec
UPDATE workspace_agent_volume_resource_monitors
SET
updated_at = $3,
state = $4,
debounced_until = $5
WHERE
agent_id = $1 AND path = $2
`
type UpdateVolumeResourceMonitorParams struct {
AgentID uuid.UUID `db:"agent_id" json:"agent_id"`
Path string `db:"path" json:"path"`
UpdatedAt time.Time `db:"updated_at" json:"updated_at"`
State WorkspaceAgentMonitorState `db:"state" json:"state"`
DebouncedUntil time.Time `db:"debounced_until" json:"debounced_until"`
}
func (q *sqlQuerier) UpdateVolumeResourceMonitor(ctx context.Context, arg UpdateVolumeResourceMonitorParams) error {
_, err := q.db.ExecContext(ctx, updateVolumeResourceMonitor,
arg.AgentID,
arg.Path,
arg.UpdatedAt,
arg.State,
arg.DebouncedUntil,
)
return err
}
const deleteOldWorkspaceAgentLogs = `-- name: DeleteOldWorkspaceAgentLogs :exec
WITH
latest_builds AS (
@@ -19,11 +19,14 @@ INSERT INTO
workspace_agent_memory_resource_monitors (
agent_id,
enabled,
state,
threshold,
created_at
created_at,
updated_at,
debounced_until
)
VALUES
($1, $2, $3, $4) RETURNING *;
($1, $2, $3, $4, $5, $6, $7) RETURNING *;
-- name: InsertVolumeResourceMonitor :one
INSERT INTO
@@ -31,8 +34,29 @@ INSERT INTO
agent_id,
path,
enabled,
state,
threshold,
created_at
created_at,
updated_at,
debounced_until
)
VALUES
($1, $2, $3, $4, $5) RETURNING *;
($1, $2, $3, $4, $5, $6, $7, $8) RETURNING *;
-- name: UpdateMemoryResourceMonitor :exec
UPDATE workspace_agent_memory_resource_monitors
SET
updated_at = $2,
state = $3,
debounced_until = $4
WHERE
agent_id = $1;
-- name: UpdateVolumeResourceMonitor :exec
UPDATE workspace_agent_volume_resource_monitors
SET
updated_at = $3,
state = $4,
debounced_until = $5
WHERE
agent_id = $1 AND path = $2;
@@ -1981,10 +1981,13 @@ func InsertWorkspaceResource(ctx context.Context, db database.Store, jobID uuid.
if prAgent.ResourcesMonitoring != nil {
if prAgent.ResourcesMonitoring.Memory != nil {
_, err = db.InsertMemoryResourceMonitor(ctx, database.InsertMemoryResourceMonitorParams{
AgentID: agentID,
Enabled: prAgent.ResourcesMonitoring.Memory.Enabled,
Threshold: prAgent.ResourcesMonitoring.Memory.Threshold,
CreatedAt: dbtime.Now(),
AgentID: agentID,
Enabled: prAgent.ResourcesMonitoring.Memory.Enabled,
Threshold: prAgent.ResourcesMonitoring.Memory.Threshold,
State: database.WorkspaceAgentMonitorStateOK,
CreatedAt: dbtime.Now(),
UpdatedAt: dbtime.Now(),
DebouncedUntil: time.Time{},
})
if err != nil {
return xerrors.Errorf("failed to insert agent memory resource monitor into db: %w", err)
@@ -1992,11 +1995,14 @@ func InsertWorkspaceResource(ctx context.Context, db database.Store, jobID uuid.
}
for _, volume := range prAgent.ResourcesMonitoring.Volumes {
_, err = db.InsertVolumeResourceMonitor(ctx, database.InsertVolumeResourceMonitorParams{
AgentID: agentID,
Path: volume.Path,
Enabled: volume.Enabled,
Threshold: volume.Threshold,
CreatedAt: dbtime.Now(),
AgentID: agentID,
Path: volume.Path,
Enabled: volume.Enabled,
Threshold: volume.Threshold,
State: database.WorkspaceAgentMonitorStateOK,
CreatedAt: dbtime.Now(),
UpdatedAt: dbtime.Now(),
DebouncedUntil: time.Time{},
})
if err != nil {
return xerrors.Errorf("failed to insert agent volume resource monitor into db: %w", err)
+1
View File
@@ -299,6 +299,7 @@ var (
// Valid Actions
// - "ActionCreate" :: create workspace agent resource monitor
// - "ActionRead" :: read workspace agent resource monitor
// - "ActionUpdate" :: update workspace agent resource monitor
ResourceWorkspaceAgentResourceMonitor = Object{
Type: "workspace_agent_resource_monitor",
}
+1
View File
@@ -306,6 +306,7 @@ var RBACPermissions = map[string]PermissionDefinition{
Actions: map[Action]ActionDefinition{
ActionRead: actDef("read workspace agent resource monitor"),
ActionCreate: actDef("create workspace agent resource monitor"),
ActionUpdate: actDef("update workspace agent resource monitor"),
},
},
}
+1 -1
View File
@@ -779,7 +779,7 @@ func TestRolePermissions(t *testing.T) {
},
{
Name: "ResourceMonitor",
Actions: []policy.Action{policy.ActionRead, policy.ActionCreate},
Actions: []policy.Action{policy.ActionRead, policy.ActionCreate, policy.ActionUpdate},
Resource: rbac.ResourceWorkspaceAgentResourceMonitor,
AuthorizeMap: map[bool][]hasAuthSubjects{
true: {owner},
+16
View File
@@ -177,3 +177,19 @@ func DifferenceFunc[T any](a []T, b []T, equal func(a, b T) bool) []T {
}
return tmp
}
func CountConsecutive[T comparable](needle T, haystack ...T) int {
maxLength := 0
curLength := 0
for _, v := range haystack {
if v == needle {
curLength++
} else {
maxLength = max(maxLength, curLength)
curLength = 0
}
}
return max(maxLength, curLength)
}
+2
View File
@@ -143,7 +143,9 @@ func (api *API) workspaceAgentRPC(rw http.ResponseWriter, r *http.Request) {
Ctx: api.ctx,
Log: logger,
Clock: api.Clock,
Database: api.Database,
NotificationsEnqueuer: api.NotificationsEnqueuer,
Pubsub: api.Pubsub,
DerpMapFn: api.DERPMap,
TailnetCoordinator: &api.TailnetCoordinator,
+1 -1
View File
@@ -92,7 +92,7 @@ var RBACResourceActions = map[RBACResource][]RBACAction{
ResourceTemplate: {ActionCreate, ActionDelete, ActionRead, ActionUpdate, ActionUse, ActionViewInsights},
ResourceUser: {ActionCreate, ActionDelete, ActionRead, ActionReadPersonal, ActionUpdate, ActionUpdatePersonal},
ResourceWorkspace: {ActionApplicationConnect, ActionCreate, ActionDelete, ActionRead, ActionSSH, ActionWorkspaceStart, ActionWorkspaceStop, ActionUpdate},
ResourceWorkspaceAgentResourceMonitor: {ActionCreate, ActionRead},
ResourceWorkspaceAgentResourceMonitor: {ActionCreate, ActionRead, ActionUpdate},
ResourceWorkspaceDormant: {ActionApplicationConnect, ActionCreate, ActionDelete, ActionRead, ActionSSH, ActionWorkspaceStart, ActionWorkspaceStop, ActionUpdate},
ResourceWorkspaceProxy: {ActionCreate, ActionDelete, ActionRead, ActionUpdate},
}
+1
View File
@@ -171,6 +171,7 @@ export const RBACResourceActions: Partial<
workspace_agent_resource_monitor: {
create: "create workspace agent resource monitor",
read: "read workspace agent resource monitor",
update: "update workspace agent resource monitor",
},
workspace_dormant: {
application_connect: "connect to workspace apps via browser",