Files
coder/coderd/agentapi/resources_monitoring_test.go
Callum Styan 45c43d4ec4 fix: refactor agent resource monitoring API to avoid excessive calls to DB (#20430)
This should resolve https://github.com/coder/internal/issues/728 by
refactoring the ResourceMonitorAPI struct to only require querying the
resource monitor once for memory and once for volumes, then using the
stored monitors on the API struct from that point on. This should
eliminate the vast majority of calls to `GetWorkspaceByAgentID` and
`FetchVolumesResourceMonitorsUpdatedAfter`/`FetchMemoryResourceMonitorsUpdatedAfter`
(millions of calls per week).

Tests passed, and I ran an instance of coder via a workspace with a
template that added resource monitoring every 10s. Note that this is the
default docker container, so there are other sources of
`GetWorkspaceByAgentID` db queries. Note that this workspace was running
for ~15 minutes at the time I gathered this data.

Over 30s for the `ResourceMonitor` calls:
```
coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep ResourceMonitor | grep count
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsByAgentID"} 2
coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsUpdatedAfter"} 2
100  288k    0  288k    0     0  58.3M      0 --:--:-- --:--:-- --:--:-- 70.4M
coderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsByAgentID"} 2
coderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsUpdatedAfter"} 2
coderd_db_query_latencies_seconds_count{query="UpdateMemoryResourceMonitor"} 155
coderd_db_query_latencies_seconds_count{query="UpdateVolumeResourceMonitor"} 155
coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep ResourceMonitor | grep count
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsByAgentID"} 2
coderd_db_query_latencies_seconds_count{query="FetchMemoryResourceMonitorsUpdatedAfter"} 2
100  288k    0  288k    0     0  34.7M      0 --:--:-- --:--:-- --:--:-- 40.2M
coderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsByAgentID"} 2
coderd_db_query_latencies_seconds_count{query="FetchVolumesResourceMonitorsUpdatedAfter"} 2
coderd_db_query_latencies_seconds_count{query="UpdateMemoryResourceMonitor"} 158
coderd_db_query_latencies_seconds_count{query="UpdateVolumeResourceMonitor"} 158
```

And over 1m for the `GetWorkspaceAgentByID` calls, the majority are from
the workspace metadata stats updates:
```
coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep GetWorkspaceByAgentID | grep count
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  284k    0  284k    0     0  42.4M      0 --:--:-- --:--:-- --:--:-- 46.3M
coderd_db_query_latencies_seconds_count{query="GetWorkspaceByAgentID"} 876
coder@callum-coder-2:~/coder$ curl localhost:19090/metrics | grep GetWorkspaceByAgentID | grep count
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  284k    0  284k    0     0  75.4M      0 --:--:-- --:--:-- --:--:-- 92.7M
coderd_db_query_latencies_seconds_count{query="GetWorkspaceByAgentID"} 918
```

---------

Signed-off-by: Callum Styan <callumstyan@gmail.com>
2025-10-28 13:38:16 -07:00

967 lines
32 KiB
Go

package agentapi_test
import (
"context"
"testing"
"time"
"github.com/google/uuid"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/types/known/timestamppb"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/agentapi"
"github.com/coder/coder/v2/coderd/agentapi/resourcesmonitor"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/notifications/notificationstest"
"github.com/coder/quartz"
)
func resourceMonitorAPI(t *testing.T) (*agentapi.ResourcesMonitoringAPI, database.User, *quartz.Mock, *notificationstest.FakeEnqueuer) {
t.Helper()
db, _ := dbtestutil.NewDB(t)
user := dbgen.User(t, db, database.User{})
org := dbgen.Organization(t, db, database.Organization{})
template := dbgen.Template(t, db, database.Template{
OrganizationID: org.ID,
CreatedBy: user.ID,
})
templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{Valid: true, UUID: template.ID},
OrganizationID: org.ID,
CreatedBy: user.ID,
})
workspace := dbgen.Workspace(t, db, database.WorkspaceTable{
OrganizationID: org.ID,
TemplateID: template.ID,
OwnerID: user.ID,
})
job := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
Type: database.ProvisionerJobTypeWorkspaceBuild,
})
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
JobID: job.ID,
WorkspaceID: workspace.ID,
TemplateVersionID: templateVersion.ID,
})
resource := dbgen.WorkspaceResource(t, db, database.WorkspaceResource{
JobID: build.JobID,
})
agent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
ResourceID: resource.ID,
})
notifyEnq := &notificationstest.FakeEnqueuer{}
clock := quartz.NewMock(t)
return &agentapi.ResourcesMonitoringAPI{
AgentID: agent.ID,
WorkspaceID: workspace.ID,
Clock: clock,
Database: db,
NotificationsEnqueuer: notifyEnq,
Config: resourcesmonitor.Config{
NumDatapoints: 20,
CollectionInterval: 10 * time.Second,
Alert: resourcesmonitor.AlertConfig{
MinimumNOKsPercent: 20,
ConsecutiveNOKsPercent: 50,
},
},
Debounce: 1 * time.Minute,
}, user, clock, notifyEnq
}
func TestMemoryResourceMonitorDebounce(t *testing.T) {
t.Parallel()
// This test is a bit of a long one. We're testing that
// when a monitor goes into an alert state, it doesn't
// allow another notification to occur until after the
// debounce period.
//
// 1. OK -> NOK |> sends a notification
// 2. NOK -> OK |> does nothing
// 3. OK -> NOK |> does nothing due to debounce period
// 4. NOK -> OK |> does nothing
// 5. OK -> NOK |> sends a notification as debounce period exceeded
api, user, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 100
// Given: A monitor in an OK state
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
// When: The monitor is given a state that will trigger NOK
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect there to be a notification sent
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
notifyEnq.Clear()
// When: The monitor moves to an OK state from NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to a NOK state before the debounced time.
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no new notifications (showing the debouncer working)
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to an OK state from NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We still expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When: The monitor moves back to a NOK state after the debounce period.
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect a notification
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
}
func TestMemoryResourceMonitor(t *testing.T) {
t.Parallel()
tests := []struct {
name string
memoryUsage []int64
memoryTotal int64
previousState database.WorkspaceAgentMonitorState
expectState database.WorkspaceAgentMonitorState
shouldNotify bool
}{
{
name: "WhenOK/NeverExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ShouldStayInOK",
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ConsecutiveExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenOK/MinimumExceedsThreshold",
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenNOK/NeverExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenNOK/ShouldStayInNOK",
memoryUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/ConsecutiveExceedsThreshold",
memoryUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/MinimumExceedsThreshold",
memoryUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
memoryTotal: 10,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
api, user, clock, notifyEnq := resourceMonitorAPI(t)
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.memoryUsage))
collectedAt := clock.Now()
for _, usage := range tt.memoryUsage {
collectedAt = collectedAt.Add(15 * time.Second)
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
CollectedAt: timestamppb.New(collectedAt),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: usage,
Total: tt.memoryTotal,
},
})
}
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: tt.previousState,
Threshold: 80,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
clock.Set(collectedAt)
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: datapoints,
})
require.NoError(t, err)
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
if tt.shouldNotify {
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
} else {
require.Len(t, sent, 0)
}
})
}
}
func TestMemoryResourceMonitorMissingData(t *testing.T) {
t.Parallel()
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
t.Parallel()
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in an OK state.
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
// When: A datapoint is missing, surrounded by two NOK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Memory: nil,
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 10,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfMemory))
require.Len(t, sent, 0)
// Then: We expect the monitor to still be in an OK state.
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitor.State)
})
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
t.Parallel()
api, _, clock, _ := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in a NOK state.
dbgen.WorkspaceAgentMemoryResourceMonitor(t, api.Database, database.WorkspaceAgentMemoryResourceMonitor{
AgentID: api.AgentID,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
// When: A datapoint is missing, surrounded by two OK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Memory: nil,
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Memory: &agentproto.PushResourcesMonitoringUsageRequest_Datapoint_MemoryUsage{
Used: 1,
Total: 10,
},
},
},
})
require.NoError(t, err)
// Then: We expect the monitor to still be in a NOK state.
monitor, err := api.Database.FetchMemoryResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitor.State)
})
}
func TestVolumeResourceMonitorDebounce(t *testing.T) {
t.Parallel()
// This test is an even longer one. We're testing
// that the debounce logic is independent per
// volume monitor. We interleave the triggering
// of each monitor to ensure the debounce logic
// is monitor independent.
//
// First Monitor:
// 1. OK -> NOK |> sends a notification
// 2. NOK -> OK |> does nothing
// 3. OK -> NOK |> does nothing due to debounce period
// 4. NOK -> OK |> does nothing
// 5. OK -> NOK |> sends a notification as debounce period exceeded
// 6. NOK -> OK |> does nothing
//
// Second Monitor:
// 1. OK -> OK |> does nothing
// 2. OK -> NOK |> sends a notification
// 3. NOK -> OK |> does nothing
// 4. OK -> NOK |> does nothing due to debounce period
// 5. NOK -> OK |> does nothing
// 6. OK -> NOK |> sends a notification as debounce period exceeded
//
firstVolumePath := "/home/coder"
secondVolumePath := "/dev/coder"
api, _, clock, notifyEnq := resourceMonitorAPI(t)
// Given:
// - First monitor in an OK state
// - Second monitor in an OK state
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: firstVolumePath,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: secondVolumePath,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
// When:
// - First monitor is in a NOK state
// - Second monitor is in an OK state
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the first monitor
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes := requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, firstVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First monitor moves back to OK
// - Second monitor moves to NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the second monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, secondVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First monitor moves back to NOK before debounce period has ended
// - Second monitor moves back to OK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect no new notifications
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When:
// - First monitor moves back to OK
// - Second monitor moves back to NOK
clock.Advance(api.Debounce / 4)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect no new notifications.
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
notifyEnq.Clear()
// When:
// - First monitor moves back to a NOK state after the debounce period
// - Second monitor moves back to OK
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 10, Total: 10},
{Volume: secondVolumePath, Used: 1, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the first monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, firstVolumePath, volumes[0]["path"])
notifyEnq.Clear()
// When:
// - First montior moves back to OK
// - Second monitor moves back to NOK after the debounce period
clock.Advance(api.Debounce/4 + 1*time.Second)
_, err = api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{Volume: firstVolumePath, Used: 1, Total: 10},
{Volume: secondVolumePath, Used: 10, Total: 10},
},
},
},
})
require.NoError(t, err)
// Then:
// - We expect a notification from only the second monitor
sent = notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes = requireVolumeData(t, sent[0])
require.Len(t, volumes, 1)
require.Equal(t, secondVolumePath, volumes[0]["path"])
}
func TestVolumeResourceMonitor(t *testing.T) {
t.Parallel()
tests := []struct {
name string
volumePath string
volumeUsage []int64
volumeTotal int64
thresholdPercent int32
previousState database.WorkspaceAgentMonitorState
expectState database.WorkspaceAgentMonitorState
shouldNotify bool
}{
{
name: "WhenOK/NeverExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ShouldStayInOK",
volumePath: "/home/coder",
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenOK/ConsecutiveExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenOK/MinimumExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: true,
},
{
name: "WhenNOK/NeverExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateOK,
shouldNotify: false,
},
{
name: "WhenNOK/ShouldStayInNOK",
volumePath: "/home/coder",
volumeUsage: []int64{9, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 2, 3, 1, 2},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/ConsecutiveExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 3, 2, 4, 2, 3, 2, 1, 2, 3, 4, 4, 1, 8, 9, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
{
name: "WhenNOK/MinimumExceedsThreshold",
volumePath: "/home/coder",
volumeUsage: []int64{2, 8, 2, 9, 2, 8, 2, 9, 2, 8, 4, 9, 1, 8, 2, 8, 9},
volumeTotal: 10,
thresholdPercent: 80,
previousState: database.WorkspaceAgentMonitorStateNOK,
expectState: database.WorkspaceAgentMonitorStateNOK,
shouldNotify: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
api, user, clock, notifyEnq := resourceMonitorAPI(t)
datapoints := make([]*agentproto.PushResourcesMonitoringUsageRequest_Datapoint, 0, len(tt.volumeUsage))
collectedAt := clock.Now()
for _, volumeUsage := range tt.volumeUsage {
collectedAt = collectedAt.Add(15 * time.Second)
volumeDatapoints := []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: tt.volumePath,
Used: volumeUsage,
Total: tt.volumeTotal,
},
}
datapoints = append(datapoints, &agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
CollectedAt: timestamppb.New(collectedAt),
Volumes: volumeDatapoints,
})
}
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: tt.volumePath,
State: tt.previousState,
Threshold: tt.thresholdPercent,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
clock.Set(collectedAt)
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: datapoints,
})
require.NoError(t, err)
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
if tt.shouldNotify {
require.Len(t, sent, 1)
require.Equal(t, user.ID, sent[0].UserID)
} else {
require.Len(t, sent, 0)
}
})
}
}
func TestVolumeResourceMonitorMultiple(t *testing.T) {
t.Parallel()
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 100
// Given: two different volume resource monitors
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: "/home/coder",
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: "/dev/coder",
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
// When: both of them move to a NOK state
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: "/home/coder",
Used: 10,
Total: 10,
},
{
Volume: "/dev/coder",
Used: 10,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect a notification to alert with information about both
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 1)
volumes := requireVolumeData(t, sent[0])
require.Len(t, volumes, 2)
require.Equal(t, "/home/coder", volumes[0]["path"])
require.Equal(t, "/dev/coder", volumes[1]["path"])
}
func TestVolumeResourceMonitorMissingData(t *testing.T) {
t.Parallel()
t.Run("UnknownPreventsMovingIntoAlertState", func(t *testing.T) {
t.Parallel()
volumePath := "/home/coder"
api, _, clock, notifyEnq := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in an OK state.
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: volumePath,
State: database.WorkspaceAgentMonitorStateOK,
Threshold: 80,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
// When: A datapoint is missing, surrounded by two NOK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 10,
Total: 10,
},
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 10,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect no notifications, as this unknown prevents us knowing we should alert.
sent := notifyEnq.Sent(notificationstest.WithTemplateID(notifications.TemplateWorkspaceOutOfDisk))
require.Len(t, sent, 0)
// Then: We expect the monitor to still be in an OK state.
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Len(t, monitors, 1)
require.Equal(t, database.WorkspaceAgentMonitorStateOK, monitors[0].State)
})
t.Run("UnknownPreventsMovingOutOfAlertState", func(t *testing.T) {
t.Parallel()
volumePath := "/home/coder"
api, _, clock, _ := resourceMonitorAPI(t)
api.Config.Alert.ConsecutiveNOKsPercent = 50
api.Config.Alert.MinimumNOKsPercent = 100
// Given: A monitor in a NOK state.
dbgen.WorkspaceAgentVolumeResourceMonitor(t, api.Database, database.WorkspaceAgentVolumeResourceMonitor{
AgentID: api.AgentID,
Path: volumePath,
State: database.WorkspaceAgentMonitorStateNOK,
Threshold: 80,
})
// Initialize API to fetch and cache the monitors
require.NoError(t, api.InitMonitors(context.Background()))
// When: A datapoint is missing, surrounded by two OK datapoints.
_, err := api.PushResourcesMonitoringUsage(context.Background(), &agentproto.PushResourcesMonitoringUsageRequest{
Datapoints: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint{
{
CollectedAt: timestamppb.New(clock.Now()),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 1,
Total: 10,
},
},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(10 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{},
},
{
CollectedAt: timestamppb.New(clock.Now().Add(20 * time.Second)),
Volumes: []*agentproto.PushResourcesMonitoringUsageRequest_Datapoint_VolumeUsage{
{
Volume: volumePath,
Used: 1,
Total: 10,
},
},
},
},
})
require.NoError(t, err)
// Then: We expect the monitor to still be in a NOK state.
monitors, err := api.Database.FetchVolumesResourceMonitorsByAgentID(context.Background(), api.AgentID)
require.NoError(t, err)
require.Len(t, monitors, 1)
require.Equal(t, database.WorkspaceAgentMonitorStateNOK, monitors[0].State)
})
}
func requireVolumeData(t *testing.T, notif *notificationstest.FakeNotification) []map[string]any {
t.Helper()
volumesData := notif.Data["volumes"]
require.IsType(t, []map[string]any{}, volumesData)
return volumesData.([]map[string]any)
}