Files
coder/coderd/database/queries/workspaceagentstats.sql
T
Callum Styan 6abb889fab perf: optimize GetDeploymentWorkspaceAgentStats by eliminating 2nd select (#21112)
Tracking issue here: https://github.com/coder/internal/issues/1009

To summarize, the current version of this query selects from
`workspace_agent_stats` twice. The expensive portion of this query is
the bitmap heap scan we have to do for each of these selects. We can
easily cut the cost of this query by 40-50% by cutting this down to a
single select, and using those rows for both sets of calculations.

Eliminating the heap scan itself would require a follow up PR to
introduce a new index. Blink helped with the rewrite of the query.

The current plan looks like this:
```
 Nested Loop  (cost=6101.64..6101.69 rows=1 width=64) (actual time=11.782..11.787 rows=1 loops=1)
   ->  Aggregate  (cost=2996.17..2996.19 rows=1 width=32) (actual time=3.356..3.357 rows=1 loops=1)
         ->  Bitmap Heap Scan on workspace_agent_stats  (cost=54.80..2992.86 rows=440 width=24) (actu
al time=0.346..2.927 rows=818 loops=1)
               Recheck Cond: (created_at > (now() - '00:15:00'::interval))
               Filter: (connection_median_latency_ms > '0'::double precision)
               Rows Removed by Filter: 1070
               Heap Blocks: exact=486
               ->  Bitmap Index Scan on idx_agent_stats_created_at  (cost=0.00..54.69 rows=1368 width
=0) (actual time=0.241..0.241 rows=1888 loops=1)
                     Index Cond: (created_at > (now() - '00:15:00'::interval))
   ->  Aggregate  (cost=3105.47..3105.49 rows=1 width=32) (actual time=8.418..8.420 rows=1 loops=1)
         ->  Subquery Scan on a  (cost=3060.95..3105.39 rows=7 width=32) (actual time=7.851..8.394 ro
ws=63 loops=1)
               Filter: (a.rn = 1)
               ->  WindowAgg  (cost=3060.95..3088.29 rows=1368 width=209) (actual time=7.850..8.382 r
ows=63 loops=1)
                     Run Condition: (row_number() OVER (?) <= 1)
                     ->  Sort  (cost=3060.93..3064.35 rows=1368 width=56) (actual time=7.836..8.036 r
ows=1888 loops=1)
                           Sort Key: workspace_agent_stats_1.agent_id, workspace_agent_stats_1.create
d_at DESC
                           Sort Method: quicksort  Memory: 181kB
                           ->  Bitmap Heap Scan on workspace_agent_stats workspace_agent_stats_1  (co
st=55.03..2989.67 rows=1368 width=56) (actual time=0.388..2.096 rows=1888 loops=1)
                                 Recheck Cond: (created_at > (now() - '00:15:00'::interval))
                                 Heap Blocks: exact=486
                                 ->  Bitmap Index Scan on idx_agent_stats_created_at  (cost=0.00..54.
69 rows=1368 width=0) (actual time=0.295..0.295 rows=1888 loops=1)
                                       Index Cond: (created_at > (now() - '00:15:00'::interval))
 Planning Time: 2.350 ms
 Execution Time: 13.152 ms
(24 rows)
```

The new plan looks like this
```
 Aggregate  (cost=2966.96..2966.98 rows=1 width=64) (actual time=3.812..3.814 rows=1 loops=1)
   ->  WindowAgg  (cost=2891.96..2916.94 rows=1250 width=88) (actual time=2.696..3.412 rows=1890 loop
s=1)
         ->  Sort  (cost=2891.94..2895.06 rows=1250 width=80) (actual time=2.686..2.780 rows=1890 loo
ps=1)
               Sort Key: workspace_agent_stats.agent_id, workspace_agent_stats.created_at DESC
               Sort Method: quicksort  Memory: 226kB
               ->  Bitmap Heap Scan on workspace_agent_stats  (cost=50.11..2827.64 rows=1250 width=80
) (actual time=0.218..1.551 rows=1890 loops=1)
                     Recheck Cond: (created_at > (now() - '00:15:00'::interval))
                     Heap Blocks: exact=474
                     ->  Bitmap Index Scan on idx_agent_stats_created_at  (cost=0.00..49.80 rows=1250
 width=0) (actual time=0.146..0.147 rows=1890 loops=1)
                           Index Cond: (created_at > (now() - '00:15:00'::interval))
 Planning Time: 0.534 ms
 Execution Time: 3.969 ms
(12 rows)
```

If we compare the results of the query they're similar enough that any
differences can be attributed to slightly different timestamps for
`now()` in the version of the query I am using to generate results for
comparison:
```
 workspace_rx_bytes | workspace_tx_bytes | workspace_connection_latency_50 | workspace_connection_latency_95 | session_count_vscode | session_count_ssh | session_count_jetbrains | session_count_reconnecting_pty 
--------------------+--------------------+---------------------------------+---------------------------------+----------------------+-------------------+-------------------------+--------------------------------
           15263563 |           74555854 |                          47.933 |                        250.5522 |                  239 |                59 |                       3 |                              3
(1 row)

 workspace_rx_bytes | workspace_tx_bytes | workspace_connection_latency_50 | workspace_connection_latency_95 | session_count_vscode | session_count_ssh | session_count_jetbrains | session_count_reconnecting_pty 
--------------------+--------------------+---------------------------------+---------------------------------+----------------------+-------------------+-------------------------+--------------------------------
           15295819 |           74598410 |                          47.933 |                        250.5522 |                  239 |                59 |                       3 |                              3           
```

---------

Signed-off-by: Callum Styan <callumstyan@gmail.com>
2025-12-09 15:19:55 -08:00

387 lines
14 KiB
SQL

-- name: InsertWorkspaceAgentStats :exec
INSERT INTO
workspace_agent_stats (
id,
created_at,
user_id,
workspace_id,
template_id,
agent_id,
connections_by_proto,
connection_count,
rx_packets,
rx_bytes,
tx_packets,
tx_bytes,
session_count_vscode,
session_count_jetbrains,
session_count_reconnecting_pty,
session_count_ssh,
connection_median_latency_ms,
usage
)
SELECT
unnest(@id :: uuid[]) AS id,
unnest(@created_at :: timestamptz[]) AS created_at,
unnest(@user_id :: uuid[]) AS user_id,
unnest(@workspace_id :: uuid[]) AS workspace_id,
unnest(@template_id :: uuid[]) AS template_id,
unnest(@agent_id :: uuid[]) AS agent_id,
jsonb_array_elements(@connections_by_proto :: jsonb) AS connections_by_proto,
unnest(@connection_count :: bigint[]) AS connection_count,
unnest(@rx_packets :: bigint[]) AS rx_packets,
unnest(@rx_bytes :: bigint[]) AS rx_bytes,
unnest(@tx_packets :: bigint[]) AS tx_packets,
unnest(@tx_bytes :: bigint[]) AS tx_bytes,
unnest(@session_count_vscode :: bigint[]) AS session_count_vscode,
unnest(@session_count_jetbrains :: bigint[]) AS session_count_jetbrains,
unnest(@session_count_reconnecting_pty :: bigint[]) AS session_count_reconnecting_pty,
unnest(@session_count_ssh :: bigint[]) AS session_count_ssh,
unnest(@connection_median_latency_ms :: double precision[]) AS connection_median_latency_ms,
unnest(@usage :: boolean[]) AS usage;
-- name: GetTemplateDAUs :many
SELECT
(created_at at TIME ZONE cast(@tz_offset::integer as text))::date as date,
user_id
FROM
workspace_agent_stats
WHERE
template_id = $1 AND
connection_count > 0
GROUP BY
date, user_id
ORDER BY
date ASC;
-- name: GetDeploymentDAUs :many
SELECT
(created_at at TIME ZONE cast(@tz_offset::integer as text))::date as date,
user_id
FROM
workspace_agent_stats
WHERE
connection_count > 0
GROUP BY
date, user_id
ORDER BY
date ASC;
-- name: DeleteOldWorkspaceAgentStats :exec
DELETE FROM
workspace_agent_stats
WHERE
created_at < (
SELECT
COALESCE(
-- When generating initial template usage stats, all the
-- raw agent stats are needed, after that only ~30 mins
-- from last rollup is needed. Deployment stats seem to
-- use between 15 mins and 1 hour of data. We keep a
-- little bit more (1 day) just in case.
MAX(start_time) - '1 days'::interval,
-- Fall back to ~6 months ago if there are no template
-- usage stats so that we don't delete the data before
-- it's rolled up.
NOW() - '180 days'::interval
)
FROM
template_usage_stats
)
AND created_at < (
-- Delete at most in batches of 4 hours (with this batch size, assuming
-- 1 iteration / 10 minutes, we can clear out the previous 6 months of
-- data in 7.5 days) whilst keeping the DB load low.
SELECT
COALESCE(MIN(created_at) + '4 hours'::interval, NOW())
FROM
workspace_agent_stats
);
-- name: GetDeploymentWorkspaceAgentStats :one
WITH stats AS (
SELECT
agent_id,
created_at,
rx_bytes,
tx_bytes,
connection_median_latency_ms,
session_count_vscode,
session_count_ssh,
session_count_jetbrains,
session_count_reconnecting_pty,
ROW_NUMBER() OVER (PARTITION BY agent_id ORDER BY created_at DESC) AS rn
FROM workspace_agent_stats
WHERE created_at > $1
)
SELECT
coalesce(SUM(rx_bytes), 0)::bigint AS workspace_rx_bytes,
coalesce(SUM(tx_bytes), 0)::bigint AS workspace_tx_bytes,
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
coalesce((PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY connection_median_latency_ms) FILTER (WHERE connection_median_latency_ms > 0)), -1)::FLOAT AS workspace_connection_latency_50,
coalesce((PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY connection_median_latency_ms) FILTER (WHERE connection_median_latency_ms > 0)), -1)::FLOAT AS workspace_connection_latency_95,
coalesce(SUM(session_count_vscode) FILTER (WHERE rn = 1), 0)::bigint AS session_count_vscode,
coalesce(SUM(session_count_ssh) FILTER (WHERE rn = 1), 0)::bigint AS session_count_ssh,
coalesce(SUM(session_count_jetbrains) FILTER (WHERE rn = 1), 0)::bigint AS session_count_jetbrains,
coalesce(SUM(session_count_reconnecting_pty) FILTER (WHERE rn = 1), 0)::bigint AS session_count_reconnecting_pty
FROM stats;
-- name: GetDeploymentWorkspaceAgentUsageStats :one
WITH agent_stats AS (
SELECT
coalesce(SUM(rx_bytes), 0)::bigint AS workspace_rx_bytes,
coalesce(SUM(tx_bytes), 0)::bigint AS workspace_tx_bytes,
coalesce((PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY connection_median_latency_ms)), -1)::FLOAT AS workspace_connection_latency_50,
coalesce((PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY connection_median_latency_ms)), -1)::FLOAT AS workspace_connection_latency_95
FROM workspace_agent_stats
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
WHERE workspace_agent_stats.created_at > $1 AND connection_median_latency_ms > 0
),
minute_buckets AS (
SELECT
agent_id,
date_trunc('minute', created_at) AS minute_bucket,
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty
FROM
workspace_agent_stats
WHERE
created_at >= $1
AND created_at < date_trunc('minute', now()) -- Exclude current partial minute
AND usage = true
GROUP BY
agent_id,
minute_bucket
),
latest_buckets AS (
SELECT DISTINCT ON (agent_id)
agent_id,
minute_bucket,
session_count_vscode,
session_count_jetbrains,
session_count_reconnecting_pty,
session_count_ssh
FROM
minute_buckets
ORDER BY
agent_id,
minute_bucket DESC
),
latest_agent_stats AS (
SELECT
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty
FROM
latest_buckets
)
SELECT * FROM agent_stats, latest_agent_stats;
-- name: GetWorkspaceAgentStats :many
WITH agent_stats AS (
SELECT
user_id,
agent_id,
workspace_id,
template_id,
MIN(created_at)::timestamptz AS aggregated_from,
coalesce(SUM(rx_bytes), 0)::bigint AS workspace_rx_bytes,
coalesce(SUM(tx_bytes), 0)::bigint AS workspace_tx_bytes,
coalesce((PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY connection_median_latency_ms)), -1)::FLOAT AS workspace_connection_latency_50,
coalesce((PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY connection_median_latency_ms)), -1)::FLOAT AS workspace_connection_latency_95
FROM workspace_agent_stats
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
WHERE workspace_agent_stats.created_at > $1 AND connection_median_latency_ms > 0
GROUP BY user_id, agent_id, workspace_id, template_id
), latest_agent_stats AS (
SELECT
a.agent_id,
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty
FROM (
SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn
FROM workspace_agent_stats WHERE created_at > $1
) AS a WHERE a.rn = 1 GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id
)
SELECT * FROM agent_stats JOIN latest_agent_stats ON agent_stats.agent_id = latest_agent_stats.agent_id;
-- name: GetWorkspaceAgentUsageStats :many
WITH agent_stats AS (
SELECT
user_id,
agent_id,
workspace_id,
template_id,
MIN(created_at)::timestamptz AS aggregated_from,
coalesce(SUM(rx_bytes), 0)::bigint AS workspace_rx_bytes,
coalesce(SUM(tx_bytes), 0)::bigint AS workspace_tx_bytes,
coalesce((PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY connection_median_latency_ms)), -1)::FLOAT AS workspace_connection_latency_50,
coalesce((PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY connection_median_latency_ms)), -1)::FLOAT AS workspace_connection_latency_95
FROM workspace_agent_stats
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
WHERE workspace_agent_stats.created_at > $1 AND connection_median_latency_ms > 0
GROUP BY user_id, agent_id, workspace_id, template_id
),
minute_buckets AS (
SELECT
agent_id,
date_trunc('minute', created_at) AS minute_bucket,
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty
FROM
workspace_agent_stats
WHERE
created_at >= $1
AND created_at < date_trunc('minute', now()) -- Exclude current partial minute
AND usage = true
GROUP BY
agent_id,
minute_bucket,
user_id,
agent_id,
workspace_id,
template_id
),
latest_buckets AS (
SELECT DISTINCT ON (agent_id)
agent_id,
session_count_vscode,
session_count_ssh,
session_count_jetbrains,
session_count_reconnecting_pty
FROM
minute_buckets
ORDER BY
agent_id,
minute_bucket DESC
)
SELECT user_id,
agent_stats.agent_id,
workspace_id,
template_id,
aggregated_from,
workspace_rx_bytes,
workspace_tx_bytes,
workspace_connection_latency_50,
workspace_connection_latency_95,
-- `minute_buckets` could return 0 rows if there are no usage stats since `created_at`.
coalesce(latest_buckets.agent_id,agent_stats.agent_id) AS agent_id,
coalesce(session_count_vscode, 0)::bigint AS session_count_vscode,
coalesce(session_count_ssh, 0)::bigint AS session_count_ssh,
coalesce(session_count_jetbrains, 0)::bigint AS session_count_jetbrains,
coalesce(session_count_reconnecting_pty, 0)::bigint AS session_count_reconnecting_pty
FROM agent_stats LEFT JOIN latest_buckets ON agent_stats.agent_id = latest_buckets.agent_id;
-- name: GetWorkspaceAgentStatsAndLabels :many
WITH agent_stats AS (
SELECT
user_id,
agent_id,
workspace_id,
coalesce(SUM(rx_bytes), 0)::bigint AS rx_bytes,
coalesce(SUM(tx_bytes), 0)::bigint AS tx_bytes
FROM workspace_agent_stats
WHERE workspace_agent_stats.created_at > $1
GROUP BY user_id, agent_id, workspace_id
), latest_agent_stats AS (
SELECT
a.agent_id,
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty,
coalesce(SUM(connection_count), 0)::bigint AS connection_count,
coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms
FROM (
SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn
FROM workspace_agent_stats
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
WHERE created_at > $1 AND connection_median_latency_ms > 0
) AS a
WHERE a.rn = 1
GROUP BY a.user_id, a.agent_id, a.workspace_id
)
SELECT
users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes,
session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty,
connection_count, connection_median_latency_ms
FROM
agent_stats
JOIN
latest_agent_stats
ON
agent_stats.agent_id = latest_agent_stats.agent_id
JOIN
users
ON
users.id = agent_stats.user_id
JOIN
workspace_agents
ON
workspace_agents.id = agent_stats.agent_id
JOIN
workspaces
ON
workspaces.id = agent_stats.workspace_id;
-- name: GetWorkspaceAgentUsageStatsAndLabels :many
WITH agent_stats AS (
SELECT
user_id,
agent_id,
workspace_id,
coalesce(SUM(rx_bytes), 0)::bigint AS rx_bytes,
coalesce(SUM(tx_bytes), 0)::bigint AS tx_bytes,
coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms
FROM workspace_agent_stats
-- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms.
WHERE workspace_agent_stats.created_at > $1 AND connection_median_latency_ms > 0
GROUP BY user_id, agent_id, workspace_id
), latest_agent_stats AS (
SELECT
agent_id,
coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode,
coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh,
coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains,
coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty,
coalesce(SUM(connection_count), 0)::bigint AS connection_count
FROM workspace_agent_stats
-- We only want the latest stats, but those stats might be
-- spread across multiple rows.
WHERE usage = true AND created_at > now() - '1 minute'::interval
GROUP BY user_id, agent_id, workspace_id
)
SELECT
users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes,
coalesce(session_count_vscode, 0)::bigint AS session_count_vscode,
coalesce(session_count_ssh, 0)::bigint AS session_count_ssh,
coalesce(session_count_jetbrains, 0)::bigint AS session_count_jetbrains,
coalesce(session_count_reconnecting_pty, 0)::bigint AS session_count_reconnecting_pty,
coalesce(connection_count, 0)::bigint AS connection_count,
connection_median_latency_ms
FROM
agent_stats
LEFT JOIN
latest_agent_stats
ON
agent_stats.agent_id = latest_agent_stats.agent_id
JOIN
users
ON
users.id = agent_stats.user_id
JOIN
workspace_agents
ON
workspace_agents.id = agent_stats.agent_id
JOIN
workspaces
ON
workspaces.id = agent_stats.workspace_id;